diff --git a/.bzrignore b/.bzrignore index 392572a3f24..ea3f83a4bd5 100644 --- a/.bzrignore +++ b/.bzrignore @@ -1444,6 +1444,21 @@ storage/innobase/ib_config.h storage/innobase/ib_config.h.in storage/innobase/mkinstalldirs storage/innobase/stamp-h1 +storage/xtradb/autom4te-2.53.cache/* +storage/xtradb/autom4te-2.53.cache/output.0 +storage/xtradb/autom4te-2.53.cache/requests +storage/xtradb/autom4te-2.53.cache/traces.0 +storage/xtradb/autom4te.cache/* +storage/xtradb/autom4te.cache/output.0 +storage/xtradb/autom4te.cache/requests +storage/xtradb/autom4te.cache/traces.0 +storage/xtradb/configure.lineno +storage/xtradb/conftest.s1 +storage/xtradb/conftest.subs +storage/xtradb/ib_config.h +storage/xtradb/ib_config.h.in +storage/xtradb/mkinstalldirs +storage/xtradb/stamp-h1 storage/maria/*.MAD storage/maria/*.MAI storage/maria/ma_rt_test diff --git a/BUILD/compile-innodb b/BUILD/compile-innodb new file mode 100755 index 00000000000..82601f03ae9 --- /dev/null +++ b/BUILD/compile-innodb @@ -0,0 +1,24 @@ +#! /bin/sh +# +# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA +# + +path=`dirname $0` +. "$path/SETUP.sh" + +extra_flags="$pentium_cflags $fast_cflags -g" +extra_configs="$pentium_configs $static_link --with-plugins=innobase" + +. "$path/FINISH.sh" diff --git a/BUILD/compile-innodb-debug b/BUILD/compile-innodb-debug new file mode 100755 index 00000000000..efb4abf88d5 --- /dev/null +++ b/BUILD/compile-innodb-debug @@ -0,0 +1,24 @@ +#! /bin/sh +# +# Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA +# + +path=`dirname $0` +. "$path/SETUP.sh" $@ --with-debug=full + +extra_flags="$pentium_cflags $debug_cflags" +extra_configs="$pentium_configs $debug_configs --with-plugins=innobase" + +. "$path/FINISH.sh" diff --git a/CMakeLists.txt b/CMakeLists.txt index b0ce594a425..c6b8e7e2325 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,7 +263,7 @@ IF(WITH_FEDERATED_STORAGE_ENGINE) ADD_SUBDIRECTORY(storage/federated) ENDIF(WITH_FEDERATED_STORAGE_ENGINE) IF(WITH_INNOBASE_STORAGE_ENGINE) - ADD_SUBDIRECTORY(storage/innobase) + ADD_SUBDIRECTORY(storage/xtradb) ENDIF(WITH_INNOBASE_STORAGE_ENGINE) IF(WITH_MARIA_STORAGE_ENGINE) ADD_SUBDIRECTORY(storage/maria) diff --git a/configure.in b/configure.in index a46d25b1d12..57c32fbca64 100644 --- a/configure.in +++ b/configure.in @@ -1729,6 +1729,30 @@ then fi fi +AC_CACHE_CHECK([whether the compiler provides atomic builtins], + [mysql_cv_gcc_atomic_builtins], [AC_TRY_RUN([ + int main() + { + int foo= -10; int bar= 10; + if (!__sync_fetch_and_add(&foo, bar) || foo) + return -1; + bar= __sync_lock_test_and_set(&foo, bar); + if (bar || foo != 10) + return -1; + bar= __sync_val_compare_and_swap(&bar, foo, 15); + if (bar) + return -1; + return 0; + } +], [mysql_cv_gcc_atomic_builtins=yes], + [mysql_cv_gcc_atomic_builtins=no], + [mysql_cv_gcc_atomic_builtins=no])]) + +if test "x$mysql_cv_gcc_atomic_builtins" = xyes; then + AC_DEFINE(HAVE_GCC_ATOMIC_BUILTINS, 1, + [Define to 1 if compiler provides atomic builtins.]) +fi + AC_ARG_WITH([atomic-ops], AC_HELP_STRING([--with-atomic-ops=rwlocks|smp|up], [Implement atomic operations using pthread rwlocks or atomic CPU @@ -1742,28 +1766,9 @@ case "$with_atomic_ops" in [Use pthread rwlocks for atomic ops]) ;; "smp") ;; "") - AC_CACHE_CHECK([whether the compiler provides atomic builtins], - [mysql_cv_gcc_atomic_builtins], [AC_TRY_RUN([ - int main() - { - int foo= -10; int bar= 10; - if (!__sync_fetch_and_add(&foo, bar) || foo) - return -1; - bar= __sync_lock_test_and_set(&foo, bar); - if (bar || foo != 10) - return -1; - bar= __sync_val_compare_and_swap(&bar, foo, 15); - if (bar) - return -1; - return 0; - } - ], [mysql_cv_gcc_atomic_builtins=yes_but_disabled], - [mysql_cv_gcc_atomic_builtins=no], - [mysql_cv_gcc_atomic_builtins=no])]) - - if test "x$mysql_cv_gcc_atomic_builtins" = xyes; then - AC_DEFINE(HAVE_GCC_ATOMIC_BUILTINS, 1, - [Define to 1 if compiler provides atomic builtins.]) + if test "x$mysql_cv_gcc_atomic_builtins" = xyes_but_disabled; then + AC_DEFINE([MY_ATOMIC_MODE_GCC_BUILTINS], [1], + [Use GCC atomic builtins for atomic ops]) fi ;; *) AC_MSG_ERROR(["$with_atomic_ops" is not a valid value for --with-atomic-ops]) ;; diff --git a/include/atomic/nolock.h b/include/atomic/nolock.h index cafd916981d..550b53adcd9 100644 --- a/include/atomic/nolock.h +++ b/include/atomic/nolock.h @@ -14,7 +14,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #if defined(__i386__) || defined(_MSC_VER) || \ - defined(__x86_64__) || defined(HAVE_GCC_ATOMIC_BUILTINS) + defined(__x86_64__) || defined(MY_ATOMIC_MODE_GCC_BUILTINS) # ifdef MY_ATOMIC_MODE_DUMMY # define LOCK_prefix "" @@ -22,7 +22,7 @@ # define LOCK_prefix "lock" # endif -# ifdef HAVE_GCC_ATOMIC_BUILTINS +# ifdef MY_ATOMIC_MODE_GCC_BUILTINS # include "gcc_builtins.h" # elif __GNUC__ # include "x86-gcc.h" diff --git a/include/my_sys.h b/include/my_sys.h index fd1c486cead..ad23abafa0a 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -791,6 +791,9 @@ extern size_t my_b_gets(IO_CACHE *info, char *to, size_t max_length); extern my_off_t my_b_filelength(IO_CACHE *info); extern size_t my_b_printf(IO_CACHE *info, const char* fmt, ...); extern size_t my_b_vprintf(IO_CACHE *info, const char* fmt, va_list ap); +extern int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, + const char *default_val); +extern int init_intvar_from_file(int* var, IO_CACHE* f, int default_val); extern my_bool open_cached_file(IO_CACHE *cache,const char *dir, const char *prefix, size_t cache_size, myf cache_myflags); diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt index 5b22fd07e26..7f311844c6a 100644 --- a/libmysqld/CMakeLists.txt +++ b/libmysqld/CMakeLists.txt @@ -132,9 +132,9 @@ IF(WITH_FEDERATED_STORAGE_ENGINE) ENDIF(WITH_FEDERATED_STORAGE_ENGINE) IF(WITH_INNOBASE_STORAGE_ENGINE) - INCLUDE(${CMAKE_SOURCE_DIR}/storage/innobase/CMakeLists.txt) + INCLUDE(${CMAKE_SOURCE_DIR}/storage/xtradb/CMakeLists.txt) FOREACH(rpath ${INNOBASE_SOURCES}) - SET(LIB_SOURCES ${LIB_SOURCES} ../storage/innobase/${rpath}) + SET(LIB_SOURCES ${LIB_SOURCES} ../storage/xtradb/${rpath}) ENDFOREACH(rpath) ENDIF(WITH_INNOBASE_STORAGE_ENGINE) diff --git a/mysql-test/include/innodb-index.inc b/mysql-test/include/innodb-index.inc new file mode 100644 index 00000000000..37de3162abe --- /dev/null +++ b/mysql-test/include/innodb-index.inc @@ -0,0 +1,26 @@ +--eval create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb default charset=$charset +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); +commit; +--error ER_DUP_ENTRY +alter table t1 add unique index (b); +insert into t1 values(8,9,'fff','fff'); +select * from t1; +show create table t1; +alter table t1 add index (b); +insert into t1 values(10,10,'kkk','iii'); +select * from t1; +select * from t1 force index(b) order by b; +explain select * from t1 force index(b) order by b; +show create table t1; +alter table t1 add unique index (c), add index (d); +insert into t1 values(11,11,'aaa','mmm'); +select * from t1; +select * from t1 force index(b) order by b; +select * from t1 force index(c) order by c; +select * from t1 force index(d) order by d; +explain select * from t1 force index(b) order by b; +explain select * from t1 force index(c) order by c; +explain select * from t1 force index(d) order by d; +show create table t1; +check table t1; +drop table t1; diff --git a/mysql-test/r/index_merge_innodb.result b/mysql-test/r/index_merge_innodb.result index 588de70e6e5..e71f547d36a 100644 --- a/mysql-test/r/index_merge_innodb.result +++ b/mysql-test/r/index_merge_innodb.result @@ -111,7 +111,7 @@ count(*) explain select count(*) from t1 where key1a = 2 and key1b is null and key2a = 2 and key2b is null; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 index_merge i1,i2 i1,i2 10,10 NULL 4 Using intersect(i1,i2); Using where; Using index +1 SIMPLE t1 index_merge i1,i2 i1,i2 10,10 NULL 3 Using intersect(i1,i2); Using where; Using index select count(*) from t1 where key1a = 2 and key1b is null and key2a = 2 and key2b is null; count(*) @@ -119,7 +119,7 @@ count(*) explain select count(*) from t1 where key1a = 2 and key1b is null and key3a = 2 and key3b is null; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 index_merge i1,i3 i1,i3 10,10 NULL 4 Using intersect(i1,i3); Using where; Using index +1 SIMPLE t1 index_merge i1,i3 i1,i3 10,10 NULL 3 Using intersect(i1,i3); Using where; Using index select count(*) from t1 where key1a = 2 and key1b is null and key3a = 2 and key3b is null; count(*) diff --git a/mysql-test/r/information_schema.result b/mysql-test/r/information_schema.result index 9ff2160e909..95ed38697f3 100644 --- a/mysql-test/r/information_schema.result +++ b/mysql-test/r/information_schema.result @@ -42,7 +42,7 @@ WHERE table_schema IN ('mysql', 'INFORMATION_SCHEMA', 'test', 'mysqltest') AND table_name<>'ndb_binlog_index' AND table_name<>'ndb_apply_status' AND NOT (table_schema = 'INFORMATION_SCHEMA' AND table_name LIKE 'PBXT_%'); -select * from v1; +select * from v1 ORDER BY c COLLATE utf8_bin; c CHARACTER_SETS COLLATIONS @@ -54,6 +54,17 @@ EVENTS FILES GLOBAL_STATUS GLOBAL_VARIABLES +INNODB_BUFFER_POOL_PAGES +INNODB_BUFFER_POOL_PAGES_BLOB +INNODB_BUFFER_POOL_PAGES_INDEX +INNODB_CMP +INNODB_CMPMEM +INNODB_CMPMEM_RESET +INNODB_CMP_RESET +INNODB_LOCKS +INNODB_LOCK_WAITS +INNODB_RSEG +INNODB_TRX KEY_COLUMN_USAGE PARTITIONS PLUGINS @@ -72,6 +83,7 @@ TABLE_PRIVILEGES TRIGGERS USER_PRIVILEGES VIEWS +XTRADB_ENHANCEMENTS columns_priv db event @@ -87,6 +99,11 @@ proc procs_priv servers slow_log +t1 +t2 +t3 +t4 +t5 tables_priv time_zone time_zone_leap_second @@ -94,11 +111,6 @@ time_zone_name time_zone_transition time_zone_transition_type user -t1 -t4 -t2 -t3 -t5 v1 select c,table_name from v1 inner join information_schema.TABLES v2 on (v1.c=v2.table_name) @@ -800,6 +812,8 @@ TABLES CREATE_TIME datetime TABLES UPDATE_TIME datetime TABLES CHECK_TIME datetime TRIGGERS CREATED datetime +INNODB_TRX trx_started datetime +INNODB_TRX trx_wait_started datetime event execute_at datetime event last_executed datetime event starts datetime @@ -848,6 +862,7 @@ TABLES TABLE_NAME select TABLE_CONSTRAINTS TABLE_NAME select TABLE_PRIVILEGES TABLE_NAME select VIEWS TABLE_NAME select +INNODB_BUFFER_POOL_PAGES_INDEX table_name select delete from mysql.user where user='mysqltest_4'; delete from mysql.db where user='mysqltest_4'; flush privileges; @@ -1223,12 +1238,12 @@ DROP PROCEDURE p1; DROP USER mysql_bug20230@localhost; SELECT MAX(table_name) FROM information_schema.tables WHERE table_schema IN ('mysql', 'INFORMATION_SCHEMA', 'test'); MAX(table_name) -VIEWS +XTRADB_ENHANCEMENTS SELECT table_name from information_schema.tables WHERE table_name=(SELECT MAX(table_name) FROM information_schema.tables WHERE table_schema IN ('mysql', 'INFORMATION_SCHEMA', 'test')); table_name -VIEWS +XTRADB_ENHANCEMENTS DROP TABLE IF EXISTS bug23037; DROP FUNCTION IF EXISTS get_value; SELECT COLUMN_NAME, MD5(COLUMN_DEFAULT), LENGTH(COLUMN_DEFAULT) FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='bug23037'; diff --git a/mysql-test/r/information_schema_all_engines.result b/mysql-test/r/information_schema_all_engines.result index 4ffaa12fd61..ed73f0a42e0 100644 --- a/mysql-test/r/information_schema_all_engines.result +++ b/mysql-test/r/information_schema_all_engines.result @@ -29,7 +29,19 @@ TABLE_PRIVILEGES TRIGGERS USER_PRIVILEGES VIEWS +INNODB_BUFFER_POOL_PAGES PBXT_STATISTICS +INNODB_CMP +INNODB_RSEG +XTRADB_ENHANCEMENTS +INNODB_BUFFER_POOL_PAGES_INDEX +INNODB_BUFFER_POOL_PAGES_BLOB +INNODB_TRX +INNODB_CMP_RESET +INNODB_LOCK_WAITS +INNODB_CMPMEM_RESET +INNODB_LOCKS +INNODB_CMPMEM SELECT t.table_name, c1.column_name FROM information_schema.tables t INNER JOIN @@ -73,7 +85,19 @@ TABLE_PRIVILEGES TABLE_SCHEMA TRIGGERS TRIGGER_SCHEMA USER_PRIVILEGES GRANTEE VIEWS TABLE_SCHEMA +INNODB_BUFFER_POOL_PAGES page_type PBXT_STATISTICS ID +INNODB_CMP page_size +INNODB_RSEG rseg_id +XTRADB_ENHANCEMENTS name +INNODB_BUFFER_POOL_PAGES_INDEX schema_name +INNODB_BUFFER_POOL_PAGES_BLOB space_id +INNODB_TRX trx_id +INNODB_CMP_RESET page_size +INNODB_LOCK_WAITS requesting_trx_id +INNODB_CMPMEM_RESET page_size +INNODB_LOCKS lock_id +INNODB_CMPMEM page_size SELECT t.table_name, c1.column_name FROM information_schema.tables t INNER JOIN @@ -117,7 +141,19 @@ TABLE_PRIVILEGES TABLE_SCHEMA TRIGGERS TRIGGER_SCHEMA USER_PRIVILEGES GRANTEE VIEWS TABLE_SCHEMA +INNODB_BUFFER_POOL_PAGES page_type PBXT_STATISTICS ID +INNODB_CMP page_size +INNODB_RSEG rseg_id +XTRADB_ENHANCEMENTS name +INNODB_BUFFER_POOL_PAGES_INDEX schema_name +INNODB_BUFFER_POOL_PAGES_BLOB space_id +INNODB_TRX trx_id +INNODB_CMP_RESET page_size +INNODB_LOCK_WAITS requesting_trx_id +INNODB_CMPMEM_RESET page_size +INNODB_LOCKS lock_id +INNODB_CMPMEM page_size select 1 as f1 from information_schema.tables where "CHARACTER_SETS"= (select cast(table_name as char) from information_schema.tables order by table_name limit 1) limit 1; @@ -149,6 +185,17 @@ EVENTS information_schema.EVENTS 1 FILES information_schema.FILES 1 GLOBAL_STATUS information_schema.GLOBAL_STATUS 1 GLOBAL_VARIABLES information_schema.GLOBAL_VARIABLES 1 +INNODB_BUFFER_POOL_PAGES information_schema.INNODB_BUFFER_POOL_PAGES 1 +INNODB_BUFFER_POOL_PAGES_BLOB information_schema.INNODB_BUFFER_POOL_PAGES_BLOB 1 +INNODB_BUFFER_POOL_PAGES_INDEX information_schema.INNODB_BUFFER_POOL_PAGES_INDEX 1 +INNODB_CMP information_schema.INNODB_CMP 1 +INNODB_CMPMEM information_schema.INNODB_CMPMEM 1 +INNODB_CMPMEM_RESET information_schema.INNODB_CMPMEM_RESET 1 +INNODB_CMP_RESET information_schema.INNODB_CMP_RESET 1 +INNODB_LOCKS information_schema.INNODB_LOCKS 1 +INNODB_LOCK_WAITS information_schema.INNODB_LOCK_WAITS 1 +INNODB_RSEG information_schema.INNODB_RSEG 1 +INNODB_TRX information_schema.INNODB_TRX 1 KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1 PARTITIONS information_schema.PARTITIONS 1 PBXT_STATISTICS information_schema.PBXT_STATISTICS 1 @@ -168,6 +215,7 @@ TABLE_PRIVILEGES information_schema.TABLE_PRIVILEGES 1 TRIGGERS information_schema.TRIGGERS 1 USER_PRIVILEGES information_schema.USER_PRIVILEGES 1 VIEWS information_schema.VIEWS 1 +XTRADB_ENHANCEMENTS information_schema.XTRADB_ENHANCEMENTS 1 Database: information_schema +---------------------------------------+ | Tables | @@ -200,7 +248,19 @@ Database: information_schema | TRIGGERS | | USER_PRIVILEGES | | VIEWS | +| INNODB_BUFFER_POOL_PAGES | | PBXT_STATISTICS | +| INNODB_CMP | +| INNODB_RSEG | +| XTRADB_ENHANCEMENTS | +| INNODB_BUFFER_POOL_PAGES_INDEX | +| INNODB_BUFFER_POOL_PAGES_BLOB | +| INNODB_TRX | +| INNODB_CMP_RESET | +| INNODB_LOCK_WAITS | +| INNODB_CMPMEM_RESET | +| INNODB_LOCKS | +| INNODB_CMPMEM | +---------------------------------------+ Database: INFORMATION_SCHEMA +---------------------------------------+ @@ -234,7 +294,19 @@ Database: INFORMATION_SCHEMA | TRIGGERS | | USER_PRIVILEGES | | VIEWS | +| INNODB_BUFFER_POOL_PAGES | | PBXT_STATISTICS | +| INNODB_CMP | +| INNODB_RSEG | +| XTRADB_ENHANCEMENTS | +| INNODB_BUFFER_POOL_PAGES_INDEX | +| INNODB_BUFFER_POOL_PAGES_BLOB | +| INNODB_TRX | +| INNODB_CMP_RESET | +| INNODB_LOCK_WAITS | +| INNODB_CMPMEM_RESET | +| INNODB_LOCKS | +| INNODB_CMPMEM | +---------------------------------------+ Wildcard: inf_rmation_schema +--------------------+ @@ -244,5 +316,5 @@ Wildcard: inf_rmation_schema +--------------------+ SELECT table_schema, count(*) FROM information_schema.TABLES WHERE table_schema IN ('mysql', 'INFORMATION_SCHEMA', 'test', 'mysqltest') AND table_name<>'ndb_binlog_index' AND table_name<>'ndb_apply_status' GROUP BY TABLE_SCHEMA; table_schema count(*) -information_schema 29 +information_schema 41 mysql 22 diff --git a/mysql-test/r/innodb-analyze.result b/mysql-test/r/innodb-analyze.result new file mode 100644 index 00000000000..2aee004a2d6 --- /dev/null +++ b/mysql-test/r/innodb-analyze.result @@ -0,0 +1,2 @@ +Variable_name Value +innodb_stats_sample_pages 1 diff --git a/mysql-test/r/innodb-autoinc.result b/mysql-test/r/innodb-autoinc.result index 36d8e26de9e..8174ba52b41 100644 --- a/mysql-test/r/innodb-autoinc.result +++ b/mysql-test/r/innodb-autoinc.result @@ -581,11 +581,10 @@ c1 DROP TABLE t1; SET @@SESSION.AUTO_INCREMENT_INCREMENT=1, @@SESSION.AUTO_INCREMENT_OFFSET=1; SET @@INSERT_ID=1; -SHOW VARIABLES LIKE "%auto_inc%"; +SHOW VARIABLES LIKE "auto_inc%"; Variable_name Value auto_increment_increment 1 auto_increment_offset 1 -pbxt_auto_increment_mode 0 CREATE TABLE t1 (c1 DOUBLE NOT NULL AUTO_INCREMENT, c2 INT, PRIMARY KEY (c1)) ENGINE=InnoDB; INSERT INTO t1 VALUES(NULL, 1); INSERT INTO t1 VALUES(NULL, 2); diff --git a/mysql-test/r/innodb-index.result b/mysql-test/r/innodb-index.result new file mode 100644 index 00000000000..e096334dff3 --- /dev/null +++ b/mysql-test/r/innodb-index.result @@ -0,0 +1,1137 @@ +SET @save_innodb_file_format_check=@@global.innodb_file_format_check; +create table t1(a int not null, b int, c char(10) not null, d varchar(20)) engine = innodb; +insert into t1 values (5,5,'oo','oo'),(4,4,'tr','tr'),(3,4,'ad','ad'),(2,3,'ak','ak'); +commit; +alter table t1 add index b (b), add index b (b); +ERROR 42000: Duplicate key name 'b' +alter table t1 add index (b,b); +ERROR 42S21: Duplicate column name 'b' +alter table t1 add index d2 (d); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + KEY `d2` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +explain select * from t1 force index(d2) order by d; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL d2 23 NULL 4 +select * from t1 force index (d2) order by d; +a b c d +3 4 ad ad +2 3 ak ak +5 5 oo oo +4 4 tr tr +alter table t1 add unique index (b); +ERROR 23000: Duplicate entry '4' for key 'b' +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + KEY `d2` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t1 add index (b); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + KEY `d2` (`d`), + KEY `b` (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +CREATE TABLE `t1#1`(a INT PRIMARY KEY) ENGINE=InnoDB; +call mtr.add_suppression(" table `test`\\.`t1#[12]` already exists in InnoDB internal"); +alter table t1 add unique index (c), add index (d); +ERROR HY000: Table 'test.t1#1' already exists +rename table `t1#1` to `t1#2`; +alter table t1 add unique index (c), add index (d); +ERROR HY000: Table 'test.t1#2' already exists +drop table `t1#2`; +alter table t1 add unique index (c), add index (d); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + UNIQUE KEY `c` (`c`), + KEY `d2` (`d`), + KEY `b` (`b`), + KEY `d` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL c 10 NULL 4 +alter table t1 add primary key (a), drop index c; +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `d2` (`d`), + KEY `b` (`b`), + KEY `d` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t1 add primary key (c); +ERROR 42000: Multiple primary key defined +alter table t1 drop primary key, add primary key (b); +ERROR 23000: Duplicate entry '4' for key 'PRIMARY' +create unique index c on t1 (c); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `c` (`c`), + KEY `d2` (`d`), + KEY `b` (`b`), + KEY `d` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL c 10 NULL 4 +select * from t1 force index(c) order by c; +a b c d +3 4 ad ad +2 3 ak ak +5 5 oo oo +4 4 tr tr +alter table t1 drop index b, add index (b); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `c` (`c`), + KEY `d2` (`d`), + KEY `d` (`d`), + KEY `b` (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +insert into t1 values(6,1,'ggg','ggg'); +select * from t1; +a b c d +2 3 ak ak +3 4 ad ad +4 4 tr tr +5 5 oo oo +6 1 ggg ggg +select * from t1 force index(b) order by b; +a b c d +6 1 ggg ggg +2 3 ak ak +3 4 ad ad +4 4 tr tr +5 5 oo oo +select * from t1 force index(c) order by c; +a b c d +3 4 ad ad +2 3 ak ak +6 1 ggg ggg +5 5 oo oo +4 4 tr tr +select * from t1 force index(d) order by d; +a b c d +3 4 ad ad +2 3 ak ak +6 1 ggg ggg +5 5 oo oo +4 4 tr tr +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 5 NULL 5 +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL c 10 NULL 5 +explain select * from t1 force index(d) order by d; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL d 23 NULL 5 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) NOT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `c` (`c`), + KEY `d2` (`d`), + KEY `d` (`d`), + KEY `b` (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +drop table t1; +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,3,'ad','ad'),(4,4,'afe','afe'); +commit; +alter table t1 add index (c(2)); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `c` (`c`(2)) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t1 add unique index (d(10)); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `d` (`d`(10)), + KEY `c` (`c`(2)) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +insert into t1 values(5,1,'ggg','ggg'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 3 ad ad +4 4 afe afe +5 1 ggg ggg +select * from t1 force index(c) order by c; +a b c d +1 1 ab ab +2 2 ac ac +3 3 ad ad +4 4 afe afe +5 1 ggg ggg +select * from t1 force index(d) order by d; +a b c d +1 1 ab ab +2 2 ac ac +3 3 ad ad +4 4 afe afe +5 1 ggg ggg +explain select * from t1 order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 5 Using filesort +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 5 Using filesort +explain select * from t1 force index(d) order by d; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 5 Using filesort +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `d` (`d`(10)), + KEY `c` (`c`(2)) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t1 drop index d; +insert into t1 values(8,9,'fff','fff'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 3 ad ad +4 4 afe afe +5 1 ggg ggg +8 9 fff fff +select * from t1 force index(c) order by c; +a b c d +1 1 ab ab +2 2 ac ac +3 3 ad ad +4 4 afe afe +8 9 fff fff +5 1 ggg ggg +explain select * from t1 order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using filesort +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using filesort +explain select * from t1 order by d; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using filesort +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `c` (`c`(2)) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +drop table t1; +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); +commit; +alter table t1 add unique index (b,c); +insert into t1 values(8,9,'fff','fff'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +select * from t1 force index(b) order by b; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 16 NULL 5 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `b` (`b`,`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t1 add index (b,c); +insert into t1 values(11,11,'kkk','kkk'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +11 11 kkk kkk +select * from t1 force index(b) order by b; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +11 11 kkk kkk +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 16 NULL 6 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `b` (`b`,`c`), + KEY `b_2` (`b`,`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t1 add unique index (c,d); +insert into t1 values(13,13,'yyy','aaa'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +11 11 kkk kkk +13 13 yyy aaa +select * from t1 force index(b) order by b; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +11 11 kkk kkk +13 13 yyy aaa +select * from t1 force index(c) order by c; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +11 11 kkk kkk +13 13 yyy aaa +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 16 NULL 7 +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL c 34 NULL 7 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `b` (`b`,`c`), + UNIQUE KEY `c` (`c`,`d`), + KEY `b_2` (`b`,`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +drop table t1; +create table t1(a int not null, b int not null, c int, primary key (a), key (b)) engine = innodb; +create table t3(a int not null, c int not null, d int, primary key (a), key (c)) engine = innodb; +create table t4(a int not null, d int not null, e int, primary key (a), key (d)) engine = innodb; +create table t2(a int not null, b int not null, c int not null, d int not null, e int, +foreign key (b) references t1(b) on delete cascade, +foreign key (c) references t3(c), foreign key (d) references t4(d)) +engine = innodb; +alter table t1 drop index b; +ERROR HY000: Cannot drop index 'b': needed in a foreign key constraint +alter table t3 drop index c; +ERROR HY000: Cannot drop index 'c': needed in a foreign key constraint +alter table t4 drop index d; +ERROR HY000: Cannot drop index 'd': needed in a foreign key constraint +alter table t2 drop index b; +ERROR HY000: Cannot drop index 'b': needed in a foreign key constraint +alter table t2 drop index b, drop index c, drop index d; +ERROR HY000: Cannot drop index 'b': needed in a foreign key constraint +create unique index dc on t2 (d,c); +create index dc on t1 (b,c); +alter table t2 add primary key (a); +insert into t1 values (1,1,1); +insert into t3 values (1,1,1); +insert into t4 values (1,1,1); +insert into t2 values (1,1,1,1,1); +commit; +alter table t4 add constraint dc foreign key (a) references t1(a); +show create table t4; +Table Create Table +t4 CREATE TABLE `t4` ( + `a` int(11) NOT NULL, + `d` int(11) NOT NULL, + `e` int(11) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `d` (`d`), + CONSTRAINT `dc` FOREIGN KEY (`a`) REFERENCES `t1` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t3 add constraint dc foreign key (a) references t1(a); +ERROR HY000: Can't create table '#sql-temporary' (errno: 121) +show create table t3; +Table Create Table +t3 CREATE TABLE `t3` ( + `a` int(11) NOT NULL, + `c` int(11) NOT NULL, + `d` int(11) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `c` (`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +alter table t2 drop index b, add index (b); +show create table t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) NOT NULL, + `b` int(11) NOT NULL, + `c` int(11) NOT NULL, + `d` int(11) NOT NULL, + `e` int(11) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `dc` (`d`,`c`), + KEY `c` (`c`), + KEY `b` (`b`), + CONSTRAINT `t2_ibfk_1` FOREIGN KEY (`b`) REFERENCES `t1` (`b`) ON DELETE CASCADE, + CONSTRAINT `t2_ibfk_2` FOREIGN KEY (`c`) REFERENCES `t3` (`c`), + CONSTRAINT `t2_ibfk_3` FOREIGN KEY (`d`) REFERENCES `t4` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +delete from t1; +ERROR 23000: Cannot delete or update a parent row: a foreign key constraint fails (`test`.`t4`, CONSTRAINT `dc` FOREIGN KEY (`a`) REFERENCES `t1` (`a`)) +drop index dc on t4; +ERROR 42000: Can't DROP 'dc'; check that column/key exists +alter table t3 drop foreign key dc; +ERROR HY000: Error on rename of './test/t3' to '#sql2-temporary' (errno: 152) +alter table t4 drop foreign key dc; +select * from t2; +a b c d e +1 1 1 1 1 +delete from t1; +select * from t2; +a b c d e +drop table t2,t4,t3,t1; +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb default charset=utf8; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); +commit; +alter table t1 add unique index (b); +ERROR 23000: Duplicate entry '2' for key 'b' +insert into t1 values(8,9,'fff','fff'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 +alter table t1 add index (b); +insert into t1 values(10,10,'kkk','iii'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +select * from t1 force index(b) order by b; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 5 NULL 6 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `b` (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 +alter table t1 add unique index (c), add index (d); +insert into t1 values(11,11,'aaa','mmm'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +11 11 aaa mmm +select * from t1 force index(b) order by b; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +11 11 aaa mmm +select * from t1 force index(c) order by c; +a b c d +11 11 aaa mmm +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +select * from t1 force index(d) order by d; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +11 11 aaa mmm +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 5 NULL 7 +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL c 31 NULL 7 +explain select * from t1 force index(d) order by d; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL d 63 NULL 7 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `c` (`c`), + KEY `b` (`b`), + KEY `d` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +drop table t1; +create table t1(a int not null, b int) engine = innodb; +insert into t1 values (1,1),(1,1),(1,1),(1,1); +alter table t1 add unique index (a); +ERROR 23000: Duplicate entry '1' for key 'a' +alter table t1 add unique index (b); +ERROR 23000: Duplicate entry '1' for key 'b' +alter table t1 add unique index (a), add unique index(b); +ERROR 23000: Duplicate entry '1' for key 'a' +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +drop table t1; +create table t1(a int not null, c int not null,b int, primary key(a), unique key(c), key(b)) engine = innodb; +alter table t1 drop index c, drop index b; +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `c` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + PRIMARY KEY (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +drop table t1; +create table t1(a int not null, b int, primary key(a)) engine = innodb; +alter table t1 add index (b); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `b` (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +drop table t1; +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,3,'ac','ac'),(4,4,'afe','afe'),(5,4,'affe','affe'); +alter table t1 add unique index (b), add unique index (c), add unique index (d); +ERROR 23000: Duplicate entry '4' for key 'b' +alter table t1 add unique index (c), add unique index (b), add index (d); +ERROR 23000: Duplicate entry 'ac' for key 'c' +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +drop table t1; +create table t1(a int not null, b int not null, c int, primary key (a), key(c)) engine=innodb; +insert into t1 values (5,1,5),(4,2,4),(3,3,3),(2,4,2),(1,5,1); +alter table t1 add unique index (b); +insert into t1 values (10,20,20),(11,19,19),(12,18,18),(13,17,17); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) NOT NULL, + `c` int(11) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `b` (`b`), + KEY `c` (`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL c 5 NULL 9 +explain select * from t1 order by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL PRIMARY 4 NULL 9 +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 4 NULL 9 +select * from t1 order by a; +a b c +1 5 1 +2 4 2 +3 3 3 +4 2 4 +5 1 5 +10 20 20 +11 19 19 +12 18 18 +13 17 17 +select * from t1 force index(b) order by b; +a b c +5 1 5 +4 2 4 +3 3 3 +2 4 2 +1 5 1 +13 17 17 +12 18 18 +11 19 19 +10 20 20 +select * from t1 force index(c) order by c; +a b c +1 5 1 +2 4 2 +3 3 3 +4 2 4 +5 1 5 +13 17 17 +12 18 18 +11 19 19 +10 20 20 +drop table t1; +create table t1(a int not null, b int not null) engine=innodb; +insert into t1 values (1,1); +alter table t1 add primary key(b); +insert into t1 values (2,2); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) NOT NULL, + PRIMARY KEY (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +select * from t1; +a b +1 1 +2 2 +explain select * from t1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 +explain select * from t1 order by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 Using filesort +explain select * from t1 order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL PRIMARY 4 NULL 2 +checksum table t1; +Table Checksum +test.t1 582702641 +drop table t1; +create table t1(a int not null) engine=innodb; +insert into t1 values (1); +alter table t1 add primary key(a); +insert into t1 values (2); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + PRIMARY KEY (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +commit; +select * from t1; +a +1 +2 +explain select * from t1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL PRIMARY 4 NULL 2 Using index +explain select * from t1 order by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL PRIMARY 4 NULL 2 Using index +drop table t1; +create table t2(d varchar(17) primary key) engine=innodb default charset=utf8; +create table t3(a int primary key) engine=innodb; +insert into t3 values(22),(44),(33),(55),(66); +insert into t2 values ('jejdkrun87'),('adfd72nh9k'), +('adfdpplkeock'),('adfdijnmnb78k'),('adfdijn0loKNHJik'); +create table t1(a int, b blob, c text, d text not null) +engine=innodb default charset = utf8; +insert into t1 +select a,left(repeat(d,100*a),65535),repeat(d,20*a),d from t2,t3; +drop table t2, t3; +select count(*) from t1 where a=44; +count(*) +5 +select a, +length(b),b=left(repeat(d,100*a),65535),length(c),c=repeat(d,20*a),d from t1; +a length(b) b=left(repeat(d,100*a),65535) length(c) c=repeat(d,20*a) d +22 22000 1 4400 1 adfd72nh9k +22 35200 1 7040 1 adfdijn0loKNHJik +22 28600 1 5720 1 adfdijnmnb78k +22 26400 1 5280 1 adfdpplkeock +22 22000 1 4400 1 jejdkrun87 +33 33000 1 6600 1 adfd72nh9k +33 52800 1 10560 1 adfdijn0loKNHJik +33 42900 1 8580 1 adfdijnmnb78k +33 39600 1 7920 1 adfdpplkeock +33 33000 1 6600 1 jejdkrun87 +44 44000 1 8800 1 adfd72nh9k +44 65535 1 14080 1 adfdijn0loKNHJik +44 57200 1 11440 1 adfdijnmnb78k +44 52800 1 10560 1 adfdpplkeock +44 44000 1 8800 1 jejdkrun87 +55 55000 1 11000 1 adfd72nh9k +55 65535 1 17600 1 adfdijn0loKNHJik +55 65535 1 14300 1 adfdijnmnb78k +55 65535 1 13200 1 adfdpplkeock +55 55000 1 11000 1 jejdkrun87 +66 65535 1 13200 1 adfd72nh9k +66 65535 1 21120 1 adfdijn0loKNHJik +66 65535 1 17160 1 adfdijnmnb78k +66 65535 1 15840 1 adfdpplkeock +66 65535 1 13200 1 jejdkrun87 +alter table t1 add primary key (a), add key (b(20)); +ERROR 23000: Duplicate entry '22' for key 'PRIMARY' +delete from t1 where a%2; +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +alter table t1 add primary key (a,b(255),c(255)), add key (b(767)); +select count(*) from t1 where a=44; +count(*) +5 +select a, +length(b),b=left(repeat(d,100*a),65535),length(c),c=repeat(d,20*a),d from t1; +a length(b) b=left(repeat(d,100*a),65535) length(c) c=repeat(d,20*a) d +22 22000 1 4400 1 adfd72nh9k +22 35200 1 7040 1 adfdijn0loKNHJik +22 28600 1 5720 1 adfdijnmnb78k +22 26400 1 5280 1 adfdpplkeock +22 22000 1 4400 1 jejdkrun87 +44 44000 1 8800 1 adfd72nh9k +44 65535 1 14080 1 adfdijn0loKNHJik +44 57200 1 11440 1 adfdijnmnb78k +44 52800 1 10560 1 adfdpplkeock +44 44000 1 8800 1 jejdkrun87 +66 65535 1 13200 1 adfd72nh9k +66 65535 1 21120 1 adfdijn0loKNHJik +66 65535 1 17160 1 adfdijnmnb78k +66 65535 1 15840 1 adfdpplkeock +66 65535 1 13200 1 jejdkrun87 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL DEFAULT '0', + `b` blob NOT NULL, + `c` text NOT NULL, + `d` text NOT NULL, + PRIMARY KEY (`a`,`b`(255),`c`(255)), + KEY `b` (`b`(767)) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +explain select * from t1 where b like 'adfd%'; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL b NULL NULL NULL 15 Using where +create table t2(a int, b varchar(255), primary key(a,b)) engine=innodb; +insert into t2 select a,left(b,255) from t1; +drop table t1; +rename table t2 to t1; +set innodb_lock_wait_timeout=1; +begin; +select a from t1 limit 1 for update; +a +22 +set innodb_lock_wait_timeout=1; +create index t1ba on t1 (b,a); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +commit; +begin; +select a from t1 limit 1 lock in share mode; +a +22 +create index t1ba on t1 (b,a); +drop index t1ba on t1; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +commit; +explain select a from t1 order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL t1ba 261 NULL 15 Using index +select a,sleep(2+a/100) from t1 order by b limit 3; +select sleep(1); +sleep(1) +0 +drop index t1ba on t1; +a sleep(2+a/100) +22 0 +44 0 +66 0 +explain select a from t1 order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL PRIMARY 261 NULL 15 Using index; Using filesort +select a from t1 order by b limit 3; +a +22 +66 +44 +commit; +drop table t1; +set global innodb_file_per_table=on; +set global innodb_file_format='Barracuda'; +create table t1(a blob,b blob,c blob,d blob,e blob,f blob,g blob,h blob, +i blob,j blob,k blob,l blob,m blob,n blob,o blob,p blob, +q blob,r blob,s blob,t blob,u blob) +engine=innodb row_format=dynamic; +create index t1a on t1 (a(1)); +create index t1b on t1 (b(1)); +create index t1c on t1 (c(1)); +create index t1d on t1 (d(1)); +create index t1e on t1 (e(1)); +create index t1f on t1 (f(1)); +create index t1g on t1 (g(1)); +create index t1h on t1 (h(1)); +create index t1i on t1 (i(1)); +create index t1j on t1 (j(1)); +create index t1k on t1 (k(1)); +create index t1l on t1 (l(1)); +create index t1m on t1 (m(1)); +create index t1n on t1 (n(1)); +create index t1o on t1 (o(1)); +create index t1p on t1 (p(1)); +create index t1q on t1 (q(1)); +create index t1r on t1 (r(1)); +create index t1s on t1 (s(1)); +create index t1t on t1 (t(1)); +create index t1u on t1 (u(1)); +ERROR HY000: Too big row +create index t1ut on t1 (u(1), t(1)); +ERROR HY000: Too big row +create index t1st on t1 (s(1), t(1)); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` blob, + `b` blob, + `c` blob, + `d` blob, + `e` blob, + `f` blob, + `g` blob, + `h` blob, + `i` blob, + `j` blob, + `k` blob, + `l` blob, + `m` blob, + `n` blob, + `o` blob, + `p` blob, + `q` blob, + `r` blob, + `s` blob, + `t` blob, + `u` blob, + KEY `t1a` (`a`(1)), + KEY `t1b` (`b`(1)), + KEY `t1c` (`c`(1)), + KEY `t1d` (`d`(1)), + KEY `t1e` (`e`(1)), + KEY `t1f` (`f`(1)), + KEY `t1g` (`g`(1)), + KEY `t1h` (`h`(1)), + KEY `t1i` (`i`(1)), + KEY `t1j` (`j`(1)), + KEY `t1k` (`k`(1)), + KEY `t1l` (`l`(1)), + KEY `t1m` (`m`(1)), + KEY `t1n` (`n`(1)), + KEY `t1o` (`o`(1)), + KEY `t1p` (`p`(1)), + KEY `t1q` (`q`(1)), + KEY `t1r` (`r`(1)), + KEY `t1s` (`s`(1)), + KEY `t1t` (`t`(1)), + KEY `t1st` (`s`(1),`t`(1)) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 ROW_FORMAT=DYNAMIC +create index t1u on t1 (u(1)); +ERROR HY000: Too big row +alter table t1 row_format=compact; +create index t1u on t1 (u(1)); +drop table t1; +set global innodb_file_per_table=0; +set global innodb_file_format=Antelope; +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; +CREATE TABLE t1( +c1 BIGINT(12) NOT NULL, +PRIMARY KEY (c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +CREATE TABLE t2( +c1 BIGINT(16) NOT NULL, +c2 BIGINT(12) NOT NULL, +c3 BIGINT(12) NOT NULL, +PRIMARY KEY (c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca +FOREIGN KEY (c3) REFERENCES t1(c1); +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`), + KEY `fk_t2_ca` (`c3`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`) REFERENCES `t1` (`c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +CREATE INDEX i_t2_c3_c2 ON t2(c3, c2); +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`), + KEY `i_t2_c3_c2` (`c3`,`c2`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`) REFERENCES `t1` (`c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; +INSERT INTO t2 VALUES(0,0,0); +ERROR 23000: Cannot add or update a child row: a foreign key constraint fails (`test`.`t2`, CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`) REFERENCES `t1` (`c1`)) +INSERT INTO t1 VALUES(0); +INSERT INTO t2 VALUES(0,0,0); +DROP TABLE t2; +CREATE TABLE t2( +c1 BIGINT(16) NOT NULL, +c2 BIGINT(12) NOT NULL, +c3 BIGINT(12) NOT NULL, +PRIMARY KEY (c1,c2,c3) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca +FOREIGN KEY (c3) REFERENCES t1(c1); +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`,`c2`,`c3`), + KEY `fk_t2_ca` (`c3`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`) REFERENCES `t1` (`c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +CREATE INDEX i_t2_c3_c2 ON t2(c3, c2); +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`,`c2`,`c3`), + KEY `i_t2_c3_c2` (`c3`,`c2`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`) REFERENCES `t1` (`c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +INSERT INTO t2 VALUES(0,0,1); +ERROR 23000: Cannot add or update a child row: a foreign key constraint fails (`test`.`t2`, CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`) REFERENCES `t1` (`c1`)) +INSERT INTO t2 VALUES(0,0,0); +DELETE FROM t1; +ERROR 23000: Cannot delete or update a parent row: a foreign key constraint fails (`test`.`t2`, CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`) REFERENCES `t1` (`c1`)) +DELETE FROM t2; +DROP TABLE t2; +DROP TABLE t1; +CREATE TABLE t1( +c1 BIGINT(12) NOT NULL, +c2 INT(4) NOT NULL, +PRIMARY KEY (c2,c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +CREATE TABLE t2( +c1 BIGINT(16) NOT NULL, +c2 BIGINT(12) NOT NULL, +c3 BIGINT(12) NOT NULL, +PRIMARY KEY (c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca +FOREIGN KEY (c3,c2) REFERENCES t1(c1,c1); +ERROR HY000: Can't create table '#sql-temporary' (errno: 150) +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca +FOREIGN KEY (c3,c2) REFERENCES t1(c1,c2); +ERROR HY000: Can't create table '#sql-temporary' (errno: 150) +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca +FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1); +ERROR HY000: Can't create table '#sql-temporary' (errno: 150) +ALTER TABLE t1 MODIFY COLUMN c2 BIGINT(12) NOT NULL; +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca +FOREIGN KEY (c3,c2) REFERENCES t1(c1,c2); +ERROR HY000: Can't create table '#sql-temporary' (errno: 150) +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca +FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1); +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` bigint(12) NOT NULL, + `c2` bigint(12) NOT NULL, + PRIMARY KEY (`c2`,`c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`), + KEY `fk_t2_ca` (`c3`,`c2`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`, `c2`) REFERENCES `t1` (`c2`, `c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +CREATE INDEX i_t2_c2_c1 ON t2(c2, c1); +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`), + KEY `fk_t2_ca` (`c3`,`c2`), + KEY `i_t2_c2_c1` (`c2`,`c1`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`, `c2`) REFERENCES `t1` (`c2`, `c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +CREATE INDEX i_t2_c3_c1_c2 ON t2(c3, c1, c2); +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`), + KEY `fk_t2_ca` (`c3`,`c2`), + KEY `i_t2_c2_c1` (`c2`,`c1`), + KEY `i_t2_c3_c1_c2` (`c3`,`c1`,`c2`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`, `c2`) REFERENCES `t1` (`c2`, `c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +CREATE INDEX i_t2_c3_c2 ON t2(c3, c2); +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c1` bigint(16) NOT NULL, + `c2` bigint(12) NOT NULL, + `c3` bigint(12) NOT NULL, + PRIMARY KEY (`c1`), + KEY `i_t2_c2_c1` (`c2`,`c1`), + KEY `i_t2_c3_c1_c2` (`c3`,`c1`,`c2`), + KEY `i_t2_c3_c2` (`c3`,`c2`), + CONSTRAINT `fk_t2_ca` FOREIGN KEY (`c3`, `c2`) REFERENCES `t1` (`c2`, `c1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +DROP TABLE t2; +DROP TABLE t1; +SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; diff --git a/mysql-test/r/innodb-index_ucs2.result b/mysql-test/r/innodb-index_ucs2.result new file mode 100644 index 00000000000..c8a1e8c7da1 --- /dev/null +++ b/mysql-test/r/innodb-index_ucs2.result @@ -0,0 +1,116 @@ +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb default charset=ucs2; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); +commit; +alter table t1 add unique index (b); +ERROR 23000: Duplicate entry '2' for key 'b' +insert into t1 values(8,9,'fff','fff'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=ucs2 +alter table t1 add index (b); +insert into t1 values(10,10,'kkk','iii'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +select * from t1 force index(b) order by b; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 5 NULL 6 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + KEY `b` (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=ucs2 +alter table t1 add unique index (c), add index (d); +insert into t1 values(11,11,'aaa','mmm'); +select * from t1; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +11 11 aaa mmm +select * from t1 force index(b) order by b; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +11 11 aaa mmm +select * from t1 force index(c) order by c; +a b c d +11 11 aaa mmm +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +select * from t1 force index(d) order by d; +a b c d +1 1 ab ab +2 2 ac ac +3 2 ad ad +4 4 afe afe +8 9 fff fff +10 10 kkk iii +11 11 aaa mmm +explain select * from t1 force index(b) order by b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL b 5 NULL 7 +explain select * from t1 force index(c) order by c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL c 21 NULL 7 +explain select * from t1 force index(d) order by d; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL d 43 NULL 7 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` int(11) DEFAULT NULL, + `c` char(10) DEFAULT NULL, + `d` varchar(20) DEFAULT NULL, + PRIMARY KEY (`a`), + UNIQUE KEY `c` (`c`), + KEY `b` (`b`), + KEY `d` (`d`) +) ENGINE=InnoDB DEFAULT CHARSET=ucs2 +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +drop table t1; diff --git a/mysql-test/r/innodb-timeout.result b/mysql-test/r/innodb-timeout.result new file mode 100644 index 00000000000..be9a688cd72 --- /dev/null +++ b/mysql-test/r/innodb-timeout.result @@ -0,0 +1,38 @@ +set global innodb_lock_wait_timeout=42; +select @@innodb_lock_wait_timeout; +@@innodb_lock_wait_timeout +42 +set innodb_lock_wait_timeout=1; +select @@innodb_lock_wait_timeout; +@@innodb_lock_wait_timeout +1 +select @@innodb_lock_wait_timeout; +@@innodb_lock_wait_timeout +42 +set global innodb_lock_wait_timeout=347; +select @@innodb_lock_wait_timeout; +@@innodb_lock_wait_timeout +42 +set innodb_lock_wait_timeout=1; +select @@innodb_lock_wait_timeout; +@@innodb_lock_wait_timeout +1 +select @@innodb_lock_wait_timeout; +@@innodb_lock_wait_timeout +347 +create table t1(a int primary key)engine=innodb; +begin; +insert into t1 values(1),(2),(3); +select * from t1 for update; +commit; +a +1 +2 +3 +begin; +insert into t1 values(4); +select * from t1 for update; +commit; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +drop table t1; +set global innodb_lock_wait_timeout=50; diff --git a/mysql-test/r/innodb-use-sys-malloc.result b/mysql-test/r/innodb-use-sys-malloc.result new file mode 100644 index 00000000000..2ec4c7c8130 --- /dev/null +++ b/mysql-test/r/innodb-use-sys-malloc.result @@ -0,0 +1,48 @@ +SELECT @@GLOBAL.innodb_use_sys_malloc; +@@GLOBAL.innodb_use_sys_malloc +1 +1 Expected +SET @@GLOBAL.innodb_use_sys_malloc=0; +ERROR HY000: Variable 'innodb_use_sys_malloc' is a read only variable +Expected error 'Read only variable' +SELECT @@GLOBAL.innodb_use_sys_malloc; +@@GLOBAL.innodb_use_sys_malloc +1 +1 Expected +drop table if exists t1; +create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; +insert into t1 values (1),(2),(3),(4),(5),(6),(7); +select * from t1; +a +1 +2 +3 +4 +5 +6 +7 +drop table t1; +SELECT @@GLOBAL.innodb_use_sys_malloc; +@@GLOBAL.innodb_use_sys_malloc +1 +1 Expected +SET @@GLOBAL.innodb_use_sys_malloc=0; +ERROR HY000: Variable 'innodb_use_sys_malloc' is a read only variable +Expected error 'Read only variable' +SELECT @@GLOBAL.innodb_use_sys_malloc; +@@GLOBAL.innodb_use_sys_malloc +1 +1 Expected +drop table if exists t1; +create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; +insert into t1 values (1),(2),(3),(4),(5),(6),(7); +select * from t1; +a +1 +2 +3 +4 +5 +6 +7 +drop table t1; diff --git a/mysql-test/r/innodb-zip.result b/mysql-test/r/innodb-zip.result new file mode 100644 index 00000000000..b26c4112826 --- /dev/null +++ b/mysql-test/r/innodb-zip.result @@ -0,0 +1,421 @@ +set global innodb_file_per_table=off; +set global innodb_file_format=`0`; +create table t0(a int primary key) engine=innodb row_format=compressed; +Warnings: +Warning 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_per_table. +Warning 1478 InnoDB: assuming ROW_FORMAT=COMPACT. +create table t00(a int primary key) engine=innodb +key_block_size=4 row_format=compressed; +Warnings: +Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=4. +Warning 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_per_table. +Warning 1478 InnoDB: assuming ROW_FORMAT=COMPACT. +create table t1(a int primary key) engine=innodb row_format=dynamic; +Warnings: +Warning 1478 InnoDB: ROW_FORMAT=DYNAMIC requires innodb_file_per_table. +Warning 1478 InnoDB: assuming ROW_FORMAT=COMPACT. +create table t2(a int primary key) engine=innodb row_format=redundant; +create table t3(a int primary key) engine=innodb row_format=compact; +create table t4(a int primary key) engine=innodb key_block_size=9; +Warnings: +Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=9. +create table t5(a int primary key) engine=innodb +key_block_size=1 row_format=redundant; +Warnings: +Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=1. +set global innodb_file_per_table=on; +create table t6(a int primary key) engine=innodb +key_block_size=1 row_format=redundant; +Warnings: +Warning 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=1. +set global innodb_file_format=`1`; +create table t7(a int primary key) engine=innodb +key_block_size=1 row_format=redundant; +Warnings: +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=1 unless ROW_FORMAT=COMPRESSED. +create table t8(a int primary key) engine=innodb +key_block_size=1 row_format=fixed; +Warnings: +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=1 unless ROW_FORMAT=COMPRESSED. +Warning 1478 InnoDB: assuming ROW_FORMAT=COMPACT. +create table t9(a int primary key) engine=innodb +key_block_size=1 row_format=compact; +Warnings: +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=1 unless ROW_FORMAT=COMPRESSED. +create table t10(a int primary key) engine=innodb +key_block_size=1 row_format=dynamic; +Warnings: +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=1 unless ROW_FORMAT=COMPRESSED. +create table t11(a int primary key) engine=innodb +key_block_size=1 row_format=compressed; +create table t12(a int primary key) engine=innodb +key_block_size=1; +create table t13(a int primary key) engine=innodb +row_format=compressed; +create table t14(a int primary key) engine=innodb key_block_size=9; +Warnings: +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=9. +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t0 Compact +test t00 Compact +test t1 Compact +test t10 Dynamic +test t11 Compressed +test t12 Compressed +test t13 Compressed +test t14 Compact +test t2 Redundant +test t3 Compact +test t4 Compact +test t5 Redundant +test t6 Redundant +test t7 Redundant +test t8 Compact +test t9 Compact +drop table t0,t00,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14; +alter table t1 key_block_size=0; +Warnings: +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=0. +alter table t1 row_format=dynamic; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t1 Dynamic +alter table t1 row_format=compact; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t1 Compact +alter table t1 row_format=redundant; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t1 Redundant +drop table t1; +create table t1(a int not null, b text, index(b(10))) engine=innodb +key_block_size=1; +create table t2(b text)engine=innodb; +insert into t2 values(concat('1abcdefghijklmnopqrstuvwxyz', repeat('A',5000))); +insert into t1 select 1, b from t2; +commit; +begin; +update t1 set b=repeat('B',100); +select a,left(b,40) from t1 natural join t2; +a left(b,40) +1 1abcdefghijklmnopqrstuvwxyzAAAAAAAAAAAAA +rollback; +select a,left(b,40) from t1 natural join t2; +a left(b,40) +1 1abcdefghijklmnopqrstuvwxyzAAAAAAAAAAAAA +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t1 Compressed +test t2 Compact +drop table t1,t2; +SET SESSION innodb_strict_mode = off; +CREATE TABLE t1( +c TEXT NOT NULL, d TEXT NOT NULL, +PRIMARY KEY (c(767),d(767))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1 CHARSET=ASCII; +ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs +CREATE TABLE t1( +c TEXT NOT NULL, d TEXT NOT NULL, +PRIMARY KEY (c(767),d(767))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=2 CHARSET=ASCII; +ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs +CREATE TABLE t1( +c TEXT NOT NULL, d TEXT NOT NULL, +PRIMARY KEY (c(767),d(767))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4 CHARSET=ASCII; +drop table t1; +CREATE TABLE t1(c TEXT, PRIMARY KEY (c(440))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1 CHARSET=ASCII; +ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs +CREATE TABLE t1(c TEXT, PRIMARY KEY (c(438))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1 CHARSET=ASCII; +INSERT INTO t1 VALUES(REPEAT('A',512)),(REPEAT('B',512)); +DROP TABLE t1; +create table t1( c1 int not null, c2 blob, c3 blob, c4 blob, +primary key(c1, c2(22), c3(22))) +engine = innodb row_format = dynamic; +begin; +insert into t1 values(1, repeat('A', 20000), repeat('B', 20000), +repeat('C', 20000)); +update t1 set c3 = repeat('D', 20000) where c1 = 1; +commit; +select count(*) from t1 where c2 = repeat('A', 20000); +count(*) +1 +select count(*) from t1 where c3 = repeat('D', 20000); +count(*) +1 +select count(*) from t1 where c4 = repeat('C', 20000); +count(*) +1 +update t1 set c3 = repeat('E', 20000) where c1 = 1; +drop table t1; +set global innodb_file_format=`0`; +select @@innodb_file_format; +@@innodb_file_format +Antelope +set global innodb_file_format=`1`; +select @@innodb_file_format; +@@innodb_file_format +Barracuda +set global innodb_file_format=`2`; +ERROR HY000: Incorrect arguments to SET +set global innodb_file_format=`-1`; +ERROR HY000: Incorrect arguments to SET +set global innodb_file_format=`Antelope`; +set global innodb_file_format=`Barracuda`; +set global innodb_file_format=`Cheetah`; +ERROR HY000: Incorrect arguments to SET +set global innodb_file_format=`abc`; +ERROR HY000: Incorrect arguments to SET +set global innodb_file_format=`1a`; +ERROR HY000: Incorrect arguments to SET +set global innodb_file_format=``; +ERROR HY000: Incorrect arguments to SET +set global innodb_file_per_table = on; +set global innodb_file_format = `1`; +set innodb_strict_mode = off; +create table t1 (id int primary key) engine = innodb key_block_size = 0; +Warnings: +Warning 1478 InnoDB: ignoring KEY_BLOCK_SIZE=0. +drop table t1; +set innodb_strict_mode = on; +create table t1 (id int primary key) engine = innodb key_block_size = 0; +ERROR HY000: Can't create table 'test.t1' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 0. Valid values are [1, 2, 4, 8, 16] +Error 1005 Can't create table 'test.t1' (errno: 1478) +create table t2 (id int primary key) engine = innodb key_block_size = 9; +ERROR HY000: Can't create table 'test.t2' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16] +Error 1005 Can't create table 'test.t2' (errno: 1478) +create table t3 (id int primary key) engine = innodb key_block_size = 1; +create table t4 (id int primary key) engine = innodb key_block_size = 2; +create table t5 (id int primary key) engine = innodb key_block_size = 4; +create table t6 (id int primary key) engine = innodb key_block_size = 8; +create table t7 (id int primary key) engine = innodb key_block_size = 16; +create table t8 (id int primary key) engine = innodb row_format = compressed; +create table t9 (id int primary key) engine = innodb row_format = dynamic; +create table t10(id int primary key) engine = innodb row_format = compact; +create table t11(id int primary key) engine = innodb row_format = redundant; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t10 Compact +test t11 Redundant +test t3 Compressed +test t4 Compressed +test t5 Compressed +test t6 Compressed +test t7 Compressed +test t8 Compressed +test t9 Dynamic +drop table t3, t4, t5, t6, t7, t8, t9, t10, t11; +create table t1 (id int primary key) engine = innodb +key_block_size = 8 row_format = compressed; +create table t2 (id int primary key) engine = innodb +key_block_size = 8 row_format = redundant; +ERROR HY000: Can't create table 'test.t2' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: cannot specify ROW_FORMAT = REDUNDANT with KEY_BLOCK_SIZE. +Error 1005 Can't create table 'test.t2' (errno: 1478) +create table t3 (id int primary key) engine = innodb +key_block_size = 8 row_format = compact; +ERROR HY000: Can't create table 'test.t3' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE. +Error 1005 Can't create table 'test.t3' (errno: 1478) +create table t4 (id int primary key) engine = innodb +key_block_size = 8 row_format = dynamic; +ERROR HY000: Can't create table 'test.t4' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: cannot specify ROW_FORMAT = DYNAMIC with KEY_BLOCK_SIZE. +Error 1005 Can't create table 'test.t4' (errno: 1478) +create table t5 (id int primary key) engine = innodb +key_block_size = 8 row_format = default; +ERROR HY000: Can't create table 'test.t5' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE. +Error 1005 Can't create table 'test.t5' (errno: 1478) +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t1 Compressed +drop table t1; +create table t1 (id int primary key) engine = innodb +key_block_size = 9 row_format = redundant; +ERROR HY000: Can't create table 'test.t1' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16] +Error 1478 InnoDB: cannot specify ROW_FORMAT = REDUNDANT with KEY_BLOCK_SIZE. +Error 1005 Can't create table 'test.t1' (errno: 1478) +create table t2 (id int primary key) engine = innodb +key_block_size = 9 row_format = compact; +ERROR HY000: Can't create table 'test.t2' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16] +Error 1478 InnoDB: cannot specify ROW_FORMAT = COMPACT with KEY_BLOCK_SIZE. +Error 1005 Can't create table 'test.t2' (errno: 1478) +create table t2 (id int primary key) engine = innodb +key_block_size = 9 row_format = dynamic; +ERROR HY000: Can't create table 'test.t2' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: invalid KEY_BLOCK_SIZE = 9. Valid values are [1, 2, 4, 8, 16] +Error 1478 InnoDB: cannot specify ROW_FORMAT = DYNAMIC with KEY_BLOCK_SIZE. +Error 1005 Can't create table 'test.t2' (errno: 1478) +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +set global innodb_file_per_table = off; +create table t1 (id int primary key) engine = innodb key_block_size = 1; +ERROR HY000: Can't create table 'test.t1' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Error 1005 Can't create table 'test.t1' (errno: 1478) +create table t2 (id int primary key) engine = innodb key_block_size = 2; +ERROR HY000: Can't create table 'test.t2' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Error 1005 Can't create table 'test.t2' (errno: 1478) +create table t3 (id int primary key) engine = innodb key_block_size = 4; +ERROR HY000: Can't create table 'test.t3' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Error 1005 Can't create table 'test.t3' (errno: 1478) +create table t4 (id int primary key) engine = innodb key_block_size = 8; +ERROR HY000: Can't create table 'test.t4' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Error 1005 Can't create table 'test.t4' (errno: 1478) +create table t5 (id int primary key) engine = innodb key_block_size = 16; +ERROR HY000: Can't create table 'test.t5' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_per_table. +Error 1005 Can't create table 'test.t5' (errno: 1478) +create table t6 (id int primary key) engine = innodb row_format = compressed; +ERROR HY000: Can't create table 'test.t6' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_per_table. +Error 1005 Can't create table 'test.t6' (errno: 1478) +create table t7 (id int primary key) engine = innodb row_format = dynamic; +ERROR HY000: Can't create table 'test.t7' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: ROW_FORMAT=DYNAMIC requires innodb_file_per_table. +Error 1005 Can't create table 'test.t7' (errno: 1478) +create table t8 (id int primary key) engine = innodb row_format = compact; +create table t9 (id int primary key) engine = innodb row_format = redundant; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t8 Compact +test t9 Redundant +drop table t8, t9; +set global innodb_file_per_table = on; +set global innodb_file_format = `0`; +create table t1 (id int primary key) engine = innodb key_block_size = 1; +ERROR HY000: Can't create table 'test.t1' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Error 1005 Can't create table 'test.t1' (errno: 1478) +create table t2 (id int primary key) engine = innodb key_block_size = 2; +ERROR HY000: Can't create table 'test.t2' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Error 1005 Can't create table 'test.t2' (errno: 1478) +create table t3 (id int primary key) engine = innodb key_block_size = 4; +ERROR HY000: Can't create table 'test.t3' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Error 1005 Can't create table 'test.t3' (errno: 1478) +create table t4 (id int primary key) engine = innodb key_block_size = 8; +ERROR HY000: Can't create table 'test.t4' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Error 1005 Can't create table 'test.t4' (errno: 1478) +create table t5 (id int primary key) engine = innodb key_block_size = 16; +ERROR HY000: Can't create table 'test.t5' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: KEY_BLOCK_SIZE requires innodb_file_format > Antelope. +Error 1005 Can't create table 'test.t5' (errno: 1478) +create table t6 (id int primary key) engine = innodb row_format = compressed; +ERROR HY000: Can't create table 'test.t6' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: ROW_FORMAT=COMPRESSED requires innodb_file_format > Antelope. +Error 1005 Can't create table 'test.t6' (errno: 1478) +create table t7 (id int primary key) engine = innodb row_format = dynamic; +ERROR HY000: Can't create table 'test.t7' (errno: 1478) +show errors; +Level Code Message +Error 1478 InnoDB: ROW_FORMAT=DYNAMIC requires innodb_file_format > Antelope. +Error 1005 Can't create table 'test.t7' (errno: 1478) +create table t8 (id int primary key) engine = innodb row_format = compact; +create table t9 (id int primary key) engine = innodb row_format = redundant; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +table_schema table_name row_format +test t8 Compact +test t9 Redundant +drop table t8, t9; +set global innodb_file_per_table=0; +set global innodb_file_format=Antelope; +set global innodb_file_per_table=on; +set global innodb_file_format=`Barracuda`; +set global innodb_file_format_check=`Antelope`; +create table normal_table ( +c1 int +) engine = innodb; +select @@innodb_file_format_check; +@@innodb_file_format_check +Antelope +create table zip_table ( +c1 int +) engine = innodb key_block_size = 8; +select @@innodb_file_format_check; +@@innodb_file_format_check +Barracuda +set global innodb_file_format_check=`Antelope`; +select @@innodb_file_format_check; +@@innodb_file_format_check +Antelope +show table status; +select @@innodb_file_format_check; +@@innodb_file_format_check +Barracuda +drop table normal_table, zip_table; diff --git a/mysql-test/r/innodb.result b/mysql-test/r/innodb.result index 1f46ade27e0..c6379d60ba4 100644 --- a/mysql-test/r/innodb.result +++ b/mysql-test/r/innodb.result @@ -1508,7 +1508,7 @@ t2 CREATE TABLE `t2` ( ) ENGINE=InnoDB DEFAULT CHARSET=latin1 drop index id2 on t2; drop index id on t2; -Got one of the listed errors +ERROR HY000: Cannot drop index 'id': needed in a foreign key constraint show create table t2; Table Create Table t2 CREATE TABLE `t2` ( @@ -1738,7 +1738,7 @@ count(*) drop table t1; show status like "Innodb_buffer_pool_pages_total"; Variable_name Value -Innodb_buffer_pool_pages_total 512 +Innodb_buffer_pool_pages_total 511 show status like "Innodb_page_size"; Variable_name Value Innodb_page_size 16384 @@ -1784,7 +1784,7 @@ innodb_sync_spin_loops 20 SET @old_innodb_thread_concurrency= @@global.innodb_thread_concurrency; show variables like "innodb_thread_concurrency"; Variable_name Value -innodb_thread_concurrency 8 +innodb_thread_concurrency 0 set global innodb_thread_concurrency=1001; Warnings: Warning 1292 Truncated incorrect thread_concurrency value: '1001' @@ -1970,7 +1970,7 @@ explain select count(*) from t1 where v between 'a' and 'a ' and v between 'a ' id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ref v v 13 const # Using where; Using index alter table t1 add unique(v); -ERROR 23000: Duplicate entry '{ ' for key 'v_2' +ERROR 23000: Duplicate entry 'v' for key 'v_2' alter table t1 add key(v); select concat('*',v,'*',c,'*',t,'*') as qq from t1 where v='a'; qq @@ -2377,6 +2377,8 @@ t1 CREATE TABLE `t1` ( ) ENGINE=InnoDB DEFAULT CHARSET=latin1 drop table t1; create table t1 (v varchar(10), c char(10)) row_format=fixed; +Warnings: +Warning 1478 InnoDB: assuming ROW_FORMAT=COMPACT. show create table t1; Table Create Table t1 CREATE TABLE `t1` ( @@ -3190,6 +3192,7 @@ t1 CREATE TABLE `t1` ( CONSTRAINT `t1_t2` FOREIGN KEY (`id`) REFERENCES `t2` (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=349 DEFAULT CHARSET=latin1 DROP TABLE t1,t2; +set innodb_strict_mode=on; CREATE TABLE t1 ( c01 CHAR(255), c02 CHAR(255), c03 CHAR(255), c04 CHAR(255), c05 CHAR(255), c06 CHAR(255), c07 CHAR(255), c08 CHAR(255), diff --git a/mysql-test/r/innodb_bug34300.result b/mysql-test/r/innodb_bug34300.result index b2732e3047d..ae9fee81ad7 100644 --- a/mysql-test/r/innodb_bug34300.result +++ b/mysql-test/r/innodb_bug34300.result @@ -1,8 +1,4 @@ -SELECT f4, f8 FROM bug34300; f4 f8 xxx zzz -ALTER TABLE bug34300 ADD COLUMN (f10 INT); -SELECT f4, f8 FROM bug34300; f4 f8 xxx zzz -DROP TABLE bug34300; diff --git a/mysql-test/r/innodb_bug36169.result b/mysql-test/r/innodb_bug36169.result new file mode 100644 index 00000000000..0c0cea08955 --- /dev/null +++ b/mysql-test/r/innodb_bug36169.result @@ -0,0 +1,5 @@ +SET @save_innodb_file_format=@@global.innodb_file_format; +SET @save_innodb_file_format_check=@@global.innodb_file_format_check; +SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +SET GLOBAL innodb_file_format='Barracuda'; +SET GLOBAL innodb_file_per_table=ON; diff --git a/mysql-test/r/innodb_bug36172.result b/mysql-test/r/innodb_bug36172.result new file mode 100644 index 00000000000..195775f74c8 --- /dev/null +++ b/mysql-test/r/innodb_bug36172.result @@ -0,0 +1 @@ +SET storage_engine=InnoDB; diff --git a/mysql-test/r/innodb_bug40360.result b/mysql-test/r/innodb_bug40360.result new file mode 100644 index 00000000000..ef4cf463903 --- /dev/null +++ b/mysql-test/r/innodb_bug40360.result @@ -0,0 +1,4 @@ +SET TX_ISOLATION='READ-COMMITTED'; +CREATE TABLE bug40360 (a INT) engine=innodb; +INSERT INTO bug40360 VALUES (1); +DROP TABLE bug40360; diff --git a/mysql-test/r/innodb_bug41904.result b/mysql-test/r/innodb_bug41904.result new file mode 100644 index 00000000000..6070d32d181 --- /dev/null +++ b/mysql-test/r/innodb_bug41904.result @@ -0,0 +1,4 @@ +CREATE TABLE bug41904 (id INT PRIMARY KEY, uniquecol CHAR(15)) ENGINE=InnoDB; +INSERT INTO bug41904 VALUES (1,NULL), (2,NULL); +CREATE UNIQUE INDEX ui ON bug41904 (uniquecol); +DROP TABLE bug41904; diff --git a/mysql-test/r/innodb_information_schema.result b/mysql-test/r/innodb_information_schema.result new file mode 100644 index 00000000000..396cae579ce --- /dev/null +++ b/mysql-test/r/innodb_information_schema.result @@ -0,0 +1,23 @@ +lock_mode lock_type lock_table lock_index lock_rec lock_data +X RECORD `test`.```t'\"_str` `PRIMARY` 2 '1', 'abc', '''abc', 'abc''', 'a''bc', 'a''bc''', '''abc''''' +X RECORD `test`.```t'\"_str` `PRIMARY` 2 '1', 'abc', '''abc', 'abc''', 'a''bc', 'a''bc''', '''abc''''' +X RECORD `test`.```t'\"_str` `PRIMARY` 3 '2', 'abc', '"abc', 'abc"', 'a"bc', 'a"bc"', '"abc""' +X RECORD `test`.```t'\"_str` `PRIMARY` 3 '2', 'abc', '"abc', 'abc"', 'a"bc', 'a"bc"', '"abc""' +X RECORD `test`.```t'\"_str` `PRIMARY` 4 '3', 'abc', '\\abc', 'abc\\', 'a\\bc', 'a\\bc\\', '\\abc\\\\' +X RECORD `test`.```t'\"_str` `PRIMARY` 4 '3', 'abc', '\\abc', 'abc\\', 'a\\bc', 'a\\bc\\', '\\abc\\\\' +X RECORD `test`.```t'\"_str` `PRIMARY` 5 '4', 'abc', '\0abc', 'abc\0', 'a\0bc', 'a\0bc\0', 'a\0bc\0\0' +X RECORD `test`.```t'\"_str` `PRIMARY` 5 '4', 'abc', '\0abc', 'abc\0', 'a\0bc', 'a\0bc\0', 'a\0bc\0\0' +X RECORD `test`.`t_min` `PRIMARY` 2 -128, 0, -32768, 0, -8388608, 0, -2147483648, 0, -9223372036854775808, 0 +X RECORD `test`.`t_min` `PRIMARY` 2 -128, 0, -32768, 0, -8388608, 0, -2147483648, 0, -9223372036854775808, 0 +X RECORD `test`.`t_max` `PRIMARY` 2 127, 255, 32767, 65535, 8388607, 16777215, 2147483647, 4294967295, 9223372036854775807, 18446744073709551615 +X RECORD `test`.`t_max` `PRIMARY` 2 127, 255, 32767, 65535, 8388607, 16777215, 2147483647, 4294967295, 9223372036854775807, 18446744073709551615 +X RECORD `test`.```t'\"_str` `PRIMARY` 1 supremum pseudo-record +X RECORD `test`.```t'\"_str` `PRIMARY` 1 supremum pseudo-record +lock_table COUNT(*) +`test`.`t_max` 2 +`test`.`t_min` 2 +`test`.```t'\"_str` 10 +lock_table COUNT(*) +"test"."t_max" 2 +"test"."t_min" 2 +"test"."`t'\""_str" 10 diff --git a/mysql-test/r/innodb_xtradb_bug317074.result b/mysql-test/r/innodb_xtradb_bug317074.result new file mode 100644 index 00000000000..0c0cea08955 --- /dev/null +++ b/mysql-test/r/innodb_xtradb_bug317074.result @@ -0,0 +1,5 @@ +SET @save_innodb_file_format=@@global.innodb_file_format; +SET @save_innodb_file_format_check=@@global.innodb_file_format_check; +SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +SET GLOBAL innodb_file_format='Barracuda'; +SET GLOBAL innodb_file_per_table=ON; diff --git a/mysql-test/r/mysqlbinlog_row_big.result b/mysql-test/r/mysqlbinlog_row_big.result index f2ca72f9936..05488be6b08 100644 --- a/mysql-test/r/mysqlbinlog_row_big.result +++ b/mysql-test/r/mysqlbinlog_row_big.result @@ -1,4 +1,9 @@ # +# We need big packets. +# +SET @old_global_max_allowed_packet=@@global.max_allowed_packet; +SET @@global.max_allowed_packet= 1024*1024*1024; +# # Preparatory cleanup. # DROP TABLE IF EXISTS t1; @@ -7,10 +12,6 @@ DROP TABLE IF EXISTS t1; # SET timestamp=1000000000; # -# We need big packets. -# -SET @@global.max_allowed_packet= 1024*1024*1024; -# # Delete all existing binary logs. # RESET MASTER; @@ -71,4 +72,5 @@ FLUSH LOGS; # Cleanup. # DROP TABLE t1; +SET @@global.max_allowed_packet=@old_global_max_allowed_packet; remove_file $MYSQLTEST_VARDIR/tmp/mysqlbinlog_big_1.out diff --git a/mysql-test/r/row-checksum-old.result b/mysql-test/r/row-checksum-old.result index 3cf5a7104b9..ef523463860 100644 --- a/mysql-test/r/row-checksum-old.result +++ b/mysql-test/r/row-checksum-old.result @@ -72,6 +72,8 @@ Table Checksum test.t1 4108368782 drop table if exists t1; create table t1 (a int null, v varchar(100)) engine=innodb checksum=0 row_format=fixed; +Warnings: +Warning 1478 InnoDB: assuming ROW_FORMAT=COMPACT. insert into t1 values(null, null), (1, "hello"); checksum table t1; Table Checksum diff --git a/mysql-test/r/row-checksum.result b/mysql-test/r/row-checksum.result index 31ae094859b..fb8a1260a1d 100644 --- a/mysql-test/r/row-checksum.result +++ b/mysql-test/r/row-checksum.result @@ -72,6 +72,8 @@ Table Checksum test.t1 3885665021 drop table if exists t1; create table t1 (a int null, v varchar(100)) engine=innodb checksum=0 row_format=fixed; +Warnings: +Warning 1478 InnoDB: assuming ROW_FORMAT=COMPACT. insert into t1 values(null, null), (1, "hello"); checksum table t1; Table Checksum diff --git a/mysql-test/r/variables-big.result b/mysql-test/r/variables-big.result index 35882c7e284..2c178999a2c 100644 --- a/mysql-test/r/variables-big.result +++ b/mysql-test/r/variables-big.result @@ -1,20 +1,20 @@ set session transaction_prealloc_size=1024*1024*1024*1; -show processlist; -Id User Host db Command Time State Info -# root localhost test Query 0 NULL show processlist +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; +TRUE +1 set session transaction_prealloc_size=1024*1024*1024*2; -show processlist; -Id User Host db Command Time State Info -# root localhost test Query 0 NULL show processlist +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; +TRUE +1 set session transaction_prealloc_size=1024*1024*1024*3; -show processlist; -Id User Host db Command Time State Info -# root localhost test Query 0 NULL show processlist +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; +TRUE +1 set session transaction_prealloc_size=1024*1024*1024*4; -show processlist; -Id User Host db Command Time State Info -# root localhost test Query 0 NULL show processlist +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; +TRUE +1 set session transaction_prealloc_size=1024*1024*1024*5; -show processlist; -Id User Host db Command Time State Info -# root localhost test Query 0 NULL show processlist +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; +TRUE +1 diff --git a/mysql-test/t/information_schema.test b/mysql-test/t/information_schema.test index 5be5d9e3893..d2db589ad91 100644 --- a/mysql-test/t/information_schema.test +++ b/mysql-test/t/information_schema.test @@ -47,7 +47,7 @@ create view v1 (c) as table_name<>'ndb_binlog_index' AND table_name<>'ndb_apply_status' AND NOT (table_schema = 'INFORMATION_SCHEMA' AND table_name LIKE 'PBXT_%'); -select * from v1; +select * from v1 ORDER BY c COLLATE utf8_bin; select c,table_name from v1 inner join information_schema.TABLES v2 on (v1.c=v2.table_name) diff --git a/mysql-test/t/innodb-analyze.test b/mysql-test/t/innodb-analyze.test new file mode 100644 index 00000000000..88eb42991b0 --- /dev/null +++ b/mysql-test/t/innodb-analyze.test @@ -0,0 +1,65 @@ +# +# Test that mysqld does not crash when running ANALYZE TABLE with +# different values of the parameter innodb_stats_sample_pages. +# + +-- source include/have_innodb.inc + +# we care only that the following SQL commands do not produce errors +# and do not crash the server +-- disable_query_log +-- disable_result_log +-- enable_warnings + +SET @save_innodb_stats_sample_pages=@@innodb_stats_sample_pages; +SET GLOBAL innodb_stats_sample_pages=0; + +# check that the value has been adjusted to 1 +-- enable_result_log +SHOW VARIABLES LIKE 'innodb_stats_sample_pages'; +-- disable_result_log + +CREATE TABLE innodb_analyze ( + a INT, + b INT, + KEY(a), + KEY(b,a) +) ENGINE=InnoDB; + +# test with empty table + +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=2; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=4; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=8; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=16; +ANALYZE TABLE innodb_analyze; + +INSERT INTO innodb_analyze VALUES +(1,1), (1,1), (1,2), (1,3), (1,4), (1,5), +(8,1), (8,8), (8,2), (7,1), (1,4), (3,5); + +SET GLOBAL innodb_stats_sample_pages=1; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=2; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=4; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=8; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=16; +ANALYZE TABLE innodb_analyze; + +SET GLOBAL innodb_stats_sample_pages=@save_innodb_stats_sample_pages; +DROP TABLE innodb_analyze; diff --git a/mysql-test/t/innodb-autoinc.test b/mysql-test/t/innodb-autoinc.test index af23a900ee0..87ad470949d 100644 --- a/mysql-test/t/innodb-autoinc.test +++ b/mysql-test/t/innodb-autoinc.test @@ -396,7 +396,7 @@ DROP TABLE t1; # SET @@SESSION.AUTO_INCREMENT_INCREMENT=1, @@SESSION.AUTO_INCREMENT_OFFSET=1; SET @@INSERT_ID=1; -SHOW VARIABLES LIKE "%auto_inc%"; +SHOW VARIABLES LIKE "auto_inc%"; CREATE TABLE t1 (c1 DOUBLE NOT NULL AUTO_INCREMENT, c2 INT, PRIMARY KEY (c1)) ENGINE=InnoDB; INSERT INTO t1 VALUES(NULL, 1); INSERT INTO t1 VALUES(NULL, 2); diff --git a/mysql-test/t/innodb-index.test b/mysql-test/t/innodb-index.test new file mode 100644 index 00000000000..f81d70f5dd6 --- /dev/null +++ b/mysql-test/t/innodb-index.test @@ -0,0 +1,516 @@ +-- source include/have_innodb.inc + +SET @save_innodb_file_format_check=@@global.innodb_file_format_check; + +create table t1(a int not null, b int, c char(10) not null, d varchar(20)) engine = innodb; +insert into t1 values (5,5,'oo','oo'),(4,4,'tr','tr'),(3,4,'ad','ad'),(2,3,'ak','ak'); +commit; +--error ER_DUP_KEYNAME +alter table t1 add index b (b), add index b (b); +--error ER_DUP_FIELDNAME +alter table t1 add index (b,b); +alter table t1 add index d2 (d); +show create table t1; +explain select * from t1 force index(d2) order by d; +select * from t1 force index (d2) order by d; +--error ER_DUP_ENTRY +alter table t1 add unique index (b); +show create table t1; +alter table t1 add index (b); +show create table t1; + +# Check how existing tables interfere with temporary tables. +CREATE TABLE `t1#1`(a INT PRIMARY KEY) ENGINE=InnoDB; + +call mtr.add_suppression(" table `test`\\.`t1#[12]` already exists in InnoDB internal"); + +--error 156 +alter table t1 add unique index (c), add index (d); +rename table `t1#1` to `t1#2`; +--error 156 +alter table t1 add unique index (c), add index (d); +drop table `t1#2`; + +alter table t1 add unique index (c), add index (d); +show create table t1; +explain select * from t1 force index(c) order by c; +alter table t1 add primary key (a), drop index c; +show create table t1; +--error ER_MULTIPLE_PRI_KEY +alter table t1 add primary key (c); +--error ER_DUP_ENTRY +alter table t1 drop primary key, add primary key (b); +create unique index c on t1 (c); +show create table t1; +explain select * from t1 force index(c) order by c; +select * from t1 force index(c) order by c; +alter table t1 drop index b, add index (b); +show create table t1; +insert into t1 values(6,1,'ggg','ggg'); +select * from t1; +select * from t1 force index(b) order by b; +select * from t1 force index(c) order by c; +select * from t1 force index(d) order by d; +explain select * from t1 force index(b) order by b; +explain select * from t1 force index(c) order by c; +explain select * from t1 force index(d) order by d; +show create table t1; +drop table t1; + +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,3,'ad','ad'),(4,4,'afe','afe'); +commit; +alter table t1 add index (c(2)); +show create table t1; +alter table t1 add unique index (d(10)); +show create table t1; +insert into t1 values(5,1,'ggg','ggg'); +select * from t1; +select * from t1 force index(c) order by c; +select * from t1 force index(d) order by d; +explain select * from t1 order by b; +explain select * from t1 force index(c) order by c; +explain select * from t1 force index(d) order by d; +show create table t1; +alter table t1 drop index d; +insert into t1 values(8,9,'fff','fff'); +select * from t1; +select * from t1 force index(c) order by c; +explain select * from t1 order by b; +explain select * from t1 force index(c) order by c; +explain select * from t1 order by d; +show create table t1; +drop table t1; + +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,2,'ad','ad'),(4,4,'afe','afe'); +commit; +alter table t1 add unique index (b,c); +insert into t1 values(8,9,'fff','fff'); +select * from t1; +select * from t1 force index(b) order by b; +explain select * from t1 force index(b) order by b; +show create table t1; +alter table t1 add index (b,c); +insert into t1 values(11,11,'kkk','kkk'); +select * from t1; +select * from t1 force index(b) order by b; +explain select * from t1 force index(b) order by b; +show create table t1; +alter table t1 add unique index (c,d); +insert into t1 values(13,13,'yyy','aaa'); +select * from t1; +select * from t1 force index(b) order by b; +select * from t1 force index(c) order by c; +explain select * from t1 force index(b) order by b; +explain select * from t1 force index(c) order by c; +show create table t1; +drop table t1; + +create table t1(a int not null, b int not null, c int, primary key (a), key (b)) engine = innodb; +create table t3(a int not null, c int not null, d int, primary key (a), key (c)) engine = innodb; +create table t4(a int not null, d int not null, e int, primary key (a), key (d)) engine = innodb; +create table t2(a int not null, b int not null, c int not null, d int not null, e int, +foreign key (b) references t1(b) on delete cascade, +foreign key (c) references t3(c), foreign key (d) references t4(d)) +engine = innodb; +--error ER_DROP_INDEX_FK +alter table t1 drop index b; +--error ER_DROP_INDEX_FK +alter table t3 drop index c; +--error ER_DROP_INDEX_FK +alter table t4 drop index d; +--error ER_DROP_INDEX_FK +alter table t2 drop index b; +--error ER_DROP_INDEX_FK +alter table t2 drop index b, drop index c, drop index d; +# Apparently, the following makes mysql_alter_table() drop index d. +create unique index dc on t2 (d,c); +create index dc on t1 (b,c); +# This should preserve the foreign key constraints. +alter table t2 add primary key (a); +insert into t1 values (1,1,1); +insert into t3 values (1,1,1); +insert into t4 values (1,1,1); +insert into t2 values (1,1,1,1,1); +commit; +alter table t4 add constraint dc foreign key (a) references t1(a); +show create table t4; +--replace_regex /'test\.#sql-[0-9a-f_]*'/'#sql-temporary'/ +# a foreign key 'test/dc' already exists +--error ER_CANT_CREATE_TABLE +alter table t3 add constraint dc foreign key (a) references t1(a); +show create table t3; +alter table t2 drop index b, add index (b); +show create table t2; +--error ER_ROW_IS_REFERENCED_2 +delete from t1; +--error ER_CANT_DROP_FIELD_OR_KEY +drop index dc on t4; +# there is no foreign key dc on t3 +--replace_regex /'\.\/test\/#sql2-[0-9a-f-]*'/'#sql2-temporary'/ +--error ER_ERROR_ON_RENAME +alter table t3 drop foreign key dc; +alter table t4 drop foreign key dc; +select * from t2; +delete from t1; +select * from t2; + +drop table t2,t4,t3,t1; + +-- let charset = utf8 +-- source include/innodb-index.inc + +create table t1(a int not null, b int) engine = innodb; +insert into t1 values (1,1),(1,1),(1,1),(1,1); +--error ER_DUP_ENTRY +alter table t1 add unique index (a); +--error ER_DUP_ENTRY +alter table t1 add unique index (b); +--error ER_DUP_ENTRY +alter table t1 add unique index (a), add unique index(b); +show create table t1; +drop table t1; + +create table t1(a int not null, c int not null,b int, primary key(a), unique key(c), key(b)) engine = innodb; +alter table t1 drop index c, drop index b; +show create table t1; +drop table t1; + +create table t1(a int not null, b int, primary key(a)) engine = innodb; +alter table t1 add index (b); +show create table t1; +drop table t1; + +create table t1(a int not null, b int, c char(10), d varchar(20), primary key (a)) engine = innodb; +insert into t1 values (1,1,'ab','ab'),(2,2,'ac','ac'),(3,3,'ac','ac'),(4,4,'afe','afe'),(5,4,'affe','affe'); +--error ER_DUP_ENTRY +alter table t1 add unique index (b), add unique index (c), add unique index (d); +--error ER_DUP_ENTRY +alter table t1 add unique index (c), add unique index (b), add index (d); +show create table t1; +drop table t1; + +create table t1(a int not null, b int not null, c int, primary key (a), key(c)) engine=innodb; +insert into t1 values (5,1,5),(4,2,4),(3,3,3),(2,4,2),(1,5,1); +alter table t1 add unique index (b); +insert into t1 values (10,20,20),(11,19,19),(12,18,18),(13,17,17); +show create table t1; +check table t1; +explain select * from t1 force index(c) order by c; +explain select * from t1 order by a; +explain select * from t1 force index(b) order by b; +select * from t1 order by a; +select * from t1 force index(b) order by b; +select * from t1 force index(c) order by c; +drop table t1; + +create table t1(a int not null, b int not null) engine=innodb; +insert into t1 values (1,1); +alter table t1 add primary key(b); +insert into t1 values (2,2); +show create table t1; +check table t1; +select * from t1; +explain select * from t1; +explain select * from t1 order by a; +explain select * from t1 order by b; +checksum table t1; +drop table t1; + +create table t1(a int not null) engine=innodb; +insert into t1 values (1); +alter table t1 add primary key(a); +insert into t1 values (2); +show create table t1; +check table t1; +commit; +select * from t1; +explain select * from t1; +explain select * from t1 order by a; +drop table t1; + +create table t2(d varchar(17) primary key) engine=innodb default charset=utf8; +create table t3(a int primary key) engine=innodb; + +insert into t3 values(22),(44),(33),(55),(66); + +insert into t2 values ('jejdkrun87'),('adfd72nh9k'), +('adfdpplkeock'),('adfdijnmnb78k'),('adfdijn0loKNHJik'); + +create table t1(a int, b blob, c text, d text not null) +engine=innodb default charset = utf8; + +# r2667 The following test is disabled because MySQL behavior changed. +# r2667 The test was added with this comment: +# r2667 +# r2667 ------------------------------------------------------------------------ +# r2667 r1699 | marko | 2007-08-10 19:53:19 +0300 (Fri, 10 Aug 2007) | 5 lines +# r2667 +# r2667 branches/zip: Add changes that accidentally omitted from r1698: +# r2667 +# r2667 innodb-index.test, innodb-index.result: Add a test for creating +# r2667 a PRIMARY KEY on a column that contains a NULL value. +# r2667 ------------------------------------------------------------------------ +# r2667 +# r2667 but in BZR-r2667: +# r2667 http://bazaar.launchpad.net/~mysql/mysql-server/mysql-5.1/revision/davi%40mysql.com-20080617141221-8yre8ys9j4uw3xx5?start_revid=joerg%40mysql.com-20080630105418-7qoe5ehomgrcdb89 +# r2667 MySQL changed the behavior to do full table copy when creating PRIMARY INDEX +# r2667 on a non-NULL column instead of calling ::add_index() which would fail (and +# r2667 this is what we were testing here). Before r2667 the code execution path was +# r2667 like this (when adding PRIMARY INDEX on a non-NULL column with ALTER TABLE): +# r2667 +# r2667 mysql_alter_table() +# r2667 compare_tables() // would return ALTER_TABLE_INDEX_CHANGED +# r2667 ::add_index() // would fail with "primary index cannot contain NULL" +# r2667 +# r2667 after r2667 the code execution path is the following: +# r2667 +# r2667 mysql_alter_table() +# r2667 compare_tables() // returns ALTER_TABLE_DATA_CHANGED +# r2667 full copy is done, without calling ::add_index() +# r2667 +# r2667 To enable, remove "# r2667: " below. +# r2667 +# r2667: insert into t1 values (null,null,null,'null'); +insert into t1 +select a,left(repeat(d,100*a),65535),repeat(d,20*a),d from t2,t3; +drop table t2, t3; +select count(*) from t1 where a=44; +select a, +length(b),b=left(repeat(d,100*a),65535),length(c),c=repeat(d,20*a),d from t1; +# r2667: --error ER_PRIMARY_CANT_HAVE_NULL +# r2667: alter table t1 add primary key (a), add key (b(20)); +# r2667: delete from t1 where d='null'; +--error ER_DUP_ENTRY +alter table t1 add primary key (a), add key (b(20)); +delete from t1 where a%2; +check table t1; +alter table t1 add primary key (a,b(255),c(255)), add key (b(767)); +select count(*) from t1 where a=44; +select a, +length(b),b=left(repeat(d,100*a),65535),length(c),c=repeat(d,20*a),d from t1; +show create table t1; +check table t1; +explain select * from t1 where b like 'adfd%'; + +# +# Test locking +# + +create table t2(a int, b varchar(255), primary key(a,b)) engine=innodb; +insert into t2 select a,left(b,255) from t1; +drop table t1; +rename table t2 to t1; + +connect (a,localhost,root,,); +connect (b,localhost,root,,); +connection a; +set innodb_lock_wait_timeout=1; +begin; +# Obtain an IX lock on the table +select a from t1 limit 1 for update; +connection b; +set innodb_lock_wait_timeout=1; +# This would require an S lock on the table, conflicting with the IX lock. +--error ER_LOCK_WAIT_TIMEOUT +create index t1ba on t1 (b,a); +connection a; +commit; +begin; +# Obtain an IS lock on the table +select a from t1 limit 1 lock in share mode; +connection b; +# This will require an S lock on the table. No conflict with the IS lock. +create index t1ba on t1 (b,a); +# This would require an X lock on the table, conflicting with the IS lock. +--error ER_LOCK_WAIT_TIMEOUT +drop index t1ba on t1; +connection a; +commit; +explain select a from t1 order by b; +--send +select a,sleep(2+a/100) from t1 order by b limit 3; + +# The following DROP INDEX will succeed, altough the SELECT above has +# opened a read view. However, during the execution of the SELECT, +# MySQL should hold a table lock that should block the execution +# of the DROP INDEX below. + +connection b; +select sleep(1); +drop index t1ba on t1; + +# After the index was dropped, subsequent SELECTs will use the same +# read view, but they should not be accessing the dropped index any more. + +connection a; +reap; +explain select a from t1 order by b; +select a from t1 order by b limit 3; +commit; + +connection default; +disconnect a; +disconnect b; + +drop table t1; + +let $per_table=`select @@innodb_file_per_table`; +let $format=`select @@innodb_file_format`; +set global innodb_file_per_table=on; +set global innodb_file_format='Barracuda'; +# Test creating a table that could lead to undo log overflow. +# In the undo log, we write a 768-byte prefix (REC_MAX_INDEX_COL_LEN) +# of each externally stored column that appears as a column prefix in an index. +# For this test case, it would suffice to write 1 byte, though. +create table t1(a blob,b blob,c blob,d blob,e blob,f blob,g blob,h blob, + i blob,j blob,k blob,l blob,m blob,n blob,o blob,p blob, + q blob,r blob,s blob,t blob,u blob) + engine=innodb row_format=dynamic; +create index t1a on t1 (a(1)); +create index t1b on t1 (b(1)); +create index t1c on t1 (c(1)); +create index t1d on t1 (d(1)); +create index t1e on t1 (e(1)); +create index t1f on t1 (f(1)); +create index t1g on t1 (g(1)); +create index t1h on t1 (h(1)); +create index t1i on t1 (i(1)); +create index t1j on t1 (j(1)); +create index t1k on t1 (k(1)); +create index t1l on t1 (l(1)); +create index t1m on t1 (m(1)); +create index t1n on t1 (n(1)); +create index t1o on t1 (o(1)); +create index t1p on t1 (p(1)); +create index t1q on t1 (q(1)); +create index t1r on t1 (r(1)); +create index t1s on t1 (s(1)); +create index t1t on t1 (t(1)); +--error 139 +create index t1u on t1 (u(1)); +--error 139 +create index t1ut on t1 (u(1), t(1)); +create index t1st on t1 (s(1), t(1)); +show create table t1; +--error 139 +create index t1u on t1 (u(1)); +alter table t1 row_format=compact; +create index t1u on t1 (u(1)); + +drop table t1; +eval set global innodb_file_per_table=$per_table; +eval set global innodb_file_format=$format; + +# +# Test to check whether CREATE INDEX handles implicit foreign key +# constraint modifications (Issue #70, Bug #38786) +# +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; + +CREATE TABLE t1( + c1 BIGINT(12) NOT NULL, + PRIMARY KEY (c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +CREATE TABLE t2( + c1 BIGINT(16) NOT NULL, + c2 BIGINT(12) NOT NULL, + c3 BIGINT(12) NOT NULL, + PRIMARY KEY (c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca + FOREIGN KEY (c3) REFERENCES t1(c1); + +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; + +SHOW CREATE TABLE t2; + +CREATE INDEX i_t2_c3_c2 ON t2(c3, c2); + +SHOW CREATE TABLE t2; + +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; + +--error ER_NO_REFERENCED_ROW_2 +INSERT INTO t2 VALUES(0,0,0); +INSERT INTO t1 VALUES(0); +INSERT INTO t2 VALUES(0,0,0); + +DROP TABLE t2; + +CREATE TABLE t2( + c1 BIGINT(16) NOT NULL, + c2 BIGINT(12) NOT NULL, + c3 BIGINT(12) NOT NULL, + PRIMARY KEY (c1,c2,c3) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca + FOREIGN KEY (c3) REFERENCES t1(c1); + +SHOW CREATE TABLE t2; + +CREATE INDEX i_t2_c3_c2 ON t2(c3, c2); + +SHOW CREATE TABLE t2; +--error ER_NO_REFERENCED_ROW_2 +INSERT INTO t2 VALUES(0,0,1); +INSERT INTO t2 VALUES(0,0,0); +--error ER_ROW_IS_REFERENCED_2 +DELETE FROM t1; +DELETE FROM t2; + +DROP TABLE t2; +DROP TABLE t1; + +CREATE TABLE t1( + c1 BIGINT(12) NOT NULL, + c2 INT(4) NOT NULL, + PRIMARY KEY (c2,c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +CREATE TABLE t2( + c1 BIGINT(16) NOT NULL, + c2 BIGINT(12) NOT NULL, + c3 BIGINT(12) NOT NULL, + PRIMARY KEY (c1) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; + +--replace_regex /'test\.#sql-[0-9_a-f-]*'/'#sql-temporary'/ +--error ER_CANT_CREATE_TABLE +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca + FOREIGN KEY (c3,c2) REFERENCES t1(c1,c1); +--replace_regex /'test\.#sql-[0-9_a-f-]*'/'#sql-temporary'/ +--error ER_CANT_CREATE_TABLE +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca + FOREIGN KEY (c3,c2) REFERENCES t1(c1,c2); +--replace_regex /'test\.#sql-[0-9_a-f-]*'/'#sql-temporary'/ +--error ER_CANT_CREATE_TABLE +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca + FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1); +ALTER TABLE t1 MODIFY COLUMN c2 BIGINT(12) NOT NULL; +--replace_regex /'test\.#sql-[0-9_a-f-]*'/'#sql-temporary'/ +--error ER_CANT_CREATE_TABLE +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca + FOREIGN KEY (c3,c2) REFERENCES t1(c1,c2); + +ALTER TABLE t2 ADD CONSTRAINT fk_t2_ca + FOREIGN KEY (c3,c2) REFERENCES t1(c2,c1); +SHOW CREATE TABLE t1; +SHOW CREATE TABLE t2; +CREATE INDEX i_t2_c2_c1 ON t2(c2, c1); +SHOW CREATE TABLE t2; +CREATE INDEX i_t2_c3_c1_c2 ON t2(c3, c1, c2); +SHOW CREATE TABLE t2; +CREATE INDEX i_t2_c3_c2 ON t2(c3, c2); +SHOW CREATE TABLE t2; + +DROP TABLE t2; +DROP TABLE t1; +SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; diff --git a/mysql-test/t/innodb-index_ucs2.test b/mysql-test/t/innodb-index_ucs2.test new file mode 100644 index 00000000000..fff9a4da1a8 --- /dev/null +++ b/mysql-test/t/innodb-index_ucs2.test @@ -0,0 +1,5 @@ +-- source include/have_innodb.inc +-- source include/have_ucs2.inc + +-- let charset = ucs2 +-- source include/innodb-index.inc diff --git a/mysql-test/t/innodb-timeout.test b/mysql-test/t/innodb-timeout.test new file mode 100644 index 00000000000..f23fe3cff2d --- /dev/null +++ b/mysql-test/t/innodb-timeout.test @@ -0,0 +1,64 @@ +-- source include/have_innodb.inc + +let $timeout=`select @@innodb_lock_wait_timeout`; +set global innodb_lock_wait_timeout=42; + +connect (a,localhost,root,,); +connect (b,localhost,root,,); + +connection a; +select @@innodb_lock_wait_timeout; +set innodb_lock_wait_timeout=1; +select @@innodb_lock_wait_timeout; + +connection b; +select @@innodb_lock_wait_timeout; +set global innodb_lock_wait_timeout=347; +select @@innodb_lock_wait_timeout; +set innodb_lock_wait_timeout=1; +select @@innodb_lock_wait_timeout; + +connect (c,localhost,root,,); +connection c; +select @@innodb_lock_wait_timeout; +connection default; +disconnect c; + +connection a; +create table t1(a int primary key)engine=innodb; +begin; +insert into t1 values(1),(2),(3); + +connection b; +--send +select * from t1 for update; + +connection a; +commit; + +connection b; +reap; + +connection a; +begin; +insert into t1 values(4); + +connection b; +--send +select * from t1 for update; + +connection a; +sleep 2; +commit; + +connection b; +--error ER_LOCK_WAIT_TIMEOUT +reap; +drop table t1; + +connection default; + +disconnect a; +disconnect b; + +eval set global innodb_lock_wait_timeout=$timeout; diff --git a/mysql-test/t/innodb-use-sys-malloc-master.opt b/mysql-test/t/innodb-use-sys-malloc-master.opt new file mode 100644 index 00000000000..889834add01 --- /dev/null +++ b/mysql-test/t/innodb-use-sys-malloc-master.opt @@ -0,0 +1,2 @@ +--innodb-use-sys-malloc=true +--innodb-use-sys-malloc=true diff --git a/mysql-test/t/innodb-use-sys-malloc.test b/mysql-test/t/innodb-use-sys-malloc.test new file mode 100644 index 00000000000..325dd19d086 --- /dev/null +++ b/mysql-test/t/innodb-use-sys-malloc.test @@ -0,0 +1,48 @@ +--source include/have_innodb.inc + +#display current value of innodb_use_sys_malloc +SELECT @@GLOBAL.innodb_use_sys_malloc; +--echo 1 Expected + +#try changing it. Should fail. +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +SET @@GLOBAL.innodb_use_sys_malloc=0; +--echo Expected error 'Read only variable' + +SELECT @@GLOBAL.innodb_use_sys_malloc; +--echo 1 Expected + + +#do some stuff to see if it works. +--disable_warnings +drop table if exists t1; +--enable_warnings + +create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; +insert into t1 values (1),(2),(3),(4),(5),(6),(7); +select * from t1; +drop table t1; +--source include/have_innodb.inc + +#display current value of innodb_use_sys_malloc +SELECT @@GLOBAL.innodb_use_sys_malloc; +--echo 1 Expected + +#try changing it. Should fail. +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +SET @@GLOBAL.innodb_use_sys_malloc=0; +--echo Expected error 'Read only variable' + +SELECT @@GLOBAL.innodb_use_sys_malloc; +--echo 1 Expected + + +#do some stuff to see if it works. +--disable_warnings +drop table if exists t1; +--enable_warnings + +create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; +insert into t1 values (1),(2),(3),(4),(5),(6),(7); +select * from t1; +drop table t1; diff --git a/mysql-test/t/innodb-zip.test b/mysql-test/t/innodb-zip.test new file mode 100644 index 00000000000..f3f00aea26d --- /dev/null +++ b/mysql-test/t/innodb-zip.test @@ -0,0 +1,347 @@ +-- source include/have_innodb.inc + +let $per_table=`select @@innodb_file_per_table`; +let $format=`select @@innodb_file_format`; +let $innodb_file_format_check_orig=`select @@innodb_file_format_check`; +set global innodb_file_per_table=off; +set global innodb_file_format=`0`; + +create table t0(a int primary key) engine=innodb row_format=compressed; +create table t00(a int primary key) engine=innodb +key_block_size=4 row_format=compressed; +create table t1(a int primary key) engine=innodb row_format=dynamic; +create table t2(a int primary key) engine=innodb row_format=redundant; +create table t3(a int primary key) engine=innodb row_format=compact; +create table t4(a int primary key) engine=innodb key_block_size=9; +create table t5(a int primary key) engine=innodb +key_block_size=1 row_format=redundant; + +set global innodb_file_per_table=on; +create table t6(a int primary key) engine=innodb +key_block_size=1 row_format=redundant; +set global innodb_file_format=`1`; +create table t7(a int primary key) engine=innodb +key_block_size=1 row_format=redundant; +create table t8(a int primary key) engine=innodb +key_block_size=1 row_format=fixed; +create table t9(a int primary key) engine=innodb +key_block_size=1 row_format=compact; +create table t10(a int primary key) engine=innodb +key_block_size=1 row_format=dynamic; +create table t11(a int primary key) engine=innodb +key_block_size=1 row_format=compressed; +create table t12(a int primary key) engine=innodb +key_block_size=1; +create table t13(a int primary key) engine=innodb +row_format=compressed; +create table t14(a int primary key) engine=innodb key_block_size=9; + +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; + +drop table t0,t00,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14; +alter table t1 key_block_size=0; +alter table t1 row_format=dynamic; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +alter table t1 row_format=compact; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +alter table t1 row_format=redundant; +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +drop table t1; + +create table t1(a int not null, b text, index(b(10))) engine=innodb +key_block_size=1; + +create table t2(b text)engine=innodb; +insert into t2 values(concat('1abcdefghijklmnopqrstuvwxyz', repeat('A',5000))); + +insert into t1 select 1, b from t2; +commit; + +connect (a,localhost,root,,); +connect (b,localhost,root,,); + +connection a; +begin; +update t1 set b=repeat('B',100); + +connection b; +select a,left(b,40) from t1 natural join t2; + +connection a; +rollback; + +connection b; +select a,left(b,40) from t1 natural join t2; + +connection default; +disconnect a; +disconnect b; + +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +drop table t1,t2; + +# The following should fail even in non-strict mode. +SET SESSION innodb_strict_mode = off; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE t1( + c TEXT NOT NULL, d TEXT NOT NULL, + PRIMARY KEY (c(767),d(767))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1 CHARSET=ASCII; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE t1( + c TEXT NOT NULL, d TEXT NOT NULL, + PRIMARY KEY (c(767),d(767))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=2 CHARSET=ASCII; +CREATE TABLE t1( + c TEXT NOT NULL, d TEXT NOT NULL, + PRIMARY KEY (c(767),d(767))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4 CHARSET=ASCII; +drop table t1; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE t1(c TEXT, PRIMARY KEY (c(440))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1 CHARSET=ASCII; +# The maximum key size for a compressed row actually depends on the +# version of libz used, as account must be taken for the maximum +# compressed size of a key, and this differs between libz +# versions. Some libz versions allow a size of 439, some only 438. +CREATE TABLE t1(c TEXT, PRIMARY KEY (c(438))) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1 CHARSET=ASCII; +INSERT INTO t1 VALUES(REPEAT('A',512)),(REPEAT('B',512)); +DROP TABLE t1; + +# +# Test blob column inheritance (mantis issue#36) +# + +create table t1( c1 int not null, c2 blob, c3 blob, c4 blob, + primary key(c1, c2(22), c3(22))) + engine = innodb row_format = dynamic; +begin; +insert into t1 values(1, repeat('A', 20000), repeat('B', 20000), + repeat('C', 20000)); + +update t1 set c3 = repeat('D', 20000) where c1 = 1; +commit; + +# one blob column which is unchanged in update and part of PK +# one blob column which is changed and part of of PK +# one blob column which is not part of PK and is unchanged +select count(*) from t1 where c2 = repeat('A', 20000); +select count(*) from t1 where c3 = repeat('D', 20000); +select count(*) from t1 where c4 = repeat('C', 20000); + +update t1 set c3 = repeat('E', 20000) where c1 = 1; +drop table t1; + +# +# +# Test innodb_file_format +# +set global innodb_file_format=`0`; +select @@innodb_file_format; +set global innodb_file_format=`1`; +select @@innodb_file_format; +-- error ER_WRONG_ARGUMENTS +set global innodb_file_format=`2`; +-- error ER_WRONG_ARGUMENTS +set global innodb_file_format=`-1`; +set global innodb_file_format=`Antelope`; +set global innodb_file_format=`Barracuda`; +-- error ER_WRONG_ARGUMENTS +set global innodb_file_format=`Cheetah`; +-- error ER_WRONG_ARGUMENTS +set global innodb_file_format=`abc`; +-- error ER_WRONG_ARGUMENTS +set global innodb_file_format=`1a`; +-- error ER_WRONG_ARGUMENTS +set global innodb_file_format=``; + +#test strict mode. +# this does not work anymore, has been removed from mysqltest +# -- enable_errors +set global innodb_file_per_table = on; +set global innodb_file_format = `1`; + +set innodb_strict_mode = off; +create table t1 (id int primary key) engine = innodb key_block_size = 0; +drop table t1; + +#set strict_mode +set innodb_strict_mode = on; + +#Test different values of KEY_BLOCK_SIZE + +--error ER_CANT_CREATE_TABLE +create table t1 (id int primary key) engine = innodb key_block_size = 0; +show errors; + +--error ER_CANT_CREATE_TABLE +create table t2 (id int primary key) engine = innodb key_block_size = 9; +show errors; + + +create table t3 (id int primary key) engine = innodb key_block_size = 1; +create table t4 (id int primary key) engine = innodb key_block_size = 2; +create table t5 (id int primary key) engine = innodb key_block_size = 4; +create table t6 (id int primary key) engine = innodb key_block_size = 8; +create table t7 (id int primary key) engine = innodb key_block_size = 16; + +#check various ROW_FORMAT values. +create table t8 (id int primary key) engine = innodb row_format = compressed; +create table t9 (id int primary key) engine = innodb row_format = dynamic; +create table t10(id int primary key) engine = innodb row_format = compact; +create table t11(id int primary key) engine = innodb row_format = redundant; + +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +drop table t3, t4, t5, t6, t7, t8, t9, t10, t11; + +#test different values of ROW_FORMAT with KEY_BLOCK_SIZE +create table t1 (id int primary key) engine = innodb +key_block_size = 8 row_format = compressed; + +--error ER_CANT_CREATE_TABLE +create table t2 (id int primary key) engine = innodb +key_block_size = 8 row_format = redundant; +show errors; + +--error ER_CANT_CREATE_TABLE +create table t3 (id int primary key) engine = innodb +key_block_size = 8 row_format = compact; +show errors; + +--error ER_CANT_CREATE_TABLE +create table t4 (id int primary key) engine = innodb +key_block_size = 8 row_format = dynamic; +show errors; + +--error ER_CANT_CREATE_TABLE +create table t5 (id int primary key) engine = innodb +key_block_size = 8 row_format = default; +show errors; + +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +drop table t1; + +#test multiple errors +--error ER_CANT_CREATE_TABLE +create table t1 (id int primary key) engine = innodb +key_block_size = 9 row_format = redundant; +show errors; + +--error ER_CANT_CREATE_TABLE +create table t2 (id int primary key) engine = innodb +key_block_size = 9 row_format = compact; +show errors; + +--error ER_CANT_CREATE_TABLE +create table t2 (id int primary key) engine = innodb +key_block_size = 9 row_format = dynamic; +show errors; + +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; + +#test valid values with innodb_file_per_table unset +set global innodb_file_per_table = off; + +--error ER_CANT_CREATE_TABLE +create table t1 (id int primary key) engine = innodb key_block_size = 1; +show errors; +--error ER_CANT_CREATE_TABLE +create table t2 (id int primary key) engine = innodb key_block_size = 2; +show errors; +--error ER_CANT_CREATE_TABLE +create table t3 (id int primary key) engine = innodb key_block_size = 4; +show errors; +--error ER_CANT_CREATE_TABLE +create table t4 (id int primary key) engine = innodb key_block_size = 8; +show errors; +--error ER_CANT_CREATE_TABLE +create table t5 (id int primary key) engine = innodb key_block_size = 16; +show errors; +--error ER_CANT_CREATE_TABLE +create table t6 (id int primary key) engine = innodb row_format = compressed; +show errors; +--error ER_CANT_CREATE_TABLE +create table t7 (id int primary key) engine = innodb row_format = dynamic; +show errors; +create table t8 (id int primary key) engine = innodb row_format = compact; +create table t9 (id int primary key) engine = innodb row_format = redundant; + +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +drop table t8, t9; + +#test valid values with innodb_file_format unset +set global innodb_file_per_table = on; +set global innodb_file_format = `0`; + +--error ER_CANT_CREATE_TABLE +create table t1 (id int primary key) engine = innodb key_block_size = 1; +show errors; +--error ER_CANT_CREATE_TABLE +create table t2 (id int primary key) engine = innodb key_block_size = 2; +show errors; +--error ER_CANT_CREATE_TABLE +create table t3 (id int primary key) engine = innodb key_block_size = 4; +show errors; +--error ER_CANT_CREATE_TABLE +create table t4 (id int primary key) engine = innodb key_block_size = 8; +show errors; +--error ER_CANT_CREATE_TABLE +create table t5 (id int primary key) engine = innodb key_block_size = 16; +show errors; +--error ER_CANT_CREATE_TABLE +create table t6 (id int primary key) engine = innodb row_format = compressed; +show errors; +--error ER_CANT_CREATE_TABLE +create table t7 (id int primary key) engine = innodb row_format = dynamic; +show errors; +create table t8 (id int primary key) engine = innodb row_format = compact; +create table t9 (id int primary key) engine = innodb row_format = redundant; + +SELECT table_schema, table_name, row_format +FROM information_schema.tables WHERE engine='innodb'; +drop table t8, t9; + +eval set global innodb_file_per_table=$per_table; +eval set global innodb_file_format=$format; +# +# Testing of tablespace tagging +# +-- disable_info +set global innodb_file_per_table=on; +set global innodb_file_format=`Barracuda`; +set global innodb_file_format_check=`Antelope`; +create table normal_table ( + c1 int +) engine = innodb; +select @@innodb_file_format_check; +create table zip_table ( + c1 int +) engine = innodb key_block_size = 8; +select @@innodb_file_format_check; +set global innodb_file_format_check=`Antelope`; +select @@innodb_file_format_check; +-- disable_result_log +show table status; +-- enable_result_log +select @@innodb_file_format_check; +drop table normal_table, zip_table; +-- disable_result_log + +# +# restore environment to the state it was before this test execution +# + +-- disable_query_log +eval set global innodb_file_format=$format; +eval set global innodb_file_per_table=$per_table; +eval set global innodb_file_format_check=$innodb_file_format_check_orig; diff --git a/mysql-test/t/innodb.test b/mysql-test/t/innodb.test index b0353ed5268..b4ba75dd0d1 100644 --- a/mysql-test/t/innodb.test +++ b/mysql-test/t/innodb.test @@ -1126,7 +1126,7 @@ show create table t2; create index id2 on t2 (id); show create table t2; drop index id2 on t2; ---error 1025,1025 +--error ER_DROP_INDEX_FK drop index id on t2; show create table t2; drop table t2; @@ -1294,6 +1294,7 @@ drop table t1; # Test for testable InnoDB status variables. This test # uses previous ones(pages_created, rows_deleted, ...). +--replace_result 512 511 show status like "Innodb_buffer_pool_pages_total"; show status like "Innodb_page_size"; show status like "Innodb_rows_deleted"; @@ -2357,6 +2358,7 @@ DROP TABLE t1,t2; # # Bug #21101 (Prints wrong error message if max row size is too large) # +set innodb_strict_mode=on; --error 1118 CREATE TABLE t1 ( c01 CHAR(255), c02 CHAR(255), c03 CHAR(255), c04 CHAR(255), diff --git a/mysql-test/t/innodb_bug34300.test b/mysql-test/t/innodb_bug34300.test index c4dbce02e05..0068fcd7b60 100644 --- a/mysql-test/t/innodb_bug34300.test +++ b/mysql-test/t/innodb_bug34300.test @@ -9,6 +9,7 @@ -- disable_result_log # set packet size and reconnect +SET @save_max_allowed_packet=@@global.max_allowed_packet; SET @@global.max_allowed_packet=16777216; --connect (newconn, localhost, root,,) @@ -21,7 +22,6 @@ CREATE TABLE bug34300 ( INSERT INTO bug34300 VALUES ('xxx', repeat('a', 8459264), 'zzz'); --- enable_query_log -- enable_result_log SELECT f4, f8 FROM bug34300; @@ -31,11 +31,6 @@ ALTER TABLE bug34300 ADD COLUMN (f10 INT); SELECT f4, f8 FROM bug34300; DROP TABLE bug34300; - disconnect newconn; connection default; ---disable_result_log ---disable_query_log -SET @@global.max_allowed_packet=default; ---enable_result_log ---enable_query_log +SET @@global.max_allowed_packet=@save_max_allowed_packet; diff --git a/mysql-test/t/innodb_bug36169.test b/mysql-test/t/innodb_bug36169.test new file mode 100644 index 00000000000..6fdf4015417 --- /dev/null +++ b/mysql-test/t/innodb_bug36169.test @@ -0,0 +1,1162 @@ +# +# Bug#36169 create innodb compressed table with too large row size crashed +# http://bugs.mysql.com/36169 +# + +-- source include/have_innodb.inc + +SET @save_innodb_file_format=@@global.innodb_file_format; +SET @save_innodb_file_format_check=@@global.innodb_file_format_check; +SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +SET GLOBAL innodb_file_format='Barracuda'; +SET GLOBAL innodb_file_per_table=ON; + +# +# The following is copied from http://bugs.mysql.com/36169 +# (http://bugs.mysql.com/file.php?id=9121) +# Probably it can be simplified but that is not obvious. +# + +# we care only that the following SQL commands do produce errors +# as expected and do not crash the server +-- disable_query_log +-- disable_result_log + +# Generating 10 tables +# Creating a table with 94 columns and 24 indexes +DROP TABLE IF EXISTS `table0`; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE IF NOT EXISTS `table0` +(`col0` BOOL, +`col1` BOOL, +`col2` TINYINT, +`col3` DATE, +`col4` TIME, +`col5` SET ('test1','test2','test3'), +`col6` TIME, +`col7` TEXT, +`col8` DECIMAL, +`col9` SET ('test1','test2','test3'), +`col10` FLOAT, +`col11` DOUBLE PRECISION, +`col12` ENUM ('test1','test2','test3'), +`col13` TINYBLOB, +`col14` YEAR, +`col15` SET ('test1','test2','test3'), +`col16` NUMERIC, +`col17` NUMERIC, +`col18` BLOB, +`col19` DATETIME, +`col20` DOUBLE PRECISION, +`col21` DECIMAL, +`col22` DATETIME, +`col23` NUMERIC, +`col24` NUMERIC, +`col25` LONGTEXT, +`col26` TINYBLOB, +`col27` TIME, +`col28` TINYBLOB, +`col29` ENUM ('test1','test2','test3'), +`col30` SMALLINT, +`col31` REAL, +`col32` FLOAT, +`col33` CHAR (175), +`col34` TINYTEXT, +`col35` TINYTEXT, +`col36` TINYBLOB, +`col37` TINYBLOB, +`col38` TINYTEXT, +`col39` MEDIUMBLOB, +`col40` TIMESTAMP, +`col41` DOUBLE, +`col42` SMALLINT, +`col43` LONGBLOB, +`col44` VARCHAR (80), +`col45` MEDIUMTEXT, +`col46` NUMERIC, +`col47` BIGINT, +`col48` DATE, +`col49` TINYBLOB, +`col50` DATE, +`col51` BOOL, +`col52` MEDIUMINT, +`col53` FLOAT, +`col54` TINYBLOB, +`col55` LONGTEXT, +`col56` SMALLINT, +`col57` ENUM ('test1','test2','test3'), +`col58` DATETIME, +`col59` MEDIUMTEXT, +`col60` VARCHAR (232), +`col61` NUMERIC, +`col62` YEAR, +`col63` SMALLINT, +`col64` TIMESTAMP, +`col65` BLOB, +`col66` LONGBLOB, +`col67` INT, +`col68` LONGTEXT, +`col69` ENUM ('test1','test2','test3'), +`col70` INT, +`col71` TIME, +`col72` TIMESTAMP, +`col73` TIMESTAMP, +`col74` VARCHAR (170), +`col75` SET ('test1','test2','test3'), +`col76` TINYBLOB, +`col77` BIGINT, +`col78` NUMERIC, +`col79` DATETIME, +`col80` YEAR, +`col81` NUMERIC, +`col82` LONGBLOB, +`col83` TEXT, +`col84` CHAR (83), +`col85` DECIMAL, +`col86` FLOAT, +`col87` INT, +`col88` VARCHAR (145), +`col89` DATE, +`col90` DECIMAL, +`col91` DECIMAL, +`col92` MEDIUMBLOB, +`col93` TIME, +KEY `idx0` (`col69`,`col90`,`col8`), +KEY `idx1` (`col60`), +KEY `idx2` (`col60`,`col70`,`col74`), +KEY `idx3` (`col22`,`col32`,`col72`,`col30`), +KEY `idx4` (`col29`), +KEY `idx5` (`col19`,`col45`(143)), +KEY `idx6` (`col46`,`col48`,`col5`,`col39`(118)), +KEY `idx7` (`col48`,`col61`), +KEY `idx8` (`col93`), +KEY `idx9` (`col31`), +KEY `idx10` (`col30`,`col21`), +KEY `idx11` (`col67`), +KEY `idx12` (`col44`,`col6`,`col8`,`col38`(226)), +KEY `idx13` (`col71`,`col41`,`col15`,`col49`(88)), +KEY `idx14` (`col78`), +KEY `idx15` (`col63`,`col67`,`col64`), +KEY `idx16` (`col17`,`col86`), +KEY `idx17` (`col77`,`col56`,`col10`,`col55`(24)), +KEY `idx18` (`col62`), +KEY `idx19` (`col31`,`col57`,`col56`,`col53`), +KEY `idx20` (`col46`), +KEY `idx21` (`col83`(54)), +KEY `idx22` (`col51`,`col7`(120)), +KEY `idx23` (`col7`(163),`col31`,`col71`,`col14`) +)engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + +# Creating a table with 10 columns and 32 indexes +DROP TABLE IF EXISTS `table1`; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE IF NOT EXISTS `table1` +(`col0` CHAR (113), +`col1` FLOAT, +`col2` BIGINT, +`col3` DECIMAL, +`col4` BLOB, +`col5` LONGTEXT, +`col6` SET ('test1','test2','test3'), +`col7` BIGINT, +`col8` BIGINT, +`col9` TINYBLOB, +KEY `idx0` (`col5`(101),`col7`,`col8`), +KEY `idx1` (`col8`), +KEY `idx2` (`col4`(177),`col9`(126),`col6`,`col3`), +KEY `idx3` (`col5`(160)), +KEY `idx4` (`col9`(242)), +KEY `idx5` (`col4`(139),`col2`,`col3`), +KEY `idx6` (`col7`), +KEY `idx7` (`col6`,`col2`,`col0`,`col3`), +KEY `idx8` (`col9`(66)), +KEY `idx9` (`col5`(253)), +KEY `idx10` (`col1`,`col7`,`col2`), +KEY `idx11` (`col9`(242),`col0`,`col8`,`col5`(163)), +KEY `idx12` (`col8`), +KEY `idx13` (`col0`,`col9`(37)), +KEY `idx14` (`col0`), +KEY `idx15` (`col5`(111)), +KEY `idx16` (`col8`,`col0`,`col5`(13)), +KEY `idx17` (`col4`(139)), +KEY `idx18` (`col5`(189),`col2`,`col3`,`col9`(136)), +KEY `idx19` (`col0`,`col3`,`col1`,`col8`), +KEY `idx20` (`col8`), +KEY `idx21` (`col0`,`col7`,`col9`(227),`col3`), +KEY `idx22` (`col0`), +KEY `idx23` (`col2`), +KEY `idx24` (`col3`), +KEY `idx25` (`col2`,`col3`), +KEY `idx26` (`col0`), +KEY `idx27` (`col5`(254)), +KEY `idx28` (`col3`), +KEY `idx29` (`col3`), +KEY `idx30` (`col7`,`col3`,`col0`,`col4`(220)), +KEY `idx31` (`col4`(1),`col0`) +)engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + +# Creating a table with 141 columns and 18 indexes +DROP TABLE IF EXISTS `table2`; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE IF NOT EXISTS `table2` +(`col0` BOOL, +`col1` MEDIUMINT, +`col2` VARCHAR (209), +`col3` MEDIUMBLOB, +`col4` CHAR (13), +`col5` DOUBLE, +`col6` TINYTEXT, +`col7` REAL, +`col8` SMALLINT, +`col9` BLOB, +`col10` TINYINT, +`col11` DECIMAL, +`col12` BLOB, +`col13` DECIMAL, +`col14` LONGBLOB, +`col15` SMALLINT, +`col16` LONGBLOB, +`col17` TINYTEXT, +`col18` FLOAT, +`col19` CHAR (78), +`col20` MEDIUMTEXT, +`col21` SET ('test1','test2','test3'), +`col22` MEDIUMINT, +`col23` INT, +`col24` MEDIUMBLOB, +`col25` ENUM ('test1','test2','test3'), +`col26` TINYBLOB, +`col27` VARCHAR (116), +`col28` TIMESTAMP, +`col29` BLOB, +`col30` SMALLINT, +`col31` DOUBLE PRECISION, +`col32` DECIMAL, +`col33` DECIMAL, +`col34` TEXT, +`col35` MEDIUMINT, +`col36` MEDIUMINT, +`col37` BIGINT, +`col38` VARCHAR (253), +`col39` TINYBLOB, +`col40` MEDIUMBLOB, +`col41` BIGINT, +`col42` DOUBLE, +`col43` TEXT, +`col44` BLOB, +`col45` TIME, +`col46` MEDIUMINT, +`col47` DOUBLE PRECISION, +`col48` SET ('test1','test2','test3'), +`col49` DOUBLE PRECISION, +`col50` VARCHAR (97), +`col51` TEXT, +`col52` NUMERIC, +`col53` ENUM ('test1','test2','test3'), +`col54` MEDIUMTEXT, +`col55` MEDIUMINT, +`col56` DATETIME, +`col57` DATETIME, +`col58` MEDIUMTEXT, +`col59` CHAR (244), +`col60` LONGBLOB, +`col61` MEDIUMBLOB, +`col62` DOUBLE, +`col63` SMALLINT, +`col64` BOOL, +`col65` SMALLINT, +`col66` VARCHAR (212), +`col67` TIME, +`col68` REAL, +`col69` BOOL, +`col70` BIGINT, +`col71` DATE, +`col72` TINYINT, +`col73` ENUM ('test1','test2','test3'), +`col74` DATE, +`col75` TIME, +`col76` DATETIME, +`col77` BOOL, +`col78` TINYTEXT, +`col79` MEDIUMINT, +`col80` NUMERIC, +`col81` LONGTEXT, +`col82` SET ('test1','test2','test3'), +`col83` DOUBLE PRECISION, +`col84` NUMERIC, +`col85` VARCHAR (184), +`col86` DOUBLE PRECISION, +`col87` MEDIUMTEXT, +`col88` MEDIUMBLOB, +`col89` BOOL, +`col90` SMALLINT, +`col91` TINYINT, +`col92` ENUM ('test1','test2','test3'), +`col93` BOOL, +`col94` TIMESTAMP, +`col95` BOOL, +`col96` MEDIUMTEXT, +`col97` DECIMAL, +`col98` BOOL, +`col99` DECIMAL, +`col100` MEDIUMINT, +`col101` DOUBLE PRECISION, +`col102` TINYINT, +`col103` BOOL, +`col104` MEDIUMINT, +`col105` DECIMAL, +`col106` NUMERIC, +`col107` TIMESTAMP, +`col108` MEDIUMBLOB, +`col109` TINYBLOB, +`col110` SET ('test1','test2','test3'), +`col111` YEAR, +`col112` TIMESTAMP, +`col113` CHAR (201), +`col114` BOOL, +`col115` TINYINT, +`col116` DOUBLE, +`col117` TINYINT, +`col118` TIMESTAMP, +`col119` SET ('test1','test2','test3'), +`col120` SMALLINT, +`col121` TINYBLOB, +`col122` TIMESTAMP, +`col123` BLOB, +`col124` DATE, +`col125` SMALLINT, +`col126` ENUM ('test1','test2','test3'), +`col127` MEDIUMBLOB, +`col128` DOUBLE PRECISION, +`col129` REAL, +`col130` VARCHAR (159), +`col131` MEDIUMBLOB, +`col132` BIGINT, +`col133` INT, +`col134` SET ('test1','test2','test3'), +`col135` CHAR (198), +`col136` SET ('test1','test2','test3'), +`col137` MEDIUMTEXT, +`col138` SMALLINT, +`col139` BLOB, +`col140` LONGBLOB, +KEY `idx0` (`col14`(139),`col24`(208),`col38`,`col35`), +KEY `idx1` (`col48`,`col118`,`col29`(131),`col100`), +KEY `idx2` (`col86`,`col67`,`col43`(175)), +KEY `idx3` (`col19`), +KEY `idx4` (`col40`(220),`col67`), +KEY `idx5` (`col99`,`col56`), +KEY `idx6` (`col68`,`col28`,`col137`(157)), +KEY `idx7` (`col51`(160),`col99`,`col45`,`col39`(9)), +KEY `idx8` (`col15`,`col52`,`col90`,`col94`), +KEY `idx9` (`col24`(3),`col139`(248),`col108`(118),`col41`), +KEY `idx10` (`col36`,`col92`,`col114`), +KEY `idx11` (`col115`,`col9`(116)), +KEY `idx12` (`col130`,`col93`,`col134`), +KEY `idx13` (`col123`(65)), +KEY `idx14` (`col44`(90),`col86`,`col119`), +KEY `idx15` (`col69`), +KEY `idx16` (`col132`,`col81`(118),`col18`), +KEY `idx17` (`col24`(250),`col7`,`col92`,`col45`) +)engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + +# Creating a table with 199 columns and 1 indexes +DROP TABLE IF EXISTS `table3`; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE IF NOT EXISTS `table3` +(`col0` SMALLINT, +`col1` SET ('test1','test2','test3'), +`col2` TINYTEXT, +`col3` DOUBLE, +`col4` NUMERIC, +`col5` DATE, +`col6` BIGINT, +`col7` DOUBLE, +`col8` TEXT, +`col9` INT, +`col10` REAL, +`col11` TINYINT, +`col12` NUMERIC, +`col13` NUMERIC, +`col14` TIME, +`col15` DOUBLE, +`col16` REAL, +`col17` MEDIUMBLOB, +`col18` YEAR, +`col19` TINYTEXT, +`col20` YEAR, +`col21` CHAR (250), +`col22` TINYINT, +`col23` TINYINT, +`col24` SMALLINT, +`col25` DATETIME, +`col26` MEDIUMINT, +`col27` LONGBLOB, +`col28` VARCHAR (106), +`col29` FLOAT, +`col30` MEDIUMTEXT, +`col31` TINYBLOB, +`col32` BIGINT, +`col33` YEAR, +`col34` REAL, +`col35` MEDIUMBLOB, +`col36` LONGTEXT, +`col37` LONGBLOB, +`col38` BIGINT, +`col39` FLOAT, +`col40` TIME, +`col41` DATETIME, +`col42` BOOL, +`col43` BIGINT, +`col44` SMALLINT, +`col45` TIME, +`col46` DOUBLE PRECISION, +`col47` TIME, +`col48` TINYTEXT, +`col49` DOUBLE PRECISION, +`col50` BIGINT, +`col51` NUMERIC, +`col52` TINYBLOB, +`col53` DATE, +`col54` DECIMAL, +`col55` SMALLINT, +`col56` TINYTEXT, +`col57` ENUM ('test1','test2','test3'), +`col58` YEAR, +`col59` TIME, +`col60` TINYINT, +`col61` DECIMAL, +`col62` DOUBLE, +`col63` DATE, +`col64` LONGTEXT, +`col65` DOUBLE, +`col66` VARCHAR (88), +`col67` MEDIUMTEXT, +`col68` DATE, +`col69` MEDIUMINT, +`col70` DECIMAL, +`col71` MEDIUMTEXT, +`col72` LONGTEXT, +`col73` REAL, +`col74` DOUBLE, +`col75` TIME, +`col76` DATE, +`col77` DECIMAL, +`col78` MEDIUMBLOB, +`col79` NUMERIC, +`col80` BIGINT, +`col81` YEAR, +`col82` SMALLINT, +`col83` MEDIUMINT, +`col84` TINYINT, +`col85` MEDIUMBLOB, +`col86` TIME, +`col87` MEDIUMBLOB, +`col88` LONGTEXT, +`col89` BOOL, +`col90` BLOB, +`col91` LONGBLOB, +`col92` YEAR, +`col93` BLOB, +`col94` INT, +`col95` TINYTEXT, +`col96` TINYINT, +`col97` DECIMAL, +`col98` ENUM ('test1','test2','test3'), +`col99` MEDIUMINT, +`col100` TINYINT, +`col101` MEDIUMBLOB, +`col102` TINYINT, +`col103` SET ('test1','test2','test3'), +`col104` TIMESTAMP, +`col105` TEXT, +`col106` DATETIME, +`col107` MEDIUMTEXT, +`col108` CHAR (220), +`col109` TIME, +`col110` VARCHAR (131), +`col111` DECIMAL, +`col112` FLOAT, +`col113` SMALLINT, +`col114` BIGINT, +`col115` LONGBLOB, +`col116` SET ('test1','test2','test3'), +`col117` ENUM ('test1','test2','test3'), +`col118` BLOB, +`col119` MEDIUMTEXT, +`col120` SET ('test1','test2','test3'), +`col121` DATETIME, +`col122` FLOAT, +`col123` VARCHAR (242), +`col124` YEAR, +`col125` MEDIUMBLOB, +`col126` TIME, +`col127` BOOL, +`col128` TINYBLOB, +`col129` DOUBLE, +`col130` TINYINT, +`col131` BIGINT, +`col132` SMALLINT, +`col133` INT, +`col134` DOUBLE PRECISION, +`col135` MEDIUMBLOB, +`col136` SET ('test1','test2','test3'), +`col137` TINYTEXT, +`col138` DOUBLE PRECISION, +`col139` NUMERIC, +`col140` BLOB, +`col141` SET ('test1','test2','test3'), +`col142` INT, +`col143` VARCHAR (26), +`col144` BLOB, +`col145` REAL, +`col146` SET ('test1','test2','test3'), +`col147` LONGBLOB, +`col148` TEXT, +`col149` BLOB, +`col150` CHAR (189), +`col151` LONGTEXT, +`col152` INT, +`col153` FLOAT, +`col154` LONGTEXT, +`col155` DATE, +`col156` LONGBLOB, +`col157` TINYBLOB, +`col158` REAL, +`col159` DATE, +`col160` TIME, +`col161` YEAR, +`col162` DOUBLE, +`col163` VARCHAR (90), +`col164` FLOAT, +`col165` NUMERIC, +`col166` ENUM ('test1','test2','test3'), +`col167` DOUBLE PRECISION, +`col168` DOUBLE PRECISION, +`col169` TINYBLOB, +`col170` TIME, +`col171` SMALLINT, +`col172` TINYTEXT, +`col173` SMALLINT, +`col174` DOUBLE, +`col175` VARCHAR (14), +`col176` VARCHAR (90), +`col177` REAL, +`col178` MEDIUMINT, +`col179` TINYBLOB, +`col180` FLOAT, +`col181` TIMESTAMP, +`col182` REAL, +`col183` DOUBLE PRECISION, +`col184` BIGINT, +`col185` INT, +`col186` MEDIUMTEXT, +`col187` TIME, +`col188` FLOAT, +`col189` TIME, +`col190` INT, +`col191` FLOAT, +`col192` MEDIUMINT, +`col193` TINYINT, +`col194` MEDIUMTEXT, +`col195` DATE, +`col196` TIME, +`col197` YEAR, +`col198` CHAR (206), +KEY `idx0` (`col39`,`col23`) +)engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + +# Creating a table with 133 columns and 16 indexes +DROP TABLE IF EXISTS `table4`; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE IF NOT EXISTS `table4` +(`col0` VARCHAR (60), +`col1` NUMERIC, +`col2` LONGTEXT, +`col3` MEDIUMTEXT, +`col4` LONGTEXT, +`col5` LONGBLOB, +`col6` LONGBLOB, +`col7` DATETIME, +`col8` TINYTEXT, +`col9` BLOB, +`col10` BOOL, +`col11` BIGINT, +`col12` TEXT, +`col13` VARCHAR (213), +`col14` TINYBLOB, +`col15` BOOL, +`col16` MEDIUMTEXT, +`col17` DOUBLE, +`col18` TEXT, +`col19` BLOB, +`col20` SET ('test1','test2','test3'), +`col21` TINYINT, +`col22` DATETIME, +`col23` TINYINT, +`col24` ENUM ('test1','test2','test3'), +`col25` REAL, +`col26` BOOL, +`col27` FLOAT, +`col28` LONGBLOB, +`col29` DATETIME, +`col30` FLOAT, +`col31` SET ('test1','test2','test3'), +`col32` LONGBLOB, +`col33` NUMERIC, +`col34` YEAR, +`col35` VARCHAR (146), +`col36` BIGINT, +`col37` DATETIME, +`col38` DATE, +`col39` SET ('test1','test2','test3'), +`col40` CHAR (112), +`col41` FLOAT, +`col42` YEAR, +`col43` TIME, +`col44` DOUBLE, +`col45` NUMERIC, +`col46` FLOAT, +`col47` DECIMAL, +`col48` BIGINT, +`col49` DECIMAL, +`col50` YEAR, +`col51` MEDIUMTEXT, +`col52` LONGBLOB, +`col53` SET ('test1','test2','test3'), +`col54` BLOB, +`col55` FLOAT, +`col56` REAL, +`col57` REAL, +`col58` TEXT, +`col59` MEDIUMBLOB, +`col60` INT, +`col61` INT, +`col62` DATE, +`col63` TEXT, +`col64` DATE, +`col65` ENUM ('test1','test2','test3'), +`col66` DOUBLE PRECISION, +`col67` TINYTEXT, +`col68` TINYBLOB, +`col69` FLOAT, +`col70` BLOB, +`col71` DATETIME, +`col72` DOUBLE, +`col73` LONGTEXT, +`col74` TIME, +`col75` DATETIME, +`col76` VARCHAR (122), +`col77` MEDIUMTEXT, +`col78` MEDIUMTEXT, +`col79` BOOL, +`col80` LONGTEXT, +`col81` TINYTEXT, +`col82` NUMERIC, +`col83` DOUBLE PRECISION, +`col84` DATE, +`col85` YEAR, +`col86` BLOB, +`col87` TINYTEXT, +`col88` DOUBLE PRECISION, +`col89` MEDIUMINT, +`col90` MEDIUMTEXT, +`col91` NUMERIC, +`col92` DATETIME, +`col93` NUMERIC, +`col94` SET ('test1','test2','test3'), +`col95` TINYTEXT, +`col96` SET ('test1','test2','test3'), +`col97` YEAR, +`col98` MEDIUMINT, +`col99` TEXT, +`col100` TEXT, +`col101` TIME, +`col102` VARCHAR (225), +`col103` TINYTEXT, +`col104` TEXT, +`col105` MEDIUMTEXT, +`col106` TINYINT, +`col107` TEXT, +`col108` LONGBLOB, +`col109` LONGTEXT, +`col110` TINYTEXT, +`col111` CHAR (56), +`col112` YEAR, +`col113` ENUM ('test1','test2','test3'), +`col114` TINYBLOB, +`col115` DATETIME, +`col116` DATE, +`col117` TIME, +`col118` MEDIUMTEXT, +`col119` DOUBLE PRECISION, +`col120` FLOAT, +`col121` TIMESTAMP, +`col122` MEDIUMINT, +`col123` YEAR, +`col124` DATE, +`col125` TEXT, +`col126` FLOAT, +`col127` TINYTEXT, +`col128` BOOL, +`col129` NUMERIC, +`col130` TIMESTAMP, +`col131` INT, +`col132` MEDIUMBLOB, +KEY `idx0` (`col130`), +KEY `idx1` (`col30`,`col55`,`col19`(31)), +KEY `idx2` (`col104`(186)), +KEY `idx3` (`col131`), +KEY `idx4` (`col64`,`col93`,`col2`(11)), +KEY `idx5` (`col34`,`col121`,`col22`), +KEY `idx6` (`col33`,`col55`,`col83`), +KEY `idx7` (`col17`,`col87`(245),`col99`(17)), +KEY `idx8` (`col65`,`col120`), +KEY `idx9` (`col82`), +KEY `idx10` (`col9`(72)), +KEY `idx11` (`col88`), +KEY `idx12` (`col128`,`col9`(200),`col71`,`col66`), +KEY `idx13` (`col77`(126)), +KEY `idx14` (`col105`(26),`col13`,`col117`), +KEY `idx15` (`col4`(246),`col130`,`col115`,`col3`(141)) +)engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + +# Creating a table with 176 columns and 13 indexes +DROP TABLE IF EXISTS `table5`; +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE IF NOT EXISTS `table5` +(`col0` MEDIUMTEXT, +`col1` VARCHAR (90), +`col2` TINYTEXT, +`col3` TIME, +`col4` BOOL, +`col5` TINYTEXT, +`col6` BOOL, +`col7` TIMESTAMP, +`col8` TINYBLOB, +`col9` TINYINT, +`col10` YEAR, +`col11` SET ('test1','test2','test3'), +`col12` TEXT, +`col13` CHAR (248), +`col14` BIGINT, +`col15` TEXT, +`col16` TINYINT, +`col17` NUMERIC, +`col18` SET ('test1','test2','test3'), +`col19` LONGBLOB, +`col20` FLOAT, +`col21` INT, +`col22` TEXT, +`col23` BOOL, +`col24` DECIMAL, +`col25` DOUBLE PRECISION, +`col26` FLOAT, +`col27` TINYBLOB, +`col28` NUMERIC, +`col29` MEDIUMBLOB, +`col30` DATE, +`col31` LONGTEXT, +`col32` DATE, +`col33` FLOAT, +`col34` BIGINT, +`col35` TINYTEXT, +`col36` MEDIUMTEXT, +`col37` TIME, +`col38` INT, +`col39` TINYINT, +`col40` SET ('test1','test2','test3'), +`col41` CHAR (130), +`col42` SMALLINT, +`col43` INT, +`col44` MEDIUMTEXT, +`col45` VARCHAR (126), +`col46` INT, +`col47` DOUBLE PRECISION, +`col48` BIGINT, +`col49` MEDIUMTEXT, +`col50` TINYBLOB, +`col51` MEDIUMINT, +`col52` TEXT, +`col53` VARCHAR (208), +`col54` VARCHAR (207), +`col55` NUMERIC, +`col56` DATETIME, +`col57` ENUM ('test1','test2','test3'), +`col58` NUMERIC, +`col59` TINYBLOB, +`col60` VARCHAR (73), +`col61` MEDIUMTEXT, +`col62` TINYBLOB, +`col63` DATETIME, +`col64` NUMERIC, +`col65` MEDIUMINT, +`col66` DATETIME, +`col67` NUMERIC, +`col68` TINYINT, +`col69` VARCHAR (58), +`col70` DECIMAL, +`col71` MEDIUMTEXT, +`col72` DATE, +`col73` TIME, +`col74` DOUBLE PRECISION, +`col75` DECIMAL, +`col76` MEDIUMBLOB, +`col77` REAL, +`col78` YEAR, +`col79` YEAR, +`col80` LONGBLOB, +`col81` BLOB, +`col82` BIGINT, +`col83` ENUM ('test1','test2','test3'), +`col84` NUMERIC, +`col85` SET ('test1','test2','test3'), +`col86` MEDIUMTEXT, +`col87` LONGBLOB, +`col88` TIME, +`col89` ENUM ('test1','test2','test3'), +`col90` DECIMAL, +`col91` FLOAT, +`col92` DATETIME, +`col93` TINYTEXT, +`col94` TIMESTAMP, +`col95` TIMESTAMP, +`col96` TEXT, +`col97` REAL, +`col98` VARCHAR (198), +`col99` TIME, +`col100` TINYINT, +`col101` BIGINT, +`col102` LONGBLOB, +`col103` LONGBLOB, +`col104` MEDIUMINT, +`col105` MEDIUMTEXT, +`col106` TIMESTAMP, +`col107` SMALLINT, +`col108` NUMERIC, +`col109` DECIMAL, +`col110` FLOAT, +`col111` DECIMAL, +`col112` REAL, +`col113` TINYTEXT, +`col114` FLOAT, +`col115` VARCHAR (7), +`col116` LONGTEXT, +`col117` DATE, +`col118` BIGINT, +`col119` TEXT, +`col120` BIGINT, +`col121` BLOB, +`col122` CHAR (110), +`col123` NUMERIC, +`col124` MEDIUMBLOB, +`col125` NUMERIC, +`col126` NUMERIC, +`col127` BOOL, +`col128` TIME, +`col129` TINYBLOB, +`col130` TINYBLOB, +`col131` DATE, +`col132` INT, +`col133` VARCHAR (123), +`col134` CHAR (238), +`col135` VARCHAR (225), +`col136` LONGTEXT, +`col137` LONGBLOB, +`col138` REAL, +`col139` TINYBLOB, +`col140` DATETIME, +`col141` TINYTEXT, +`col142` LONGBLOB, +`col143` BIGINT, +`col144` VARCHAR (236), +`col145` TEXT, +`col146` YEAR, +`col147` DECIMAL, +`col148` TEXT, +`col149` MEDIUMBLOB, +`col150` TINYINT, +`col151` BOOL, +`col152` VARCHAR (72), +`col153` INT, +`col154` VARCHAR (165), +`col155` TINYINT, +`col156` MEDIUMTEXT, +`col157` DOUBLE PRECISION, +`col158` TIME, +`col159` MEDIUMBLOB, +`col160` LONGBLOB, +`col161` DATETIME, +`col162` DOUBLE PRECISION, +`col163` BLOB, +`col164` ENUM ('test1','test2','test3'), +`col165` TIMESTAMP, +`col166` DATE, +`col167` TINYBLOB, +`col168` TINYBLOB, +`col169` LONGBLOB, +`col170` DATETIME, +`col171` BIGINT, +`col172` VARCHAR (30), +`col173` LONGTEXT, +`col174` TIME, +`col175` FLOAT, +KEY `idx0` (`col16`,`col156`(139),`col97`,`col120`), +KEY `idx1` (`col24`,`col0`(108)), +KEY `idx2` (`col117`,`col173`(34),`col132`,`col82`), +KEY `idx3` (`col2`(86)), +KEY `idx4` (`col2`(43)), +KEY `idx5` (`col83`,`col35`(87),`col111`), +KEY `idx6` (`col6`,`col134`,`col92`), +KEY `idx7` (`col56`), +KEY `idx8` (`col30`,`col53`,`col129`(66)), +KEY `idx9` (`col53`,`col113`(211),`col32`,`col15`(75)), +KEY `idx10` (`col34`), +KEY `idx11` (`col126`), +KEY `idx12` (`col24`) +)engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + +# Creating a table with 179 columns and 46 indexes +DROP TABLE IF EXISTS `table6`; +-- error ER_TOO_BIG_ROWSIZE +--error ER_TOO_BIG_ROWSIZE +CREATE TABLE IF NOT EXISTS `table6` +(`col0` ENUM ('test1','test2','test3'), +`col1` MEDIUMBLOB, +`col2` MEDIUMBLOB, +`col3` DATETIME, +`col4` DATE, +`col5` YEAR, +`col6` REAL, +`col7` NUMERIC, +`col8` MEDIUMBLOB, +`col9` TEXT, +`col10` TIMESTAMP, +`col11` DOUBLE, +`col12` DOUBLE, +`col13` SMALLINT, +`col14` TIMESTAMP, +`col15` DECIMAL, +`col16` DATE, +`col17` TEXT, +`col18` LONGBLOB, +`col19` BIGINT, +`col20` FLOAT, +`col21` DATETIME, +`col22` TINYINT, +`col23` MEDIUMBLOB, +`col24` SET ('test1','test2','test3'), +`col25` TIME, +`col26` TEXT, +`col27` LONGTEXT, +`col28` BIGINT, +`col29` REAL, +`col30` YEAR, +`col31` MEDIUMBLOB, +`col32` MEDIUMINT, +`col33` FLOAT, +`col34` TEXT, +`col35` DATE, +`col36` TIMESTAMP, +`col37` REAL, +`col38` BLOB, +`col39` BLOB, +`col40` BLOB, +`col41` TINYBLOB, +`col42` INT, +`col43` TINYINT, +`col44` REAL, +`col45` BIGINT, +`col46` TIMESTAMP, +`col47` BLOB, +`col48` ENUM ('test1','test2','test3'), +`col49` BOOL, +`col50` CHAR (109), +`col51` DOUBLE, +`col52` DOUBLE PRECISION, +`col53` ENUM ('test1','test2','test3'), +`col54` FLOAT, +`col55` DOUBLE PRECISION, +`col56` CHAR (166), +`col57` TEXT, +`col58` TIME, +`col59` DECIMAL, +`col60` TEXT, +`col61` ENUM ('test1','test2','test3'), +`col62` LONGTEXT, +`col63` YEAR, +`col64` DOUBLE, +`col65` CHAR (87), +`col66` DATE, +`col67` BOOL, +`col68` MEDIUMBLOB, +`col69` DATETIME, +`col70` DECIMAL, +`col71` TIME, +`col72` REAL, +`col73` LONGTEXT, +`col74` BLOB, +`col75` REAL, +`col76` INT, +`col77` INT, +`col78` FLOAT, +`col79` DOUBLE, +`col80` MEDIUMINT, +`col81` ENUM ('test1','test2','test3'), +`col82` VARCHAR (221), +`col83` BIGINT, +`col84` TINYINT, +`col85` BIGINT, +`col86` FLOAT, +`col87` MEDIUMBLOB, +`col88` CHAR (126), +`col89` MEDIUMBLOB, +`col90` DATETIME, +`col91` TINYINT, +`col92` DOUBLE, +`col93` NUMERIC, +`col94` DATE, +`col95` BLOB, +`col96` DATETIME, +`col97` TIME, +`col98` LONGBLOB, +`col99` INT, +`col100` SET ('test1','test2','test3'), +`col101` TINYBLOB, +`col102` INT, +`col103` MEDIUMBLOB, +`col104` MEDIUMTEXT, +`col105` FLOAT, +`col106` TINYBLOB, +`col107` VARCHAR (26), +`col108` TINYINT, +`col109` TIME, +`col110` TINYBLOB, +`col111` LONGBLOB, +`col112` TINYTEXT, +`col113` FLOAT, +`col114` TINYINT, +`col115` NUMERIC, +`col116` TIME, +`col117` SET ('test1','test2','test3'), +`col118` DATE, +`col119` SMALLINT, +`col120` BLOB, +`col121` TINYTEXT, +`col122` REAL, +`col123` YEAR, +`col124` REAL, +`col125` BOOL, +`col126` BLOB, +`col127` REAL, +`col128` MEDIUMBLOB, +`col129` TIMESTAMP, +`col130` LONGBLOB, +`col131` MEDIUMBLOB, +`col132` YEAR, +`col133` YEAR, +`col134` INT, +`col135` MEDIUMINT, +`col136` MEDIUMINT, +`col137` TINYTEXT, +`col138` TINYBLOB, +`col139` BLOB, +`col140` SET ('test1','test2','test3'), +`col141` ENUM ('test1','test2','test3'), +`col142` ENUM ('test1','test2','test3'), +`col143` TINYTEXT, +`col144` DATETIME, +`col145` TEXT, +`col146` DOUBLE PRECISION, +`col147` DECIMAL, +`col148` MEDIUMTEXT, +`col149` TINYTEXT, +`col150` SET ('test1','test2','test3'), +`col151` MEDIUMTEXT, +`col152` CHAR (126), +`col153` DOUBLE, +`col154` CHAR (243), +`col155` SET ('test1','test2','test3'), +`col156` SET ('test1','test2','test3'), +`col157` DATETIME, +`col158` DOUBLE, +`col159` NUMERIC, +`col160` DECIMAL, +`col161` FLOAT, +`col162` LONGBLOB, +`col163` LONGTEXT, +`col164` INT, +`col165` TIME, +`col166` CHAR (27), +`col167` VARCHAR (63), +`col168` TEXT, +`col169` TINYBLOB, +`col170` TINYBLOB, +`col171` ENUM ('test1','test2','test3'), +`col172` INT, +`col173` TIME, +`col174` DECIMAL, +`col175` DOUBLE, +`col176` MEDIUMBLOB, +`col177` LONGBLOB, +`col178` CHAR (43), +KEY `idx0` (`col131`(219)), +KEY `idx1` (`col67`,`col122`,`col59`,`col87`(33)), +KEY `idx2` (`col83`,`col42`,`col57`(152)), +KEY `idx3` (`col106`(124)), +KEY `idx4` (`col173`,`col80`,`col165`,`col89`(78)), +KEY `idx5` (`col174`,`col145`(108),`col23`(228),`col141`), +KEY `idx6` (`col157`,`col140`), +KEY `idx7` (`col130`(188),`col15`), +KEY `idx8` (`col52`), +KEY `idx9` (`col144`), +KEY `idx10` (`col155`), +KEY `idx11` (`col62`(230),`col1`(109)), +KEY `idx12` (`col151`(24),`col95`(85)), +KEY `idx13` (`col114`), +KEY `idx14` (`col42`,`col98`(56),`col146`), +KEY `idx15` (`col147`,`col39`(254),`col35`), +KEY `idx16` (`col79`), +KEY `idx17` (`col65`), +KEY `idx18` (`col149`(165),`col168`(119),`col32`,`col117`), +KEY `idx19` (`col64`), +KEY `idx20` (`col93`), +KEY `idx21` (`col64`,`col113`,`col104`(182)), +KEY `idx22` (`col52`,`col111`(189)), +KEY `idx23` (`col45`), +KEY `idx24` (`col154`,`col107`,`col110`(159)), +KEY `idx25` (`col149`(1),`col87`(131)), +KEY `idx26` (`col58`,`col115`,`col63`), +KEY `idx27` (`col95`(9),`col0`,`col87`(113)), +KEY `idx28` (`col92`,`col130`(1)), +KEY `idx29` (`col151`(129),`col137`(254),`col13`), +KEY `idx30` (`col49`), +KEY `idx31` (`col28`), +KEY `idx32` (`col83`,`col146`), +KEY `idx33` (`col155`,`col90`,`col17`(245)), +KEY `idx34` (`col174`,`col169`(44),`col107`), +KEY `idx35` (`col113`), +KEY `idx36` (`col52`), +KEY `idx37` (`col16`,`col120`(190)), +KEY `idx38` (`col28`), +KEY `idx39` (`col131`(165)), +KEY `idx40` (`col135`,`col26`(86)), +KEY `idx41` (`col69`,`col94`), +KEY `idx42` (`col105`,`col151`(38),`col97`), +KEY `idx43` (`col88`), +KEY `idx44` (`col176`(100),`col42`,`col73`(189),`col94`), +KEY `idx45` (`col2`(27),`col27`(116)) +)engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + +SET GLOBAL innodb_file_format=@save_innodb_file_format; +SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; +SET GLOBAL innodb_file_per_table=@save_innodb_file_per_table; + +DROP TABLE IF EXISTS table0; +DROP TABLE IF EXISTS table1; +DROP TABLE IF EXISTS table2; +DROP TABLE IF EXISTS table3; +DROP TABLE IF EXISTS table4; +DROP TABLE IF EXISTS table5; +DROP TABLE IF EXISTS table6; + diff --git a/mysql-test/t/innodb_bug36172.test b/mysql-test/t/innodb_bug36172.test new file mode 100644 index 00000000000..ee35d678921 --- /dev/null +++ b/mysql-test/t/innodb_bug36172.test @@ -0,0 +1,33 @@ +# +# Test case for bug 36172 +# + +-- source include/not_embedded.inc +-- source include/have_innodb.inc + +SET storage_engine=InnoDB; + +# we do not really care about what gets printed, we are only +# interested in getting success or failure according to our +# expectations + +-- disable_query_log +-- disable_result_log + +SET @save_innodb_file_format=@@global.innodb_file_format; +SET @save_innodb_file_format_check=@@global.innodb_file_format_check; +SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +SET GLOBAL innodb_file_format='Barracuda'; +SET GLOBAL innodb_file_per_table=on; + +DROP TABLE IF EXISTS `table0`; +CREATE TABLE `table0` ( `col0` tinyint(1) DEFAULT NULL, `col1` tinyint(1) DEFAULT NULL, `col2` tinyint(4) DEFAULT NULL, `col3` date DEFAULT NULL, `col4` time DEFAULT NULL, `col5` set('test1','test2','test3') DEFAULT NULL, `col6` time DEFAULT NULL, `col7` text, `col8` decimal(10,0) DEFAULT NULL, `col9` set('test1','test2','test3') DEFAULT NULL, `col10` float DEFAULT NULL, `col11` double DEFAULT NULL, `col12` enum('test1','test2','test3') DEFAULT NULL, `col13` tinyblob, `col14` year(4) DEFAULT NULL, `col15` set('test1','test2','test3') DEFAULT NULL, `col16` decimal(10,0) DEFAULT NULL, `col17` decimal(10,0) DEFAULT NULL, `col18` blob, `col19` datetime DEFAULT NULL, `col20` double DEFAULT NULL, `col21` decimal(10,0) DEFAULT NULL, `col22` datetime DEFAULT NULL, `col23` decimal(10,0) DEFAULT NULL, `col24` decimal(10,0) DEFAULT NULL, `col25` longtext, `col26` tinyblob, `col27` time DEFAULT NULL, `col28` tinyblob, `col29` enum('test1','test2','test3') DEFAULT NULL, `col30` smallint(6) DEFAULT NULL, `col31` double DEFAULT NULL, `col32` float DEFAULT NULL, `col33` char(175) DEFAULT NULL, `col34` tinytext, `col35` tinytext, `col36` tinyblob, `col37` tinyblob, `col38` tinytext, `col39` mediumblob, `col40` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, `col41` double DEFAULT NULL, `col42` smallint(6) DEFAULT NULL, `col43` longblob, `col44` varchar(80) DEFAULT NULL, `col45` mediumtext, `col46` decimal(10,0) DEFAULT NULL, `col47` bigint(20) DEFAULT NULL, `col48` date DEFAULT NULL, `col49` tinyblob, `col50` date DEFAULT NULL, `col51` tinyint(1) DEFAULT NULL, `col52` mediumint(9) DEFAULT NULL, `col53` float DEFAULT NULL, `col54` tinyblob, `col55` longtext, `col56` smallint(6) DEFAULT NULL, `col57` enum('test1','test2','test3') DEFAULT NULL, `col58` datetime DEFAULT NULL, `col59` mediumtext, `col60` varchar(232) DEFAULT NULL, `col61` decimal(10,0) DEFAULT NULL, `col62` year(4) DEFAULT NULL, `col63` smallint(6) DEFAULT NULL, `col64` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `col65` blob, `col66` longblob, `col67` int(11) DEFAULT NULL, `col68` longtext, `col69` enum('test1','test2','test3') DEFAULT NULL, `col70` int(11) DEFAULT NULL, `col71` time DEFAULT NULL, `col72` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `col73` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `col74` varchar(170) DEFAULT NULL, `col75` set('test1','test2','test3') DEFAULT NULL, `col76` tinyblob, `col77` bigint(20) DEFAULT NULL, `col78` decimal(10,0) DEFAULT NULL, `col79` datetime DEFAULT NULL, `col80` year(4) DEFAULT NULL, `col81` decimal(10,0) DEFAULT NULL, `col82` longblob, `col83` text, `col84` char(83) DEFAULT NULL, `col85` decimal(10,0) DEFAULT NULL, `col86` float DEFAULT NULL, `col87` int(11) DEFAULT NULL, `col88` varchar(145) DEFAULT NULL, `col89` date DEFAULT NULL, `col90` decimal(10,0) DEFAULT NULL, `col91` decimal(10,0) DEFAULT NULL, `col92` mediumblob, `col93` time DEFAULT NULL, KEY `idx0` (`col69`,`col90`,`col8`), KEY `idx1` (`col60`), KEY `idx2` (`col60`,`col70`,`col74`), KEY `idx3` (`col22`,`col32`,`col72`,`col30`), KEY `idx4` (`col29`), KEY `idx5` (`col19`,`col45`(143)), KEY `idx6` (`col46`,`col48`,`col5`,`col39`(118)), KEY `idx7` (`col48`,`col61`), KEY `idx8` (`col93`), KEY `idx9` (`col31`), KEY `idx10` (`col30`,`col21`), KEY `idx11` (`col67`), KEY `idx12` (`col44`,`col6`,`col8`,`col38`(226)), KEY `idx13` (`col71`,`col41`,`col15`,`col49`(88)), KEY `idx14` (`col78`), KEY `idx15` (`col63`,`col67`,`col64`), KEY `idx16` (`col17`,`col86`), KEY `idx17` (`col77`,`col56`,`col10`,`col55`(24)), KEY `idx18` (`col62`), KEY `idx19` (`col31`,`col57`,`col56`,`col53`), KEY `idx20` (`col46`), KEY `idx21` (`col83`(54)), KEY `idx22` (`col51`,`col7`(120)), KEY `idx23` (`col7`(163),`col31`,`col71`,`col14`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1 ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=2; +insert ignore into `table0` set `col23` = 7887371.5084383683, `col24` = 4293854615.6906948000, `col25` = 'vitalist', `col26` = 'widespread', `col27` = '3570490', `col28` = 'habitual', `col30` = -5471, `col31` = 4286985783.6771750000, `col32` = 6354540.9826654866, `col33` = 'defoliation', `col34` = 'logarithms', `col35` = 'tegument\'s', `col36` = 'scouting\'s', `col37` = 'intermittency', `col38` = 'elongates', `col39` = 'prophecies', `col40` = '20560103035939', `col41` = 4292809130.0544143000, `col42` = 22057, `col43` = 'Hess\'s', `col44` = 'bandstand', `col45` = 'phenylketonuria', `col46` = 6338767.4018677324, `col47` = 5310247, `col48` = '12592418', `col49` = 'churchman\'s', `col50` = '32226125', `col51` = -58, `col52` = -6207968, `col53` = 1244839.3255104220, `col54` = 'robotized', `col55` = 'monotonous', `col56` = -26909, `col58` = '20720107023550', `col59` = 'suggestiveness\'s', `col60` = 'gemology', `col61` = 4287800670.2229986000, `col62` = '1944', `col63` = -16827, `col64` = '20700107212324', `col65` = 'Nicolais', `col66` = 'apteryx', `col67` = 6935317, `col68` = 'stroganoff', `col70` = 3316430, `col71` = '3277608', `col72` = '19300511045918', `col73` = '20421201003327', `col74` = 'attenuant', `col75` = '15173', `col76` = 'upstroke\'s', `col77` = 8118987, `col78` = 6791516.2735374002, `col79` = '20780701144624', `col80` = '2134', `col81` = 4290682351.3127537000, `col82` = 'unexplainably', `col83` = 'Storm', `col84` = 'Greyso\'s', `col85` = 4289119212.4306774000, `col86` = 7617575.8796655172, `col87` = -6325335, `col88` = 'fondue\'s', `col89` = '40608940', `col90` = 1659421.8093508712, `col91` = 8346904.6584368423, `col92` = 'reloads', `col93` = '5188366'; +CHECK TABLE table0 EXTENDED; +INSERT IGNORE INTO `table0` SET `col19` = '19940127002709', `col20` = 2383927.9055146948, `col21` = 4293243420.5621204000, `col22` = '20511211123705', `col23` = 4289899778.6573381000, `col24` = 4293449279.0540481000, `col25` = 'emphysemic', `col26` = 'dentally', `col27` = '2347406', `col28` = 'eruct', `col30` = 1222, `col31` = 4294372994.9941406000, `col32` = 4291385574.1173744000, `col33` = 'borrowing\'s', `col34` = 'septics', `col35` = 'ratter\'s', `col36` = 'Kaye', `col37` = 'Florentia', `col38` = 'allium', `col39` = 'barkeep', `col40` = '19510407003441', `col41` = 4293559200.4215522000, `col42` = 22482, `col43` = 'decussate', `col44` = 'Brom\'s', `col45` = 'violated', `col46` = 4925506.4635456400, `col47` = 930549, `col48` = '51296066', `col49` = 'voluminously', `col50` = '29306676', `col51` = -88, `col52` = -2153690, `col53` = 4290250202.1464887000, `col54` = 'expropriation', `col55` = 'Aberdeen\'s', `col56` = 20343, `col58` = '19640415171532', `col59` = 'extern', `col60` = 'Ubana', `col61` = 4290487961.8539081000, `col62` = '2147', `col63` = -24271, `col64` = '20750801194548', `col65` = 'Cunaxa\'s', `col66` = 'pasticcio', `col67` = 2795817, `col68` = 'Indore\'s', `col70` = 6864127, `col71` = '1817832', `col72` = '20540506114211', `col73` = '20040101012300', `col74` = 'rationalized', `col75` = '45522', `col76` = 'indene', `col77` = -6964559, `col78` = 4247535.5266884370, `col79` = '20720416124357', `col80` = '2143', `col81` = 4292060102.4466386000, `col82` = 'striving', `col83` = 'boneblack\'s', `col84` = 'redolent', `col85` = 6489697.9009369183, `col86` = 4287473465.9731131000, `col87` = 7726015, `col88` = 'perplexed', `col89` = '17153791', `col90` = 5478587.1108127078, `col91` = 4287091404.7004304000, `col92` = 'Boulez\'s', `col93` = '2931278'; +CHECK TABLE table0 EXTENDED; + +SET GLOBAL innodb_file_format=@save_innodb_file_format; +SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; +SET GLOBAL innodb_file_per_table=@save_innodb_file_per_table; +DROP TABLE table0; diff --git a/mysql-test/t/innodb_bug40360.test b/mysql-test/t/innodb_bug40360.test new file mode 100644 index 00000000000..e88837aab4f --- /dev/null +++ b/mysql-test/t/innodb_bug40360.test @@ -0,0 +1,16 @@ +# +# Make sure http://bugs.mysql.com/40360 remains fixed. +# + +-- source include/not_embedded.inc +-- source include/have_innodb.inc + +SET TX_ISOLATION='READ-COMMITTED'; + +# This is the default since MySQL 5.1.29 SET BINLOG_FORMAT='STATEMENT'; + +CREATE TABLE bug40360 (a INT) engine=innodb; + +INSERT INTO bug40360 VALUES (1); + +DROP TABLE bug40360; diff --git a/mysql-test/t/innodb_bug41904.test b/mysql-test/t/innodb_bug41904.test new file mode 100644 index 00000000000..365c5229adc --- /dev/null +++ b/mysql-test/t/innodb_bug41904.test @@ -0,0 +1,14 @@ +# +# Make sure http://bugs.mysql.com/41904 remains fixed. +# + +-- source include/not_embedded.inc +-- source include/have_innodb.inc + +CREATE TABLE bug41904 (id INT PRIMARY KEY, uniquecol CHAR(15)) ENGINE=InnoDB; + +INSERT INTO bug41904 VALUES (1,NULL), (2,NULL); + +CREATE UNIQUE INDEX ui ON bug41904 (uniquecol); + +DROP TABLE bug41904; diff --git a/mysql-test/t/innodb_information_schema.test b/mysql-test/t/innodb_information_schema.test new file mode 100644 index 00000000000..eaed653854a --- /dev/null +++ b/mysql-test/t/innodb_information_schema.test @@ -0,0 +1,145 @@ +# +# Test that user data is correctly "visualized" in +# INFORMATION_SCHEMA.innodb_locks.lock_data +# + +-- source include/have_innodb.inc + +-- disable_query_log +-- disable_result_log + +SET storage_engine=InnoDB; + +-- disable_warnings +DROP TABLE IF EXISTS t_min, t_max; +-- enable_warnings + +let $table_def = +( + c01 TINYINT, + c02 TINYINT UNSIGNED, + c03 SMALLINT, + c04 SMALLINT UNSIGNED, + c05 MEDIUMINT, + c06 MEDIUMINT UNSIGNED, + c07 INT, + c08 INT UNSIGNED, + c09 BIGINT, + c10 BIGINT UNSIGNED, + PRIMARY KEY(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10) +); + +-- eval CREATE TABLE t_min $table_def; +INSERT INTO t_min VALUES +(-128, 0, + -32768, 0, + -8388608, 0, + -2147483648, 0, + -9223372036854775808, 0); + +-- eval CREATE TABLE t_max $table_def; +INSERT INTO t_max VALUES +(127, 255, + 32767, 65535, + 8388607, 16777215, + 2147483647, 4294967295, + 9223372036854775807, 18446744073709551615); + +CREATE TABLE ```t'\"_str` ( + c1 VARCHAR(32), + c2 VARCHAR(32), + c3 VARCHAR(32), + c4 VARCHAR(32), + c5 VARCHAR(32), + c6 VARCHAR(32), + c7 VARCHAR(32), + PRIMARY KEY(c1, c2, c3, c4, c5, c6, c7) +); +INSERT INTO ```t'\"_str` VALUES +('1', 'abc', '''abc', 'abc''', 'a''bc', 'a''bc''', '''abc'''''); +INSERT INTO ```t'\"_str` VALUES +('2', 'abc', '"abc', 'abc"', 'a"bc', 'a"bc"', '"abc""'); +INSERT INTO ```t'\"_str` VALUES +('3', 'abc', '\\abc', 'abc\\', 'a\\bc', 'a\\bc\\', '\\abc\\\\'); +INSERT INTO ```t'\"_str` VALUES +('4', 'abc', 0x00616263, 0x61626300, 0x61006263, 0x6100626300, 0x610062630000); + +-- connect (con_lock,localhost,root,,) +-- connect (con_min_trylock,localhost,root,,) +-- connect (con_max_trylock,localhost,root,,) +-- connect (con_str_insert_supremum,localhost,root,,) +-- connect (con_str_lock_row1,localhost,root,,) +-- connect (con_str_lock_row2,localhost,root,,) +-- connect (con_str_lock_row3,localhost,root,,) +-- connect (con_str_lock_row4,localhost,root,,) +-- connect (con_verify_innodb_locks,localhost,root,,) + +-- connection con_lock +SET autocommit=0; +SELECT * FROM t_min FOR UPDATE; +SELECT * FROM t_max FOR UPDATE; +SELECT * FROM ```t'\"_str` FOR UPDATE; + +-- connection con_min_trylock +-- send +SELECT * FROM t_min FOR UPDATE; + +-- connection con_max_trylock +-- send +SELECT * FROM t_max FOR UPDATE; + +-- connection con_str_insert_supremum +-- send +INSERT INTO ```t'\"_str` VALUES +('z', 'z', 'z', 'z', 'z', 'z', 'z'); + +-- connection con_str_lock_row1 +-- send +SELECT * FROM ```t'\"_str` WHERE c1 = '1' FOR UPDATE; + +-- connection con_str_lock_row2 +-- send +SELECT * FROM ```t'\"_str` WHERE c1 = '2' FOR UPDATE; + +-- connection con_str_lock_row3 +-- send +SELECT * FROM ```t'\"_str` WHERE c1 = '3' FOR UPDATE; + +-- connection con_str_lock_row4 +-- send +SELECT * FROM ```t'\"_str` WHERE c1 = '4' FOR UPDATE; + +# Give time to the above 2 queries to execute before continuing. +# Without this sleep it sometimes happens that the SELECT from innodb_locks +# executes before some of them, resulting in less than expected number +# of rows being selected from innodb_locks. +-- sleep 0.1 + +-- enable_result_log +-- connection con_verify_innodb_locks +SELECT lock_mode, lock_type, lock_table, lock_index, lock_rec, lock_data +FROM INFORMATION_SCHEMA.INNODB_LOCKS ORDER BY lock_data; + +SELECT lock_table,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_LOCKS +GROUP BY lock_table; + +set @save_sql_mode = @@sql_mode; +SET SQL_MODE='ANSI_QUOTES'; +SELECT lock_table,COUNT(*) FROM INFORMATION_SCHEMA.INNODB_LOCKS +GROUP BY lock_table; +SET @@sql_mode=@save_sql_mode; +-- disable_result_log + +-- connection default + +-- disconnect con_lock +-- disconnect con_min_trylock +-- disconnect con_max_trylock +-- disconnect con_str_insert_supremum +-- disconnect con_str_lock_row1 +-- disconnect con_str_lock_row2 +-- disconnect con_str_lock_row3 +-- disconnect con_str_lock_row4 +-- disconnect con_verify_innodb_locks + +DROP TABLE t_min, t_max, ```t'\"_str`; diff --git a/mysql-test/t/innodb_xtradb_bug317074.test b/mysql-test/t/innodb_xtradb_bug317074.test new file mode 100644 index 00000000000..554acc7196c --- /dev/null +++ b/mysql-test/t/innodb_xtradb_bug317074.test @@ -0,0 +1,45 @@ +-- source include/have_innodb.inc + +SET @save_innodb_file_format=@@global.innodb_file_format; +SET @save_innodb_file_format_check=@@global.innodb_file_format_check; +SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +SET GLOBAL innodb_file_format='Barracuda'; +SET GLOBAL innodb_file_per_table=ON; + +-- disable_query_log +-- disable_result_log + +DROP TABLE IF EXISTS `test1`; +CREATE TABLE IF NOT EXISTS `test1` ( + `a` int primary key auto_increment, + `b` int default 0, + `c` char(100) default 'testtest' +) ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +delimiter |; +CREATE PROCEDURE insert_many(p1 int) +BEGIN +SET @x = 0; +SET @y = 0; +REPEAT + insert into test1 set b=1; + SET @x = @x + 1; + SET @y = @y + 1; + IF @y >= 100 THEN + commit; + SET @y = 0; + END IF; +UNTIL @x >= p1 END REPEAT; +END| +delimiter ;| +call insert_many(100000); +DROP PROCEDURE insert_many; + +# The bug is hangup at the following statement +ALTER TABLE test1 ENGINE=MyISAM; + +SET GLOBAL innodb_file_format=@save_innodb_file_format; +SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; +SET GLOBAL innodb_file_per_table=@save_innodb_file_per_table; + +DROP TABLE test1; diff --git a/mysql-test/t/mysqlbinlog_row_big.test b/mysql-test/t/mysqlbinlog_row_big.test index 14e818e7214..f4e095af021 100644 --- a/mysql-test/t/mysqlbinlog_row_big.test +++ b/mysql-test/t/mysqlbinlog_row_big.test @@ -23,6 +23,16 @@ # This is a big test. --source include/big_test.inc +--echo # +--echo # We need big packets. +--echo # +connect (con1, localhost, root,,); +connection con1; +SET @old_global_max_allowed_packet=@@global.max_allowed_packet; +SET @@global.max_allowed_packet= 1024*1024*1024; +connect (con2, localhost, root,,); +connection con2; + --echo # --echo # Preparatory cleanup. --echo # @@ -35,11 +45,6 @@ DROP TABLE IF EXISTS t1; --echo # SET timestamp=1000000000; ---echo # ---echo # We need big packets. ---echo # -SET @@global.max_allowed_packet= 1024*1024*1024; - --echo # --echo # Delete all existing binary logs. --echo # @@ -122,9 +127,14 @@ let $MYSQLD_DATADIR= `select @@datadir`; --echo # DROP TABLE t1; +connection con1; +SET @@global.max_allowed_packet=@old_global_max_allowed_packet; + --echo remove_file \$MYSQLTEST_VARDIR/$mysqlbinlog_output # # NOTE: If you want to see the *huge* mysqlbinlog output, disable next line: # --remove_file $MYSQLTEST_VARDIR/$mysqlbinlog_output +disconnect con1 +disconnect con2 diff --git a/mysql-test/t/partition_innodb.test b/mysql-test/t/partition_innodb.test index 2abbceffbb0..722fad2919e 100644 --- a/mysql-test/t/partition_innodb.test +++ b/mysql-test/t/partition_innodb.test @@ -27,14 +27,14 @@ UPDATE t1 SET DATA = data*2 WHERE id = 3; # grouping/referencing in replace_regex is very slow on long strings, # removing all before/after the interesting row before grouping/referencing ---replace_regex /.*---TRANSACTION [0-9]+ [0-9]+, .*, OS thread id [0-9]+// /MySQL thread id [0-9]+, query id [0-9]+ .*// /.*([0-9]+ lock struct\(s\)), heap size [0-9]+, ([0-9]+ row lock\(s\)).*/\1 \2/ +--replace_regex /.*LIST OF TRANSACTIONS FOR EACH SESSION:// /MySQL thread id [0-9]+, query id [0-9]+ .*// /.*([0-9]+ lock struct\(s\)), heap size [0-9]+, ([0-9]+ row lock\(s\)).*/\1 \2/ SHOW ENGINE InnoDB STATUS; UPDATE t1 SET data = data*2 WHERE data = 2; # grouping/referencing in replace_regex is very slow on long strings, # removing all before/after the interesting row before grouping/referencing ---replace_regex /.*---TRANSACTION [0-9]+ [0-9]+, .*, OS thread id [0-9]+// /MySQL thread id [0-9]+, query id [0-9]+ .*// /.*([0-9]+ lock struct\(s\)), heap size [0-9]+, ([0-9]+ row lock\(s\)).*/\1 \2/ +--replace_regex /.*LIST OF TRANSACTIONS FOR EACH SESSION:// /MySQL thread id [0-9]+, query id [0-9]+ .*// /.*([0-9]+ lock struct\(s\)), heap size [0-9]+, ([0-9]+ row lock\(s\)).*/\1 \2/ SHOW ENGINE InnoDB STATUS; SET @@session.tx_isolation = @old_tx_isolation; diff --git a/mysql-test/t/type_bit_innodb.test b/mysql-test/t/type_bit_innodb.test index e7e66da8927..dc5947e25e1 100644 --- a/mysql-test/t/type_bit_innodb.test +++ b/mysql-test/t/type_bit_innodb.test @@ -41,7 +41,7 @@ create table t1 (a bit) engine=innodb; insert into t1 values (b'0'), (b'1'), (b'000'), (b'100'), (b'001'); select hex(a) from t1; # It is not deterministic which duplicate will be seen first ---replace_regex /(.*Duplicate entry )'.*'( for key.*)/\1''\2/ +--replace_regex /entry '(.*)' for/entry '' for/ --error ER_DUP_ENTRY alter table t1 add unique (a); drop table t1; diff --git a/mysql-test/t/variables-big.test b/mysql-test/t/variables-big.test index 2dacabfece7..efefa055eab 100644 --- a/mysql-test/t/variables-big.test +++ b/mysql-test/t/variables-big.test @@ -8,19 +8,20 @@ # Bug #27322 failure to allocate transaction_prealloc_size causes crash # +set @pid_temp = (select ID from information_schema.processlist); set session transaction_prealloc_size=1024*1024*1024*1; --replace_column 1 # -show processlist; +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; set session transaction_prealloc_size=1024*1024*1024*2; --replace_column 1 # -show processlist; +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; --replace_column 1 # set session transaction_prealloc_size=1024*1024*1024*3; --replace_column 1 # -show processlist; +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; set session transaction_prealloc_size=1024*1024*1024*4; --replace_column 1 # -show processlist; +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; set session transaction_prealloc_size=1024*1024*1024*5; --replace_column 1 # -show processlist; +select @pid_temp = (select ID from information_schema.processlist) as 'TRUE'; diff --git a/mysys/mf_iocache2.c b/mysys/mf_iocache2.c index 728501e6c50..87346f26b60 100644 --- a/mysys/mf_iocache2.c +++ b/mysys/mf_iocache2.c @@ -464,3 +464,52 @@ process_flags: err: return (size_t) -1; } + +int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, + const char *default_val) +{ + uint length; + DBUG_ENTER("init_strvar_from_file"); + + if ((length=my_b_gets(f,var, max_size))) + { + char* last_p = var + length -1; + if (*last_p == '\n') + *last_p = 0; /* if we stopped on newline, kill it */ + else + { + /* + If we truncated a line or stopped on last char, remove all chars + up to and including newline. + */ + int c; + while (((c=my_b_get(f)) != '\n' && c != my_b_EOF)) + ; + } + DBUG_RETURN(0); + } + else if (default_val) + { + strmake(var, default_val, max_size-1); + DBUG_RETURN(0); + } + DBUG_RETURN(1); +} + +int init_intvar_from_file(int* var, IO_CACHE* f, int default_val) +{ + char buf[32]; + DBUG_ENTER("init_intvar_from_file"); + + if (my_b_gets(f, buf, sizeof(buf))) + { + *var = atoi(buf); + DBUG_RETURN(0); + } + else if (default_val) + { + *var = default_val; + DBUG_RETURN(0); + } + DBUG_RETURN(1); +} diff --git a/mysys/thr_mutex.c b/mysys/thr_mutex.c index 80f21e53473..c46b68761db 100644 --- a/mysys/thr_mutex.c +++ b/mysys/thr_mutex.c @@ -149,6 +149,35 @@ static inline void remove_from_active_list(safe_mutex_t *mp) mp->prev= mp->next= 0; } +/* + We initialise the hashes for deadlock detection lazily. + This greatly helps with performance when lots of mutexes are initiased but + only a few of them are actually used (eg. XtraDB). +*/ +static int safe_mutex_lazy_init_deadlock_detection(safe_mutex_t *mp) +{ + if (!my_multi_malloc(MY_FAE | MY_WME, + &mp->locked_mutex, sizeof(*mp->locked_mutex), + &mp->used_mutex, sizeof(*mp->used_mutex), NullS)) + { + return 1; /* Error */ + } + + pthread_mutex_lock(&THR_LOCK_mutex); + mp->id= ++safe_mutex_id; + pthread_mutex_unlock(&THR_LOCK_mutex); + hash_init(mp->locked_mutex, &my_charset_bin, + 1000, + offsetof(safe_mutex_deadlock_t, id), + sizeof(mp->id), + 0, 0, HASH_UNIQUE); + hash_init(mp->used_mutex, &my_charset_bin, + 1000, + offsetof(safe_mutex_t, id), + sizeof(mp->id), + 0, 0, HASH_UNIQUE); + return 0; +} int safe_mutex_init(safe_mutex_t *mp, const pthread_mutexattr_t *attr __attribute__((unused)), @@ -167,35 +196,8 @@ int safe_mutex_init(safe_mutex_t *mp, mp->line= line; /* Skip the very common '&' prefix from the autogenerated name */ mp->name= name[0] == '&' ? name + 1 : name; + /* Deadlock detection is initialised only lazily, on first use. */ - if (safe_mutex_deadlock_detector && !( my_flags & MYF_NO_DEADLOCK_DETECTION)) - { - if (!my_multi_malloc(MY_FAE | MY_WME, - &mp->locked_mutex, sizeof(*mp->locked_mutex), - &mp->used_mutex, sizeof(*mp->used_mutex), NullS)) - { - /* Disable deadlock handling for this mutex */ - my_flags|= MYF_NO_DEADLOCK_DETECTION; - } - else - { - pthread_mutex_lock(&THR_LOCK_mutex); - mp->id= ++safe_mutex_id; - pthread_mutex_unlock(&THR_LOCK_mutex); - hash_init(mp->locked_mutex, &my_charset_bin, - 1000, - offsetof(safe_mutex_deadlock_t, id), - sizeof(mp->id), - 0, 0, HASH_UNIQUE); - hash_init(mp->used_mutex, &my_charset_bin, - 1000, - offsetof(safe_mutex_t, id), - sizeof(mp->id), - 0, 0, HASH_UNIQUE); - } - } - else - my_flags|= MYF_NO_DEADLOCK_DETECTION; mp->create_flags= my_flags; #ifdef SAFE_MUTEX_DETECT_DESTROY @@ -310,7 +312,8 @@ int safe_mutex_lock(safe_mutex_t *mp, myf my_flags, const char *file, /* Deadlock detection */ mp->prev= mp->next= 0; - if (!(mp->active_flags & (MYF_TRY_LOCK | MYF_NO_DEADLOCK_DETECTION))) + if (!(mp->active_flags & (MYF_TRY_LOCK | MYF_NO_DEADLOCK_DETECTION)) && + (mp->used_mutex != NULL || !safe_mutex_lazy_init_deadlock_detection(mp))) { safe_mutex_t **mutex_in_use= my_thread_var_mutex_in_use(); @@ -643,7 +646,7 @@ int safe_mutex_destroy(safe_mutex_t *mp, const char *file, uint line) void safe_mutex_free_deadlock_data(safe_mutex_t *mp) { /* Free all entries that points to this one */ - if (!(mp->create_flags & MYF_NO_DEADLOCK_DETECTION)) + if (!(mp->create_flags & MYF_NO_DEADLOCK_DETECTION) && mp->used_mutex != NULL) { pthread_mutex_lock(&THR_LOCK_mutex); my_hash_iterate(mp->used_mutex, diff --git a/sql/log_event.cc b/sql/log_event.cc index 87917b6ac86..485dc2fddc3 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -9313,3 +9313,29 @@ st_print_event_info::st_print_event_info() open_cached_file(&body_cache, NULL, NULL, 0, flags); } #endif + +#if defined(MYSQL_SERVER) +/* + Access to the current replication position. + + There is a dummy replacement for this in the embedded library that returns + FALSE; this is used by XtraDB to allow it to access replication stuff while + still being able to use the same plugin in both stand-alone and embedded. +*/ +bool rpl_get_position_info(const char **log_file_name, ulonglong *log_pos, + const char **group_relay_log_name, + ulonglong *relay_log_pos) +{ +#if defined(EMBEDDED_LIBRARY) || !defined(HAVE_REPLICATION) + return FALSE; +#else + const Relay_log_info *rli= &(active_mi->rli); + *log_file_name= rli->group_master_log_name; + *log_pos= rli->group_master_log_pos + + (rli->future_event_relay_log_pos - rli->group_relay_log_pos); + *group_relay_log_name= rli->group_relay_log_name; + *relay_log_pos= rli->future_event_relay_log_pos; + return TRUE; +#endif +} +#endif diff --git a/sql/log_event.h b/sql/log_event.h index bda53da8ab0..45dcf297697 100644 --- a/sql/log_event.h +++ b/sql/log_event.h @@ -3917,6 +3917,10 @@ static inline bool copy_event_cache_to_file_and_reinit(IO_CACHE *cache, reinit_io_cache(cache, WRITE_CACHE, 0, FALSE, TRUE); } +bool rpl_get_position_info(const char **log_file_name, ulonglong *log_pos, + const char **group_relay_log_name, + ulonglong *relay_log_pos); + /** @} (end of group Replication) */ diff --git a/sql/rpl_mi.cc b/sql/rpl_mi.cc index cb8b0e02ef9..b8af53849f1 100644 --- a/sql/rpl_mi.cc +++ b/sql/rpl_mi.cc @@ -22,11 +22,6 @@ #ifdef HAVE_REPLICATION -// Defined in slave.cc -int init_intvar_from_file(int* var, IO_CACHE* f, int default_val); -int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, - const char *default_val); - Master_info::Master_info() :Slave_reporting_capability("I/O"), ssl(0), ssl_verify_server_cert(0), fd(-1), io_thd(0), inited(0), diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index 82628143bea..3fbe9b16d23 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -23,11 +23,6 @@ static int count_relay_log_space(Relay_log_info* rli); -// Defined in slave.cc -int init_intvar_from_file(int* var, IO_CACHE* f, int default_val); -int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, - const char *default_val); - Relay_log_info::Relay_log_info() :Slave_reporting_capability("SQL"), diff --git a/sql/slave.cc b/sql/slave.cc index ec86a4c48e0..b6138078e35 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -755,57 +755,6 @@ const char *print_slave_db_safe(const char* db) DBUG_RETURN((db ? db : "")); } -int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, - const char *default_val) -{ - uint length; - DBUG_ENTER("init_strvar_from_file"); - - if ((length=my_b_gets(f,var, max_size))) - { - char* last_p = var + length -1; - if (*last_p == '\n') - *last_p = 0; // if we stopped on newline, kill it - else - { - /* - If we truncated a line or stopped on last char, remove all chars - up to and including newline. - */ - int c; - while (((c=my_b_get(f)) != '\n' && c != my_b_EOF)) - ; - } - DBUG_RETURN(0); - } - else if (default_val) - { - strmake(var, default_val, max_size-1); - DBUG_RETURN(0); - } - DBUG_RETURN(1); -} - - -int init_intvar_from_file(int* var, IO_CACHE* f, int default_val) -{ - char buf[32]; - DBUG_ENTER("init_intvar_from_file"); - - - if (my_b_gets(f, buf, sizeof(buf))) - { - *var = atoi(buf); - DBUG_RETURN(0); - } - else if (default_val) - { - *var = default_val; - DBUG_RETURN(0); - } - DBUG_RETURN(1); -} - /* Note that we rely on the master's version (3.23, 4.0.14 etc) instead of relying on the binlog's version. This is not perfect: imagine an upgrade diff --git a/storage/innobase/plug.in b/storage/innobase/plug.in.disabled similarity index 100% rename from storage/innobase/plug.in rename to storage/innobase/plug.in.disabled diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt new file mode 100644 index 00000000000..be4206e0076 --- /dev/null +++ b/storage/xtradb/CMakeLists.txt @@ -0,0 +1,97 @@ +# Copyright (C) 2006 MySQL AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX") +SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX") +ADD_DEFINITIONS(-D_WIN32 -D_LIB) + +# Bug 19424 - InnoDB: Possibly a memory overrun of the buffer being freed (64-bit Visual C) +# Removing Win64 compiler optimizations for all innodb/mem/* files. +IF(CMAKE_GENERATOR MATCHES "Visual Studio" AND CMAKE_SIZEOF_VOID_P MATCHES 8) + SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/storage/xtradb/mem/mem0mem.c + ${CMAKE_SOURCE_DIR}/storage/xtradb/mem/mem0pool.c + PROPERTIES COMPILE_FLAGS -Od) +ENDIF(CMAKE_GENERATOR MATCHES "Visual Studio" AND CMAKE_SIZEOF_VOID_P MATCHES 8) + +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib + ${CMAKE_SOURCE_DIR}/storage/xtradb/include + ${CMAKE_SOURCE_DIR}/storage/xtradb/handler + ${CMAKE_SOURCE_DIR}/sql + ${CMAKE_SOURCE_DIR}/regex + ${CMAKE_SOURCE_DIR}/extra/yassl/include) + +SET(INNOBASE_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c + buf/buf0buddy.c buf/buf0buf.c buf/buf0flu.c buf/buf0lru.c buf/buf0rea.c + data/data0data.c data/data0type.c + dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c + dyn/dyn0dyn.c + eval/eval0eval.c eval/eval0proc.c + fil/fil0fil.c + fsp/fsp0fsp.c + fut/fut0fut.c fut/fut0lst.c + ha/ha0ha.c ha/hash0hash.c ha/ha0storage.c + ibuf/ibuf0ibuf.c + pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c + lock/lock0lock.c lock/lock0iter.c + log/log0log.c log/log0recv.c + mach/mach0data.c + mem/mem0mem.c mem/mem0pool.c + mtr/mtr0log.c mtr/mtr0mtr.c + os/os0file.c os/os0proc.c os/os0sync.c os/os0thread.c + page/page0cur.c page/page0page.c page/page0zip.c + que/que0que.c + handler/ha_innodb.cc handler/handler0alter.cc handler/i_s.cc handler/mysql_addons.cc + read/read0read.c + rem/rem0cmp.c rem/rem0rec.c + row/row0ext.c row/row0ins.c row/row0merge.c row/row0mysql.c + row/row0purge.c row/row0row.c row/row0sel.c row/row0uins.c + row/row0umod.c row/row0undo.c row/row0upd.c row/row0vers.c + srv/srv0que.c srv/srv0srv.c srv/srv0start.c + sync/sync0arr.c sync/sync0rw.c sync/sync0sync.c + thr/thr0loc.c + trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c + trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c + usr/usr0sess.c + ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c ut/ut0list.c ut/ut0wqueue.c) + +IF(NOT SOURCE_SUBLIBS) + ADD_LIBRARY(innobase ${INNOBASE_SOURCES}) + ADD_DEPENDENCIES(innobase GenError) + SET_TARGET_PROPERTIES(innobase PROPERTIES COMPILE_FLAGS "-DMYSQL_SERVER") + + IF(INNODB_DYNAMIC_PLUGIN) + # The dynamic plugin requires CMake 2.6.0 or later. Otherwise, the /DELAYLOAD property + # will not be set + CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0 FATAL_ERROR) + ADD_LIBRARY(ha_innodb SHARED ${INNOBASE_SOURCES} ha_innodb.def handler/win_delay_loader.cc) + ADD_DEPENDENCIES(ha_innodb GenError mysqld) + # If build type is not specified as Release, default to Debug + # This is a workaround to a problem in CMake 2.6, which does not + # set the path of mysqld.lib correctly + IF(CMAKE_BUILD_TYPE MATCHES Release) + SET(CMAKE_BUILD_TYPE "Release") + ELSE(CMAKE_BUILD_TYPE MATCHES Release) + SET(CMAKE_BUILD_TYPE "Debug") + ENDIF(CMAKE_BUILD_TYPE MATCHES Release) + TARGET_LINK_LIBRARIES(ha_innodb strings zlib) + TARGET_LINK_LIBRARIES(ha_innodb ${CMAKE_SOURCE_DIR}/sql/${CMAKE_BUILD_TYPE}/mysqld.lib) + SET_TARGET_PROPERTIES(ha_innodb PROPERTIES OUTPUT_NAME ha_innodb) + SET_TARGET_PROPERTIES(ha_innodb PROPERTIES LINK_FLAGS "/MAP /MAPINFO:EXPORTS") + SET_TARGET_PROPERTIES(ha_innodb PROPERTIES LINK_FLAGS "/ENTRY:\"_DllMainCRTStartup@12\"") + SET_TARGET_PROPERTIES(ha_innodb PROPERTIES COMPILE_FLAGS "-DMYSQL_DYNAMIC_PLUGIN") + SET_TARGET_PROPERTIES(ha_innodb PROPERTIES LINK_FLAGS "/DELAYLOAD:mysqld.exe") + ENDIF(INNODB_DYNAMIC_PLUGIN) + +ENDIF(NOT SOURCE_SUBLIBS) diff --git a/storage/xtradb/COPYING.Google b/storage/xtradb/COPYING.Google new file mode 100644 index 00000000000..5ade2b0e381 --- /dev/null +++ b/storage/xtradb/COPYING.Google @@ -0,0 +1,30 @@ +Portions of this software contain modifications contributed by Google, Inc. +These contributions are used with the following license: + +Copyright (c) 2008, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the Google Inc. nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/storage/xtradb/ChangeLog b/storage/xtradb/ChangeLog new file mode 100644 index 00000000000..f662b02eea5 --- /dev/null +++ b/storage/xtradb/ChangeLog @@ -0,0 +1,775 @@ +2009-03-05 The InnoDB Team + + * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result, + mysql-test/innodb-autoinc.test: + Fix Bug#43203 Overflow from auto incrementing causes server segv + +2009-02-25 The InnoDB Team + + * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result, + mysql-test/innodb-autoinc.test: + Fix Bug#42714 AUTO_INCREMENT errors in 5.1.31 + +2009-02-23 The InnoDB Team + + * btr/btr0cur.c: + Fix Bug#43043 Crash on BLOB delete operation + +2009-02-20 The InnoDB Team + + * handler/ha_innodb.cc: + Make innodb_use_sys_malloc=ON the default. + +2009-02-20 The InnoDB Team + + * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result, + mysql-test/innodb-autoinc.test: + Fix Bug#42400 InnoDB autoinc code can't handle floating-point columns + +2009-02-18 The InnoDB Team + + * include/ut0mem.h, os/os0proc.c, ut/ut0mem.c: + Protect ut_total_allocated_memory with ut_list_mutex in + os_mem_alloc_large() and os_mem_free_large(). The lack of this mutex + protection could cause an assertion failure during fast index + creation. Also, add UNIV_MEM_ALLOC and UNIV_MEM_FREE instrumentation + to os_mem_alloc_large() and os_mem_free_large(), so that Valgrind can + detect more errors. + +2009-02-11 The InnoDB Team + + * handler/ha_innodb.cc: + Make innodb_thread_concurrency=0 the default. The old default value + was 8. A non-zero setting may be useful when InnoDB is showing severe + scalability problems under multiple concurrent connections. + +2009-02-10 The InnoDB Team + + * handler/ha_innodb.cc, handler/ha_innodb.h: + Fix Bug#41676 Table names are case insensitive in locking + +2009-02-10 The InnoDB Team + + * mem/mem0dbg.c, mem/mem0mem.c, mem/mem0pool.c: + When innodb_use_sys_malloc is set, ignore + innodb_additional_mem_pool_size, because nothing will be allocated + from mem_comm_pool. + +2009-02-10 The InnoDB Team + + * ut/ut0mem.c: + Map ut_malloc_low(), ut_realloc(), and ut_free() directly to malloc(), + realloc(), and free() when innodb_use_sys_malloc is set. As a side + effect, ut_total_allocated_memory ("Total memory allocated" in the + "BUFFER POOL AND MEMORY" section of SHOW ENGINE INNODB STATUS) will + exclude any memory allocated by these functions when + innodb_use_sys_malloc is set. + +2009-02-10 The InnoDB Team + + * btr/btr0cur.c, btr/btr0sea.c, buf/buf0buf.c, handler/ha_innodb.cc, + include/buf0buf.ic, include/os0sync.h, include/srv0srv.h, + include/sync0rw.h, include/sync0rw.ic, include/sync0sync.h, + include/sync0sync.ic, include/univ.i, row/row0sel.c, srv/srv0srv.c, + srv/srv0start.c, sync/sync0arr.c, sync/sync0rw.c, sync/sync0sync.c: + On those platforms that support it, implement the synchronization + primitives of InnoDB mutexes and read/write locks with GCC atomic + builtins instead of Pthreads mutexes and InnoDB mutexes. These changes + are based on a patch supplied by Mark Callaghan of Google under a BSD + license. + +2009-01-30 The InnoDB Team + + * btr/btr0cur.c, btr/btr0sea.c, buf/buf0buf.c, handler/ha_innodb.cc, + include/btr0sea.h, include/buf0buf.h, include/sync0sync.h, + sync/sync0sync.c: + Make the configuration parameter innodb_adaptive_hash_index dynamic, + so that it can be changed at runtime. + +2009-01-29 The InnoDB Team + + * handler/ha_innodb.cc, ibuf/ibuf0ibuf.c, include/ibuf0ibuf.h, + include/ibuf0ibuf.ic: + Implement the settable global variable innodb_change_buffering, + with the allowed values 'none' and 'inserts'. The default value + 'inserts' enables the buffering of inserts to non-unique secondary + index trees when the B-tree leaf page is not in the buffer pool. + +2009-01-27 The InnoDB Team + + * buf/buf0lru.c: + Fix a race condition in buf_LRU_invalidate_tablespace(): The + compressed page size (zip_size) was read while the block descriptor + was no longer protected by a mutex. This could lead to corruption + when a table is dropped on a busy system that contains compressed + tables. + +2009-01-26 The InnoDB Team + + * btr/btr0sea.c, buf/buf0buf.c, include/buf0buf.h, include/buf0buf.ic, + include/mtr0log.ic, include/row0upd.ic, mtr/mtr0mtr.c: + Implement buf_block_align() with pointer arithmetics, as it is in the + built-in InnoDB distributed with MySQL. Do not acquire the buffer pool + mutex before buf_block_align(). This removes a scalability bottleneck + in the adaptive hash index lookup. In CHECK TABLE, check that + buf_pool->page_hash is consistent with buf_block_align(). + +2009-01-23 The InnoDB Team + + * btr/btr0sea.c: + Fix Bug#42279 Race condition in btr_search_drop_page_hash_when_freed() + +2009-01-23 The InnoDB Team + + * buf/buf0buf.c, include/buf0buf.h: + Remove the unused mode BUF_GET_NOWAIT of buf_page_get_gen() + +2009-01-20 The InnoDB Team + + * include/rem0rec.h, include/rem0rec.ic: + Fix Bug#41571 MySQL segfaults after innodb recovery + +2009-01-20 The InnoDB Team + + * lock/lock0lock.c: + Fix Bug#42152 Race condition in lock_is_table_exclusive() + +2009-01-14 The InnoDB Team + + * include/trx0roll.h, trx/trx0roll.c, trx/trx0trx.c: + Fix Bug#38187 Error 153 when creating savepoints + +2009-01-14 The InnoDB Team + + * dict/dict0load.c: + Fix Bug#42075 dict_load_indexes failure in dict_load_table will + corrupt the dictionary cache + +2009-01-13 The InnoDB Team + + * buf/buf0buddy.c, dict/dict0dict.c, dict/dict0mem.c, fil/fil0fil.c, + ha/ha0storage.c, handler/ha_innodb.cc, handler/win_delay_loader.cc, + include/buf0buf.ic, include/dict0dict.ic, include/hash0hash.h, + thr/thr0loc.c, trx/trx0i_s.c: + Add the parameter ASSERTION to HASH_SEARCH() macro, and use it for + light validation of the traversed items in hash table lookups when + UNIV_DEBUG is enabled. + +2009-01-09 The InnoDB Team + + * buf/buf0flu.c, include/buf0flu.h, include/buf0flu.ic: + Remove unused code from the functions + buf_flush_insert_into_flush_list() and + buf_flush_insert_sorted_into_flush_list(). + +2009-01-09 The InnoDB Team + + * buf/buf0flu.c: + Simplify the functions buf_flush_try_page() and buf_flush_batch(). Add + debug assertions and an explanation to buf_flush_write_block_low(). + +2009-01-07 The InnoDB Team + + * row/row0merge.c: + Fix a bug in recovery when dropping temporary indexes. + +2009-01-07 The InnoDB Team + + * handler/ha_innodb.cc, handler/ha_innodb.h, handler/handler0alter.cc: + Fix Bug#41680 calls to trx_allocate_for_mysql are not consistent + +2009-01-07 The InnoDB Team + + * mysql-test/innodb_bug41904.result, mysql-test/innodb_bug41904.test, + row/row0merge.c: + Fix Bug#41904 create unique index problem + +2009-01-02 The InnoDB Team + + * handler/ha_innodb.cc, include/srv0srv.h, mem/mem0pool.c, + mysql-test/innodb-use-sys-malloc-master.opt, + mysql-test/innodb-use-sys-malloc.result, + mysql-test/innodb-use-sys-malloc.test, srv/srv0srv.c, srv/srv0start.c: + Implement the configuration parameter innodb_use_sys_malloc (false by + default), for disabling InnoDB's internal memory allocator and using + system malloc/free instead. The "BUFFER POOL AND MEMORY" section of + SHOW ENGINE INNODB STATUS will report "in additional pool allocated + allocated 0" when innodb_use_sys_malloc is set. + +2008-12-30 The InnoDB Team + + * btr/btr0btr.c: + When setting the PAGE_LEVEL of a compressed B-tree page from or to 0, + compress the page at the same time. This is necessary, because the + column information stored on the compressed page will differ between + leaf and non-leaf pages. Leaf pages are identified by PAGE_LEVEL=0. + This bug can make InnoDB crash when all rows of a compressed table are + deleted. + +2008-12-17 The InnoDB Team + + * include/row0sel.h, include/row0upd.h, pars/pars0pars.c, + row/row0mysql.c, row/row0sel.c, row/row0upd.c: + Remove update-in-place select from the internal SQL interpreter. It + was only used for updating the InnoDB internal data dictionary when + renaming or dropping tables. It could have caused deadlocks when + acquiring latches on insert buffer bitmap pages. + +2008-12-17 The InnoDB Team + + * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c, ha/ha0ha.c, + ha/hash0hash.c, include/buf0buf.h, include/ha0ha.h, include/ha0ha.ic, + include/hash0hash.h, include/univ.i: + Introduce the preprocessor symbol UNIV_AHI_DEBUG for enabling adaptive + hash index debugging independently of UNIV_DEBUG. + +2008-12-16 The InnoDB Team + + * btr/btr0cur.c: + Do not update the free bits in the insert buffer bitmap when inserting + or deleting from the insert buffer B-tree. Assert that records in the + insert buffer B-tree are never updated. + +2008-12-12 The InnoDB Team + + * buf/buf0buf.c, fil/fil0fil.c, fsp/fsp0fsp.c, ibuf/ibuf0ibuf.c, + include/fil0fil.h, include/ibuf0ibuf.h, include/ibuf0ibuf.ic, + include/ibuf0types.h: + Clean up the insert buffer subsystem so that only one insert + buffer B-tree exists. + Originally, there were provisions in InnoDB for multiple insert + buffer B-trees, apparently one for each tablespace. + When Heikki Tuuri implemented multiple InnoDB tablespaces in + MySQL/InnoDB 4.1, he made the insert buffer live only in the + system tablespace (space 0) but left the provisions in the code. + +2008-12-11 The InnoDB Team + + * include/srv0srv.h, os/os0proc.c, srv/srv0srv.c: + Fix the issue that the InnoDB plugin fails if innodb_buffer_pool_size + is defined bigger than 4096M on 64-bit Windows. This bug should not + have affected other 64-bit systems. + +2008-12-09 The InnoDB Team + + * handler/ha_innodb.cc: + Fix Bug#40386 Not flushing query cache after truncate. + +2008-12-09 The InnoDB Team + + * handler/ha_innodb.cc, srv/srv0srv.c, trx/trx0trx.c: + Fix Bug#40760 "set global innodb_thread_concurrency = 0;" is not safe + +2008-12-04 The InnoDB Team + + * handler/ha_innodb.cc, handler/mysql_addons.cc, + include/mysql_addons.h, trx/trx0i_s.c, win-plugin/win-plugin.diff: + Remove dependencies to MySQL internals (defining MYSQL_SERVER). + +2008-12-02 The InnoDB Team + + * page/page0cur.c: + When allocating space for a record from the free list of previously + purged records, zero out the DB_TRX_ID and DB_ROLL_PTR of the purged + record if the new record would not overwrite these fields. This fixes + a harmless content mismatch reported by page_zip_validate(). + +2008-12-02 The InnoDB Team + + * row/row0merge.c: + Replace the WHILE 1 with WHILE 1=1 in the SQL procedure, so that the + loop will actually be entered and temporary indexes be dropped during + crash recovery. + +2008-12-01 The InnoDB Team + + InnoDB Plugin 1.0.2 released + +2008-10-31 The InnoDB Team + + * dict/dict0mem.c, include/dict0mem.h, include/lock0lock.h, + include/row0mysql.h, include/trx0trx.h, include/univ.i, + include/ut0vec.h, include/ut0vec.ic, lock/lock0lock.c, + row/row0mysql.c, trx/trx0trx.c: + Fix Bug#26316 Triggers create duplicate entries on auto-increment + columns + +2008-10-30 The InnoDB Team + + * handler/ha_innodb.cc, handler/handler0vars.h, + handler/win_delay_loader.cc, mysql-test/innodb_bug40360.result, + mysql-test/innodb_bug40360.test: + Fix Bug#40360 Binlog related errors with binlog off + +2008-10-29 The InnoDB Team + + * include/data0type.ic: + Fix Bug#40369 dtype_get_sql_null_size() returns 0 or 1, not the size + +2008-10-29 The InnoDB Team + + * handler/ha_innodb.cc, include/srv0srv.h, srv/srv0srv.c: + Fix Bug#38189 innodb_stats_on_metadata missing + +2008-10-28 The InnoDB Team + + * CMakeLists.txt, ha_innodb.def, handler/ha_innodb.cc, + handler/handler0alter.cc, handler/handler0vars.h, handler/i_s.cc, + handler/win_delay_loader.cc, win-plugin/*: + Implemented the delayloading of externals for the plugin on Windows. + This makes it possible to build a dynamic plugin (ha_innodb.dll) on + Windows. + +2008-10-27 The InnoDB Team + + * CMakeLists.txt: + Fix Bug#19424 InnoDB: Possibly a memory overrun of the buffer being + freed (64-bit Visual C) + +2008-10-23 The InnoDB Team + + * ibuf/ibuf0ibuf.c: + ibuf_delete_rec(): When the cursor to the insert buffer record + cannot be restored, do not complain if the tablespace does not + exist, because the insert buffer record may have been discarded by + some other thread. This bug has existed in MySQL/InnoDB since + version 4.1, when innodb_file_per_table was implemented. + This may fix Bug#27276 InnoDB Error: ibuf cursor restoration fails. + +2008-10-22 The InnoDB Team + + * dict/dict0dict.c, dict/dict0mem.c, handler/ha_innodb.cc, + handler/ha_innodb.h, include/dict0dict.h, include/dict0mem.h, + row/row0mysql.c: + Fix Bug#39830 Table autoinc value not updated on first insert + Fix Bug#35498 Cannot get table test/table1 auto-inccounter value in + ::info + Fix Bug#36411 "Failed to read auto-increment value from storage + engine" in 5.1.24 auto-inc + +2008-10-22 The InnoDB Team + + * handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c: + Fix Bug#40224 New AUTOINC changes mask reporting of deadlock/timeout + errors + +2008-10-16 The InnoDB Team + + * dict/dict0dict.c, mysql-test/innodb-index.result, + mysql-test/innodb-index.test: + Skip the undo log size check when creating REDUNDANT and COMPACT + tables. In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED, column + prefix indexes require that prefixes of externally stored columns + be written to the undo log. This may make the undo log record + bigger than the record on the B-tree page. The maximum size of an + undo log record is the page size. That must be checked for, in + dict_index_add_to_cache(). However, this restriction must not + be enforced on REDUNDANT or COMPACT tables. + +2008-10-15 The InnoDB Team + + * btr/btr0cur.c, include/btr0cur.h, row/row0ext.c, row/row0sel.c, + row/row0upd.c: + When the server crashes while freeing an externally stored column + of a compressed table, the BTR_EXTERN_LEN field in the BLOB + pointer will be written as 0. Tolerate this in the functions that + deal with externally stored columns. This fixes problems after + crash recovery, in the rollback of incomplete transactions, and in + the purge of delete-marked records. + +2008-10-15 The InnoDB Team + + * btr/btr0btr.c, include/page0zip.h, page/page0zip.c, include/univ.i: + When a B-tree node of a compressed table is split or merged, the + compression may fail. In this case, the entire compressed page + will be copied and the excess records will be deleted. However, + page_zip_copy(), now renamed to page_zip_copy_recs(), copied too + many fields in the page header, overwriting PAGE_BTR_SEG_LEAF and + PAGE_BTR_SEG_TOP when splitting the B-tree root. This caused + corruption of compressed tables. Furthermore, the lock table and + the adaptive hash index would be corrupted, because we forgot to + update them when invoking page_zip_copy_recs(). + + Introduce the symbol UNIV_ZIP_DEBUG for triggering the copying of + compressed pages more often, for debugging purposes. + +2008-10-10 The InnoDB Team + + * handler/handler0alter.cc, include/row0merge.h, row/row0merge.c, + row/row0mysql.c: + Fix some locking issues, mainly in fast index creation. The + InnoDB data dictionary cache should be latched whenever a + transaction is holding locks on any data dictionary tables. + Otherwise, lock waits or deadlocks could occur. Furthermore, the + data dictionary transaction must be committed (and the locks + released) before the data dictionary latch is released. + + ha_innobase::add_index(): Lock the data dictionary before renaming + or dropping the created indexes, because neither operation will + commit the data dictionary transaction. + + ha_innobase::final_drop_index(): Commit the transactions before + unlocking the data dictionary. + +2008-10-09 The InnoDB Team + + * buf/buf0lru.c: + Fix Bug#39939 DROP TABLE/DISCARD TABLESPACE takes long time in + buf_LRU_invalidate_tablespace() + +2008-10-08 The InnoDB Team + + * dict/dict0crea.c, trx/trx0roll.c, include/row0mysql.h, + row/row0merge.c, row/row0mysql.c: + When dropping a table, hold the data dictionary latch until the + transaction has been committed. The data dictionary latch is + supposed to prevent lock waits and deadlocks in the data + dictionary tables. Due to this bug, DROP TABLE could cause a + deadlock or hang. Note that because of Bug#33650 and Bug#39833, + MySQL may also drop a (temporary) table when executing CREATE INDEX + or ALTER TABLE ... ADD INDEX. + +2008-10-04 The InnoDB Team + + * handler/ha_innodb.cc, mysql-test/innodb_bug39438-master.opt, + mysql-test/innodb_bug39438.result, mysql-test/innodb_bug39438.test: + Fix Bug#39438 Testcase for Bug#39436 crashes on 5.1 in + fil_space_get_latch + +2008-10-04 The InnoDB Team + + * include/lock0lock.h, lock/lock0lock.c, + mysql-test/innodb_bug38231.result, mysql-test/innodb_bug38231.test, + row/row0mysql.c: + Fix Bug#38231 Innodb crash in lock_reset_all_on_table() on TRUNCATE + + LOCK / UNLOCK + +2008-10-04 The InnoDB Team + + * handler/ha_innodb.cc: + Fix Bug#35498 Cannot get table test/table1 auto-inccounter value in + ::info + +2008-10-04 The InnoDB Team + + * handler/ha_innodb.cc, handler/ha_innodb.h: + Fix Bug#37788 InnoDB Plugin: AUTO_INCREMENT wrong for compressed + tables + +2008-10-04 The InnoDB Team + + * dict/dict0dict.c, handler/ha_innodb.cc, handler/ha_innodb.h, + include/dict0dict.h, include/dict0mem.h, row/row0mysql.c: + Fix Bug#39830 Table autoinc value not updated on first insert + +2008-10-03 The InnoDB Team + + * mysql-test/innodb-index.test, mysql-test/innodb-index.result, + mysql-test/innodb-timeout.test, mysql-test/innodb-timeout.result, + srv/srv0srv.c, include/srv0srv.h, handler/ha_innodb.cc, + include/ha_prototypes.h: + Fix Bug#36285 innodb_lock_wait_timeout is not dynamic, not per session + +2008-09-19 The InnoDB Team + + * os/os0proc.c: + Fix a memory leak on Windows. The memory leak was due to wrong + parameters passed into VirtualFree() call. As the result, the + call fails with Windows error 87. + +2008-09-17 The InnoDB Team + + * mysql-test/innodb.result, mysql-test/innodb-zip.result, + mysql-test/innodb-zip.test, mysql-test/innodb.test, ibuf/ibuf0ibuf.c, + dict/dict0crea.c, dict/dict0load.c, dict/dict0boot.c, + include/dict0dict.h, include/trx0trx.h, dict/dict0dict.c, + trx/trx0trx.c, include/ha_prototypes.h, handler/ha_innodb.cc: + When creating an index in innodb_strict_mode, check that the + maximum record size will never exceed the B-tree page size limit. + For uncompressed tables, there should always be enough space for + two records in an empty B-tree page. For compressed tables, there + should be enough space for storing two node pointer records or one + data record in an empty page in uncompressed format. + The purpose of this check is to guarantee that INSERT or UPDATE + will never fail due to too big record size. + +2008-09-17 The InnoDB Team + + * btr/btr0cur.c, data/data0data.c, include/page0zip.h, + include/page0zip.ic, page/page0zip.c, mysql-test/innodb_bug36172.test: + Prevent infinite B-tree page splits in compressed tables by + ensuring that there will always be enough space for two node + pointer records in an empty B-tree page. Also, require that at + least one data record will fit in an empty compressed page. This + will reduce the maximum size of records in compressed tables. + +2008-09-09 The InnoDB Team + + * mysql-test/innodb.result: + Fix the failing innodb test by merging changes that MySQL made to + that file (r2646.12.1 in MySQL BZR repository) + +2008-09-09 The InnoDB Team + + * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result, + mysql-test/innodb-autoinc.test: + Fix Bug#38839 auto increment does not work properly with InnoDB after + update + +2008-09-09 The InnoDB Team + + * dict/dict0dict.c, handler/handler0alter.cc, include/dict0dict.h, + mysql-test/innodb-index.result, mysql-test/innodb-index.test: + Fix Bug#38786 InnoDB plugin crashes on drop table/create table with FK + +2008-08-21 The InnoDB Team + + * handler/ha_innodb.cc, include/ha_prototypes.h, row/row0sel.c: + Fix Bug#37885 row_search_for_mysql may gap lock unnecessarily with SQL + comments in query + +2008-08-21 The InnoDB Team + + * handler/ha_innodb.cc: + Fix Bug#38185 ha_innobase::info can hold locks even when called with + HA_STATUS_NO_LOCK + +2008-08-18 The InnoDB Team + + * buf/buf0buf.c, buf/buf0lru.c, include/buf0buf.ic, include/univ.i: + Introduce UNIV_LRU_DEBUG for debugging the LRU buffer pool cache + +2008-08-08 The InnoDB Team + + * buf/buf0lru.c, include/buf0buf.h: + Fix two recovery bugs that could lead to a crash in debug builds with + small buffer size + +2008-08-07 The InnoDB Team + + * btr/btr0cur.c, handler/ha_innodb.cc, include/srv0srv.h, + srv/srv0srv.c: + Add a parameter innodb_stats_sample_pages to allow users to control + the number of index dives when InnoDB estimates the cardinality of + an index (ANALYZE TABLE, SHOW TABLE STATUS etc) + +2008-08-07 The InnoDB Team + + * trx/trx0i_s.c: + Fix a bug that would lead to a crash if a SELECT was issued from the + INFORMATION_SCHEMA tables and there are rolling back transactions at + the same time + +2008-08-06 The InnoDB Team + + * btr/btr0btr.c, btr/btr0cur.c, ibuf/ibuf0ibuf.c, include/btr0cur.h, + include/trx0roll.h, include/trx0types.h, row/row0purge.c, + row/row0uins.c, row/row0umod.c, trx/trx0roll.c: + In the rollback of incomplete transactions after crash recovery, + tolerate clustered index records whose externally stored columns + have not been written. + +2008-07-30 The InnoDB Team + + * trx/trx0trx.c: + Fixes a race in recovery where the recovery thread recovering a + PREPARED trx and the background rollback thread can both try + to free the trx after its status is set to COMMITTED_IN_MEMORY. + +2008-07-29 The InnoDB Team + + * include/trx0rec.h, row/row0purge.c, row/row0vers.c, trx/trx0rec.c: + Fix a BLOB corruption bug + +2008-07-15 The InnoDB Team + + * btr/btr0sea.c, dict/dict0dict.c, include/btr0sea.h: + Fixed a timing hole where a thread dropping an index can free the + in-memory index struct while another thread is still using that + structure to remove entries from adaptive hash index belonging + to one of the pages that belongs to the index being dropped. + +2008-07-04 The InnoDB Team + + * mysql-test/innodb-index.result: + Fix the failing innodb-index test by adjusting the result to a new + MySQL behavior (the change occured in BZR-r2667) + +2008-07-03 The InnoDB Team + + * mysql-test/innodb-zip.result, mysql-test/innodb-zip.test: + Remove the negative test cases that produce warnings + +2008-07-02 The InnoDB Team + + * mysql-test/innodb-replace.result, mysql-test/innodb-index.test: + Disable part of innodb-index test because MySQL changed its behavior + and is not calling ::add_index() anymore when adding primary index on + non-NULL column + +2008-07-01 The InnoDB Team + + * mysql-test/innodb-replace.result, mysql-test/innodb-replace.test: + Fix the failing innodb-replace test by merging changes that MySQL + made to that file (r2659 in MySQL BZR repository) + +2008-07-01 The InnoDB Team + + * lock/lock0lock.c: + Fix Bug#36942 Performance problem in lock_get_n_rec_locks (SHOW INNODB + STATUS) + +2008-07-01 The InnoDB Team + + * ha/ha0ha.c: + Fix Bug#36941 Performance problem in ha_print_info (SHOW INNODB + STATUS) + +2008-07-01 The InnoDB Team + + * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result, + mysql-test/innodb-autoinc.test: + Fix Bug#37531 After truncate, auto_increment behaves incorrectly for + InnoDB + +2008-06-19 The InnoDB Team + + * handler/ha_innodb.cc: + Rewrite the function innodb_plugin_init() to support parameters in + different order (in static and dynamic InnoDB) and to support more + parameters in the static InnoDB + +2008-06-19 The InnoDB Team + + * handler/handler0alter.cc: + Fix a bug in ::add_index() which set the transaction state to "active" + but never restored it to the original value. This bug caused warnings + to be printed by the rpl.rpl_ddl mysql-test. + +2008-06-19 The InnoDB Team + + * mysql-test/patches: + Add a directory which contains patches, which need to be applied to + MySQL source in order to get some mysql-tests to succeed. The patches + cannot be committed in MySQL repository because they are specific to + the InnoDB plugin. + +2008-06-19 The InnoDB Team + + * mysql-test/innodb-zip.result, mysql-test/innodb-zip.test, + row/row0row.c: + Fix an anomaly when updating a record with BLOB prefix + +2008-06-18 The InnoDB Team + + * include/trx0sys.h, srv/srv0start.c, trx/trx0sys.c: + Fix a bug in recovery which was a side effect of the file_format_check + changes + +2008-06-09 The InnoDB Team + + * mysql-test/innodb.result: + Fix the failing innodb test by merging changes that MySQL made to that + file + +2008-06-06 The InnoDB Team + + * buf/buf0buf.c, handler/ha_innodb.cc, include/buf0buf.h, + include/srv0srv.h, srv/srv0srv.c: + Fix Bug#36600 SHOW STATUS takes a lot of CPU in + buf_get_latched_pages_number + + * handler/ha_innodb.cc, os/os0file.c: + Fix Bug#11894 innodb_file_per_table crashes w/ Windows .sym symbolic + link hack + + * include/ut0ut.h, srv/srv0srv.c, ut/ut0ut.c: + Fix Bug#36819 ut_usectime does not handle errors from gettimeofday + + * handler/ha_innodb.cc: + Fix Bug#35602 Failed to read auto-increment value from storage engine + + * srv/srv0start.c: + Fix Bug#36149 Read buffer overflow in srv0start.c found during "make + test" + +2008-05-08 The InnoDB Team + + * btr/btr0btr.c, mysql-test/innodb_bug36172.result, + mysql-test/innodb_bug36172.test: + Fix Bug#36172 insert into compressed innodb table crashes + +2008-05-08 The InnoDB Team + + InnoDB Plugin 1.0.1 released + +2008-05-06 The InnoDB Team + + * handler/ha_innodb.cc, include/srv0srv.h, include/sync0sync.h, + include/trx0sys.h, mysql-test/innodb-zip.result, + mysql-test/innodb-zip.test, srv/srv0srv.c, srv/srv0start.c, + sync/sync0sync.c, trx/trx0sys.c: + Implement the system tablespace tagging + + * handler/ha_innodb.cc, handler/i_s.cc, include/univ.i, + srv/srv0start.c: + Add InnoDB version in INFORMATION_SCHEMA.PLUGINS.PLUGIN_VERSION, + in the startup message and in a server variable innodb_version. + + * sync/sync0sync.c: + Fix a bug in the sync debug code where a lock with level + SYNC_LEVEL_VARYING would cause an assertion failure when a thread + tried to release it. + +2008-04-30 The InnoDB Team + + * Makefile.am: + Fix Bug#36434 ha_innodb.so is installed in the wrong directory + + * handler/ha_innodb.cc: + Merge change from MySQL (Fix Bug#35406 5.1-opt crashes on select from + I_S.REFERENTIAL_CONSTRAINTS): + ChangeSet@1.2563, 2008-03-18 19:42:04+04:00, gluh@mysql.com +1 -0 + + * scripts/install_innodb_plugins.sql: + Added + + * mysql-test/innodb.result: + Merge change from MySQL (this fixes the failing innodb test): + ChangeSet@1.1810.3601.4, 2008-02-07 02:33:21+04:00 + + * row/row0sel.c: + Fix Bug#35226 RBR event crashes slave + + * handler/ha_innodb.cc: + Change the fix for Bug#32440 to show bytes instead of kilobytes in + INFORMATION_SCHEMA.TABLES.DATA_FREE + + * handler/ha_innodb.cc, mysql-test/innodb.result, + mysql-test/innodb.test: + Fix Bug#29507 TRUNCATE shows to many rows effected + + * handler/ha_innodb.cc, mysql-test/innodb.result, + mysql-test/innodb.test: + Fix Bug#35537 Innodb doesn't increment handler_update and + handler_delete + +2008-04-29 The InnoDB Team + + * handler/i_s.cc, include/srv0start.h, srv/srv0start.c: + Fix Bug#36310 InnoDB plugin crash + +2008-04-23 The InnoDB Team + + * mysql-test/innodb_bug36169.result, mysql-test/innodb_bug36169.test, + row/row0mysql.c: + Fix Bug#36169 create innodb compressed table with too large row size + crashed + + * (outside the source tree): + Fix Bug#36222 New InnoDB plugin 1.0 has wrong MKDIR_P defined in + Makefile.in + +2008-04-15 The InnoDB Team + + InnoDB Plugin 1.0.0 released diff --git a/storage/xtradb/Makefile.am b/storage/xtradb/Makefile.am new file mode 100644 index 00000000000..8f64aedb9b0 --- /dev/null +++ b/storage/xtradb/Makefile.am @@ -0,0 +1,195 @@ +# Copyright (C) 2001, 2004, 2006 MySQL AB & Innobase Oy +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# Process this file with automake to create Makefile.in + +MYSQLDATAdir= $(localstatedir) +MYSQLSHAREdir= $(pkgdatadir) +MYSQLBASEdir= $(prefix) +MYSQLLIBdir= $(pkglibdir) +pkgplugindir= $(pkglibdir)/plugin +INCLUDES= -I$(top_srcdir)/include -I$(top_builddir)/include \ + -I$(top_srcdir)/regex \ + -I$(top_srcdir)/storage/xtradb/include \ + -I$(top_srcdir)/sql \ + -I$(srcdir) + +DEFS= @DEFS@ + + +noinst_HEADERS= include/btr0btr.h include/btr0btr.ic \ + include/btr0cur.h include/btr0cur.ic \ + include/btr0pcur.h include/btr0pcur.ic \ + include/btr0sea.h include/btr0sea.ic \ + include/btr0types.h include/buf0buddy.h \ + include/buf0buddy.ic include/buf0buf.h \ + include/buf0buf.ic include/buf0flu.h \ + include/buf0flu.ic include/buf0lru.h \ + include/buf0lru.ic include/buf0rea.h \ + include/buf0types.h include/data0data.h \ + include/data0data.ic include/data0type.h \ + include/data0type.ic include/data0types.h \ + include/db0err.h include/dict0boot.h \ + include/dict0boot.ic include/dict0crea.h \ + include/dict0crea.ic include/dict0dict.h \ + include/dict0dict.ic include/dict0load.h \ + include/dict0load.ic include/dict0mem.h \ + include/dict0mem.ic include/dict0types.h \ + include/dyn0dyn.h include/dyn0dyn.ic \ + include/eval0eval.h include/eval0eval.ic \ + include/eval0proc.h include/eval0proc.ic \ + include/fil0fil.h include/fsp0fsp.h \ + include/fsp0fsp.ic include/fut0fut.h \ + include/fut0fut.ic include/fut0lst.h \ + include/fut0lst.ic include/ha0ha.h \ + include/ha0ha.ic \ + include/ha0storage.h \ + include/ha0storage.ic \ + include/hash0hash.h \ + include/hash0hash.ic include/ibuf0ibuf.h \ + include/ibuf0ibuf.ic include/ibuf0types.h \ + include/lock0iter.h \ + include/lock0lock.h include/lock0lock.ic \ + include/lock0priv.h include/lock0priv.ic \ + include/lock0types.h include/log0log.h \ + include/log0log.ic include/log0recv.h \ + include/log0recv.ic include/mach0data.h \ + include/mach0data.ic include/mem0dbg.h \ + include/mem0dbg.ic mem/mem0dbg.c \ + include/mem0mem.h include/mem0mem.ic \ + include/mem0pool.h include/mem0pool.ic \ + include/mtr0log.h include/mtr0log.ic \ + include/mtr0mtr.h include/mtr0mtr.ic \ + include/mtr0types.h \ + include/mysql_addons.h \ + include/os0file.h \ + include/os0proc.h include/os0proc.ic \ + include/os0sync.h include/os0sync.ic \ + include/os0thread.h include/os0thread.ic \ + include/page0cur.h include/page0cur.ic \ + include/page0page.h include/page0page.ic \ + include/page0zip.h include/page0zip.ic \ + include/page0types.h include/pars0grm.h \ + include/pars0opt.h include/pars0opt.ic \ + include/pars0pars.h include/pars0pars.ic \ + include/pars0sym.h include/pars0sym.ic \ + include/pars0types.h include/que0que.h \ + include/que0que.ic include/que0types.h \ + include/read0read.h include/read0read.ic \ + include/read0types.h include/rem0cmp.h \ + include/rem0cmp.ic include/rem0rec.h \ + include/rem0rec.ic include/rem0types.h \ + include/row0ext.h include/row0ext.ic \ + include/row0ins.h include/row0ins.ic \ + include/row0merge.h \ + include/row0mysql.h include/row0mysql.ic \ + include/row0purge.h include/row0purge.ic \ + include/row0row.h include/row0row.ic \ + include/row0sel.h include/row0sel.ic \ + include/row0types.h include/row0uins.h \ + include/row0uins.ic include/row0umod.h \ + include/row0umod.ic include/row0undo.h \ + include/row0undo.ic include/row0upd.h \ + include/row0upd.ic include/row0vers.h \ + include/row0vers.ic include/srv0que.h \ + include/srv0srv.h include/srv0srv.ic \ + include/srv0start.h include/sync0arr.h \ + include/sync0arr.ic include/sync0rw.h \ + include/sync0rw.ic include/sync0sync.h \ + include/sync0sync.ic include/sync0types.h \ + include/thr0loc.h include/thr0loc.ic \ + include/trx0i_s.h \ + include/trx0purge.h include/trx0purge.ic \ + include/trx0rec.h include/trx0rec.ic \ + include/trx0roll.h include/trx0roll.ic \ + include/trx0rseg.h include/trx0rseg.ic \ + include/trx0sys.h include/trx0sys.ic \ + include/trx0trx.h include/trx0trx.ic \ + include/trx0types.h include/trx0undo.h \ + include/trx0undo.ic include/trx0xa.h \ + include/univ.i include/usr0sess.h \ + include/usr0sess.ic include/usr0types.h \ + include/ut0byte.h include/ut0byte.ic \ + include/ut0dbg.h include/ut0lst.h \ + include/ut0mem.h include/ut0mem.ic \ + include/ut0rnd.h include/ut0rnd.ic \ + include/ut0sort.h include/ut0ut.h \ + include/ut0ut.ic include/ut0vec.h \ + include/ut0vec.ic include/ut0list.h \ + include/ut0list.ic include/ut0wqueue.h \ + include/ha_prototypes.h handler/ha_innodb.h \ + include/handler0alter.h \ + handler/i_s.h handler/innodb_patch_info.h + +EXTRA_LIBRARIES= libinnobase.a +noinst_LIBRARIES= @plugin_innobase_static_target@ +libinnobase_a_SOURCES= btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c \ + btr/btr0sea.c buf/buf0buddy.c \ + buf/buf0buf.c buf/buf0flu.c \ + buf/buf0lru.c buf/buf0rea.c data/data0data.c \ + data/data0type.c dict/dict0boot.c \ + dict/dict0crea.c dict/dict0dict.c \ + dict/dict0load.c dict/dict0mem.c dyn/dyn0dyn.c \ + eval/eval0eval.c eval/eval0proc.c \ + fil/fil0fil.c fsp/fsp0fsp.c fut/fut0fut.c \ + fut/fut0lst.c ha/ha0ha.c \ + ha/ha0storage.c \ + ha/hash0hash.c \ + ibuf/ibuf0ibuf.c lock/lock0iter.c \ + lock/lock0lock.c \ + log/log0log.c log/log0recv.c mach/mach0data.c \ + mem/mem0mem.c mem/mem0pool.c mtr/mtr0log.c \ + mtr/mtr0mtr.c os/os0file.c os/os0proc.c \ + os/os0sync.c os/os0thread.c page/page0cur.c \ + page/page0page.c page/page0zip.c \ + pars/lexyy.c pars/pars0grm.c \ + pars/pars0opt.c pars/pars0pars.c \ + pars/pars0sym.c que/que0que.c read/read0read.c \ + rem/rem0cmp.c rem/rem0rec.c row/row0ext.c \ + row/row0ins.c row/row0merge.c \ + row/row0mysql.c row/row0purge.c row/row0row.c \ + row/row0sel.c row/row0uins.c row/row0umod.c \ + row/row0undo.c row/row0upd.c row/row0vers.c \ + srv/srv0que.c srv/srv0srv.c srv/srv0start.c \ + sync/sync0arr.c sync/sync0rw.c \ + sync/sync0sync.c thr/thr0loc.c \ + trx/trx0i_s.c \ + trx/trx0purge.c \ + trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c \ + trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c \ + usr/usr0sess.c ut/ut0byte.c ut/ut0dbg.c \ + ut/ut0list.c ut/ut0mem.c ut/ut0rnd.c \ + ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c \ + handler/ha_innodb.cc handler/handler0alter.cc \ + handler/i_s.cc \ + handler/mysql_addons.cc + +libinnobase_a_CXXFLAGS= $(AM_CFLAGS) +libinnobase_a_CFLAGS= $(AM_CFLAGS) + +EXTRA_LTLIBRARIES= ha_innodb.la +pkgplugin_LTLIBRARIES= @plugin_innobase_shared_target@ + +ha_innodb_la_LDFLAGS= -module -rpath $(pkgplugindir) +ha_innodb_la_CXXFLAGS= $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS) +ha_innodb_la_CFLAGS= $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS) +ha_innodb_la_SOURCES= $(libinnobase_a_SOURCES) + +EXTRA_DIST= CMakeLists.txt plug.in \ + pars/make_bison.sh pars/make_flex.sh \ + pars/pars0grm.y pars/pars0lex.l + +# Don't update the files from bitkeeper +%::SCCS/s.% diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.c new file mode 100644 index 00000000000..2029a95cc19 --- /dev/null +++ b/storage/xtradb/btr/btr0btr.c @@ -0,0 +1,3656 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#ifdef UNIV_NONINL +#include "btr0btr.ic" +#endif + +#include "fsp0fsp.h" +#include "page0page.h" +#include "page0zip.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0trx.h" + +/* +Latching strategy of the InnoDB B-tree +-------------------------------------- +A tree latch protects all non-leaf nodes of the tree. Each node of a tree +also has a latch of its own. + +A B-tree operation normally first acquires an S-latch on the tree. It +searches down the tree and releases the tree latch when it has the +leaf node latch. To save CPU time we do not acquire any latch on +non-leaf nodes of the tree during a search, those pages are only bufferfixed. + +If an operation needs to restructure the tree, it acquires an X-latch on +the tree before searching to a leaf node. If it needs, for example, to +split a leaf, +(1) InnoDB decides the split point in the leaf, +(2) allocates a new page, +(3) inserts the appropriate node pointer to the first non-leaf level, +(4) releases the tree X-latch, +(5) and then moves records from the leaf to the new allocated page. + +Node pointers +------------- +Leaf pages of a B-tree contain the index records stored in the +tree. On levels n > 0 we store 'node pointers' to pages on level +n - 1. For each page there is exactly one node pointer stored: +thus the our tree is an ordinary B-tree, not a B-link tree. + +A node pointer contains a prefix P of an index record. The prefix +is long enough so that it determines an index record uniquely. +The file page number of the child page is added as the last +field. To the child page we can store node pointers or index records +which are >= P in the alphabetical order, but < P1 if there is +a next node pointer on the level, and P1 is its prefix. + +If a node pointer with a prefix P points to a non-leaf child, +then the leftmost record in the child must have the same +prefix P. If it points to a leaf node, the child is not required +to contain any record with a prefix equal to P. The leaf case +is decided this way to allow arbitrary deletions in a leaf node +without touching upper levels of the tree. + +We have predefined a special minimum record which we +define as the smallest record in any alphabetical order. +A minimum record is denoted by setting a bit in the record +header. A minimum record acts as the prefix of a node pointer +which points to a leftmost node on any level of the tree. + +File page allocation +-------------------- +In the root node of a B-tree there are two file segment headers. +The leaf pages of a tree are allocated from one file segment, to +make them consecutive on disk if possible. From the other file segment +we allocate pages for the non-leaf levels of the tree. +*/ + +#ifdef UNIV_BTR_DEBUG +/****************************************************************** +Checks a file segment header within a B-tree root page. */ +static +ibool +btr_root_fseg_validate( +/*===================*/ + /* out: TRUE if valid */ + const fseg_header_t* seg_header, /* in: segment header */ + ulint space) /* in: tablespace identifier */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space); + ut_a(offset >= FIL_PAGE_DATA); + ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + return(TRUE); +} +#endif /* UNIV_BTR_DEBUG */ + +/****************************************************************** +Gets the root node of a tree and x-latches it. */ +static +buf_block_t* +btr_root_block_get( +/*===============*/ + /* out: root page, x-latched */ + dict_index_t* index, /* in: index tree */ + mtr_t* mtr) /* in: mtr */ +{ + ulint space; + ulint zip_size; + ulint root_page_no; + buf_block_t* block; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + root_page_no = dict_index_get_page(index); + + block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr); + ut_a((ibool)!!page_is_comp(buf_block_get_frame(block)) + == dict_table_is_comp(index->table)); +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } +#endif /* UNIV_BTR_DEBUG */ + + return(block); +} + +/****************************************************************** +Gets the root node of a tree and x-latches it. */ +UNIV_INTERN +page_t* +btr_root_get( +/*=========*/ + /* out: root page, x-latched */ + dict_index_t* index, /* in: index tree */ + mtr_t* mtr) /* in: mtr */ +{ + return(buf_block_get_frame(btr_root_block_get(index, mtr))); +} + +/***************************************************************** +Gets pointer to the previous user record in the tree. It is assumed that +the caller has appropriate latches on the page and its neighbor. */ +UNIV_INTERN +rec_t* +btr_get_prev_user_rec( +/*==================*/ + /* out: previous user record, NULL if there is none */ + rec_t* rec, /* in: record on leaf level */ + mtr_t* mtr) /* in: mtr holding a latch on the page, and if + needed, also to the previous page */ +{ + page_t* page; + page_t* prev_page; + ulint prev_page_no; + + if (!page_rec_is_infimum(rec)) { + + rec_t* prev_rec = page_rec_get_prev(rec); + + if (!page_rec_is_infimum(prev_rec)) { + + return(prev_rec); + } + } + + page = page_align(rec); + prev_page_no = btr_page_get_prev(page, mtr); + + if (prev_page_no != FIL_NULL) { + + ulint space; + ulint zip_size; + buf_block_t* prev_block; + + space = page_get_space_id(page); + zip_size = fil_space_get_zip_size(space); + + prev_block = buf_page_get_with_no_latch(space, zip_size, + prev_page_no, mtr); + prev_page = buf_block_get_frame(prev_block); + /* The caller must already have a latch to the brother */ + ut_ad(mtr_memo_contains(mtr, prev_block, + MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, prev_block, + MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_page) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + return(page_rec_get_prev(page_get_supremum_rec(prev_page))); + } + + return(NULL); +} + +/***************************************************************** +Gets pointer to the next user record in the tree. It is assumed that the +caller has appropriate latches on the page and its neighbor. */ +UNIV_INTERN +rec_t* +btr_get_next_user_rec( +/*==================*/ + /* out: next user record, NULL if there is none */ + rec_t* rec, /* in: record on leaf level */ + mtr_t* mtr) /* in: mtr holding a latch on the page, and if + needed, also to the next page */ +{ + page_t* page; + page_t* next_page; + ulint next_page_no; + + if (!page_rec_is_supremum(rec)) { + + rec_t* next_rec = page_rec_get_next(rec); + + if (!page_rec_is_supremum(next_rec)) { + + return(next_rec); + } + } + + page = page_align(rec); + next_page_no = btr_page_get_next(page, mtr); + + if (next_page_no != FIL_NULL) { + ulint space; + ulint zip_size; + buf_block_t* next_block; + + space = page_get_space_id(page); + zip_size = fil_space_get_zip_size(space); + + next_block = buf_page_get_with_no_latch(space, zip_size, + next_page_no, mtr); + next_page = buf_block_get_frame(next_block); + /* The caller must already have a latch to the brother */ + ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, next_block, + MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + return(page_rec_get_next(page_get_infimum_rec(next_page))); + } + + return(NULL); +} + +/****************************************************************** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +static +void +btr_page_create( +/*============*/ + buf_block_t* block, /* in/out: page to be created */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dict_index_t* index, /* in: index */ + ulint level, /* in: the B-tree level of the page */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_create_zip(block, index, level, mtr); + } else { + page_create(block, mtr, dict_table_is_comp(index->table)); + /* Set the level of the new index page */ + btr_page_set_level(page, NULL, level, mtr); + } + + block->check_index_page_at_flush = TRUE; + + btr_page_set_index_id(page, page_zip, index->id, mtr); +} + +/****************************************************************** +Allocates a new file page to be used in an ibuf tree. Takes the page from +the free list of the tree, which must contain pages! */ +static +buf_block_t* +btr_page_alloc_for_ibuf( +/*====================*/ + /* out: new allocated block, x-latched */ + dict_index_t* index, /* in: index tree */ + mtr_t* mtr) /* in: mtr */ +{ + fil_addr_t node_addr; + page_t* root; + page_t* new_page; + buf_block_t* new_block; + + root = btr_root_get(index, mtr); + + node_addr = flst_get_first(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST, mtr); + ut_a(node_addr.page != FIL_NULL); + + new_block = buf_page_get(dict_index_get_space(index), + dict_table_zip_size(index->table), + node_addr.page, RW_X_LATCH, mtr); + new_page = buf_block_get_frame(new_block); + buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + + flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, + mtr); + ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + mtr)); + + return(new_block); +} + +/****************************************************************** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + /* out: new allocated block, x-latched; + NULL if out of space */ + dict_index_t* index, /* in: index */ + ulint hint_page_no, /* in: hint of a good page */ + byte file_direction, /* in: direction where a possible + page split is made */ + ulint level, /* in: level where the page is placed + in the tree */ + mtr_t* mtr) /* in: mtr */ +{ + fseg_header_t* seg_header; + page_t* root; + buf_block_t* new_block; + ulint new_page_no; + + if (dict_index_is_ibuf(index)) { + + return(btr_page_alloc_for_ibuf(index, mtr)); + } + + root = btr_root_get(index, mtr); + + if (level == 0) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + } else { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + } + + /* Parameter TRUE below states that the caller has made the + reservation for free extents, and thus we know that a page can + be allocated: */ + + new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no, + file_direction, TRUE, mtr); + if (new_page_no == FIL_NULL) { + + return(NULL); + } + + new_block = buf_page_get(dict_index_get_space(index), + dict_table_zip_size(index->table), + new_page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW); + + return(new_block); +} + +/****************************************************************** +Gets the number of pages in a B-tree. */ +UNIV_INTERN +ulint +btr_get_size( +/*=========*/ + /* out: number of pages */ + dict_index_t* index, /* in: index */ + ulint flag) /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ +{ + fseg_header_t* seg_header; + page_t* root; + ulint n; + ulint dummy; + mtr_t mtr; + + mtr_start(&mtr); + + mtr_s_lock(dict_index_get_lock(index), &mtr); + + root = btr_root_get(index, &mtr); + + if (flag == BTR_N_LEAF_PAGES) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fseg_n_reserved_pages(seg_header, &n, &mtr); + + } else if (flag == BTR_TOTAL_SIZE) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + n = fseg_n_reserved_pages(seg_header, &dummy, &mtr); + + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + n += fseg_n_reserved_pages(seg_header, &dummy, &mtr); + } else { + ut_error; + } + + mtr_commit(&mtr); + + return(n); +} + +/****************************************************************** +Frees a page used in an ibuf tree. Puts the page to the free list of the +ibuf tree. */ +static +void +btr_page_free_for_ibuf( +/*===================*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: block to be freed, x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* root; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + root = btr_root_get(index, mtr); + + flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + buf_block_get_frame(block) + + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + + ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + mtr)); +} + +/****************************************************************** +Frees a file page used in an index tree. Can be used also to (BLOB) +external storage pages, because the page level 0 can be given as an +argument. */ +UNIV_INTERN +void +btr_page_free_low( +/*==============*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: block to be freed, x-latched */ + ulint level, /* in: page level */ + mtr_t* mtr) /* in: mtr */ +{ + fseg_header_t* seg_header; + page_t* root; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* The page gets invalid for optimistic searches: increment the frame + modify clock */ + + buf_block_modify_clock_inc(block); + + if (dict_index_is_ibuf(index)) { + + btr_page_free_for_ibuf(index, block, mtr); + + return; + } + + root = btr_root_get(index, mtr); + + if (level == 0) { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + } else { + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + } + + fseg_free_page(seg_header, + buf_block_get_space(block), + buf_block_get_page_no(block), mtr); +} + +/****************************************************************** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ +UNIV_INTERN +void +btr_page_free( +/*==========*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: block to be freed, x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + ulint level; + + level = btr_page_get_level(buf_block_get_frame(block), mtr); + + btr_page_free_low(index, block, level, mtr); +} + +/****************************************************************** +Sets the child node file address in a node pointer. */ +UNIV_INLINE +void +btr_node_ptr_set_child_page_no( +/*===========================*/ + rec_t* rec, /* in: node pointer record */ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint page_no,/* in: child node address */ + mtr_t* mtr) /* in: mtr */ +{ + byte* field; + ulint len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!page_is_leaf(page_align(rec))); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == REC_NODE_PTR_SIZE); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_node_ptr(page_zip, rec, + rec_offs_data_size(offsets), + page_no, mtr); + } else { + mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr); + } +} + +/**************************************************************** +Returns the child page of a node pointer and x-latches it. */ +static +buf_block_t* +btr_node_ptr_get_child( +/*===================*/ + /* out: child page, x-latched */ + const rec_t* node_ptr,/* in: node pointer */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr) /* in: mtr */ +{ + ulint page_no; + ulint space; + + ut_ad(rec_offs_validate(node_ptr, index, offsets)); + space = page_get_space_id(page_align(node_ptr)); + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + + return(btr_block_get(space, dict_table_zip_size(index->table), + page_no, RW_X_LATCH, mtr)); +} + +/**************************************************************** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. */ +static +ulint* +btr_page_get_father_node_ptr( +/*=========================*/ + /* out: rec_get_offsets() of the + node pointer record */ + ulint* offsets,/* in: work area for the return value */ + mem_heap_t* heap, /* in: memory heap to use */ + btr_cur_t* cursor, /* in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + dtuple_t* tuple; + rec_t* user_rec; + rec_t* node_ptr; + ulint level; + ulint page_no; + dict_index_t* index; + + page_no = buf_block_get_page_no(btr_cur_get_block(cursor)); + index = btr_cur_get_index(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + + ut_ad(dict_index_get_page(index) != page_no); + + level = btr_page_get_level(btr_cur_get_page(cursor), mtr); + user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level); + + btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE, + BTR_CONT_MODIFY_TREE, cursor, 0, mtr); + + node_ptr = btr_cur_get_rec(cursor); + ut_ad(!page_rec_is_comp(node_ptr) + || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + + if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets) + != page_no)) { + rec_t* print_rec; + fputs("InnoDB: Dump of the child page:\n", stderr); + buf_page_print(page_align(user_rec), 0); + fputs("InnoDB: Dump of the parent page:\n", stderr); + buf_page_print(page_align(node_ptr), 0); + + fputs("InnoDB: Corruption of an index tree: table ", stderr); + ut_print_name(stderr, NULL, TRUE, index->table_name); + fputs(", index ", stderr); + ut_print_name(stderr, NULL, FALSE, index->name); + fprintf(stderr, ",\n" + "InnoDB: father ptr page no %lu, child page no %lu\n", + (ulong) + btr_node_ptr_get_child_page_no(node_ptr, offsets), + (ulong) page_no); + print_rec = page_rec_get_next( + page_get_infimum_rec(page_align(user_rec))); + offsets = rec_get_offsets(print_rec, index, + offsets, ULINT_UNDEFINED, &heap); + page_rec_print(print_rec, offsets); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(node_ptr, offsets); + + fputs("InnoDB: You should dump + drop + reimport the table" + " to fix the\n" + "InnoDB: corruption. If the crash happens at " + "the database startup, see\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "forcing-recovery.html about\n" + "InnoDB: forcing recovery. " + "Then dump + drop + reimport.\n", stderr); + + ut_error; + } + + return(offsets); +} + +/**************************************************************** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. */ +static +ulint* +btr_page_get_father_block( +/*======================*/ + /* out: rec_get_offsets() of the + node pointer record */ + ulint* offsets,/* in: work area for the return value */ + mem_heap_t* heap, /* in: memory heap to use */ + dict_index_t* index, /* in: b-tree index */ + buf_block_t* block, /* in: child page in the index */ + mtr_t* mtr, /* in: mtr */ + btr_cur_t* cursor) /* out: cursor on node pointer record, + its page x-latched */ +{ + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr)); +} + +/**************************************************************** +Seeks to the upper level node pointer to a page. +It is assumed that mtr holds an x-latch on the tree. */ +static +void +btr_page_get_father( +/*================*/ + dict_index_t* index, /* in: b-tree index */ + buf_block_t* block, /* in: child page in the index */ + mtr_t* mtr, /* in: mtr */ + btr_cur_t* cursor) /* out: cursor on node pointer record, + its page x-latched */ +{ + mem_heap_t* heap; + rec_t* rec + = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame( + block))); + btr_cur_position(index, rec, block, cursor); + + heap = mem_heap_create(100); + btr_page_get_father_node_ptr(NULL, heap, cursor, mtr); + mem_heap_free(heap); +} + +/**************************************************************** +Creates the root node for a new index tree. */ +UNIV_INTERN +ulint +btr_create( +/*=======*/ + /* out: page number of the created root, + FIL_NULL if did not succeed */ + ulint type, /* in: type of the index */ + ulint space, /* in: space where created */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + dulint index_id,/* in: index id */ + dict_index_t* index, /* in: index */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint page_no; + buf_block_t* block; + buf_frame_t* frame; + page_t* page; + page_zip_des_t* page_zip; + + /* Create the two new segments (one, in the case of an ibuf tree) for + the index tree; the segment headers are put on the allocated root page + (for an ibuf tree, not in the root, but on a separate ibuf header + page) */ + + if (type & DICT_IBUF) { + /* Allocate first the ibuf header page */ + buf_block_t* ibuf_hdr_block = fseg_create( + space, 0, + IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr); + + buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW); + + ut_ad(buf_block_get_page_no(ibuf_hdr_block) + == IBUF_HEADER_PAGE_NO); + /* Allocate then the next page to the segment: it will be the + tree root page */ + + page_no = fseg_alloc_free_page(buf_block_get_frame( + ibuf_hdr_block) + + IBUF_HEADER + + IBUF_TREE_SEG_HEADER, + IBUF_TREE_ROOT_PAGE_NO, + FSP_UP, mtr); + ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO); + + block = buf_page_get(space, zip_size, page_no, + RW_X_LATCH, mtr); + } else { + block = fseg_create(space, 0, + PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr); + } + + if (block == NULL) { + + return(FIL_NULL); + } + + page_no = buf_block_get_page_no(block); + frame = buf_block_get_frame(block); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + + if (type & DICT_IBUF) { + /* It is an insert buffer tree: initialize the free list */ + + ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO); + + flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); + } else { + /* It is a non-ibuf tree: create a file segment for leaf + pages */ + fseg_create(space, page_no, + PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr); + /* The fseg create acquires a second latch on the page, + therefore we must declare it: */ + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + } + + /* Create a new index page on the the allocated segment page */ + page_zip = buf_block_get_page_zip(block); + + if (UNIV_LIKELY_NULL(page_zip)) { + page = page_create_zip(block, index, 0, mtr); + } else { + page = page_create(block, mtr, + dict_table_is_comp(index->table)); + /* Set the level of the new index page */ + btr_page_set_level(page, NULL, 0, mtr); + } + + block->check_index_page_at_flush = TRUE; + + /* Set the index id of the page */ + btr_page_set_index_id(page, page_zip, index_id, mtr); + + /* Set the next node and previous node fields */ + btr_page_set_next(page, page_zip, FIL_NULL, mtr); + btr_page_set_prev(page, page_zip, FIL_NULL, mtr); + + /* We reset the free bits for the page to allow creation of several + trees in the same mtr, otherwise the latch on a bitmap page would + prevent it because of the latching order */ + + if (!(type & DICT_CLUSTERED)) { + ibuf_reset_free_bits(block); + } + + /* In the following assertion we test that two records of maximum + allowed size fit on the root page: this fact is needed to ensure + correctness of split algorithms */ + + ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE); + + return(page_no); +} + +/**************************************************************** +Frees a B-tree except the root page, which MUST be freed after this +by calling btr_free_root. */ +UNIV_INTERN +void +btr_free_but_not_root( +/*==================*/ + ulint space, /* in: space where created */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no) /* in: root page number */ +{ + ibool finished; + page_t* root; + mtr_t mtr; + +leaf_loop: + mtr_start(&mtr); + + root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); +#endif /* UNIV_BTR_DEBUG */ + + /* NOTE: page hash indexes are dropped when a page is freed inside + fsp0fsp. */ + + finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF, + &mtr); + mtr_commit(&mtr); + + if (!finished) { + + goto leaf_loop; + } +top_loop: + mtr_start(&mtr); + + root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); +#endif /* UNIV_BTR_DEBUG */ + + finished = fseg_free_step_not_header( + root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr); + mtr_commit(&mtr); + + if (!finished) { + + goto top_loop; + } +} + +/**************************************************************** +Frees the B-tree root page. Other tree MUST already have been freed. */ +UNIV_INTERN +void +btr_free_root( +/*==========*/ + ulint space, /* in: space where created */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no, /* in: root page number */ + mtr_t* mtr) /* in: a mini-transaction which has already + been started */ +{ + buf_block_t* block; + fseg_header_t* header; + + block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr); + + btr_search_drop_page_hash_index(block); + + header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP; +#ifdef UNIV_BTR_DEBUG + ut_a(btr_root_fseg_validate(header, space)); +#endif /* UNIV_BTR_DEBUG */ + + while (!fseg_free_step(header, mtr)); +} + +/***************************************************************** +Reorganizes an index page. */ +static +ibool +btr_page_reorganize_low( +/*====================*/ + ibool recovery,/* in: TRUE if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + buf_block_t* block, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + buf_block_t* temp_block; + page_t* temp_page; + ulint log_mode; + ulint data_size1; + ulint data_size2; + ulint max_ins_size1; + ulint max_ins_size2; + ibool success = FALSE; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + data_size1 = page_get_data_size(page); + max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); + + /* Write the log record */ + mlog_open_and_write_index(mtr, page, index, page_is_comp(page) + ? MLOG_COMP_PAGE_REORGANIZE + : MLOG_PAGE_REORGANIZE, 0); + + /* Turn logging off */ + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + temp_block = buf_block_alloc(0); + temp_page = temp_block->frame; + + /* Copy the old page to temporary space */ + buf_frame_copy(temp_page, page); + + if (UNIV_LIKELY(!recovery)) { + btr_search_drop_page_hash_index(block); + } + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, dict_table_is_comp(index->table)); + block->check_index_page_at_flush = TRUE; + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, temp_block, + page_get_infimum_rec(temp_page), + index, mtr); + /* Copy max trx id to recreated page */ + page_set_max_trx_id(block, NULL, page_get_max_trx_id(temp_page)); + + if (UNIV_LIKELY_NULL(page_zip) + && UNIV_UNLIKELY + (!page_zip_compress(page_zip, page, index, NULL))) { + + /* Restore the old page and exit. */ + buf_frame_copy(page, temp_page); + + goto func_exit; + } + + if (UNIV_LIKELY(!recovery)) { + /* Update the record lock bitmaps */ + lock_move_reorganize_page(block, temp_block); + } + + data_size2 = page_get_data_size(page); + max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1); + + if (UNIV_UNLIKELY(data_size1 != data_size2) + || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) { + buf_page_print(page, 0); + buf_page_print(temp_page, 0); + fprintf(stderr, + "InnoDB: Error: page old data size %lu" + " new data size %lu\n" + "InnoDB: Error: page old max ins size %lu" + " new max ins size %lu\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + (unsigned long) data_size1, (unsigned long) data_size2, + (unsigned long) max_ins_size1, + (unsigned long) max_ins_size2); + } else { + success = TRUE; + } + +func_exit: +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + buf_block_free(temp_block); + + /* Restore logging mode */ + mtr_set_log_mode(mtr, log_mode); + + return(success); +} + +/***************************************************************** +Reorganizes an index page. +IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf +page of a non-clustered index, the caller must update the insert +buffer free bits in the same mini-transaction in such a way that the +modification will be redo-logged. */ +UNIV_INTERN +ibool +btr_page_reorganize( +/*================*/ + /* out: TRUE on success, FALSE on failure */ + buf_block_t* block, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + return(btr_page_reorganize_low(FALSE, block, index, mtr)); +} + +/*************************************************************** +Parses a redo log record of reorganizing a page. */ +UNIV_INTERN +byte* +btr_parse_page_reorganize( +/*======================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), + /* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + buf_block_t* block, /* in: page to be reorganized, or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + /* The record is empty, except for the record initial part */ + + if (UNIV_LIKELY(block != NULL)) { + btr_page_reorganize_low(TRUE, block, index, mtr); + } + + return(ptr); +} + +/***************************************************************** +Empties an index page. @see btr_page_create().*/ +static +void +btr_page_empty( +/*===========*/ + buf_block_t* block, /* in: page to be emptied */ + page_zip_des_t* page_zip,/* out: compressed page, or NULL */ + dict_index_t* index, /* in: index of the page */ + ulint level, /* in: the B-tree level of the page */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_zip == buf_block_get_page_zip(block)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + btr_search_drop_page_hash_index(block); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + if (UNIV_LIKELY_NULL(page_zip)) { + page_create_zip(block, index, level, mtr); + } else { + page_create(block, mtr, dict_table_is_comp(index->table)); + btr_page_set_level(page, NULL, level, mtr); + } + + block->check_index_page_at_flush = TRUE; +} + +/***************************************************************** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. */ +UNIV_INTERN +rec_t* +btr_root_raise_and_insert( +/*======================*/ + /* out: inserted record */ + btr_cur_t* cursor, /* in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + const dtuple_t* tuple, /* in: tuple to insert */ + ulint n_ext, /* in: number of externally stored columns */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + page_t* root; + page_t* new_page; + ulint new_page_no; + rec_t* rec; + mem_heap_t* heap; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t* page_cursor; + page_zip_des_t* root_page_zip; + page_zip_des_t* new_page_zip; + buf_block_t* root_block; + buf_block_t* new_block; + + root = btr_cur_get_page(cursor); + root_block = btr_cur_get_block(cursor); + root_page_zip = buf_block_get_page_zip(root_block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!root_page_zip || page_zip_validate(root_page_zip, root)); +#endif /* UNIV_ZIP_DEBUG */ + index = btr_cur_get_index(cursor); +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + ulint space = dict_index_get_space(index); + + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } + + ut_a(dict_index_get_page(index) == page_get_page_no(root)); +#endif /* UNIV_BTR_DEBUG */ + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX)); + + /* Allocate a new page to the tree. Root splitting is done by first + moving the root records to the new page, emptying the root, putting + a node pointer to the new page, and then splitting the new page. */ + + level = btr_page_get_level(root, mtr); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + ut_a(!new_page_zip == !root_page_zip); + ut_a(!new_page_zip + || page_zip_get_size(new_page_zip) + == page_zip_get_size(root_page_zip)); + + btr_page_create(new_block, new_page_zip, index, level, mtr); + + /* Set the next node and previous node fields of new page */ + btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr); + btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr); + + /* Copy the records from root to the new page one by one. */ + + if (0 +#ifdef UNIV_ZIP_COPY + || new_page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_copy_rec_list_end(new_block, root_block, + page_get_infimum_rec(root), + index, mtr))) { + ut_a(new_page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(new_page_zip, new_page, + root_page_zip, root, index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(new_block, root_block, + page_get_infimum_rec(root)); + + btr_search_move_or_delete_hash_entries(new_block, root_block, + index); + } + + /* If this is a pessimistic insert which is actually done to + perform a pessimistic update then we have stored the lock + information of the record to be inserted on the infimum of the + root page: we cannot discard the lock structs on the root page */ + + lock_update_root_raise(new_block, root_block); + + /* Create a memory heap where the node pointer is stored */ + heap = mem_heap_create(100); + + rec = page_rec_get_next(page_get_infimum_rec(new_page)); + new_page_no = buf_block_get_page_no(new_block); + + /* Build the node pointer (= node key and page address) for the + child */ + + node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap, + level); + /* The node pointer must be marked as the predefined minimum record, + as there is no lower alphabetical limit to records in the leftmost + node of a level: */ + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + + /* Rebuild the root page to get free space */ + btr_page_empty(root_block, root_page_zip, index, level + 1, mtr); + + /* Set the next node and previous node fields, although + they should already have been set. The previous node field + must be FIL_NULL if root_page_zip != NULL, because the + REC_INFO_MIN_REC_FLAG (of the first user record) will be + set if and only if btr_page_get_prev() == FIL_NULL. */ + btr_page_set_next(root, root_page_zip, FIL_NULL, mtr); + btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr); + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Insert node pointer to the root */ + + page_cur_set_before_first(root_block, page_cursor); + + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + index, 0, mtr); + + /* The root page should only contain the node pointer + to new_page at this point. Thus, the data should fit. */ + ut_a(node_ptr_rec); + + /* Free the memory heap */ + mem_heap_free(heap); + + /* We play safe and reset the free bits for the new page */ + +#if 0 + fprintf(stderr, "Root raise new page no %lu\n", new_page_no); +#endif + + if (!dict_index_is_clust(index)) { + ibuf_reset_free_bits(new_block); + } + + /* Reposition the cursor to the child node */ + page_cur_search(new_block, index, tuple, + PAGE_CUR_LE, page_cursor); + + /* Split the child and insert tuple */ + return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr)); +} + +/***************************************************************** +Decides if the page should be split at the convergence point of inserts +converging to the left. */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_left( +/*===========================*/ + /* out: TRUE if split recommended */ + btr_cur_t* cursor, /* in: cursor at which to insert */ + rec_t** split_rec) /* out: if split recommended, + the first record on upper half page, + or NULL if tuple to be inserted should + be first */ +{ + page_t* page; + rec_t* insert_point; + rec_t* infimum; + + page = btr_cur_get_page(cursor); + insert_point = btr_cur_get_rec(cursor); + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) + == page_rec_get_next(insert_point)) { + + infimum = page_get_infimum_rec(page); + + /* If the convergence is in the middle of a page, include also + the record immediately before the new insert to the upper + page. Otherwise, we could repeatedly move from page to page + lots of records smaller than the convergence point. */ + + if (infimum != insert_point + && page_rec_get_next(infimum) != insert_point) { + + *split_rec = insert_point; + } else { + *split_rec = page_rec_get_next(insert_point); + } + + return(TRUE); + } + + return(FALSE); +} + +/***************************************************************** +Decides if the page should be split at the convergence point of inserts +converging to the right. */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_right( +/*============================*/ + /* out: TRUE if split recommended */ + btr_cur_t* cursor, /* in: cursor at which to insert */ + rec_t** split_rec) /* out: if split recommended, + the first record on upper half page, + or NULL if tuple to be inserted should + be first */ +{ + page_t* page; + rec_t* insert_point; + + page = btr_cur_get_page(cursor); + insert_point = btr_cur_get_rec(cursor); + + /* We use eager heuristics: if the new insert would be right after + the previous insert on the same page, we assume that there is a + pattern of sequential inserts here. */ + + if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT) + == insert_point)) { + + rec_t* next_rec; + + next_rec = page_rec_get_next(insert_point); + + if (page_rec_is_supremum(next_rec)) { +split_at_new: + /* Split at the new record to insert */ + *split_rec = NULL; + } else { + rec_t* next_next_rec = page_rec_get_next(next_rec); + if (page_rec_is_supremum(next_next_rec)) { + + goto split_at_new; + } + + /* If there are >= 2 user records up from the insert + point, split all but 1 off. We want to keep one because + then sequential inserts can use the adaptive hash + index, as they can do the necessary checks of the right + search position just by looking at the records on this + page. */ + + *split_rec = next_next_rec; + } + + return(TRUE); + } + + return(FALSE); +} + +/***************************************************************** +Calculates a split record such that the tuple will certainly fit on +its half-page when the split is performed. We assume in this function +only that the cursor page has at least one user record. */ +static +rec_t* +btr_page_get_sure_split_rec( +/*========================*/ + /* out: split record, or NULL if tuple + will be the first record on upper half-page */ + btr_cur_t* cursor, /* in: cursor at which insert should be made */ + const dtuple_t* tuple, /* in: tuple to insert */ + ulint n_ext) /* in: number of externally stored columns */ +{ + page_t* page; + page_zip_des_t* page_zip; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + ulint total_space; + ulint incl_data; + rec_t* ins_rec; + rec_t* rec; + rec_t* next_rec; + ulint n; + mem_heap_t* heap; + ulint* offsets; + + page = btr_cur_get_page(cursor); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + page_zip = btr_cur_get_page_zip(cursor); + if (UNIV_LIKELY_NULL(page_zip)) { + /* Estimate the free space of an empty compressed page. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index->n_fields, + page_zip_get_size(page_zip)); + + if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) { + free_space = (ulint) free_space_zip; + } + } + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = page_get_n_recs(page) + 1; + ut_ad(total_n_recs >= 2); + total_space = total_data + page_dir_calc_reserved_space(total_n_recs); + + n = 0; + incl_data = 0; + ins_rec = btr_cur_get_rec(cursor); + rec = page_get_infimum_rec(page); + + heap = NULL; + offsets = NULL; + + /* We start to include records to the left half, and when the + space reserved by them exceeds half of total_space, then if + the included records fit on the left page, they will be put there + if something was left over also for the right page, + otherwise the last included record will be the first on the right + half page */ + + do { + /* Decide the next record to include */ + if (rec == ins_rec) { + rec = NULL; /* NULL denotes that tuple is + now included */ + } else if (rec == NULL) { + rec = page_rec_get_next(ins_rec); + } else { + rec = page_rec_get_next(rec); + } + + if (rec == NULL) { + /* Include tuple */ + incl_data += insert_size; + } else { + offsets = rec_get_offsets(rec, cursor->index, + offsets, ULINT_UNDEFINED, + &heap); + incl_data += rec_offs_size(offsets); + } + + n++; + } while (incl_data + page_dir_calc_reserved_space(n) + < total_space / 2); + + if (incl_data + page_dir_calc_reserved_space(n) <= free_space) { + /* The next record will be the first on + the right half page if it is not the + supremum record of page */ + + if (rec == ins_rec) { + rec = NULL; + + goto func_exit; + } else if (rec == NULL) { + next_rec = page_rec_get_next(ins_rec); + } else { + next_rec = page_rec_get_next(rec); + } + ut_ad(next_rec); + if (!page_rec_is_supremum(next_rec)) { + rec = next_rec; + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(rec); +} + +/***************************************************************** +Returns TRUE if the insert fits on the appropriate half-page with the +chosen split_rec. */ +static +ibool +btr_page_insert_fits( +/*=================*/ + /* out: TRUE if fits */ + btr_cur_t* cursor, /* in: cursor at which insert + should be made */ + const rec_t* split_rec,/* in: suggestion for first record + on upper half-page, or NULL if + tuple to be inserted should be first */ + const ulint* offsets,/* in: rec_get_offsets( + split_rec, cursor->index) */ + const dtuple_t* tuple, /* in: tuple to insert */ + ulint n_ext, /* in: number of externally stored columns */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + page_t* page; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + const rec_t* rec; + const rec_t* end_rec; + ulint* offs; + + page = btr_cur_get_page(cursor); + + ut_ad(!split_rec == !offsets); + ut_ad(!offsets + || !page_is_comp(page) == !rec_offs_comp(offsets)); + ut_ad(!offsets + || rec_offs_validate(split_rec, cursor->index, offsets)); + + insert_size = rec_get_converted_size(cursor->index, tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = page_get_n_recs(page) + 1; + + /* We determine which records (from rec to end_rec, not including + end_rec) will end up on the other half page from tuple when it is + inserted. */ + + if (split_rec == NULL) { + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) { + + rec = page_rec_get_next(page_get_infimum_rec(page)); + end_rec = split_rec; + } else { + rec = split_rec; + end_rec = page_get_supremum_rec(page); + } + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(TRUE); + } + + offs = NULL; + + while (rec != end_rec) { + /* In this loop we calculate the amount of reserved + space after rec is removed from page. */ + + offs = rec_get_offsets(rec, cursor->index, offs, + ULINT_UNDEFINED, &heap); + + total_data -= rec_offs_size(offs); + total_n_recs--; + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(TRUE); + } + + rec = page_rec_get_next_const(rec); + } + + return(FALSE); +} + +/*********************************************************** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +UNIV_INTERN +void +btr_insert_on_non_leaf_level( +/*=========================*/ + dict_index_t* index, /* in: index */ + ulint level, /* in: level, must be > 0 */ + dtuple_t* tuple, /* in: the record to be inserted */ + mtr_t* mtr) /* in: mtr */ +{ + big_rec_t* dummy_big_rec; + btr_cur_t cursor; + ulint err; + rec_t* rec; + + ut_ad(level > 0); + + btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE, + BTR_CONT_MODIFY_TREE, + &cursor, 0, mtr); + + err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + ut_a(err == DB_SUCCESS); +} + +/****************************************************************** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +static +void +btr_attach_half_pages( +/*==================*/ + dict_index_t* index, /* in: the index tree */ + buf_block_t* block, /* in/out: page to be split */ + rec_t* split_rec, /* in: first record on upper + half page */ + buf_block_t* new_block, /* in/out: the new half page */ + ulint direction, /* in: FSP_UP or FSP_DOWN */ + mtr_t* mtr) /* in: mtr */ +{ + ulint space; + ulint zip_size; + ulint prev_page_no; + ulint next_page_no; + ulint level; + page_t* page = buf_block_get_frame(block); + page_t* lower_page; + page_t* upper_page; + ulint lower_page_no; + ulint upper_page_no; + page_zip_des_t* lower_page_zip; + page_zip_des_t* upper_page_zip; + dtuple_t* node_ptr_upper; + mem_heap_t* heap; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); + + /* Based on split direction, decide upper and lower pages */ + if (direction == FSP_DOWN) { + + btr_cur_t cursor; + ulint* offsets; + + lower_page = buf_block_get_frame(new_block); + lower_page_no = buf_block_get_page_no(new_block); + lower_page_zip = buf_block_get_page_zip(new_block); + upper_page = buf_block_get_frame(block); + upper_page_no = buf_block_get_page_no(block); + upper_page_zip = buf_block_get_page_zip(block); + + /* Look up the index for the node pointer to page */ + offsets = btr_page_get_father_block(NULL, heap, index, + block, mtr, &cursor); + + /* Replace the address of the old child node (= page) with the + address of the new lower half */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_rec(&cursor), + btr_cur_get_page_zip(&cursor), + offsets, lower_page_no, mtr); + mem_heap_empty(heap); + } else { + lower_page = buf_block_get_frame(block); + lower_page_no = buf_block_get_page_no(block); + lower_page_zip = buf_block_get_page_zip(block); + upper_page = buf_block_get_frame(new_block); + upper_page_no = buf_block_get_page_no(new_block); + upper_page_zip = buf_block_get_page_zip(new_block); + } + + /* Get the level of the split pages */ + level = btr_page_get_level(buf_block_get_frame(block), mtr); + ut_ad(level + == btr_page_get_level(buf_block_get_frame(new_block), mtr)); + + /* Build the node pointer (= node key and page address) for the upper + half */ + + node_ptr_upper = dict_index_build_node_ptr(index, split_rec, + upper_page_no, heap, level); + + /* Insert it next to the pointer to the lower half. Note that this + may generate recursion leading to a split on the higher level. */ + + btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr); + + /* Free the memory heap */ + mem_heap_free(heap); + + /* Get the previous and next pages of page */ + + prev_page_no = btr_page_get_prev(page, mtr); + next_page_no = btr_page_get_next(page, mtr); + space = buf_block_get_space(block); + zip_size = buf_block_get_zip_size(block); + + /* Update page links of the level */ + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block = btr_block_get(space, zip_size, + prev_page_no, + RW_X_LATCH, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_block->frame) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_block->frame, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_next(buf_block_get_frame(prev_block), + buf_block_get_page_zip(prev_block), + lower_page_no, mtr); + } + + if (next_page_no != FIL_NULL) { + buf_block_t* next_block = btr_block_get(space, zip_size, + next_page_no, + RW_X_LATCH, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_block->frame) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_prev(buf_block_get_frame(next_block), + buf_block_get_page_zip(next_block), + upper_page_no, mtr); + } + + btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr); + btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr); + + btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr); + btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr); +} + +/***************************************************************** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch +is released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore +enough free disk space must be guaranteed to be available before +this function is called. */ +UNIV_INTERN +rec_t* +btr_page_split_and_insert( +/*======================*/ + /* out: inserted record; NOTE: the tree + x-latch is released! NOTE: 2 free disk + pages must be available! */ + btr_cur_t* cursor, /* in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + const dtuple_t* tuple, /* in: tuple to insert */ + ulint n_ext, /* in: number of externally stored columns */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + ulint page_no; + byte direction; + ulint hint_page_no; + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + buf_block_t* left_block; + buf_block_t* right_block; + buf_block_t* insert_block; + page_t* insert_page; + page_cur_t* page_cursor; + rec_t* first_rec; + byte* buf = 0; /* remove warning */ + rec_t* move_limit; + ibool insert_will_fit; + ibool insert_left; + ulint n_iterations = 0; + rec_t* rec; + mem_heap_t* heap; + ulint n_uniq; + ulint* offsets; + + heap = mem_heap_create(1024); + n_uniq = dict_index_get_n_unique_in_tree(cursor->index); +func_start: + mem_heap_empty(heap); + offsets = NULL; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_get_n_recs(page) >= 1); + + page_no = buf_block_get_page_no(block); + + /* 1. Decide the split record; split_rec == NULL means that the + tuple to be inserted should be the first record on the upper + half-page */ + + if (n_iterations > 0) { + direction = FSP_UP; + hint_page_no = page_no + 1; + split_rec = btr_page_get_sure_split_rec(cursor, tuple, n_ext); + + } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { + direction = FSP_UP; + hint_page_no = page_no + 1; + + } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) { + direction = FSP_DOWN; + hint_page_no = page_no - 1; + } else { + direction = FSP_UP; + hint_page_no = page_no + 1; + + if (page_get_n_recs(page) == 1) { + page_cur_t pcur; + + /* There is only one record in the index page + therefore we can't split the node in the middle + by default. We need to determine whether the + new record will be inserted to the left or right. */ + + /* Read the first (and only) record in the page. */ + page_cur_set_before_first(block, &pcur); + page_cur_move_to_next(&pcur); + first_rec = page_cur_get_rec(&pcur); + + offsets = rec_get_offsets( + first_rec, cursor->index, offsets, + n_uniq, &heap); + + /* If the new record is less than the existing record + the the split in the middle will copy the existing + record to the new node. */ + if (cmp_dtuple_rec(tuple, first_rec, offsets) < 0) { + split_rec = page_get_middle_rec(page); + } else { + split_rec = NULL; + } + } else { + split_rec = page_get_middle_rec(page); + } + } + + /* 2. Allocate a new page to the index */ + new_block = btr_page_alloc(cursor->index, hint_page_no, direction, + btr_page_get_level(page, mtr), mtr); + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + btr_page_create(new_block, new_page_zip, cursor->index, + btr_page_get_level(page, mtr), mtr); + + /* 3. Calculate the first record on the upper half-page, and the + first record (move_limit) on original page which ends up on the + upper half */ + + if (split_rec) { + first_rec = move_limit = split_rec; + + offsets = rec_get_offsets(split_rec, cursor->index, offsets, + n_uniq, &heap); + + insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0; + + if (UNIV_UNLIKELY(!insert_left && new_page_zip + && n_iterations > 0)) { + /* If a compressed page has already been split, + avoid further splits by inserting the record + to an empty page. */ + split_rec = NULL; + goto insert_right; + } + } else { +insert_right: + insert_left = FALSE; + buf = mem_alloc(rec_get_converted_size(cursor->index, + tuple, n_ext)); + + first_rec = rec_convert_dtuple_to_rec(buf, cursor->index, + tuple, n_ext); + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + } + + /* 4. Do first the modifications in the tree structure */ + + btr_attach_half_pages(cursor->index, block, + first_rec, new_block, direction, mtr); + + /* If the split is made on the leaf level and the insert will fit + on the appropriate half-page, we may release the tree x-latch. + We can then move the records after releasing the tree latch, + thus reducing the tree latch contention. */ + + if (split_rec) { + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, split_rec, + offsets, tuple, n_ext, heap); + } else { + mem_free(buf); + insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, NULL, + NULL, tuple, n_ext, heap); + } + + if (insert_will_fit && page_is_leaf(page)) { + + mtr_memo_release(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + } + + /* 5. Move then the records to the new page */ + if (direction == FSP_DOWN) { + /* fputs("Split left\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_move_rec_list_start(new_block, block, move_limit, + cursor->index, mtr))) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_end(move_limit - page + new_page, + new_block, cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_start( + new_block, block, move_limit, + new_page + PAGE_NEW_INFIMUM); + + btr_search_move_or_delete_hash_entries( + new_block, block, cursor->index); + + /* Delete the records from the source page. */ + + page_delete_rec_list_start(move_limit, block, + cursor->index, mtr); + } + + left_block = new_block; + right_block = block; + + lock_update_split_left(right_block, left_block); + } else { + /* fputs("Split right\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_move_rec_list_end(new_block, block, move_limit, + cursor->index, mtr))) { + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_page_zip, new_page, + page_zip, page, cursor->index, mtr); + page_delete_rec_list_start(move_limit - page + + new_page, new_block, + cursor->index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(new_block, block, move_limit); + + btr_search_move_or_delete_hash_entries( + new_block, block, cursor->index); + + /* Delete the records from the source page. */ + + page_delete_rec_list_end(move_limit, block, + cursor->index, + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + } + + left_block = block; + right_block = new_block; + + lock_update_split_right(right_block, left_block); + } + +#ifdef UNIV_ZIP_DEBUG + if (UNIV_LIKELY_NULL(page_zip)) { + ut_a(page_zip_validate(page_zip, page)); + ut_a(page_zip_validate(new_page_zip, new_page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* At this point, split_rec, move_limit and first_rec may point + to garbage on the old page. */ + + /* 6. The split and the tree modification is now completed. Decide the + page where the tuple should be inserted */ + + if (insert_left) { + insert_block = left_block; + } else { + insert_block = right_block; + } + + insert_page = buf_block_get_frame(insert_block); + + /* 7. Reposition the cursor for insert and try insertion */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_search(insert_block, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, n_ext, mtr); + +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* insert_page_zip + = buf_block_get_page_zip(insert_block); + ut_a(!insert_page_zip + || page_zip_validate(insert_page_zip, insert_page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_LIKELY(rec != NULL)) { + + goto func_exit; + } + + /* 8. If insert did not fit, try page reorganization */ + + if (UNIV_UNLIKELY + (!btr_page_reorganize(insert_block, cursor->index, mtr))) { + + goto insert_failed; + } + + page_cur_search(insert_block, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, + n_ext, mtr); + + if (UNIV_UNLIKELY(rec == NULL)) { + /* The insert did not fit on the page: loop back to the + start of the function for a new split */ +insert_failed: + /* We play safe and reset the free bits for new_page */ + if (!dict_index_is_clust(cursor->index)) { + ibuf_reset_free_bits(new_block); + } + + /* fprintf(stderr, "Split second round %lu\n", + page_get_page_no(page)); */ + n_iterations++; + ut_ad(n_iterations < 2 + || buf_block_get_page_zip(insert_block)); + ut_ad(!insert_will_fit); + + goto func_start; + } + +func_exit: + /* Insert fit on the page: update the free bits for the + left and right pages in the same mtr */ + + if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) { + ibuf_update_free_bits_for_two_pages_low( + buf_block_get_zip_size(left_block), + left_block, right_block, mtr); + } + +#if 0 + fprintf(stderr, "Split and insert done %lu %lu\n", + buf_block_get_page_no(left_block), + buf_block_get_page_no(right_block)); +#endif + + ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index)); + ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index)); + + mem_heap_free(heap); + return(rec); +} + +/***************************************************************** +Removes a page from the level list of pages. */ +static +void +btr_level_list_remove( +/*==================*/ + ulint space, /* in: space where removed */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* page, /* in: page to remove */ + mtr_t* mtr) /* in: mtr */ +{ + ulint prev_page_no; + ulint next_page_no; + + ut_ad(page && mtr); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(space == page_get_space_id(page)); + /* Get the previous and next page numbers of page */ + + prev_page_no = btr_page_get_prev(page, mtr); + next_page_no = btr_page_get_next(page, mtr); + + /* Update page links of the level */ + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block + = btr_block_get(space, zip_size, prev_page_no, + RW_X_LATCH, mtr); + page_t* prev_page + = buf_block_get_frame(prev_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(prev_page) == page_is_comp(page)); + ut_a(btr_page_get_next(prev_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_next(prev_page, + buf_block_get_page_zip(prev_block), + next_page_no, mtr); + } + + if (next_page_no != FIL_NULL) { + buf_block_t* next_block + = btr_block_get(space, zip_size, next_page_no, + RW_X_LATCH, mtr); + page_t* next_page + = buf_block_get_frame(next_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + btr_page_set_prev(next_page, + buf_block_get_page_zip(next_block), + prev_page_no, mtr); + } +} + +/******************************************************************** +Writes the redo log record for setting an index record as the predefined +minimum record. */ +UNIV_INLINE +void +btr_set_min_rec_mark_log( +/*=====================*/ + rec_t* rec, /* in: record */ + byte type, /* in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(rec, type, mtr); + + /* Write rec offset as a 2-byte ulint */ + mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES); +} + +/******************************************************************** +Parses the redo log record for setting an index record as the predefined +minimum record. */ +UNIV_INTERN +byte* +btr_parse_set_min_rec_mark( +/*=======================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + rec_t* rec; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + if (page) { + ut_a(!page_is_comp(page) == !comp); + + rec = page + mach_read_from_2(ptr); + + btr_set_min_rec_mark(rec, mtr); + } + + return(ptr + 2); +} + +/******************************************************************** +Sets a record as the predefined minimum record. */ +UNIV_INTERN +void +btr_set_min_rec_mark( +/*=================*/ + rec_t* rec, /* in: record */ + mtr_t* mtr) /* in: mtr */ +{ + ulint info_bits; + + if (UNIV_LIKELY(page_rec_is_comp(rec))) { + info_bits = rec_get_info_bits(rec, TRUE); + + rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr); + } else { + info_bits = rec_get_info_bits(rec, FALSE); + + rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr); + } +} + +/***************************************************************** +Deletes on the upper level the node pointer to a page. */ +UNIV_INTERN +void +btr_node_ptr_delete( +/*================*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: page whose node pointer is deleted */ + mtr_t* mtr) /* in: mtr */ +{ + btr_cur_t cursor; + ibool compressed; + ulint err; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + /* Delete node pointer on father page */ + btr_page_get_father(index, block, mtr, &cursor); + + compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE, + mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&cursor, mtr); + } +} + +/***************************************************************** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. */ +static +void +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* father_block; + page_t* father_page; + ulint page_level; + page_zip_des_t* father_page_zip; + page_t* page = buf_block_get_frame(block); + ulint root_page_no; + buf_block_t* blocks[BTR_MAX_LEVELS]; + ulint n_blocks; /* last used index in blocks[] */ + ulint i; + + ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + page_level = btr_page_get_level(page, mtr); + root_page_no = dict_index_get_page(index); + + { + btr_cur_t cursor; + mem_heap_t* heap = mem_heap_create(100); + ulint* offsets; + buf_block_t* b; + + offsets = btr_page_get_father_block(NULL, heap, index, + block, mtr, &cursor); + father_block = btr_cur_get_block(&cursor); + father_page_zip = buf_block_get_page_zip(father_block); + father_page = buf_block_get_frame(father_block); + + n_blocks = 0; + + /* Store all ancestor pages so we can reset their + levels later on. We have to do all the searches on + the tree now because later on, after we've replaced + the first level, the tree is in an inconsistent state + and can not be searched. */ + for (b = father_block; + buf_block_get_page_no(b) != root_page_no; ) { + ut_a(n_blocks < BTR_MAX_LEVELS); + + offsets = btr_page_get_father_block(offsets, heap, + index, b, + mtr, &cursor); + + blocks[n_blocks++] = b = btr_cur_get_block(&cursor); + } + + mem_heap_free(heap); + } + + btr_search_drop_page_hash_index(block); + + /* Make the father empty */ + btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + + /* Copy the records to the father page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || father_page_zip +#endif /* UNIV_ZIP_COPY */ + || UNIV_UNLIKELY + (!page_copy_rec_list_end(father_block, block, + page_get_infimum_rec(page), + index, mtr))) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(father_page_zip); + ut_a(page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(father_page_zip, father_page, + page_zip, page, index, mtr); + + /* Update the lock table and possible hash index. */ + + lock_move_rec_list_end(father_block, block, + page_get_infimum_rec(page)); + + btr_search_move_or_delete_hash_entries(father_block, block, + index); + } + + lock_update_copy_and_discard(father_block, block); + + /* Go upward to root page, decrementing levels by one. */ + for (i = 0; i < n_blocks; i++, page_level++) { + page_t* page = buf_block_get_frame(blocks[i]); + page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]); + + ut_ad(btr_page_get_level(page, mtr) == page_level + 1); + + btr_page_set_level(page, page_zip, page_level, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* We play it safe and reset the free bits for the father */ + if (!dict_index_is_clust(index)) { + ibuf_reset_free_bits(father_block); + } + ut_ad(page_validate(father_page, index)); + ut_ad(btr_check_node_ptr(index, father_block, mtr)); +} + +/***************************************************************** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the brother +reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to the +brothers, if they exist. */ +UNIV_INTERN +ibool +btr_compress( +/*=========*/ + /* out: TRUE on success */ + btr_cur_t* cursor, /* in: cursor on the page to merge or lift; + the page must not be empty: in record delete + use btr_discard_page if the page would become + empty */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + ulint space; + ulint zip_size; + ulint left_page_no; + ulint right_page_no; + buf_block_t* merge_block; + page_t* merge_page; + page_zip_des_t* merge_page_zip; + ibool is_left; + buf_block_t* block; + page_t* page; + btr_cur_t father_cursor; + mem_heap_t* heap; + ulint* offsets; + ulint data_size; + ulint n_recs; + ulint max_ins_size; + ulint max_ins_size_reorg; + ulint level; + + block = btr_cur_get_block(cursor); + page = btr_cur_get_page(cursor); + index = btr_cur_get_index(cursor); + ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table)); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + level = btr_page_get_level(page, mtr); + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + left_page_no = btr_page_get_prev(page, mtr); + right_page_no = btr_page_get_next(page, mtr); + +#if 0 + fprintf(stderr, "Merge left page %lu right %lu \n", + left_page_no, right_page_no); +#endif + + heap = mem_heap_create(100); + offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, + &father_cursor); + + /* Decide the page to which we try to merge and which will inherit + the locks */ + + is_left = left_page_no != FIL_NULL; + + if (is_left) { + + merge_block = btr_block_get(space, zip_size, left_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else if (right_page_no != FIL_NULL) { + + merge_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else { + /* The page is the only one on the level, lift the records + to the father */ + btr_lift_page_up(index, block, mtr); + mem_heap_free(heap); + return(TRUE); + } + + n_recs = page_get_n_recs(page); + data_size = page_get_data_size(page); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(merge_page) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + + max_ins_size_reorg = page_get_max_insert_size_after_reorganize( + merge_page, n_recs); + if (data_size > max_ins_size_reorg) { + + /* No space for merge */ +err_exit: + /* We play it safe and reset the free bits. */ + if (zip_size + && page_is_leaf(merge_page) + && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(merge_block); + } + + mem_heap_free(heap); + return(FALSE); + } + + ut_ad(page_validate(merge_page, index)); + + max_ins_size = page_get_max_insert_size(merge_page, n_recs); + + if (UNIV_UNLIKELY(data_size > max_ins_size)) { + + /* We have to reorganize merge_page */ + + if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block, + index, mtr))) { + + goto err_exit; + } + + max_ins_size = page_get_max_insert_size(merge_page, n_recs); + + ut_ad(page_validate(merge_page, index)); + ut_ad(max_ins_size == max_ins_size_reorg); + + if (UNIV_UNLIKELY(data_size > max_ins_size)) { + + /* Add fault tolerance, though this should + never happen */ + + goto err_exit; + } + } + + merge_page_zip = buf_block_get_page_zip(merge_block); +#ifdef UNIV_ZIP_DEBUG + if (UNIV_LIKELY_NULL(merge_page_zip)) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(page_zip); + ut_a(page_zip_validate(merge_page_zip, merge_page)); + ut_a(page_zip_validate(page_zip, page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* Move records to the merge page */ + if (is_left) { + rec_t* orig_pred = page_copy_rec_list_start( + merge_block, block, page_get_supremum_rec(page), + index, mtr); + + if (UNIV_UNLIKELY(!orig_pred)) { + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, mtr); + + btr_node_ptr_delete(index, block, mtr); + lock_update_merge_left(merge_block, orig_pred, block); + } else { + rec_t* orig_succ; +#ifdef UNIV_BTR_DEBUG + byte fil_page_prev[4]; +#endif /* UNIV_BTR_DEBUG */ + + if (UNIV_LIKELY_NULL(merge_page_zip)) { + /* The function page_zip_compress(), which will be + invoked by page_copy_rec_list_end() below, + requires that FIL_PAGE_PREV be FIL_NULL. + Clear the field, but prepare to restore it. */ +#ifdef UNIV_BTR_DEBUG + memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); +#endif /* UNIV_BTR_DEBUG */ +#if FIL_NULL != 0xffffffff +# error "FIL_NULL != 0xffffffff" +#endif + memset(merge_page + FIL_PAGE_PREV, 0xff, 4); + } + + orig_succ = page_copy_rec_list_end(merge_block, block, + page_get_infimum_rec(page), + cursor->index, mtr); + + if (UNIV_UNLIKELY(!orig_succ)) { + ut_a(merge_page_zip); +#ifdef UNIV_BTR_DEBUG + /* FIL_PAGE_PREV was restored from merge_page_zip. */ + ut_a(!memcmp(fil_page_prev, + merge_page + FIL_PAGE_PREV, 4)); +#endif /* UNIV_BTR_DEBUG */ + goto err_exit; + } + + btr_search_drop_page_hash_index(block); + +#ifdef UNIV_BTR_DEBUG + if (UNIV_LIKELY_NULL(merge_page_zip)) { + /* Restore FIL_PAGE_PREV in order to avoid an assertion + failure in btr_level_list_remove(), which will set + the field again to FIL_NULL. Even though this makes + merge_page and merge_page_zip inconsistent for a + split second, it is harmless, because the pages + are X-latched. */ + memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4); + } +#endif /* UNIV_BTR_DEBUG */ + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, mtr); + + /* Replace the address of the old child node (= page) with the + address of the merge page to the right */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_rec(&father_cursor), + btr_cur_get_page_zip(&father_cursor), + offsets, right_page_no, mtr); + btr_node_ptr_delete(index, merge_block, mtr); + + lock_update_merge_right(merge_block, orig_succ, block); + } + + mem_heap_free(heap); + + if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. We cannot update the insert + buffer bitmap in this mini-transaction, because + btr_compress() can be invoked recursively without + committing the mini-transaction in between. Since + insert buffer bitmap pages have a lower rank than + B-tree pages, we must not access other pages in the + same mini-transaction after accessing an insert buffer + bitmap page. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (zip_size) { + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(merge_block); + } else { + /* On uncompressed pages, the free bits will + never increase here. Thus, it is safe to + write the bits accurately in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full(merge_block, + UNIV_PAGE_SIZE, + ULINT_UNDEFINED); + } + } + + ut_ad(page_validate(merge_page, index)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page)); +#endif /* UNIV_ZIP_DEBUG */ + + /* Free the file page */ + btr_page_free(index, block, mtr); + + ut_ad(btr_check_node_ptr(index, merge_block, mtr)); + return(TRUE); +} + +/***************************************************************** +Discards a page that is the only page on its level. This will empty +the whole B-tree, leaving just an empty root page. This function +should never be reached, because btr_compress(), which is invoked in +delete operations, calls btr_lift_page_up() to flatten the B-tree. */ +static +void +btr_discard_only_page_on_level( +/*===========================*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: page which is the only on its level */ + mtr_t* mtr) /* in: mtr */ +{ + ulint page_level = 0; + + while (buf_block_get_page_no(block) != dict_index_get_page(index)) { + btr_cur_t cursor; + buf_block_t* father; + const page_t* page = buf_block_get_frame(block); + + ut_a(page_get_n_recs(page) == 1); + ut_a(page_level == btr_page_get_level(page, mtr)); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + ut_a(btr_page_get_next(page, mtr) == FIL_NULL); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + btr_search_drop_page_hash_index(block); + + btr_page_get_father(index, block, mtr, &cursor); + father = btr_cur_get_block(&cursor); + + lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block); + + /* Free the file page */ + btr_page_free(index, block, mtr); + + block = father; + page_level++; + } + + /* block is the root page, which must be empty, except + for the node pointer to the (now discarded) block(s). */ + +#ifdef UNIV_BTR_DEBUG + if (!dict_index_is_ibuf(index)) { + const page_t* root = buf_block_get_frame(block); + const ulint space = dict_index_get_space(index); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + root, space)); + ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + root, space)); + } +#endif /* UNIV_BTR_DEBUG */ + + btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); + + /* We play it safe and reset the free bits for the root */ + if (!dict_index_is_clust(index)) { + ibuf_reset_free_bits(block); + } +} + +/***************************************************************** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +UNIV_INTERN +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /* in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + ulint space; + ulint zip_size; + ulint left_page_no; + ulint right_page_no; + buf_block_t* merge_block; + page_t* merge_page; + buf_block_t* block; + page_t* page; + rec_t* node_ptr; + + block = btr_cur_get_block(cursor); + index = btr_cur_get_index(cursor); + + ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block)); + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + /* Decide the page which will inherit the locks */ + + left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr); + right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr); + + if (left_page_no != FIL_NULL) { + merge_block = btr_block_get(space, zip_size, left_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else if (right_page_no != FIL_NULL) { + merge_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, mtr); + merge_page = buf_block_get_frame(merge_block); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(merge_page, mtr) + == buf_block_get_page_no(block)); +#endif /* UNIV_BTR_DEBUG */ + } else { + btr_discard_only_page_on_level(index, block, mtr); + + return; + } + + page = buf_block_get_frame(block); + ut_a(page_is_comp(merge_page) == page_is_comp(page)); + btr_search_drop_page_hash_index(block); + + if (left_page_no == FIL_NULL && !page_is_leaf(page)) { + + /* We have to mark the leftmost node pointer on the right + side page as the predefined minimum record */ + node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page)); + + ut_ad(page_rec_is_user_rec(node_ptr)); + + /* This will make page_zip_validate() fail on merge_page + until btr_level_list_remove() completes. This is harmless, + because everything will take place within a single + mini-transaction and because writing to the redo log + is an atomic operation (performed by mtr_commit()). */ + btr_set_min_rec_mark(node_ptr, mtr); + } + + btr_node_ptr_delete(index, block, mtr); + + /* Remove the page from the level list */ + btr_level_list_remove(space, zip_size, page, mtr); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* merge_page_zip + = buf_block_get_page_zip(merge_block); + ut_a(!merge_page_zip + || page_zip_validate(merge_page_zip, merge_page)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (left_page_no != FIL_NULL) { + lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM, + block); + } else { + lock_update_discard(merge_block, + lock_get_min_heap_no(merge_block), + block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + ut_ad(btr_check_node_ptr(index, merge_block, mtr)); +} + +#ifdef UNIV_BTR_PRINT +/***************************************************************** +Prints size info of a B-tree. */ +UNIV_INTERN +void +btr_print_size( +/*===========*/ + dict_index_t* index) /* in: index tree */ +{ + page_t* root; + fseg_header_t* seg; + mtr_t mtr; + + if (dict_index_is_ibuf(index)) { + fputs("Sorry, cannot print info of an ibuf tree:" + " use ibuf functions\n", stderr); + + return; + } + + mtr_start(&mtr); + + root = btr_root_get(index, &mtr); + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + + if (!(index->type & DICT_UNIVERSAL)) { + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + } + + mtr_commit(&mtr); +} + +/**************************************************************** +Prints recursively index tree pages. */ +static +void +btr_print_recursive( +/*================*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: index page */ + ulint width, /* in: print this many entries from start + and end */ + mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */ + ulint** offsets,/* in/out: buffer for rec_get_offsets() */ + mtr_t* mtr) /* in: mtr */ +{ + const page_t* page = buf_block_get_frame(block); + page_cur_t cursor; + ulint n_recs; + ulint i = 0; + mtr_t mtr2; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n", + (ulong) btr_page_get_level(page, mtr), + (ulong) buf_block_get_page_no(block)); + + page_print(block, index, width, width); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + while (!page_cur_is_after_last(&cursor)) { + + if (page_is_leaf(page)) { + + /* If this is the leaf level, do nothing */ + + } else if ((i <= width) || (i >= n_recs - width)) { + + const rec_t* node_ptr; + + mtr_start(&mtr2); + + node_ptr = page_cur_get_rec(&cursor); + + *offsets = rec_get_offsets(node_ptr, index, *offsets, + ULINT_UNDEFINED, heap); + btr_print_recursive(index, + btr_node_ptr_get_child(node_ptr, + index, + *offsets, + &mtr2), + width, heap, offsets, &mtr2); + mtr_commit(&mtr2); + } + + page_cur_move_to_next(&cursor); + i++; + } +} + +/****************************************************************** +Prints directories and other info of all nodes in the tree. */ +UNIV_INTERN +void +btr_print_index( +/*============*/ + dict_index_t* index, /* in: index */ + ulint width) /* in: print this many entries from start + and end */ +{ + mtr_t mtr; + buf_block_t* root; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + fputs("--------------------------\n" + "INDEX TREE PRINT\n", stderr); + + mtr_start(&mtr); + + root = btr_root_block_get(index, &mtr); + + btr_print_recursive(index, root, width, &heap, &offsets, &mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + mtr_commit(&mtr); + + btr_validate_index(index, NULL); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +/**************************************************************** +Checks that the node pointer to a page is appropriate. */ +UNIV_INTERN +ibool +btr_check_node_ptr( +/*===============*/ + /* out: TRUE */ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: index page */ + mtr_t* mtr) /* in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* tuple; + ulint* offsets; + btr_cur_t cursor; + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + if (dict_index_get_page(index) == buf_block_get_page_no(block)) { + + return(TRUE); + } + + heap = mem_heap_create(256); + offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, + &cursor); + + if (page_is_leaf(page)) { + + goto func_exit; + } + + tuple = dict_index_build_node_ptr( + index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, + btr_page_get_level(page, mtr)); + + ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets)); +func_exit: + mem_heap_free(heap); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**************************************************************** +Display identification information for a record. */ +static +void +btr_index_rec_validate_report( +/*==========================*/ + const page_t* page, /* in: index page */ + const rec_t* rec, /* in: index record */ + const dict_index_t* index) /* in: index */ +{ + fputs("InnoDB: Record in ", stderr); + dict_index_name_print(stderr, NULL, index); + fprintf(stderr, ", page %lu, at offset %lu\n", + page_get_page_no(page), (ulint) page_offset(rec)); +} + +/**************************************************************** +Checks the size and number of fields in a record based on the definition of +the index. */ +UNIV_INTERN +ibool +btr_index_rec_validate( +/*===================*/ + /* out: TRUE if ok */ + const rec_t* rec, /* in: index record */ + const dict_index_t* index, /* in: index */ + ibool dump_on_error) /* in: TRUE if the function + should print hex dump of record + and page on error */ +{ + ulint len; + ulint n; + ulint i; + const page_t* page; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page = page_align(rec); + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + /* The insert buffer index tree can contain records from any + other index: we cannot check the number of fields or + their length */ + + return(TRUE); + } + + if (UNIV_UNLIKELY((ibool)!!page_is_comp(page) + != dict_table_is_comp(index->table))) { + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n", + (ulong) !!page_is_comp(page), + (ulong) dict_table_is_comp(index->table)); + + return(FALSE); + } + + n = dict_index_get_n_fields(index); + + if (!page_is_comp(page) + && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) { + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n", + (ulong) rec_get_n_fields_old(rec), (ulong) n); + + if (dump_on_error) { + buf_page_print(page, 0); + + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); + } + return(FALSE); + } + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + for (i = 0; i < n; i++) { + ulint fixed_size = dict_col_get_fixed_size( + dict_index_get_nth_col(index, i)); + + rec_get_nth_field_offs(offsets, i, &len); + + /* Note that if fixed_size != 0, it equals the + length of a fixed-size column in the clustered index. + A prefix index of the column is of fixed, but different + length. When fixed_size == 0, prefix_len is the maximum + length of the prefix index column. */ + + if ((dict_index_get_nth_field(index, i)->prefix_len == 0 + && len != UNIV_SQL_NULL && fixed_size + && len != fixed_size) + || (dict_index_get_nth_field(index, i)->prefix_len > 0 + && len != UNIV_SQL_NULL + && len + > dict_index_get_nth_field(index, i)->prefix_len)) { + + btr_index_rec_validate_report(page, rec, index); + fprintf(stderr, + "InnoDB: field %lu len is %lu," + " should be %lu\n", + (ulong) i, (ulong) len, (ulong) fixed_size); + + if (dump_on_error) { + buf_page_print(page, 0); + + fputs("InnoDB: corrupt record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(FALSE); + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(TRUE); +} + +/**************************************************************** +Checks the size and number of fields in records based on the definition of +the index. */ +static +ibool +btr_index_page_validate( +/*====================*/ + /* out: TRUE if ok */ + buf_block_t* block, /* in: index page */ + dict_index_t* index) /* in: index */ +{ + page_cur_t cur; + ibool ret = TRUE; + + page_cur_set_before_first(block, &cur); + page_cur_move_to_next(&cur); + + for (;;) { + if (page_cur_is_after_last(&cur)) { + + break; + } + + if (!btr_index_rec_validate(cur.rec, index, TRUE)) { + + return(FALSE); + } + + page_cur_move_to_next(&cur); + } + + return(ret); +} + +/**************************************************************** +Report an error on one page of an index tree. */ +static +void +btr_validate_report1( +/*=================*/ + /* out: TRUE if ok */ + dict_index_t* index, /* in: index */ + ulint level, /* in: B-tree level */ + const buf_block_t* block) /* in: index page */ +{ + fprintf(stderr, "InnoDB: Error in page %lu of ", + buf_block_get_page_no(block)); + dict_index_name_print(stderr, NULL, index); + if (level) { + fprintf(stderr, ", index tree level %lu", level); + } + putc('\n', stderr); +} + +/**************************************************************** +Report an error on two pages of an index tree. */ +static +void +btr_validate_report2( +/*=================*/ + /* out: TRUE if ok */ + const dict_index_t* index, /* in: index */ + ulint level, /* in: B-tree level */ + const buf_block_t* block1, /* in: first index page */ + const buf_block_t* block2) /* in: second index page */ +{ + fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ", + buf_block_get_page_no(block1), + buf_block_get_page_no(block2)); + dict_index_name_print(stderr, NULL, index); + if (level) { + fprintf(stderr, ", index tree level %lu", level); + } + putc('\n', stderr); +} + +/**************************************************************** +Validates index tree level. */ +static +ibool +btr_validate_level( +/*===============*/ + /* out: TRUE if ok */ + dict_index_t* index, /* in: index tree */ + trx_t* trx, /* in: transaction or NULL */ + ulint level) /* in: level number */ +{ + ulint space; + ulint zip_size; + buf_block_t* block; + page_t* page; + buf_block_t* right_block = 0; /* remove warning */ + page_t* right_page = 0; /* remove warning */ + page_t* father_page; + btr_cur_t node_cur; + btr_cur_t right_node_cur; + rec_t* rec; + ulint right_page_no; + ulint left_page_no; + page_cur_t cursor; + dtuple_t* node_ptr_tuple; + ibool ret = TRUE; + mtr_t mtr; + mem_heap_t* heap = mem_heap_create(256); + ulint* offsets = NULL; + ulint* offsets2= NULL; +#ifdef UNIV_ZIP_DEBUG + page_zip_des_t* page_zip; +#endif /* UNIV_ZIP_DEBUG */ + + mtr_start(&mtr); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + block = btr_root_block_get(index, &mtr); + page = buf_block_get_frame(block); + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + + while (level != btr_page_get_level(page, &mtr)) { + const rec_t* node_ptr; + + ut_a(space == buf_block_get_space(block)); + ut_a(space == page_get_space_id(page)); +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(!page_is_leaf(page)); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + node_ptr = page_cur_get_rec(&cursor); + offsets = rec_get_offsets(node_ptr, index, offsets, + ULINT_UNDEFINED, &heap); + block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr); + page = buf_block_get_frame(block); + } + + /* Now we are on the desired level. Loop through the pages on that + level. */ +loop: + if (trx_is_interrupted(trx)) { + mtr_commit(&mtr); + mem_heap_free(heap); + return(ret); + } + mem_heap_empty(heap); + offsets = offsets2 = NULL; + mtr_x_lock(dict_index_get_lock(index), &mtr); + +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + /* Check ordering etc. of records */ + + if (!page_validate(page, index)) { + btr_validate_report1(index, level, block); + + ret = FALSE; + } else if (level == 0) { + /* We are on level 0. Check that the records have the right + number of fields, and field lengths are right. */ + + if (!btr_index_page_validate(block, index)) { + + ret = FALSE; + } + } + + ut_a(btr_page_get_level(page, &mtr) == level); + + right_page_no = btr_page_get_next(page, &mtr); + left_page_no = btr_page_get_prev(page, &mtr); + + ut_a(page_get_n_recs(page) > 0 || (level == 0 + && page_get_page_no(page) + == dict_index_get_page(index))); + + if (right_page_no != FIL_NULL) { + const rec_t* right_rec; + right_block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, &mtr); + right_page = buf_block_get_frame(right_block); + if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr) + != page_get_page_no(page))) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: broken FIL_PAGE_NEXT" + " or FIL_PAGE_PREV links\n", stderr); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + + ret = FALSE; + } + + if (UNIV_UNLIKELY(page_is_comp(right_page) + != page_is_comp(page))) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: 'compact' flag mismatch\n", stderr); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + + ret = FALSE; + + goto node_ptr_fails; + } + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + right_rec = page_rec_get_next(page_get_infimum_rec( + right_page)); + offsets = rec_get_offsets(rec, index, + offsets, ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, + offsets2, ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec, + offsets, offsets2, + index) >= 0)) { + + btr_validate_report2(index, level, block, right_block); + + fputs("InnoDB: records in wrong order" + " on adjacent pages\n", stderr); + + buf_page_print(page, 0); + buf_page_print(right_page, 0); + + fputs("InnoDB: record ", stderr); + rec = page_rec_get_prev(page_get_supremum_rec(page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + fputs("InnoDB: record ", stderr); + rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + rec_print(stderr, rec, index); + putc('\n', stderr); + + ret = FALSE; + } + } + + if (level > 0 && left_page_no == FIL_NULL) { + ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page))); + } + + if (buf_block_get_page_no(block) != dict_index_get_page(index)) { + + /* Check father node pointers */ + + rec_t* node_ptr; + + offsets = btr_page_get_father_block(offsets, heap, index, + block, &mtr, &node_cur); + father_page = btr_cur_get_page(&node_cur); + node_ptr = btr_cur_get_rec(&node_cur); + + btr_cur_position( + index, page_rec_get_prev(page_get_supremum_rec(page)), + block, &node_cur); + offsets = btr_page_get_father_node_ptr(offsets, heap, + &node_cur, &mtr); + + if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur)) + || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, + offsets) + != buf_block_get_page_no(block))) { + + btr_validate_report1(index, level, block); + + fputs("InnoDB: node pointer to the page is wrong\n", + stderr); + + buf_page_print(father_page, 0); + buf_page_print(page, 0); + + fputs("InnoDB: node ptr ", stderr); + rec_print(stderr, node_ptr, index); + + rec = btr_cur_get_rec(&node_cur); + fprintf(stderr, "\n" + "InnoDB: node ptr child page n:o %lu\n", + (ulong) btr_node_ptr_get_child_page_no( + rec, offsets)); + + fputs("InnoDB: record on page ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + ret = FALSE; + + goto node_ptr_fails; + } + + if (!page_is_leaf(page)) { + node_ptr_tuple = dict_index_build_node_ptr( + index, + page_rec_get_next(page_get_infimum_rec(page)), + 0, heap, btr_page_get_level(page, &mtr)); + + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, + offsets)) { + const rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + btr_validate_report1(index, level, block); + + buf_page_print(father_page, 0); + buf_page_print(page, 0); + + fputs("InnoDB: Error: node ptrs differ" + " on levels > 0\n" + "InnoDB: node ptr ", stderr); + rec_print_new(stderr, node_ptr, offsets); + fputs("InnoDB: first rec ", stderr); + rec_print(stderr, first_rec, index); + putc('\n', stderr); + ret = FALSE; + + goto node_ptr_fails; + } + } + + if (left_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_next( + page_get_infimum_rec(father_page))); + ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL); + } + + if (right_page_no == FIL_NULL) { + ut_a(node_ptr == page_rec_get_prev( + page_get_supremum_rec(father_page))); + ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL); + } else { + const rec_t* right_node_ptr + = page_rec_get_next(node_ptr); + + offsets = btr_page_get_father_block( + offsets, heap, index, right_block, + &mtr, &right_node_cur); + if (right_node_ptr + != page_get_supremum_rec(father_page)) { + + if (btr_cur_get_rec(&right_node_cur) + != right_node_ptr) { + ret = FALSE; + fputs("InnoDB: node pointer to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print(father_page, 0); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + } + } else { + page_t* right_father_page + = btr_cur_get_page(&right_node_cur); + + if (btr_cur_get_rec(&right_node_cur) + != page_rec_get_next( + page_get_infimum_rec( + right_father_page))) { + ret = FALSE; + fputs("InnoDB: node pointer 2 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print(father_page, 0); + buf_page_print(right_father_page, 0); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + } + + if (page_get_page_no(right_father_page) + != btr_page_get_next(father_page, &mtr)) { + + ret = FALSE; + fputs("InnoDB: node pointer 3 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + + buf_page_print(father_page, 0); + buf_page_print(right_father_page, 0); + buf_page_print(page, 0); + buf_page_print(right_page, 0); + } + } + } + } + +node_ptr_fails: + /* Commit the mini-transaction to release the latch on 'page'. + Re-acquire the latch on right_page, which will become 'page' + on the next loop. The page has already been checked. */ + mtr_commit(&mtr); + + if (right_page_no != FIL_NULL) { + mtr_start(&mtr); + + block = btr_block_get(space, zip_size, right_page_no, + RW_X_LATCH, &mtr); + page = buf_block_get_frame(block); + + goto loop; + } + + mem_heap_free(heap); + return(ret); +} + +/****************************************************************** +Checks the consistency of an index tree. */ +UNIV_INTERN +ibool +btr_validate_index( +/*===============*/ + /* out: TRUE if ok */ + dict_index_t* index, /* in: index */ + trx_t* trx) /* in: transaction or NULL */ +{ + mtr_t mtr; + page_t* root; + ulint i; + ulint n; + + mtr_start(&mtr); + mtr_x_lock(dict_index_get_lock(index), &mtr); + + root = btr_root_get(index, &mtr); + n = btr_page_get_level(root, &mtr); + + for (i = 0; i <= n && !trx_is_interrupted(trx); i++) { + if (!btr_validate_level(index, trx, n - i)) { + + mtr_commit(&mtr); + + return(FALSE); + } + } + + mtr_commit(&mtr); + + return(TRUE); +} diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.c new file mode 100644 index 00000000000..70cf06342aa --- /dev/null +++ b/storage/xtradb/btr/btr0cur.c @@ -0,0 +1,4809 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The index tree cursor + +All changes that row operations make to a B-tree or the records +there must go through this module! Undo log records are written here +of every modify or insert of a clustered index record. + + NOTE!!! +To make sure we do not run out of disk space during a pessimistic +insert or update, we have to reserve 2 x the height of the index tree +many pages in the tablespace before we start the operation, because +if leaf splitting has been started, it is difficult to undo, except +by crashing the database and doing a roll-forward. + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0cur.h" + +#ifdef UNIV_NONINL +#include "btr0cur.ic" +#endif + +#include "page0page.h" +#include "page0zip.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "buf0lru.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "trx0roll.h" /* trx_is_recv() */ +#include "que0que.h" +#include "row0row.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "zlib.h" + +#ifdef UNIV_DEBUG +/* If the following is set to TRUE, this module prints a lot of +trace information of individual record operations */ +UNIV_INTERN ibool btr_cur_print_record_ops = FALSE; +#endif /* UNIV_DEBUG */ + +UNIV_INTERN ulint btr_cur_n_non_sea = 0; +UNIV_INTERN ulint btr_cur_n_sea = 0; +UNIV_INTERN ulint btr_cur_n_non_sea_old = 0; +UNIV_INTERN ulint btr_cur_n_sea_old = 0; + +/* In the optimistic insert, if the insert does not fit, but this much space +can be released by page reorganize, then it is reorganized */ + +#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32) + +/* The structure of a BLOB part header */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_PART_LEN 0 /* BLOB part len on this + page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /* next BLOB part page no, + FIL_NULL if none */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_SIZE 8 + +/* A BLOB field reference full of zero, for use in assertions and tests. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]; + +/*********************************************************************** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: record in a clustered index */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr); /* in: mtr, or NULL if not logged */ +/*********************************************************************** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /* in: cursor positioned on a page */ + ulint height, /* in: height of the page in tree; + 0 means leaf node */ + ulint root_height); /* in: root node height in tree */ +/*************************************************************** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /* in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /* in: record */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + const upd_t* update, /* in: update vector */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* mtr); /* in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +/*************************************************************** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* mtr); /* in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +/*************************************************************** +Gets the externally stored size of a record, in units of a database page. */ +static +ulint +btr_rec_get_externally_stored_len( +/*==============================*/ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ + +/********************************************************** +The following function is used to set the deleted bit of a record. */ +UNIV_INLINE +void +btr_rec_set_deleted_flag( +/*=====================*/ + /* out: TRUE on success; + FALSE on page_zip overflow */ + rec_t* rec, /* in/out: physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page (or NULL) */ + ulint flag) /* in: nonzero if delete marked */ +{ + if (page_rec_is_comp(rec)) { + rec_set_deleted_flag_new(rec, page_zip, flag); + } else { + ut_ad(!page_zip); + rec_set_deleted_flag_old(rec, flag); + } +} + +/*==================== B-TREE SEARCH =========================*/ + +/************************************************************************ +Latches the leaf page or pages requested. */ +static +void +btr_cur_latch_leaves( +/*=================*/ + page_t* page, /* in: leaf page where the search + converged */ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number of the leaf */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr) /* in: mtr */ +{ + ulint mode; + ulint left_page_no; + ulint right_page_no; + buf_block_t* get_block; + + ut_ad(page && mtr); + + switch (latch_mode) { + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH; + get_block = btr_block_get(space, zip_size, page_no, mode, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + return; + case BTR_MODIFY_TREE: + /* x-latch also brothers from left to right */ + left_page_no = btr_page_get_prev(page, mtr); + + if (left_page_no != FIL_NULL) { + get_block = btr_block_get(space, zip_size, + left_page_no, + RW_X_LATCH, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_next(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + } + + get_block = btr_block_get(space, zip_size, page_no, + RW_X_LATCH, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + + right_page_no = btr_page_get_next(page, mtr); + + if (right_page_no != FIL_NULL) { + get_block = btr_block_get(space, zip_size, + right_page_no, + RW_X_LATCH, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_prev(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + } + + return; + + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH; + /* latch also left brother */ + left_page_no = btr_page_get_prev(page, mtr); + + if (left_page_no != FIL_NULL) { + get_block = btr_block_get(space, zip_size, + left_page_no, mode, mtr); + cursor->left_block = get_block; +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) + == page_is_comp(page)); + ut_a(btr_page_get_next(get_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + } + + get_block = btr_block_get(space, zip_size, page_no, mode, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); +#endif /* UNIV_BTR_DEBUG */ + get_block->check_index_page_at_flush = TRUE; + return; + } + + ut_error; +} + +/************************************************************************ +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. + +If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. */ +UNIV_INTERN +void +btr_cur_search_to_nth_level( +/*========================*/ + dict_index_t* index, /* in: index */ + ulint level, /* in: the tree level of search */ + const dtuple_t* tuple, /* in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + ulint mode, /* in: PAGE_CUR_L, ...; + Inserts should always be made using + PAGE_CUR_LE to search the position! */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ..., ORed with + BTR_INSERT and BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if has_search_latch + is != 0, we maybe do not have a latch set + on the cursor page, we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /* in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ + ulint has_search_latch,/* in: info on the latch mode the + caller currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t* page_cursor; + page_t* page; + buf_block_t* guess; + rec_t* node_ptr; + ulint page_no; + ulint space; + ulint up_match; + ulint up_bytes; + ulint low_match; + ulint low_bytes; + ulint height; + ulint savepoint; + ulint page_mode; + ulint insert_planned; + ulint estimate; + ulint ignore_sec_unique; + ulint root_height = 0; /* remove warning */ +#ifdef BTR_CUR_ADAPT + btr_search_t* info; +#endif + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + /* Currently, PAGE_CUR_LE is the only search mode used for searches + ending to upper levels */ + + ut_ad(level == 0 || mode == PAGE_CUR_LE); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(!dict_index_is_ibuf(index) || ibuf_inside()); + ut_ad(dtuple_check_typed(tuple)); + +#ifdef UNIV_DEBUG + cursor->up_match = ULINT_UNDEFINED; + cursor->low_match = ULINT_UNDEFINED; +#endif + insert_planned = latch_mode & BTR_INSERT; + estimate = latch_mode & BTR_ESTIMATE; + ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE; + latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE + | BTR_IGNORE_SEC_UNIQUE); + + ut_ad(!insert_planned || (mode == PAGE_CUR_LE)); + + cursor->flag = BTR_CUR_BINARY; + cursor->index = index; + +#ifndef BTR_CUR_ADAPT + guess = NULL; +#else + info = btr_search_get_info(index); + + guess = info->root_guess; + +#ifdef BTR_CUR_HASH_ADAPT + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; +#endif + if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED + && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ + && !estimate +#ifdef PAGE_CUR_LE_OR_EXTENDS + && mode != PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + /* If !has_search_latch, we do a dirty read of + btr_search_enabled below, and btr_search_guess_on_hash() + will have to check it again. */ + && UNIV_LIKELY(btr_search_enabled) + && btr_search_guess_on_hash(index, info, tuple, mode, + latch_mode, cursor, + has_search_latch, mtr)) { + + /* Search using the hash index succeeded */ + + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + btr_cur_n_sea++; + + return; + } +#endif /* BTR_CUR_HASH_ADAPT */ +#endif /* BTR_CUR_ADAPT */ + btr_cur_n_non_sea++; + + /* If the hash search did not succeed, do binary search down the + tree */ + + if (has_search_latch) { + /* Release possible search latch to obey latching order */ + rw_lock_s_unlock(&btr_search_latch); + } + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + + savepoint = mtr_set_savepoint(mtr); + + if (latch_mode == BTR_MODIFY_TREE) { + mtr_x_lock(dict_index_get_lock(index), mtr); + + } else if (latch_mode == BTR_CONT_MODIFY_TREE) { + /* Do nothing */ + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + } else { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + + space = dict_index_get_space(index); + page_no = dict_index_get_page(index); + + up_match = 0; + up_bytes = 0; + low_match = 0; + low_bytes = 0; + + height = ULINT_UNDEFINED; + + /* We use these modified search modes on non-leaf levels of the + B-tree. These let us end up in the right B-tree leaf. In that leaf + we use the original search mode. */ + + switch (mode) { + case PAGE_CUR_GE: + page_mode = PAGE_CUR_L; + break; + case PAGE_CUR_G: + page_mode = PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + page_mode = mode; + break; + } + + /* Loop and search until we arrive at the desired level */ + + for (;;) { + ulint zip_size; + buf_block_t* block; + ulint rw_latch; + ulint buf_mode; + + zip_size = dict_table_zip_size(index->table); + rw_latch = RW_NO_LATCH; + buf_mode = BUF_GET; + + if (height == 0 && latch_mode <= BTR_MODIFY_LEAF) { + + rw_latch = latch_mode; + + if (insert_planned + && ibuf_should_try(index, ignore_sec_unique)) { + + /* Try insert to the insert buffer if the + page is not in the buffer pool */ + + buf_mode = BUF_GET_IF_IN_POOL; + } + } + +retry_page_get: + block = buf_page_get_gen(space, zip_size, page_no, + rw_latch, guess, buf_mode, + __FILE__, __LINE__, mtr); + if (block == NULL) { + /* This must be a search to perform an insert; + try insert to the insert buffer */ + + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + ut_ad(insert_planned); + ut_ad(cursor->thr); + + if (ibuf_insert(tuple, index, space, zip_size, + page_no, cursor->thr)) { + /* Insertion to the insert buffer succeeded */ + cursor->flag = BTR_CUR_INSERT_TO_IBUF; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + goto func_exit; + } + + /* Insert to the insert buffer did not succeed: + retry page get */ + + buf_mode = BUF_GET; + + goto retry_page_get; + } + + page = buf_block_get_frame(block); + + block->check_index_page_at_flush = TRUE; + + if (rw_latch != RW_NO_LATCH) { +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + } + + ut_ad(0 == ut_dulint_cmp(index->id, + btr_page_get_index_id(page))); + + if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + root_height = height; + cursor->tree_height = root_height + 1; +#ifdef BTR_CUR_ADAPT + if (block != guess) { + info->root_guess = block; + } +#endif + } + + if (height == 0) { + if (rw_latch == RW_NO_LATCH) { + + btr_cur_latch_leaves(page, space, zip_size, + page_no, latch_mode, + cursor, mtr); + } + + if ((latch_mode != BTR_MODIFY_TREE) + && (latch_mode != BTR_CONT_MODIFY_TREE)) { + + /* Release the tree s-latch */ + + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + + page_mode = mode; + } + + page_cur_search_with_match(block, index, tuple, page_mode, + &up_match, &up_bytes, + &low_match, &low_bytes, + page_cursor); + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + /* If this is the desired level, leave the loop */ + + ut_ad(height == btr_page_get_level( + page_cur_get_page(page_cursor), mtr)); + + if (level == height) { + + if (level > 0) { + /* x-latch the page */ + page = btr_page_get(space, zip_size, + page_no, RW_X_LATCH, mtr); + ut_a((ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + } + + break; + } + + ut_ad(height > 0); + + height--; + + guess = NULL; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (level == 0) { + cursor->low_match = low_match; + cursor->low_bytes = low_bytes; + cursor->up_match = up_match; + cursor->up_bytes = up_bytes; + +#ifdef BTR_CUR_ADAPT + /* We do a dirty read of btr_search_enabled here. We + will properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a + page hash index, while holding btr_search_latch. */ + if (UNIV_LIKELY(btr_search_enabled)) { + + btr_search_info_update(index, cursor); + } +#endif + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_GE); + ut_ad(cursor->up_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + ut_ad(cursor->low_match != ULINT_UNDEFINED + || mode != PAGE_CUR_LE); + } + +func_exit: + if (has_search_latch) { + + rw_lock_s_lock(&btr_search_latch); + } +} + +/********************************************************************* +Opens a cursor at either end of an index. */ +UNIV_INTERN +void +btr_cur_open_at_index_side( +/*=======================*/ + ibool from_left, /* in: TRUE if open to the low end, + FALSE if to the high end */ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: latch mode */ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t* page_cursor; + ulint page_no; + ulint space; + ulint zip_size; + ulint height; + ulint root_height = 0; /* remove warning */ + rec_t* node_ptr; + ulint estimate; + ulint savepoint; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + estimate = latch_mode & BTR_ESTIMATE; + latch_mode = latch_mode & ~BTR_ESTIMATE; + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched the leaf node */ + + savepoint = mtr_set_savepoint(mtr); + + if (latch_mode == BTR_MODIFY_TREE) { + mtr_x_lock(dict_index_get_lock(index), mtr); + } else { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + page_no = dict_index_get_page(index); + + height = ULINT_UNDEFINED; + + for (;;) { + buf_block_t* block; + page_t* page; + block = buf_page_get_gen(space, zip_size, page_no, + RW_NO_LATCH, NULL, BUF_GET, + __FILE__, __LINE__, mtr); + page = buf_block_get_frame(block); + ut_ad(0 == ut_dulint_cmp(index->id, + btr_page_get_index_id(page))); + + block->check_index_page_at_flush = TRUE; + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + root_height = height; + } + + if (height == 0) { + btr_cur_latch_leaves(page, space, zip_size, page_no, + latch_mode, cursor, mtr); + + /* In versions <= 3.23.52 we had forgotten to + release the tree latch here. If in an index scan + we had to scan far to find a record visible to the + current transaction, that could starve others + waiting for the tree latch. */ + + if ((latch_mode != BTR_MODIFY_TREE) + && (latch_mode != BTR_CONT_MODIFY_TREE)) { + + /* Release the tree s-latch */ + + mtr_release_s_latch_at_savepoint( + mtr, savepoint, + dict_index_get_lock(index)); + } + } + + if (from_left) { + page_cur_set_before_first(block, page_cursor); + } else { + page_cur_set_after_last(block, page_cursor); + } + + if (height == 0) { + if (estimate) { + btr_cur_add_path_info(cursor, height, + root_height); + } + + break; + } + + ut_ad(height > 0); + + if (from_left) { + page_cur_move_to_next(page_cursor); + } else { + page_cur_move_to_prev(page_cursor); + } + + if (estimate) { + btr_cur_add_path_info(cursor, height, root_height); + } + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/************************************************************************** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INTERN +void +btr_cur_open_at_rnd_pos( +/*====================*/ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /* in/out: B-tree cursor */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t* page_cursor; + ulint page_no; + ulint space; + ulint zip_size; + ulint height; + rec_t* node_ptr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + if (latch_mode == BTR_MODIFY_TREE) { + mtr_x_lock(dict_index_get_lock(index), mtr); + } else { + mtr_s_lock(dict_index_get_lock(index), mtr); + } + + page_cursor = btr_cur_get_page_cur(cursor); + cursor->index = index; + + space = dict_index_get_space(index); + zip_size = dict_table_zip_size(index->table); + page_no = dict_index_get_page(index); + + height = ULINT_UNDEFINED; + + for (;;) { + buf_block_t* block; + page_t* page; + + block = buf_page_get_gen(space, zip_size, page_no, + RW_NO_LATCH, NULL, BUF_GET, + __FILE__, __LINE__, mtr); + page = buf_block_get_frame(block); + ut_ad(0 == ut_dulint_cmp(index->id, + btr_page_get_index_id(page))); + + if (height == ULINT_UNDEFINED) { + /* We are in the root node */ + + height = btr_page_get_level(page, mtr); + } + + if (height == 0) { + btr_cur_latch_leaves(page, space, zip_size, page_no, + latch_mode, cursor, mtr); + } + + page_cur_open_on_rnd_user_rec(block, page_cursor); + + if (height == 0) { + + break; + } + + ut_ad(height > 0); + + height--; + + node_ptr = page_cur_get_rec(page_cursor); + offsets = rec_get_offsets(node_ptr, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*==================== B-TREE INSERT =========================*/ + +/***************************************************************** +Inserts a record if there is enough space, or if enough space can +be freed by reorganizing. Differs from btr_cur_optimistic_insert because +no heuristics is applied to whether it pays to use CPU time for +reorganizing the page or not. */ +static +rec_t* +btr_cur_insert_if_possible( +/*=======================*/ + /* out: pointer to inserted record if succeed, + else NULL */ + btr_cur_t* cursor, /* in: cursor on page after which to insert; + cursor stays valid */ + const dtuple_t* tuple, /* in: tuple to insert; the size info need not + have been stored to tuple */ + ulint n_ext, /* in: number of externally stored columns */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t* page_cursor; + buf_block_t* block; + rec_t* rec; + + ut_ad(dtuple_check_typed(tuple)); + + block = btr_cur_get_block(cursor); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, n_ext, mtr); + + if (UNIV_UNLIKELY(!rec)) { + /* If record did not fit, reorganize */ + + if (btr_page_reorganize(block, cursor->index, mtr)) { + + page_cur_search(block, cursor->index, tuple, + PAGE_CUR_LE, page_cursor); + + rec = page_cur_tuple_insert(page_cursor, tuple, + cursor->index, n_ext, mtr); + } + } + + return(rec); +} + +/***************************************************************** +For an insert, checks the locks and does the undo logging if desired. */ +UNIV_INLINE +ulint +btr_cur_ins_lock_and_undo( +/*======================*/ + /* out: DB_SUCCESS, DB_WAIT_LOCK, + DB_FAIL, or error number */ + ulint flags, /* in: undo logging and locking flags: if + not zero, the parameters index and thr + should be specified */ + btr_cur_t* cursor, /* in: cursor on page after which to insert */ + const dtuple_t* entry, /* in: entry to insert */ + que_thr_t* thr, /* in: query thread or NULL */ + ibool* inherit)/* out: TRUE if the inserted new record maybe + should inherit LOCK_GAP type locks from the + successor record */ +{ + dict_index_t* index; + ulint err; + rec_t* rec; + dulint roll_ptr; + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + err = lock_rec_insert_check_and_lock(flags, rec, + btr_cur_get_block(cursor), + index, thr, inherit); + + if (err != DB_SUCCESS) { + + return(err); + } + + if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) { + + err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP, + thr, index, entry, + NULL, 0, NULL, + &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + /* Now we can fill in the roll ptr field in entry */ + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + + row_upd_index_entry_sys_field(entry, index, + DATA_ROLL_PTR, roll_ptr); + } + } + + return(DB_SUCCESS); +} + +#ifdef UNIV_DEBUG +/***************************************************************** +Report information about a transaction. */ +static +void +btr_cur_trx_report( +/*===============*/ + trx_t* trx, /* in: transaction */ + const dict_index_t* index, /* in: index */ + const char* op) /* in: operation */ +{ + fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", + TRX_ID_PREP_PRINTF(trx->id)); + fputs(op, stderr); + dict_index_name_print(stderr, trx, index); + putc('\n', stderr); +} +#endif /* UNIV_DEBUG */ + +/***************************************************************** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. */ +UNIV_INTERN +ulint +btr_cur_optimistic_insert( +/*======================*/ + /* out: DB_SUCCESS, DB_WAIT_LOCK, + DB_FAIL, or error number */ + ulint flags, /* in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /* in: cursor on page after which to insert; + cursor stays valid */ + dtuple_t* entry, /* in/out: entry to insert */ + rec_t** rec, /* out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /* in: number of externally stored columns */ + que_thr_t* thr, /* in: query thread or NULL */ + mtr_t* mtr) /* in: mtr; if this function returns + DB_SUCCESS on a leaf page of a secondary + index in a compressed tablespace, the + mtr must be committed before latching + any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + dict_index_t* index; + page_cur_t* page_cursor; + buf_block_t* block; + page_t* page; + ulint max_size; + rec_t* dummy_rec; + ibool leaf; + ibool reorg; + ibool inherit; + ulint zip_size; + ulint rec_size; + mem_heap_t* heap = NULL; + ulint err; + + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = cursor->index; + zip_size = buf_block_get_zip_size(block); +#ifdef UNIV_DEBUG_VALGRIND + if (zip_size) { + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + if (!dtuple_check_typed_no_assert(entry)) { + fputs("InnoDB: Error in a tuple to insert into ", stderr); + dict_index_name_print(stderr, thr_get_trx(thr), index); + } +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), index, "insert into "); + dtuple_print(stderr, entry); + } +#endif /* UNIV_DEBUG */ + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + max_size = page_get_max_insert_size_after_reorganize(page, 1); + leaf = page_is_leaf(page); + + /* Calculate the record size when entry is converted to a record */ + rec_size = rec_get_converted_size(index, entry, n_ext); + + if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), + dtuple_get_n_fields(entry), zip_size)) { + + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext); + + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(index, entry, n_ext); + } + + if (UNIV_UNLIKELY(zip_size)) { + /* Estimate the free space of an empty compressed page. + Subtract one byte for the encoded heap_no in the + modification log. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index->n_fields, zip_size) - 1; + ulint n_uniq = dict_index_get_n_unique_in_tree(index); + + ut_ad(dict_table_is_comp(index->table)); + + /* There should be enough room for two node pointer + records on an empty non-leaf page. This prevents + infinite page splits. */ + + if (UNIV_LIKELY(entry->n_fields >= n_uniq) + && UNIV_UNLIKELY(REC_NODE_PTR_SIZE + + rec_get_converted_size_comp_prefix( + index, entry->fields, n_uniq, + NULL) + /* On a compressed page, there is + a two-byte entry in the dense + page directory for every record. + But there is no record header. */ + - (REC_N_NEW_EXTRA_BYTES - 2) + > free_space_zip / 2)) { + + if (big_rec_vec) { + dtuple_convert_back_big_rec( + index, entry, big_rec_vec); + } + + if (heap) { + mem_heap_free(heap); + } + + return(DB_TOO_BIG_RECORD); + } + } + + /* If there have been many consecutive inserts, and we are on the leaf + level, check if we have to split the page to reserve enough free space + for future updates of records. */ + + if (dict_index_is_clust(index) + && (page_get_n_recs(page) >= 2) + && UNIV_LIKELY(leaf) + && (dict_index_get_space_reserve() + rec_size > max_size) + && (btr_page_get_split_rec_to_right(cursor, &dummy_rec) + || btr_page_get_split_rec_to_left(cursor, &dummy_rec))) { +fail: + err = DB_FAIL; +fail_err: + + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(err); + } + + if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + || max_size < rec_size) + && UNIV_LIKELY(page_get_n_recs(page) > 1) + && page_get_max_insert_size(page, 1) < rec_size) { + + goto fail; + } + + /* Check locks and write to the undo log, if specified */ + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &inherit); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + + goto fail_err; + } + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + + { + const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor); + *rec = page_cur_tuple_insert(page_cursor, entry, index, + n_ext, mtr); + reorg = page_cursor_rec != page_cur_get_rec(page_cursor); + + if (UNIV_UNLIKELY(reorg)) { + ut_a(zip_size); + ut_a(*rec); + } + } + + if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) { + /* If the record did not fit, reorganize */ + if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) { + ut_a(zip_size); + + goto fail; + } + + ut_ad(zip_size + || page_get_max_insert_size(page, 1) == max_size); + + reorg = TRUE; + + page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor); + + *rec = page_cur_tuple_insert(page_cursor, entry, index, + n_ext, mtr); + + if (UNIV_UNLIKELY(!*rec)) { + if (UNIV_LIKELY(zip_size != 0)) { + + goto fail; + } + + fputs("InnoDB: Error: cannot insert tuple ", stderr); + dtuple_print(stderr, entry); + fputs(" into ", stderr); + dict_index_name_print(stderr, thr_get_trx(thr), index); + fprintf(stderr, "\nInnoDB: max insert size %lu\n", + (ulong) max_size); + ut_error; + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + +#ifdef BTR_CUR_HASH_ADAPT + if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) { + btr_search_update_hash_node_on_insert(cursor); + } else { + btr_search_update_hash_on_insert(cursor); + } +#endif + + if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) { + + lock_update_insert(block, *rec); + } + +#if 0 + fprintf(stderr, "Insert into page %lu, max ins size %lu," + " rec %lu ind type %lu\n", + buf_block_get_page_no(block), max_size, + rec_size + PAGE_DIR_SLOT_SIZE, index->type); +#endif + if (leaf + && !dict_index_is_clust(index) + && !dict_index_is_ibuf(index)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (zip_size) { + /* Update the bits in the same mini-transaction. */ + ibuf_update_free_bits_zip(block, mtr); + } else { + /* Decrement the bits in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full( + block, max_size, + rec_size + PAGE_DIR_SLOT_SIZE); + } + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/***************************************************************** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. */ +UNIV_INTERN +ulint +btr_cur_pessimistic_insert( +/*=======================*/ + /* out: DB_SUCCESS or error number */ + ulint flags, /* in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /* in: cursor after which to insert; + cursor stays valid */ + dtuple_t* entry, /* in/out: entry to insert */ + rec_t** rec, /* out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /* in: number of externally stored columns */ + que_thr_t* thr, /* in: query thread or NULL */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index = cursor->index; + ulint zip_size = dict_table_zip_size(index->table); + big_rec_t* big_rec_vec = NULL; + mem_heap_t* heap = NULL; + ulint err; + ibool dummy_inh; + ibool success; + ulint n_extents = 0; + ulint n_reserved; + + ut_ad(dtuple_check_typed(entry)); + + *big_rec = NULL; + + ut_ad(mtr_memo_contains(mtr, + dict_index_get_lock(btr_cur_get_index(cursor)), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + /* Try first an optimistic insert; reset the cursor flag: we do not + assume anything of how it was positioned */ + + cursor->flag = BTR_CUR_BINARY; + + err = btr_cur_optimistic_insert(flags, cursor, entry, rec, + big_rec, n_ext, thr, mtr); + if (err != DB_FAIL) { + + return(err); + } + + /* Retry with a pessimistic insert. Check locks and write to undo log, + if specified */ + + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &dummy_inh); + + if (err != DB_SUCCESS) { + + return(err); + } + + if (!(flags & BTR_NO_UNDO_LOG_FLAG)) { + /* First reserve enough free space for the file segments + of the index tree, so that the insert will not fail because + of lack of space */ + + n_extents = cursor->tree_height / 16 + 3; + + success = fsp_reserve_free_extents(&n_reserved, index->space, + n_extents, FSP_NORMAL, mtr); + if (!success) { + return(DB_OUT_OF_FILE_SPACE); + } + } + + if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), + dict_table_is_comp(index->table), + dict_index_get_n_fields(index), + zip_size)) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + + if (UNIV_LIKELY_NULL(big_rec_vec)) { + /* This should never happen, but we handle + the situation in a robust manner. */ + ut_ad(0); + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext); + + if (big_rec_vec == NULL) { + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, + n_reserved); + } + return(DB_TOO_BIG_RECORD); + } + } + + if (dict_index_get_page(index) + == buf_block_get_page_no(btr_cur_get_block(cursor))) { + + /* The page is the root page */ + *rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr); + } else { + *rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec); + +#ifdef BTR_CUR_ADAPT + btr_search_update_hash_on_insert(cursor); +#endif + if (!(flags & BTR_NO_LOCKING_FLAG)) { + + lock_update_insert(btr_cur_get_block(cursor), *rec); + } + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*==================== B-TREE UPDATE =========================*/ + +/***************************************************************** +For an update, checks the locks and does the undo logging. */ +UNIV_INLINE +ulint +btr_cur_upd_lock_and_undo( +/*======================*/ + /* out: DB_SUCCESS, DB_WAIT_LOCK, or error + number */ + ulint flags, /* in: undo logging and locking flags */ + btr_cur_t* cursor, /* in: cursor on record to update */ + const upd_t* update, /* in: update vector */ + ulint cmpl_info,/* in: compiler info on secondary index + updates */ + que_thr_t* thr, /* in: query thread */ + dulint* roll_ptr)/* out: roll pointer */ +{ + dict_index_t* index; + rec_t* rec; + ulint err; + + ut_ad(cursor && update && thr && roll_ptr); + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + if (!dict_index_is_clust(index)) { + /* We do undo logging only when we update a clustered index + record */ + return(lock_sec_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, + index, thr)); + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + err = DB_SUCCESS; + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + err = lock_clust_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, index, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), thr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (err != DB_SUCCESS) { + + return(err); + } + } + + /* Append the info about the update in the undo log */ + + err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr, + index, NULL, update, + cmpl_info, rec, roll_ptr); + return(err); +} + +/*************************************************************** +Writes a redo log record of updating a record in-place. */ +UNIV_INLINE +void +btr_cur_update_in_place_log( +/*========================*/ + ulint flags, /* in: flags */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index where cursor positioned */ + const upd_t* update, /* in: update vector */ + trx_t* trx, /* in: transaction */ + dulint roll_ptr, /* in: roll ptr */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + page_t* page = page_align(rec); + ut_ad(flags < 256); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page) + ? MLOG_COMP_REC_UPDATE_IN_PLACE + : MLOG_REC_UPDATE_IN_PLACE, + 1 + DATA_ROLL_PTR_LEN + 14 + 2 + + MLOG_BUF_MARGIN); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } + + /* The code below assumes index is a clustered index: change index to + the clustered index if we are updating a secondary index record (or we + could as well skip writing the sys col values to the log in this case + because they are not needed for a secondary index record update) */ + + index = dict_table_get_first_index(index->table); + + mach_write_to_1(log_ptr, flags); + log_ptr++; + + log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, + mtr); + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + row_upd_index_write_log(update, log_ptr, mtr); +} + +/*************************************************************** +Parses a redo log record of updating a record in-place. */ +UNIV_INTERN +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dict_index_t* index) /* in: index corresponding to page */ +{ + ulint flags; + rec_t* rec; + upd_t* update; + ulint pos; + dulint trx_id; + dulint roll_ptr; + ulint rec_offset; + mem_heap_t* heap; + ulint* offsets; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + flags = mach_read_from_1(ptr); + ptr++; + + ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + rec_offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(rec_offset <= UNIV_PAGE_SIZE); + + heap = mem_heap_create(256); + + ptr = row_upd_index_parse(ptr, end_ptr, heap, &update); + + if (!ptr || !page) { + + goto func_exit; + } + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + rec = page + rec_offset; + + /* We do not need to reserve btr_search_latch, as the page is only + being recovered, and there cannot be a hash index to it. */ + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets, + pos, trx_id, roll_ptr); + } + + row_upd_rec_in_place(rec, index, offsets, update, page_zip); + +func_exit: + mem_heap_free(heap); + + return(ptr); +} + +/***************************************************************** +See if there is enough place in the page modification log to log +an update-in-place. */ +static +ibool +btr_cur_update_alloc_zip( +/*=====================*/ + /* out: TRUE if enough place */ + page_zip_des_t* page_zip,/* in/out: compressed page */ + buf_block_t* block, /* in/out: buffer page */ + dict_index_t* index, /* in: the index corresponding to the block */ + ulint length, /* in: size needed */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ut_a(page_zip == buf_block_get_page_zip(block)); + ut_ad(page_zip); + ut_ad(!dict_index_is_ibuf(index)); + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, 0)) { + return(TRUE); + } + + if (!page_zip->m_nonempty) { + /* The page has been freshly compressed, so + recompressing it will not help. */ + return(FALSE); + } + + if (!page_zip_compress(page_zip, buf_block_get_frame(block), + index, mtr)) { + /* Unable to compress the page */ + return(FALSE); + } + + /* After recompressing a page, we must make sure that the free + bits in the insert buffer bitmap will not exceed the free + space on the page. Because this function will not attempt + recompression unless page_zip_available() fails above, it is + safe to reset the free bits if page_zip_available() fails + again, below. The free bits can safely be reset in a separate + mini-transaction. If page_zip_available() succeeds below, we + can be sure that the page_zip_compress() above did not reduce + the free space available on the page. */ + + if (!page_zip_available(page_zip, dict_index_is_clust(index), + length, 0)) { + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + ibuf_reset_free_bits(block); + } + return(FALSE); + } + + return(TRUE); +} + +/***************************************************************** +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. */ +UNIV_INTERN +ulint +btr_cur_update_in_place( +/*====================*/ + /* out: DB_SUCCESS or error number */ + ulint flags, /* in: undo logging and locking flags */ + btr_cur_t* cursor, /* in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + const upd_t* update, /* in: update vector */ + ulint cmpl_info,/* in: compiler info on secondary index + updates */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ +{ + dict_index_t* index; + buf_block_t* block; + page_zip_des_t* page_zip; + ulint err; + rec_t* rec; + dulint roll_ptr = ut_dulint_zero; + trx_t* trx; + ulint was_delete_marked; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + + trx = thr_get_trx(thr); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(trx, index, "update "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + block = btr_cur_get_block(cursor); + page_zip = buf_block_get_page_zip(block); + + /* Check that enough space is available on the compressed page. */ + if (UNIV_LIKELY_NULL(page_zip) + && !btr_cur_update_alloc_zip(page_zip, block, index, + rec_offs_size(offsets), mtr)) { + return(DB_ZIP_OVERFLOW); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + thr, &roll_ptr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); + } + + if (block->is_hashed) { + /* The function row_upd_changes_ord_field_binary works only + if the update vector was built for a clustered index, we must + NOT call it if index is secondary */ + + if (!dict_index_is_clust(index) + || row_upd_changes_ord_field_binary(NULL, index, update)) { + + /* Remove possible hash index pointer to this record */ + btr_search_update_hash_on_delete(cursor); + } + + rw_lock_x_lock(&btr_search_latch); + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields(rec, NULL, + index, offsets, trx, roll_ptr); + } + + was_delete_marked = rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block))); + + row_upd_rec_in_place(rec, index, offsets, update, page_zip); + + if (block->is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } + + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + btr_cur_update_in_place_log(flags, rec, index, update, + trx, roll_ptr, mtr); + + if (was_delete_marked + && !rec_get_deleted_flag(rec, page_is_comp( + buf_block_get_frame(block)))) { + /* The new updated record owns its possible externally + stored fields */ + + btr_cur_unmark_extern_fields(page_zip, + rec, index, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(DB_SUCCESS); +} + +/***************************************************************** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. We assume here that the ordering +fields of the record do not change. */ +UNIV_INTERN +ulint +btr_cur_optimistic_update( +/*======================*/ + /* out: DB_SUCCESS, or DB_OVERFLOW if the + updated record does not fit, DB_UNDERFLOW + if the page would become too empty, or + DB_ZIP_OVERFLOW if there is not enough + space left on the compressed page */ + ulint flags, /* in: undo logging and locking flags */ + btr_cur_t* cursor, /* in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + const upd_t* update, /* in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/* in: compiler info on secondary index + updates */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ +{ + dict_index_t* index; + page_cur_t* page_cursor; + ulint err; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + rec_t* orig_rec; + ulint max_size; + ulint new_rec_size; + ulint old_rec_size; + dtuple_t* new_entry; + dulint roll_ptr; + trx_t* trx; + mem_heap_t* heap; + ulint i; + ulint n_ext; + ulint* offsets; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + orig_rec = rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + + heap = mem_heap_create(1024); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), index, "update "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + if (!row_upd_changes_field_size_or_external(index, offsets, update)) { + + /* The simplest and the most common case: the update does not + change the size of any field and none of the updated fields is + externally stored in rec or update, and there is enough space + on the compressed page to log the update. */ + + mem_heap_free(heap); + return(btr_cur_update_in_place(flags, cursor, update, + cmpl_info, thr, mtr)); + } + + if (rec_offs_any_extern(offsets)) { +any_extern: + /* Externally stored fields are treated in pessimistic + update */ + + mem_heap_free(heap); + return(DB_OVERFLOW); + } + + for (i = 0; i < upd_get_n_fields(update); i++) { + if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { + + goto any_extern; + } + } + + page_cursor = btr_cur_get_page_cur(cursor); + + new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, + &n_ext, heap); + /* We checked above that there are no externally stored fields. */ + ut_a(!n_ext); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. + Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + FALSE, heap); + old_rec_size = rec_offs_size(offsets); + new_rec_size = rec_get_converted_size(index, new_entry, 0); + + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_LIKELY_NULL(page_zip) + && !btr_cur_update_alloc_zip(page_zip, block, index, + new_rec_size, mtr)) { + err = DB_ZIP_OVERFLOW; + goto err_exit; + } + + if (UNIV_UNLIKELY(new_rec_size + >= (page_get_free_space_of_empty(page_is_comp(page)) + / 2))) { + + err = DB_OVERFLOW; + goto err_exit; + } + + if (UNIV_UNLIKELY(page_get_data_size(page) + - old_rec_size + new_rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT)) { + + /* The page would become too empty */ + + err = DB_UNDERFLOW; + goto err_exit; + } + + max_size = old_rec_size + + page_get_max_insert_size_after_reorganize(page, 1); + + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) + && (max_size >= new_rec_size)) + || (page_get_n_recs(page) <= 1))) { + + /* There was not enough space, or it did not pay to + reorganize: for simplicity, we decide what to do assuming a + reorganization is needed, though it might not be necessary */ + + err = DB_OVERFLOW; + goto err_exit; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, thr, + &roll_ptr); + if (err != DB_SUCCESS) { +err_exit: + mem_heap_free(heap); + return(err); + } + + /* Ok, we may do the replacement. Store on the page infimum the + explicit locks on rec, before deleting rec (see the comment in + btr_cur_pessimistic_update). */ + + lock_rec_store_on_page_infimum(block, rec); + + btr_search_update_hash_on_delete(cursor); + + /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above + invokes rec_offs_make_valid() to point to the copied record that + the fields of new_entry point to. We have to undo it here. */ + ut_ad(rec_offs_validate(NULL, index, offsets)); + rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets); + + page_cur_delete_rec(page_cursor, index, offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + trx = thr_get_trx(thr); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, + roll_ptr); + row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, + trx->id); + } + + /* There are no externally stored columns in new_entry */ + rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr); + ut_a(rec); /* <- We calculated above the insert would fit */ + + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + /* Restore the old explicit lock state on the record */ + + lock_rec_restore_from_page_infimum(block, rec, block); + + page_cur_move_to_next(page_cursor); + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/***************************************************************** +If, in a split, a new supremum record was created as the predecessor of the +updated record, the supremum record must inherit exactly the locks on the +updated record. In the split it may have inherited locks from the successor +of the updated record, which is not correct. This function restores the +right locks for the new supremum. */ +static +void +btr_cur_pess_upd_restore_supremum( +/*==============================*/ + buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: updated record */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* page; + buf_block_t* prev_block; + ulint space; + ulint zip_size; + ulint prev_page_no; + + page = buf_block_get_frame(block); + + if (page_rec_get_next(page_get_infimum_rec(page)) != rec) { + /* Updated record is not the first user record on its page */ + + return; + } + + space = buf_block_get_space(block); + zip_size = buf_block_get_zip_size(block); + prev_page_no = btr_page_get_prev(page, mtr); + + ut_ad(prev_page_no != FIL_NULL); + prev_block = buf_page_get_with_no_latch(space, zip_size, + prev_page_no, mtr); +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(prev_block->frame, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + /* We must already have an x-latch on prev_block! */ + ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX)); + + lock_rec_reset_and_inherit_gap_locks(prev_block, block, + PAGE_HEAP_NO_SUPREMUM, + page_rec_get_heap_no(rec)); +} + +/***************************************************************** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. We assume +here that the ordering fields of the record do not change. */ +UNIV_INTERN +ulint +btr_cur_pessimistic_update( +/*=======================*/ + /* out: DB_SUCCESS or error code */ + ulint flags, /* in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /* in: cursor on the record to update */ + mem_heap_t** heap, /* in/out: pointer to memory heap, or NULL */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ + const upd_t* update, /* in: update vector; this is allowed also + contain trx id and roll ptr fields, but + the values in update vector have no effect */ + ulint cmpl_info,/* in: compiler info on secondary index + updates */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + big_rec_t* dummy_big_rec; + dict_index_t* index; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + page_cur_t* page_cursor; + dtuple_t* new_entry; + ulint err; + ulint optim_err; + dulint roll_ptr; + trx_t* trx; + ibool was_first; + ulint n_extents = 0; + ulint n_reserved; + ulint n_ext; + ulint* offsets = NULL; + + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + rec = btr_cur_get_rec(cursor); + index = cursor->index; + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + + optim_err = btr_cur_optimistic_update(flags, cursor, update, + cmpl_info, thr, mtr); + + switch (optim_err) { + case DB_UNDERFLOW: + case DB_OVERFLOW: + case DB_ZIP_OVERFLOW: + break; + default: + return(optim_err); + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, + thr, &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + if (optim_err == DB_OVERFLOW) { + ulint reserve_flag; + + /* First reserve enough free space for the file segments + of the index tree, so that the update will not fail because + of lack of space */ + + n_extents = cursor->tree_height / 16 + 3; + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + reserve_flag = FSP_CLEANING; + } else { + reserve_flag = FSP_NORMAL; + } + + if (!fsp_reserve_free_extents(&n_reserved, index->space, + n_extents, reserve_flag, mtr)) { + return(DB_OUT_OF_FILE_SPACE); + } + } + + if (!*heap) { + *heap = mem_heap_create(1024); + } + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap); + + trx = thr_get_trx(thr); + + new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, + &n_ext, *heap); + /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above + invokes rec_offs_make_valid() to point to the copied record that + the fields of new_entry point to. We have to undo it here. */ + ut_ad(rec_offs_validate(NULL, index, offsets)); + rec_offs_make_valid(rec, index, offsets); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. If the + clustered index record is delete-marked, then its externally + stored fields cannot have been purged yet, because then the + purge would also have removed the clustered index record + itself. Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + FALSE, *heap); + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, + roll_ptr); + row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, + trx->id); + } + + if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) { + /* We are in a transaction rollback undoing a row + update: we must free possible externally stored fields + which got new values in the update, if they are not + inherited values. They can be inherited if we have + updated the primary key to another value, and then + update it back again. */ + + ut_ad(big_rec_vec == NULL); + + btr_rec_free_updated_extern_fields( + index, rec, page_zip, offsets, update, + trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr); + } + + /* We have to set appropriate extern storage bits in the new + record to be inserted: we have to remember which fields were such */ + + ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec)); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap); + n_ext += btr_push_update_extern_fields(new_entry, update, *heap); + + if (UNIV_LIKELY_NULL(page_zip)) { + ut_ad(page_is_comp(page)); + if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + TRUE, + dict_index_get_n_fields(index), + page_zip_get_size(page_zip))) { + + goto make_external; + } + } else if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(page), 0, 0)) { +make_external: + big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext); + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + err = DB_TOO_BIG_RECORD; + goto return_after_reservations; + } + } + + /* Store state of explicit locks on rec on the page infimum record, + before deleting rec. The page infimum acts as a dummy carrier of the + locks, taking care also of lock releases, before we can move the locks + back on the actual record. There is a special case: if we are + inserting on the root page and the insert causes a call of + btr_root_raise_and_insert. Therefore we cannot in the lock system + delete the lock structs set on the root page even if the root + page carries just node pointers. */ + + lock_rec_store_on_page_infimum(block, rec); + + btr_search_update_hash_on_delete(cursor); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_delete_rec(page_cursor, index, offsets, mtr); + + page_cur_move_to_prev(page_cursor); + + rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr); + + if (rec) { + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), + rec, block); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, heap); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* The new inserted record owns its possible externally + stored fields */ + btr_cur_unmark_extern_fields(page_zip, + rec, index, offsets, mtr); + } + + btr_cur_compress_if_useful(cursor, mtr); + + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_zip(block, mtr); + } + + err = DB_SUCCESS; + goto return_after_reservations; + } else { + ut_a(optim_err != DB_UNDERFLOW); + + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) + && page_is_leaf(page)) { + ibuf_reset_free_bits(block); + } + } + + /* Was the record to be updated positioned as the first user + record on its page? */ + was_first = page_cur_is_before_first(page_cursor); + + /* The first parameter means that no lock checking and undo logging + is made in the insert */ + + err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG, + cursor, new_entry, &rec, + &dummy_big_rec, n_ext, NULL, mtr); + ut_a(rec); + ut_a(err == DB_SUCCESS); + ut_a(dummy_big_rec == NULL); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* The new inserted record owns its possible externally + stored fields */ + buf_block_t* rec_block = btr_cur_get_block(cursor); + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); + page = buf_block_get_frame(rec_block); +#endif /* UNIV_ZIP_DEBUG */ + page_zip = buf_block_get_page_zip(rec_block); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, heap); + btr_cur_unmark_extern_fields(page_zip, + rec, index, offsets, mtr); + } + + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), + rec, block); + + /* If necessary, restore also the correct lock state for a new, + preceding supremum record created in a page split. While the old + record was nonexistent, the supremum might have inherited its locks + from a wrong record. */ + + if (!was_first) { + btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor), + rec, mtr); + } + +return_after_reservations: +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + *big_rec = big_rec_vec; + + return(err); +} + +/*==================== B-TREE DELETE MARK AND UNMARK ===============*/ + +/******************************************************************** +Writes the redo log record for delete marking or unmarking of an index +record. */ +UNIV_INLINE +void +btr_cur_del_mark_set_clust_rec_log( +/*===============================*/ + ulint flags, /* in: flags */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index of the record */ + ibool val, /* in: value to set */ + trx_t* trx, /* in: deleting transaction */ + dulint roll_ptr,/* in: roll ptr to the undo log record */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + ut_ad(flags < 256); + ut_ad(val <= 1); + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_CLUST_DELETE_MARK + : MLOG_REC_CLUST_DELETE_MARK, + 1 + 1 + DATA_ROLL_PTR_LEN + + 14 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery */ + return; + } + + mach_write_to_1(log_ptr, flags); + log_ptr++; + mach_write_to_1(log_ptr, val); + log_ptr++; + + log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr, + mtr); + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + mlog_close(mtr, log_ptr); +} + +/******************************************************************** +Parses the redo log record for delete marking or unmarking of a clustered +index record. */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_clust_rec( +/*=================================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dict_index_t* index) /* in: index corresponding to page */ +{ + ulint flags; + ulint val; + ulint pos; + dulint trx_id; + dulint roll_ptr; + ulint offset; + rec_t* rec; + + ut_ad(!page + || !!page_is_comp(page) == dict_table_is_comp(index->table)); + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + flags = mach_read_from_1(ptr); + ptr++; + val = mach_read_from_1(ptr); + ptr++; + + ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (page) { + rec = page + offset; + + /* We do not need to reserve btr_search_latch, as the page + is only being recovered, and there cannot be a hash index to + it. */ + + btr_rec_set_deleted_flag(rec, page_zip, val); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + row_upd_rec_sys_fields_in_recovery( + rec, page_zip, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + pos, trx_id, roll_ptr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + } + + return(ptr); +} + +/*************************************************************** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. */ +UNIV_INTERN +ulint +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, or error + number */ + ulint flags, /* in: undo logging and locking flags */ + btr_cur_t* cursor, /* in: cursor */ + ibool val, /* in: value to set */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + buf_block_t* block; + dulint roll_ptr; + ulint err; + rec_t* rec; + page_zip_des_t* page_zip; + trx_t* trx; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + index = cursor->index; + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), index, "del mark "); + rec_print_new(stderr, rec, offsets); + } +#endif /* UNIV_DEBUG */ + + ut_ad(dict_index_is_clust(index)); + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + + err = lock_clust_rec_modify_check_and_lock(flags, + btr_cur_get_block(cursor), + rec, index, offsets, thr); + + if (err != DB_SUCCESS) { + + goto func_exit; + } + + err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr, + index, NULL, NULL, 0, rec, + &roll_ptr); + if (err != DB_SUCCESS) { + + goto func_exit; + } + + block = btr_cur_get_block(cursor); + + if (block->is_hashed) { + rw_lock_x_lock(&btr_search_latch); + } + + page_zip = buf_block_get_page_zip(block); + + btr_rec_set_deleted_flag(rec, page_zip, val); + + trx = thr_get_trx(thr); + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + row_upd_rec_sys_fields(rec, page_zip, + index, offsets, trx, roll_ptr); + } + + if (block->is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } + + btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx, + roll_ptr, mtr); + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/******************************************************************** +Writes the redo log record for a delete mark setting of a secondary +index record. */ +UNIV_INLINE +void +btr_cur_del_mark_set_sec_rec_log( +/*=============================*/ + rec_t* rec, /* in: record */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + ut_ad(val <= 1); + + log_ptr = mlog_open(mtr, 11 + 1 + 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr); + mach_write_to_1(log_ptr, val); + log_ptr++; + + mach_write_to_2(log_ptr, page_offset(rec)); + log_ptr += 2; + + mlog_close(mtr, log_ptr); +} + +/******************************************************************** +Parses the redo log record for delete marking or unmarking of a secondary +index record. */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_sec_rec( +/*===============================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip)/* in/out: compressed page, or NULL */ +{ + ulint val; + ulint offset; + rec_t* rec; + + if (end_ptr < ptr + 3) { + + return(NULL); + } + + val = mach_read_from_1(ptr); + ptr++; + + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (page) { + rec = page + offset; + + /* We do not need to reserve btr_search_latch, as the page + is only being recovered, and there cannot be a hash index to + it. */ + + btr_rec_set_deleted_flag(rec, page_zip, val); + } + + return(ptr); +} + +/*************************************************************** +Sets a secondary index record delete mark to TRUE or FALSE. */ +UNIV_INTERN +ulint +btr_cur_del_mark_set_sec_rec( +/*=========================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, or error + number */ + ulint flags, /* in: locking flag */ + btr_cur_t* cursor, /* in: cursor */ + ibool val, /* in: value to set */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + rec_t* rec; + ulint err; + + block = btr_cur_get_block(cursor); + rec = btr_cur_get_rec(cursor); + +#ifdef UNIV_DEBUG + if (btr_cur_print_record_ops && thr) { + btr_cur_trx_report(thr_get_trx(thr), cursor->index, + "del mark "); + rec_print(stderr, rec, cursor->index); + } +#endif /* UNIV_DEBUG */ + + err = lock_sec_rec_modify_check_and_lock(flags, + btr_cur_get_block(cursor), + rec, cursor->index, thr); + if (err != DB_SUCCESS) { + + return(err); + } + + ut_ad(!!page_rec_is_comp(rec) + == dict_table_is_comp(cursor->index->table)); + + if (block->is_hashed) { + rw_lock_x_lock(&btr_search_latch); + } + + btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val); + + if (block->is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } + + btr_cur_del_mark_set_sec_rec_log(rec, val, mtr); + + return(DB_SUCCESS); +} + +/*************************************************************** +Clear a secondary index record's delete mark. This function is only +used by the insert buffer insert merge mechanism. */ +UNIV_INTERN +void +btr_cur_del_unmark_for_ibuf( +/*========================*/ + rec_t* rec, /* in/out: record to delete unmark */ + page_zip_des_t* page_zip, /* in/out: compressed page + corresponding to rec, or NULL + when the tablespace is + uncompressed */ + mtr_t* mtr) /* in: mtr */ +{ + /* We do not need to reserve btr_search_latch, as the page has just + been read to the buffer pool and there cannot be a hash index to it. */ + + btr_rec_set_deleted_flag(rec, page_zip, FALSE); + + btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr); +} + +/*==================== B-TREE RECORD REMOVE =========================*/ + +/***************************************************************** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! */ +UNIV_INTERN +ibool +btr_cur_compress_if_useful( +/*=======================*/ + /* out: TRUE if compression occurred */ + btr_cur_t* cursor, /* in: cursor on the page to compress; + cursor does not stay valid if compression + occurs */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mtr_memo_contains(mtr, + dict_index_get_lock(btr_cur_get_index(cursor)), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + return(btr_cur_compress_recommendation(cursor, mtr) + && btr_compress(cursor, mtr)); +} + +/*********************************************************** +Removes the record on which the tree cursor is positioned on a leaf page. +It is assumed that the mtr has an x-latch on the page where the cursor is +positioned, but no latch on the whole tree. */ +UNIV_INTERN +ibool +btr_cur_optimistic_delete( +/*======================*/ + /* out: TRUE if success, i.e., the page + did not become too empty */ + btr_cur_t* cursor, /* in: cursor on leaf page, on the record to + delete; cursor stays valid: if deletion + succeeds, on function exit it points to the + successor of the deleted record */ + mtr_t* mtr) /* in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ +{ + buf_block_t* block; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool no_compress_needed; + rec_offs_init(offsets_); + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + /* This is intended only for leaf page deletions */ + + block = btr_cur_get_block(cursor); + + ut_ad(page_is_leaf(buf_block_get_frame(block))); + + rec = btr_cur_get_rec(cursor); + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + no_compress_needed = !rec_offs_any_extern(offsets) + && btr_cur_can_delete_without_compress( + cursor, rec_offs_size(offsets), mtr); + + if (no_compress_needed) { + + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + ulint max_ins = 0; + + lock_update_delete(block, rec); + + btr_search_update_hash_on_delete(cursor); + + if (!page_zip) { + max_ins = page_get_max_insert_size_after_reorganize( + page, 1); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + cursor->index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (dict_index_is_clust(cursor->index) + || dict_index_is_ibuf(cursor->index) + || !page_is_leaf(page)) { + /* The insert buffer does not handle + inserts to clustered indexes, to + non-leaf pages of secondary index B-trees, + or to the insert buffer. */ + } else if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins, mtr); + } + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(no_compress_needed); +} + +/***************************************************************** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. */ +UNIV_INTERN +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + /* out: TRUE if compression occurred */ + ulint* err, /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /* in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /* in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + dict_index_t* index; + rec_t* rec; + dtuple_t* node_ptr; + ulint n_extents = 0; + ulint n_reserved; + ibool success; + ibool ret = FALSE; + ulint level; + mem_heap_t* heap; + ulint* offsets; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = btr_cur_get_index(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + if (!has_reserved_extents) { + /* First reserve enough free space for the file segments + of the index tree, so that the node pointer updates will + not fail because of lack of space */ + + n_extents = cursor->tree_height / 32 + 1; + + success = fsp_reserve_free_extents(&n_reserved, + index->space, + n_extents, + FSP_CLEANING, mtr); + if (!success) { + *err = DB_OUT_OF_FILE_SPACE; + + return(FALSE); + } + } + + heap = mem_heap_create(1024); + rec = btr_cur_get_rec(cursor); + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + if (rec_offs_any_extern(offsets)) { + btr_rec_free_externally_stored_fields(index, + rec, offsets, page_zip, + rb_ctx, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + if (UNIV_UNLIKELY(page_get_n_recs(page) < 2) + && UNIV_UNLIKELY(dict_index_get_page(index) + != buf_block_get_page_no(block))) { + + /* If there is only one record, drop the whole page in + btr_discard_page, if this is not the root page */ + + btr_discard_page(cursor, mtr); + + *err = DB_SUCCESS; + ret = TRUE; + + goto return_after_reservations; + } + + lock_update_delete(block, rec); + level = btr_page_get_level(page, mtr); + + if (level > 0 + && UNIV_UNLIKELY(rec == page_rec_get_next( + page_get_infimum_rec(page)))) { + + rec_t* next_rec = page_rec_get_next(rec); + + if (btr_page_get_prev(page, mtr) == FIL_NULL) { + + /* If we delete the leftmost node pointer on a + non-leaf level, we must mark the new leftmost node + pointer as the predefined minimum record */ + + /* This will make page_zip_validate() fail until + page_cur_delete_rec() completes. This is harmless, + because everything will take place within a single + mini-transaction and because writing to the redo log + is an atomic operation (performed by mtr_commit()). */ + btr_set_min_rec_mark(next_rec, mtr); + } else { + /* Otherwise, if we delete the leftmost node pointer + on a page, we have to change the father node pointer + so that it is equal to the new leftmost node pointer + on the page */ + + btr_node_ptr_delete(index, block, mtr); + + node_ptr = dict_index_build_node_ptr( + index, next_rec, buf_block_get_page_no(block), + heap, level); + + btr_insert_on_non_leaf_level(index, + level + 1, node_ptr, mtr); + } + } + + btr_search_update_hash_on_delete(cursor); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_ad(btr_check_node_ptr(index, block, mtr)); + + *err = DB_SUCCESS; + +return_after_reservations: + mem_heap_free(heap); + + if (ret == FALSE) { + ret = btr_cur_compress_if_useful(cursor, mtr); + } + + if (n_extents > 0) { + fil_space_release_free_extents(index->space, n_reserved); + } + + return(ret); +} + +/*********************************************************************** +Adds path information to the cursor for the current page, for which +the binary search has been performed. */ +static +void +btr_cur_add_path_info( +/*==================*/ + btr_cur_t* cursor, /* in: cursor positioned on a page */ + ulint height, /* in: height of the page in tree; + 0 means leaf node */ + ulint root_height) /* in: root node height in tree */ +{ + btr_path_t* slot; + rec_t* rec; + + ut_a(cursor->path_arr); + + if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) { + /* Do nothing; return empty path */ + + slot = cursor->path_arr; + slot->nth_rec = ULINT_UNDEFINED; + + return; + } + + if (height == 0) { + /* Mark end of slots for path */ + slot = cursor->path_arr + root_height + 1; + slot->nth_rec = ULINT_UNDEFINED; + } + + rec = btr_cur_get_rec(cursor); + + slot = cursor->path_arr + (root_height - height); + + slot->nth_rec = page_rec_get_n_recs_before(rec); + slot->n_recs = page_get_n_recs(page_align(rec)); +} + +/*********************************************************************** +Estimates the number of rows in a given index range. */ +UNIV_INTERN +ib_int64_t +btr_estimate_n_rows_in_range( +/*=========================*/ + /* out: estimated number of rows */ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple1, /* in: range start, may also be empty tuple */ + ulint mode1, /* in: search mode for range start */ + const dtuple_t* tuple2, /* in: range end, may also be empty tuple */ + ulint mode2) /* in: search mode for range end */ +{ + btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS]; + btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS]; + btr_cur_t cursor; + btr_path_t* slot1; + btr_path_t* slot2; + ibool diverged; + ibool diverged_lot; + ulint divergence_level; + ib_int64_t n_rows; + ulint i; + mtr_t mtr; + + mtr_start(&mtr); + + cursor.path_arr = path1; + + if (dtuple_get_n_fields(tuple1) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple1, mode1, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, &mtr); + } else { + btr_cur_open_at_index_side(TRUE, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, &mtr); + } + + mtr_commit(&mtr); + + mtr_start(&mtr); + + cursor.path_arr = path2; + + if (dtuple_get_n_fields(tuple2) > 0) { + + btr_cur_search_to_nth_level(index, 0, tuple2, mode2, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, 0, &mtr); + } else { + btr_cur_open_at_index_side(FALSE, index, + BTR_SEARCH_LEAF | BTR_ESTIMATE, + &cursor, &mtr); + } + + mtr_commit(&mtr); + + /* We have the path information for the range in path1 and path2 */ + + n_rows = 1; + diverged = FALSE; /* This becomes true when the path is not + the same any more */ + diverged_lot = FALSE; /* This becomes true when the paths are + not the same or adjacent any more */ + divergence_level = 1000000; /* This is the level where paths diverged + a lot */ + for (i = 0; ; i++) { + ut_ad(i < BTR_PATH_ARRAY_N_SLOTS); + + slot1 = path1 + i; + slot2 = path2 + i; + + if (slot1->nth_rec == ULINT_UNDEFINED + || slot2->nth_rec == ULINT_UNDEFINED) { + + if (i > divergence_level + 1) { + /* In trees whose height is > 1 our algorithm + tends to underestimate: multiply the estimate + by 2: */ + + n_rows = n_rows * 2; + } + + /* Do not estimate the number of rows in the range + to over 1 / 2 of the estimated rows in the whole + table */ + + if (n_rows > index->table->stat_n_rows / 2) { + n_rows = index->table->stat_n_rows / 2; + + /* If there are just 0 or 1 rows in the table, + then we estimate all rows are in the range */ + + if (n_rows == 0) { + n_rows = index->table->stat_n_rows; + } + } + + return(n_rows); + } + + if (!diverged && slot1->nth_rec != slot2->nth_rec) { + + diverged = TRUE; + + if (slot1->nth_rec < slot2->nth_rec) { + n_rows = slot2->nth_rec - slot1->nth_rec; + + if (n_rows > 1) { + diverged_lot = TRUE; + divergence_level = i; + } + } else { + /* Maybe the tree has changed between + searches */ + + return(10); + } + + } else if (diverged && !diverged_lot) { + + if (slot1->nth_rec < slot1->n_recs + || slot2->nth_rec > 1) { + + diverged_lot = TRUE; + divergence_level = i; + + n_rows = 0; + + if (slot1->nth_rec < slot1->n_recs) { + n_rows += slot1->n_recs + - slot1->nth_rec; + } + + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + } + } else if (diverged_lot) { + + n_rows = (n_rows * (slot1->n_recs + slot2->n_recs)) + / 2; + } + } +} + +/*********************************************************************** +Estimates the number of different key values in a given index, for +each n-column prefix of the index where n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals. */ +UNIV_INTERN +void +btr_estimate_number_of_different_key_vals( +/*======================================*/ + dict_index_t* index) /* in: index */ +{ + btr_cur_t cursor; + page_t* page; + rec_t* rec; + ulint n_cols; + ulint matched_fields; + ulint matched_bytes; + ib_int64_t* n_diff; + ullint n_sample_pages; /* number of pages to sample */ + ulint not_empty_flag = 0; + ulint total_external_size = 0; + ulint i; + ulint j; + ullint add_on; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_rec_[REC_OFFS_NORMAL_SIZE]; + ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets_rec = offsets_rec_; + ulint* offsets_next_rec= offsets_next_rec_; + rec_offs_init(offsets_rec_); + rec_offs_init(offsets_next_rec_); + + n_cols = dict_index_get_n_unique(index); + + n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t)); + + /* It makes no sense to test more pages than are contained + in the index, thus we lower the number if it is too high */ + if (srv_stats_sample_pages > index->stat_index_size) { + if (index->stat_index_size > 0) { + n_sample_pages = index->stat_index_size; + } else { + n_sample_pages = 1; + } + } else { + n_sample_pages = srv_stats_sample_pages; + } + + /* We sample some pages in the index to get an estimate */ + + for (i = 0; i < n_sample_pages; i++) { + rec_t* supremum; + mtr_start(&mtr); + + btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr); + + /* Count the number of different key values for each prefix of + the key on this index page. If the prefix does not determine + the index record uniquely in the B-tree, then we subtract one + because otherwise our algorithm would give a wrong estimate + for an index where there is just one key value. */ + + page = btr_cur_get_page(&cursor); + + supremum = page_get_supremum_rec(page); + rec = page_rec_get_next(page_get_infimum_rec(page)); + + if (rec != supremum) { + not_empty_flag = 1; + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); + } + + while (rec != supremum) { + rec_t* next_rec = page_rec_get_next(rec); + if (next_rec == supremum) { + break; + } + + matched_fields = 0; + matched_bytes = 0; + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + n_cols, &heap); + + cmp_rec_rec_with_match(rec, next_rec, + offsets_rec, offsets_next_rec, + index, &matched_fields, + &matched_bytes); + + for (j = matched_fields + 1; j <= n_cols; j++) { + /* We add one if this index record has + a different prefix from the previous */ + + n_diff[j]++; + } + + total_external_size + += btr_rec_get_externally_stored_len( + rec, offsets_rec); + + rec = next_rec; + /* Initialize offsets_rec for the next round + and assign the old offsets_rec buffer to + offsets_next_rec. */ + { + ulint* offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } + } + + + if (n_cols == dict_index_get_n_unique_in_tree(index)) { + + /* If there is more than one leaf page in the tree, + we add one because we know that the first record + on the page certainly had a different prefix than the + last record on the previous index page in the + alphabetical order. Before this fix, if there was + just one big record on each clustered index page, the + algorithm grossly underestimated the number of rows + in the table. */ + + if (btr_page_get_prev(page, &mtr) != FIL_NULL + || btr_page_get_next(page, &mtr) != FIL_NULL) { + + n_diff[n_cols]++; + } + } + + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + ULINT_UNDEFINED, &heap); + total_external_size += btr_rec_get_externally_stored_len( + rec, offsets_rec); + mtr_commit(&mtr); + } + + /* If we saw k borders between different key values on + n_sample_pages leaf pages, we can estimate how many + there will be in index->stat_n_leaf_pages */ + + /* We must take into account that our sample actually represents + also the pages used for external storage of fields (those pages are + included in index->stat_n_leaf_pages) */ + + for (j = 0; j <= n_cols; j++) { + index->stat_n_diff_key_vals[j] + = ((n_diff[j] + * (ib_int64_t)index->stat_n_leaf_pages + + n_sample_pages - 1 + + total_external_size + + not_empty_flag) + / (n_sample_pages + + total_external_size)); + + /* If the tree is small, smaller than + 10 * n_sample_pages + total_external_size, then + the above estimate is ok. For bigger trees it is common that we + do not see any borders between key values in the few pages + we pick. But still there may be n_sample_pages + different key values, or even more. Let us try to approximate + that: */ + + add_on = index->stat_n_leaf_pages + / (10 * (n_sample_pages + + total_external_size)); + + if (add_on > n_sample_pages) { + add_on = n_sample_pages; + } + + index->stat_n_diff_key_vals[j] += add_on; + } + + mem_free(n_diff); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ + +/*************************************************************** +Gets the externally stored size of a record, in units of a database page. */ +static +ulint +btr_rec_get_externally_stored_len( +/*==============================*/ + /* out: externally stored part, + in units of a database page */ + rec_t* rec, /* in: record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + byte* data; + ulint local_len; + ulint extern_len; + ulint total_extern_len = 0; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + extern_len = mach_read_from_4(data + local_len + + BTR_EXTERN_LEN + 4); + + total_extern_len += ut_calc_align(extern_len, + UNIV_PAGE_SIZE); + } + } + + return(total_extern_len / UNIV_PAGE_SIZE); +} + +/*********************************************************************** +Sets the ownership bit of an externally stored field in a record. */ +static +void +btr_cur_set_ownership_of_extern_field( +/*==================================*/ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: clustered index record */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint i, /* in: field number */ + ibool val, /* in: value to set */ + mtr_t* mtr) /* in: mtr, or NULL if not logged */ +{ + byte* data; + ulint local_len; + ulint byte_val; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN); + + if (val) { + byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG); + } else { + byte_val = byte_val | BTR_EXTERN_OWNER_FLAG; + } + + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr); + } else if (UNIV_LIKELY(mtr != NULL)) { + + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val, + MLOG_1BYTE, mtr); + } else { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + } +} + +/*********************************************************************** +Marks not updated extern fields as not-owned by this record. The ownership +is transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +UNIV_INTERN +void +btr_cur_mark_extern_inherited_fields( +/*=================================*/ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: record in a clustered index */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + const upd_t* update, /* in: update vector */ + mtr_t* mtr) /* in: mtr, or NULL if not logged */ +{ + ulint n; + ulint j; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + + if (!rec_offs_any_extern(offsets)) { + + return; + } + + n = rec_offs_n_fields(offsets); + + for (i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + /* Check it is not in updated fields */ + + if (update) { + for (j = 0; j < upd_get_n_fields(update); + j++) { + if (upd_get_nth_field(update, j) + ->field_no == i) { + + goto updated; + } + } + } + + btr_cur_set_ownership_of_extern_field( + page_zip, rec, index, offsets, i, FALSE, mtr); +updated: + ; + } + } +} + +/*********************************************************************** +The complement of the previous function: in an update entry may inherit +some externally stored fields from a record. We must mark them as inherited +in entry, so that they are not freed in a rollback. */ +UNIV_INTERN +void +btr_cur_mark_dtuple_inherited_extern( +/*=================================*/ + dtuple_t* entry, /* in/out: updated entry to be + inserted to clustered index */ + const upd_t* update) /* in: update vector */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + dfield_t* dfield = dtuple_get_nth_field(entry, i); + byte* data; + ulint len; + ulint j; + + if (!dfield_is_ext(dfield)) { + continue; + } + + /* Check if it is in updated fields */ + + for (j = 0; j < upd_get_n_fields(update); j++) { + if (upd_get_nth_field(update, j)->field_no == i) { + + goto is_updated; + } + } + + data = dfield_get_data(dfield); + len = dfield_get_len(dfield); + data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN] + |= BTR_EXTERN_INHERITED_FLAG; + +is_updated: + ; + } +} + +/*********************************************************************** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: record in a clustered index */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + mtr_t* mtr) /* in: mtr, or NULL if not logged */ +{ + ulint n; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + n = rec_offs_n_fields(offsets); + + if (!rec_offs_any_extern(offsets)) { + + return; + } + + for (i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + btr_cur_set_ownership_of_extern_field( + page_zip, rec, index, offsets, i, TRUE, mtr); + } + } +} + +/*********************************************************************** +Marks all extern fields in a dtuple as owned by the record. */ +UNIV_INTERN +void +btr_cur_unmark_dtuple_extern_fields( +/*================================*/ + dtuple_t* entry) /* in/out: clustered index entry */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + dfield_t* dfield = dtuple_get_nth_field(entry, i); + + if (dfield_is_ext(dfield)) { + byte* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN] + &= ~BTR_EXTERN_OWNER_FLAG; + } + } +} + +/*********************************************************************** +Flags the data tuple fields that are marked as extern storage in the +update vector. We use this function to remember which fields we must +mark as extern storage in a record inserted for an update. */ +UNIV_INTERN +ulint +btr_push_update_extern_fields( +/*==========================*/ + /* out: number of flagged external columns */ + dtuple_t* tuple, /* in/out: data tuple */ + const upd_t* update, /* in: update vector */ + mem_heap_t* heap) /* in: memory heap */ +{ + ulint n_pushed = 0; + ulint n; + const upd_field_t* uf; + + ut_ad(tuple); + ut_ad(update); + + uf = update->fields; + n = upd_get_n_fields(update); + + for (; n--; uf++) { + if (dfield_is_ext(&uf->new_val)) { + dfield_t* field + = dtuple_get_nth_field(tuple, uf->field_no); + + if (!dfield_is_ext(field)) { + dfield_set_ext(field); + n_pushed++; + } + + switch (uf->orig_len) { + byte* data; + ulint len; + byte* buf; + case 0: + break; + case BTR_EXTERN_FIELD_REF_SIZE: + /* Restore the original locally stored + part of the column. In the undo log, + InnoDB writes a longer prefix of externally + stored columns, so that column prefixes + in secondary indexes can be reconstructed. */ + dfield_set_data(field, (byte*) dfield_get_data(field) + + dfield_get_len(field) + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(field); + break; + default: + /* Reconstruct the original locally + stored part of the column. The data + will have to be copied. */ + ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE); + + data = dfield_get_data(field); + len = dfield_get_len(field); + + buf = mem_heap_alloc(heap, uf->orig_len); + /* Copy the locally stored prefix. */ + memcpy(buf, data, + uf->orig_len + - BTR_EXTERN_FIELD_REF_SIZE); + /* Copy the BLOB pointer. */ + memcpy(buf + uf->orig_len + - BTR_EXTERN_FIELD_REF_SIZE, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(field, buf, uf->orig_len); + dfield_set_ext(field); + } + } + } + + return(n_pushed); +} + +/*********************************************************************** +Returns the length of a BLOB part stored on the header page. */ +static +ulint +btr_blob_get_part_len( +/*==================*/ + /* out: part length */ + const byte* blob_header) /* in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN)); +} + +/*********************************************************************** +Returns the page number where the next BLOB part is stored. */ +static +ulint +btr_blob_get_next_page_no( +/*======================*/ + /* out: page number or FIL_NULL if + no more pages */ + const byte* blob_header) /* in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO)); +} + +/*********************************************************************** +Deallocate a buffer block that was reserved for a BLOB part. */ +static +void +btr_blob_free( +/*==========*/ + buf_block_t* block, /* in: buffer block */ + ibool all, /* in: TRUE=remove also the compressed page + if there is one */ + mtr_t* mtr) /* in: mini-transaction to commit */ +{ + ulint space = buf_block_get_space(block); + ulint page_no = buf_block_get_page_no(block); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + + mtr_commit(mtr); + + buf_pool_mutex_enter(); + mutex_enter(&block->mutex); + + /* Only free the block if it is still allocated to + the same file page. */ + + if (buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { + + if (buf_LRU_free_block(&block->page, all, NULL) + != BUF_LRU_FREED + && all && block->page.zip.data) { + /* Attempt to deallocate the uncompressed page + if the whole block cannot be deallocted. */ + + buf_LRU_free_block(&block->page, FALSE, NULL); + } + } + + buf_pool_mutex_exit(); + mutex_exit(&block->mutex); +} + +/*********************************************************************** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. */ +UNIV_INTERN +ulint +btr_store_big_rec_extern_fields( +/*============================*/ + /* out: DB_SUCCESS or error */ + dict_index_t* index, /* in: index of rec; the index tree + MUST be X-latched */ + buf_block_t* rec_block, /* in/out: block containing rec */ + rec_t* rec, /* in/out: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index); + the "external storage" flags in offsets + will not correspond to rec when + this function returns */ + big_rec_t* big_rec_vec, /* in: vector containing fields + to be stored externally */ + mtr_t* local_mtr __attribute__((unused))) /* in: mtr + containing the latch to rec and to the + tree */ +{ + ulint rec_page_no; + byte* field_ref; + ulint extern_len; + ulint store_len; + ulint page_no; + ulint space_id; + ulint zip_size; + ulint prev_page_no; + ulint hint_page_no; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + page_zip_des_t* page_zip; + z_stream c_stream; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); + ut_a(dict_index_is_clust(index)); + + page_zip = buf_block_get_page_zip(rec_block); + ut_a(dict_table_zip_size(index->table) + == buf_block_get_zip_size(rec_block)); + + space_id = buf_block_get_space(rec_block); + zip_size = buf_block_get_zip_size(rec_block); + rec_page_no = buf_block_get_page_no(rec_block); + ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + + if (UNIV_LIKELY_NULL(page_zip)) { + int err; + + /* Zlib deflate needs 128 kilobytes for the default + window size, plus 512 << memLevel, plus a few + kilobytes for small objects. We use reduced memLevel + to limit the memory consumption, and preallocate the + heap, hoping to avoid memory fragmentation. */ + heap = mem_heap_create(250000); + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + } + + /* We have to create a file segment to the tablespace + for each field and put the pointer to the field in rec */ + + for (i = 0; i < big_rec_vec->n_fields; i++) { + ut_ad(rec_offs_nth_extern(offsets, + big_rec_vec->fields[i].field_no)); + { + ulint local_len; + field_ref = rec_get_nth_field( + rec, offsets, big_rec_vec->fields[i].field_no, + &local_len); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + field_ref += local_len; + } + extern_len = big_rec_vec->fields[i].len; + + ut_a(extern_len > 0); + + prev_page_no = FIL_NULL; + + if (UNIV_LIKELY_NULL(page_zip)) { + int err = deflateReset(&c_stream); + ut_a(err == Z_OK); + + c_stream.next_in = (void*) big_rec_vec->fields[i].data; + c_stream.avail_in = extern_len; + } + + for (;;) { + buf_block_t* block; + page_t* page; + + mtr_start(&mtr); + + if (prev_page_no == FIL_NULL) { + hint_page_no = 1 + rec_page_no; + } else { + hint_page_no = prev_page_no + 1; + } + + block = btr_page_alloc(index, hint_page_no, + FSP_NO_DIR, 0, &mtr); + if (UNIV_UNLIKELY(block == NULL)) { + + mtr_commit(&mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + deflateEnd(&c_stream); + mem_heap_free(heap); + } + + return(DB_OUT_OF_FILE_SPACE); + } + + page_no = buf_block_get_page_no(block); + page = buf_block_get_frame(block); + + if (prev_page_no != FIL_NULL) { + buf_block_t* prev_block; + page_t* prev_page; + + prev_block = buf_page_get(space_id, zip_size, + prev_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(prev_block, + SYNC_EXTERN_STORAGE); + prev_page = buf_block_get_frame(prev_block); + + if (UNIV_LIKELY_NULL(page_zip)) { + mlog_write_ulint( + prev_page + FIL_PAGE_NEXT, + page_no, MLOG_4BYTES, &mtr); + memcpy(buf_block_get_page_zip( + prev_block) + ->data + FIL_PAGE_NEXT, + prev_page + FIL_PAGE_NEXT, 4); + } else { + mlog_write_ulint( + prev_page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + page_no, MLOG_4BYTES, &mtr); + } + + } + + if (UNIV_LIKELY_NULL(page_zip)) { + int err; + page_zip_des_t* blob_page_zip; + + mach_write_to_2(page + FIL_PAGE_TYPE, + prev_page_no == FIL_NULL + ? FIL_PAGE_TYPE_ZBLOB + : FIL_PAGE_TYPE_ZBLOB2); + + c_stream.next_out = page + + FIL_PAGE_DATA; + c_stream.avail_out + = page_zip_get_size(page_zip) + - FIL_PAGE_DATA; + + err = deflate(&c_stream, Z_FINISH); + ut_a(err == Z_OK || err == Z_STREAM_END); + ut_a(err == Z_STREAM_END + || c_stream.avail_out == 0); + + /* Write the "next BLOB page" pointer */ + mlog_write_ulint(page + FIL_PAGE_NEXT, + FIL_NULL, MLOG_4BYTES, &mtr); + /* Initialize the unused "prev page" pointer */ + mlog_write_ulint(page + FIL_PAGE_PREV, + FIL_NULL, MLOG_4BYTES, &mtr); + /* Write a back pointer to the record + into the otherwise unused area. This + information could be useful in + debugging. Later, we might want to + implement the possibility to relocate + BLOB pages. Then, we would need to be + able to adjust the BLOB pointer in the + record. We do not store the heap + number of the record, because it can + change in page_zip_reorganize() or + btr_page_reorganize(). However, also + the page number of the record may + change when B-tree nodes are split or + merged. */ + mlog_write_ulint(page + + FIL_PAGE_FILE_FLUSH_LSN, + space_id, + MLOG_4BYTES, &mtr); + mlog_write_ulint(page + + FIL_PAGE_FILE_FLUSH_LSN + 4, + rec_page_no, + MLOG_4BYTES, &mtr); + + /* Zero out the unused part of the page. */ + memset(page + page_zip_get_size(page_zip) + - c_stream.avail_out, + 0, c_stream.avail_out); + mlog_log_string(page + FIL_PAGE_TYPE, + page_zip_get_size(page_zip) + - FIL_PAGE_TYPE, + &mtr); + /* Copy the page to compressed storage, + because it will be flushed to disk + from there. */ + blob_page_zip = buf_block_get_page_zip(block); + ut_ad(blob_page_zip); + ut_ad(page_zip_get_size(blob_page_zip) + == page_zip_get_size(page_zip)); + memcpy(blob_page_zip->data, page, + page_zip_get_size(page_zip)); + + if (err == Z_OK && prev_page_no != FIL_NULL) { + + goto next_zip_page; + } + + rec_block = buf_page_get(space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, + SYNC_NO_ORDER_CHECK); + + if (err == Z_STREAM_END) { + mach_write_to_4(field_ref + + BTR_EXTERN_LEN, 0); + mach_write_to_4(field_ref + + BTR_EXTERN_LEN + 4, + c_stream.total_in); + } else { + memset(field_ref + BTR_EXTERN_LEN, + 0, 8); + } + + if (prev_page_no == FIL_NULL) { + mach_write_to_4(field_ref + + BTR_EXTERN_SPACE_ID, + space_id); + + mach_write_to_4(field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mach_write_to_4(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_NEXT); + } + + page_zip_write_blob_ptr( + page_zip, rec, index, offsets, + big_rec_vec->fields[i].field_no, &mtr); + +next_zip_page: + prev_page_no = page_no; + + /* Commit mtr and release the + uncompressed page frame to save memory. */ + btr_blob_free(block, FALSE, &mtr); + + if (err == Z_STREAM_END) { + break; + } + } else { + mlog_write_ulint(page + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_BLOB, + MLOG_2BYTES, &mtr); + + if (extern_len > (UNIV_PAGE_SIZE + - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END)) { + store_len = UNIV_PAGE_SIZE + - FIL_PAGE_DATA + - BTR_BLOB_HDR_SIZE + - FIL_PAGE_DATA_END; + } else { + store_len = extern_len; + } + + mlog_write_string(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_SIZE, + (const byte*) + big_rec_vec->fields[i].data + + big_rec_vec->fields[i].len + - extern_len, + store_len, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN, + store_len, MLOG_4BYTES, &mtr); + mlog_write_ulint(page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO, + FIL_NULL, MLOG_4BYTES, &mtr); + + extern_len -= store_len; + + rec_block = buf_page_get(space_id, zip_size, + rec_page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, + SYNC_NO_ORDER_CHECK); + + mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0, + MLOG_4BYTES, &mtr); + mlog_write_ulint(field_ref + + BTR_EXTERN_LEN + 4, + big_rec_vec->fields[i].len + - extern_len, + MLOG_4BYTES, &mtr); + + if (prev_page_no == FIL_NULL) { + mlog_write_ulint(field_ref + + BTR_EXTERN_SPACE_ID, + space_id, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(field_ref + + BTR_EXTERN_PAGE_NO, + page_no, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_DATA, + MLOG_4BYTES, &mtr); + } + + prev_page_no = page_no; + + mtr_commit(&mtr); + + if (extern_len == 0) { + break; + } + } + } + } + + if (UNIV_LIKELY_NULL(page_zip)) { + deflateEnd(&c_stream); + mem_heap_free(heap); + } + + return(DB_SUCCESS); +} + +/*********************************************************************** +Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */ +static +void +btr_check_blob_fil_page_type( +/*=========================*/ + ulint space_id, /* in: space id */ + ulint page_no, /* in: page number */ + const page_t* page, /* in: page */ + ibool read) /* in: TRUE=read, FALSE=purge */ +{ + ulint type = fil_page_get_type(page); + + ut_a(space_id == page_get_space_id(page)); + ut_a(page_no == page_get_page_no(page)); + + if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) { + ulint flags = fil_space_get_flags(space_id); + + if (UNIV_LIKELY + ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) { + /* Old versions of InnoDB did not initialize + FIL_PAGE_TYPE on BLOB pages. Do not print + anything about the type mismatch when reading + a BLOB page that is in Antelope format.*/ + return; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: FIL_PAGE_TYPE=%lu" + " on BLOB %s space %lu page %lu flags %lx\n", + (ulong) type, read ? "read" : "purge", + (ulong) space_id, (ulong) page_no, (ulong) flags); + ut_error; + } +} + +/*********************************************************************** +Frees the space in an externally stored field to the file space +management if the field in data is owned by the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +UNIV_INTERN +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /* in/out: field reference */ + const rec_t* rec, /* in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const ulint* offsets, /* in: rec_get_offsets(rec, index), + or NULL */ + page_zip_des_t* page_zip, /* in: compressed page corresponding + to rec, or NULL if rec == NULL */ + ulint i, /* in: field number of field_ref; + ignored if rec == NULL */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* local_mtr __attribute__((unused))) /* in: mtr + containing the latch to data an an + X-latch to the index tree */ +{ + page_t* page; + ulint space_id; + ulint rec_zip_size = dict_table_zip_size(index->table); + ulint ext_zip_size; + ulint page_no; + ulint next_page_no; + mtr_t mtr; +#ifdef UNIV_DEBUG + ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains_page(local_mtr, field_ref, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + + if (rec) { + ulint local_len; + const byte* f = rec_get_nth_field(rec, offsets, + i, &local_len); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + f += local_len; + ut_ad(f == field_ref); + } +#endif /* UNIV_DEBUG */ + + if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* In the rollback of uncommitted transactions, we may + encounter a clustered index record whose BLOBs have + not been written. There is nothing to free then. */ + ut_a(rb_ctx == RB_RECOVERY); + return; + } + + space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID); + + if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) { + ext_zip_size = fil_space_get_zip_size(space_id); + /* This must be an undo log record in the system tablespace, + that is, in row_purge_upd_exist_or_extern(). + Currently, externally stored records are stored in the + same tablespace as the referring records. */ + ut_ad(!page_get_space_id(page_align(field_ref))); + ut_ad(!rec); + ut_ad(!page_zip); + } else { + ext_zip_size = rec_zip_size; + } + + if (!rec) { + /* This is a call from row_purge_upd_exist_or_extern(). */ + ut_ad(!page_zip); + rec_zip_size = 0; + } + + for (;;) { + buf_block_t* rec_block; + buf_block_t* ext_block; + + mtr_start(&mtr); + + rec_block = buf_page_get(page_get_space_id( + page_align(field_ref)), + rec_zip_size, + page_get_page_no( + page_align(field_ref)), + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK); + page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); + + if (/* There is no external storage data */ + page_no == FIL_NULL + /* This field does not own the externally stored field */ + || (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_OWNER_FLAG) + /* Rollback and inherited field */ + || (rb_ctx != RB_NONE + && (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_INHERITED_FLAG))) { + + /* Do not free */ + mtr_commit(&mtr); + + return; + } + + ext_block = buf_page_get(space_id, ext_zip_size, page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(ext_block); + + if (ext_zip_size) { + /* Note that page_zip will be NULL + in row_purge_upd_exist_or_extern(). */ + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + default: + ut_error; + } + next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT); + + btr_page_free_low(index, ext_block, 0, &mtr); + + if (UNIV_LIKELY(page_zip != NULL)) { + mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no); + mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4, + 0); + page_zip_write_blob_ptr(page_zip, rec, index, + offsets, i, &mtr); + } else { + mlog_write_ulint(field_ref + + BTR_EXTERN_PAGE_NO, + next_page_no, + MLOG_4BYTES, &mtr); + mlog_write_ulint(field_ref + + BTR_EXTERN_LEN + 4, 0, + MLOG_4BYTES, &mtr); + } + } else { + ut_a(!page_zip); + btr_check_blob_fil_page_type(space_id, page_no, page, + FALSE); + + next_page_no = mach_read_from_4( + page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO); + + /* We must supply the page level (= 0) as an argument + because we did not store it on the page (we save the + space overhead from an index page header. */ + + btr_page_free_low(index, ext_block, 0, &mtr); + + mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no, + MLOG_4BYTES, &mtr); + /* Zero out the BLOB length. If the server + crashes during the execution of this function, + trx_rollback_or_clean_all_recovered() could + dereference the half-deleted BLOB, fetching a + wrong prefix for the BLOB. */ + mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4, + 0, + MLOG_4BYTES, &mtr); + } + + /* Commit mtr and release the BLOB block to save memory. */ + btr_blob_free(ext_block, TRUE, &mtr); + } +} + +/*************************************************************** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /* in/out: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* mtr) /* in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + /* Free possible externally stored fields in the record */ + + ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + byte* data + = rec_get_nth_field(rec, offsets, i, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, page_zip, i, rb_ctx, mtr); + } + } +} + +/*************************************************************** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /* in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + const upd_t* update, /* in: update vector */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* mtr) /* in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)); + + /* Free possible externally stored fields in the record */ + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const upd_field_t* ufield = upd_get_nth_field(update, i); + + if (rec_offs_nth_extern(offsets, ufield->field_no)) { + ulint len; + byte* data = rec_get_nth_field( + rec, offsets, ufield->field_no, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, page_zip, + ufield->field_no, rb_ctx, mtr); + } + } +} + +/*********************************************************************** +Copies the prefix of an uncompressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. */ +static +ulint +btr_copy_blob_prefix( +/*=================*/ + /* out: number of bytes written to buf */ + byte* buf, /* out: the externally stored part of + the field, or a prefix of it */ + ulint len, /* in: length of buf, in bytes */ + ulint space_id,/* in: space id of the BLOB pages */ + ulint page_no,/* in: page number of the first BLOB page */ + ulint offset) /* in: offset on the first BLOB page */ +{ + ulint copied_len = 0; + + for (;;) { + mtr_t mtr; + buf_block_t* block; + const page_t* page; + const byte* blob_header; + ulint part_len; + ulint copy_len; + + mtr_start(&mtr); + + block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + page = buf_block_get_frame(block); + + btr_check_blob_fil_page_type(space_id, page_no, page, TRUE); + + blob_header = page + offset; + part_len = btr_blob_get_part_len(blob_header); + copy_len = ut_min(part_len, len - copied_len); + + memcpy(buf + copied_len, + blob_header + BTR_BLOB_HDR_SIZE, copy_len); + copied_len += copy_len; + + page_no = btr_blob_get_next_page_no(blob_header); + + mtr_commit(&mtr); + + if (page_no == FIL_NULL || copy_len != part_len) { + return(copied_len); + } + + /* On other BLOB pages except the first the BLOB header + always is at the page data start: */ + + offset = FIL_PAGE_DATA; + + ut_ad(copied_len <= len); + } +} + +/*********************************************************************** +Copies the prefix of a compressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. */ +static +void +btr_copy_zblob_prefix( +/*==================*/ + z_stream* d_stream,/* in/out: the decompressing stream */ + ulint zip_size,/* in: compressed BLOB page size */ + ulint space_id,/* in: space id of the BLOB pages */ + ulint page_no,/* in: page number of the first BLOB page */ + ulint offset) /* in: offset on the first BLOB page */ +{ + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + ut_ad(space_id); + + for (;;) { + buf_page_t* bpage; + int err; + ulint next_page_no; + + /* There is no latch on bpage directly. Instead, + bpage is protected by the B-tree page latch that + is being held on the clustered index record, or, + in row_merge_copy_blobs(), by an exclusive table lock. */ + bpage = buf_page_get_zip(space_id, zip_size, page_no); + + if (UNIV_UNLIKELY(!bpage)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot load" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) page_no, (ulong) space_id); + return; + } + + if (UNIV_UNLIKELY + (fil_page_get_type(bpage->zip.data) != page_type)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Unexpected type %lu of" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) fil_page_get_type(bpage->zip.data), + (ulong) page_no, (ulong) space_id); + goto end_of_blob; + } + + next_page_no = mach_read_from_4(bpage->zip.data + offset); + + if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) { + /* When the BLOB begins at page header, + the compressed data payload does not + immediately follow the next page pointer. */ + offset = FIL_PAGE_DATA; + } else { + offset += 4; + } + + d_stream->next_in = bpage->zip.data + offset; + d_stream->avail_in = zip_size - offset; + + err = inflate(d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + if (!d_stream->avail_out) { + goto end_of_blob; + } + break; + case Z_STREAM_END: + if (next_page_no == FIL_NULL) { + goto end_of_blob; + } + /* fall through */ + default: +inflate_error: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: inflate() of" + " compressed BLOB" + " page %lu space %lu returned %d (%s)\n", + (ulong) page_no, (ulong) space_id, + err, d_stream->msg); + case Z_BUF_ERROR: + goto end_of_blob; + } + + if (next_page_no == FIL_NULL) { + if (!d_stream->avail_in) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected end of" + " compressed BLOB" + " page %lu space %lu\n", + (ulong) page_no, + (ulong) space_id); + } else { + err = inflate(d_stream, Z_FINISH); + switch (err) { + case Z_STREAM_END: + case Z_BUF_ERROR: + break; + default: + goto inflate_error; + } + } + +end_of_blob: + buf_page_release_zip(bpage); + return; + } + + buf_page_release_zip(bpage); + + /* On other BLOB pages except the first + the BLOB header always is at the page header: */ + + page_no = next_page_no; + offset = FIL_PAGE_NEXT; + page_type = FIL_PAGE_TYPE_ZBLOB2; + } +} + +/*********************************************************************** +Copies the prefix of an externally stored field of a record. The +clustered index record that points to this BLOB must be protected by a +lock or a page latch. */ +static +ulint +btr_copy_externally_stored_field_prefix_low( +/*========================================*/ + /* out: number of bytes written to buf */ + byte* buf, /* out: the externally stored part of + the field, or a prefix of it */ + ulint len, /* in: length of buf, in bytes */ + ulint zip_size,/* in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint space_id,/* in: space id of the first BLOB page */ + ulint page_no,/* in: page number of the first BLOB page */ + ulint offset) /* in: offset on the first BLOB page */ +{ + if (UNIV_UNLIKELY(len == 0)) { + return(0); + } + + if (UNIV_UNLIKELY(zip_size)) { + int err; + z_stream d_stream; + mem_heap_t* heap; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); + + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + + d_stream.next_out = buf; + d_stream.avail_out = len; + d_stream.avail_in = 0; + + btr_copy_zblob_prefix(&d_stream, zip_size, + space_id, page_no, offset); + inflateEnd(&d_stream); + mem_heap_free(heap); + return(d_stream.total_out); + } else { + return(btr_copy_blob_prefix(buf, len, space_id, + page_no, offset)); + } +} + +/*********************************************************************** +Copies the prefix of an externally stored field of a record. The +clustered index record must be protected by a lock or a page latch. */ +UNIV_INTERN +ulint +btr_copy_externally_stored_field_prefix( +/*====================================*/ + /* out: the length of the copied field, + or 0 if the column was being or has been + deleted */ + byte* buf, /* out: the field, or a prefix of it */ + ulint len, /* in: length of buf, in bytes */ + ulint zip_size,/* in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + const byte* data, /* in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint local_len)/* in: length of data, in bytes */ +{ + ulint space_id; + ulint page_no; + ulint offset; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(local_len >= len)) { + memcpy(buf, data, len); + return(len); + } + + memcpy(buf, data, local_len); + data += local_len; + + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) { + /* The externally stored part of the column has been + (partially) deleted. Signal the half-deleted BLOB + to the caller. */ + + return(0); + } + + space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + BTR_EXTERN_OFFSET); + + return(local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + len - local_len, + zip_size, + space_id, page_no, + offset)); +} + +/*********************************************************************** +Copies an externally stored field of a record to mem heap. The +clustered index record must be protected by a lock or a page latch. */ +static +byte* +btr_copy_externally_stored_field( +/*=============================*/ + /* out: the whole field copied to heap */ + ulint* len, /* out: length of the whole field */ + const byte* data, /* in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint zip_size,/* in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint local_len,/* in: length of data */ + mem_heap_t* heap) /* in: mem heap */ +{ + ulint space_id; + ulint page_no; + ulint offset; + ulint extern_len; + byte* buf; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID); + + page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO); + + offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET); + + /* Currently a BLOB cannot be bigger than 4 GB; we + leave the 4 upper bytes in the length field unused */ + + extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4); + + buf = mem_heap_alloc(heap, local_len + extern_len); + + memcpy(buf, data, local_len); + *len = local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + extern_len, + zip_size, + space_id, + page_no, offset); + + return(buf); +} + +/*********************************************************************** +Copies an externally stored field of a record to mem heap. */ +UNIV_INTERN +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + /* out: the field copied to heap */ + const rec_t* rec, /* in: record in a clustered index; + must be protected by a lock or a page latch */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint zip_size,/* in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint no, /* in: field number */ + ulint* len, /* out: length of the field */ + mem_heap_t* heap) /* in: mem heap */ +{ + ulint local_len; + const byte* data; + + ut_a(rec_offs_nth_extern(offsets, no)); + + /* An externally stored field can contain some initial + data from the field, and in the last 20 bytes it has the + space id, page number, and offset where the rest of the + field data is stored, and the data length in addition to + the data stored locally. We may need to store some data + locally to get the local record length above the 128 byte + limit so that field offsets are stored in two bytes, and + the extern bit is available in those two bytes. */ + + data = rec_get_nth_field(rec, offsets, no, &local_len); + + return(btr_copy_externally_stored_field(len, data, + zip_size, local_len, heap)); +} diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.c new file mode 100644 index 00000000000..b14efefe13f --- /dev/null +++ b/storage/xtradb/btr/btr0pcur.c @@ -0,0 +1,584 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#include "btr0pcur.h" + +#ifdef UNIV_NONINL +#include "btr0pcur.ic" +#endif + +#include "ut0byte.h" +#include "rem0cmp.h" +#include "trx0trx.h" + +/****************************************************************** +Allocates memory for a persistent cursor object and initializes the cursor. */ +UNIV_INTERN +btr_pcur_t* +btr_pcur_create_for_mysql(void) +/*============================*/ + /* out, own: persistent cursor */ +{ + btr_pcur_t* pcur; + + pcur = mem_alloc(sizeof(btr_pcur_t)); + + pcur->btr_cur.index = NULL; + btr_pcur_init(pcur); + + return(pcur); +} + +/****************************************************************** +Frees the memory for a persistent cursor object. */ +UNIV_INTERN +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor) /* in, own: persistent cursor */ +{ + if (cursor->old_rec_buf != NULL) { + + mem_free(cursor->old_rec_buf); + + cursor->old_rec_buf = NULL; + } + + cursor->btr_cur.page_cur.rec = NULL; + cursor->old_rec = NULL; + cursor->old_n_fields = 0; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; + + mem_free(cursor); +} + +/****************************************************************** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +UNIV_INTERN +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t* page_cursor; + buf_block_t* block; + rec_t* rec; + dict_index_t* index; + page_t* page; + ulint offs; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + block = btr_pcur_get_block(cursor); + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + page_cursor = btr_pcur_get_page_cur(cursor); + + rec = page_cur_get_rec(page_cursor); + page = page_align(rec); + offs = page_offset(rec); + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_a(cursor->latch_mode != BTR_NO_LATCHES); + + if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) { + /* It must be an empty index tree; NOTE that in this case + we do not store the modify_clock, but always do a search + if we restore the cursor position */ + + ut_a(btr_page_get_next(page, mtr) == FIL_NULL); + ut_a(btr_page_get_prev(page, mtr) == FIL_NULL); + + cursor->old_stored = BTR_PCUR_OLD_STORED; + + if (page_rec_is_supremum_low(offs)) { + + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + } else { + cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; + } + + return; + } + + if (page_rec_is_supremum_low(offs)) { + + rec = page_rec_get_prev(rec); + + cursor->rel_pos = BTR_PCUR_AFTER; + + } else if (page_rec_is_infimum_low(offs)) { + + rec = page_rec_get_next(rec); + + cursor->rel_pos = BTR_PCUR_BEFORE; + } else { + cursor->rel_pos = BTR_PCUR_ON; + } + + cursor->old_stored = BTR_PCUR_OLD_STORED; + cursor->old_rec = dict_index_copy_rec_order_prefix( + index, rec, &cursor->old_n_fields, + &cursor->old_rec_buf, &cursor->buf_size); + + cursor->block_when_stored = block; + cursor->modify_clock = buf_block_get_modify_clock(block); +} + +/****************************************************************** +Copies the stored position of a pcur to another pcur. */ +UNIV_INTERN +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /* in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate) /* in: pcur from which the info is + copied */ +{ + if (pcur_receive->old_rec_buf) { + mem_free(pcur_receive->old_rec_buf); + } + + ut_memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t)); + + if (pcur_donate->old_rec_buf) { + + pcur_receive->old_rec_buf = mem_alloc(pcur_donate->buf_size); + + ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf, + pcur_donate->buf_size); + pcur_receive->old_rec = pcur_receive->old_rec_buf + + (pcur_donate->old_rec - pcur_donate->old_rec_buf); + } + + pcur_receive->old_n_fields = pcur_donate->old_n_fields; +} + +/****************************************************************** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. */ +UNIV_INTERN +ibool +btr_pcur_restore_position( +/*======================*/ + /* out: TRUE if the cursor position + was stored when it was on a user record + and it can be restored on a user record + whose ordering fields are identical to + the ones of the original user record */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /* in: detached persistent cursor */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + dtuple_t* tuple; + ulint mode; + ulint old_mode; + mem_heap_t* heap; + + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED) + || UNIV_UNLIKELY(cursor->pos_state != BTR_PCUR_WAS_POSITIONED + && cursor->pos_state != BTR_PCUR_IS_POSITIONED)) { + ut_print_buf(stderr, cursor, sizeof(btr_pcur_t)); + putc('\n', stderr); + if (cursor->trx_if_known) { + trx_print(stderr, cursor->trx_if_known, 0); + } + + ut_error; + } + + if (UNIV_UNLIKELY + (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE + || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) { + + /* In these cases we do not try an optimistic restoration, + but always do a search */ + + btr_cur_open_at_index_side( + cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE, + index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr); + + cursor->block_when_stored = btr_pcur_get_block(cursor); + + return(FALSE); + } + + ut_a(cursor->old_rec); + ut_a(cursor->old_n_fields); + + if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF) + || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) { + /* Try optimistic restoration */ + + if (UNIV_LIKELY(buf_page_optimistic_get( + latch_mode, + cursor->block_when_stored, + cursor->modify_clock, mtr))) { + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + buf_block_dbg_add_level(btr_pcur_get_block(cursor), + SYNC_TREE_NODE); + + if (cursor->rel_pos == BTR_PCUR_ON) { +#ifdef UNIV_DEBUG + const rec_t* rec; + const ulint* offsets1; + const ulint* offsets2; +#endif /* UNIV_DEBUG */ + cursor->latch_mode = latch_mode; +#ifdef UNIV_DEBUG + rec = btr_pcur_get_rec(cursor); + + heap = mem_heap_create(256); + offsets1 = rec_get_offsets( + cursor->old_rec, index, NULL, + cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets( + rec, index, NULL, + cursor->old_n_fields, &heap); + + ut_ad(!cmp_rec_rec(cursor->old_rec, + rec, offsets1, offsets2, + index)); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ + return(TRUE); + } + + return(FALSE); + } + } + + /* If optimistic restoration did not succeed, open the cursor anew */ + + heap = mem_heap_create(256); + + tuple = dict_index_build_data_tuple(index, cursor->old_rec, + cursor->old_n_fields, heap); + + /* Save the old search mode of the cursor */ + old_mode = cursor->search_mode; + + if (UNIV_LIKELY(cursor->rel_pos == BTR_PCUR_ON)) { + mode = PAGE_CUR_LE; + } else if (cursor->rel_pos == BTR_PCUR_AFTER) { + mode = PAGE_CUR_G; + } else { + ut_ad(cursor->rel_pos == BTR_PCUR_BEFORE); + mode = PAGE_CUR_L; + } + + btr_pcur_open_with_no_init(index, tuple, mode, latch_mode, + cursor, 0, mtr); + + /* Restore the old search mode */ + cursor->search_mode = old_mode; + + if (cursor->rel_pos == BTR_PCUR_ON + && btr_pcur_is_on_user_rec(cursor) + && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor), + rec_get_offsets( + btr_pcur_get_rec(cursor), index, + NULL, ULINT_UNDEFINED, &heap))) { + + /* We have to store the NEW value for the modify clock, since + the cursor can now be on a different page! But we can retain + the value of old_rec */ + + cursor->block_when_stored = btr_pcur_get_block(cursor); + cursor->modify_clock = buf_block_get_modify_clock( + cursor->block_when_stored); + cursor->old_stored = BTR_PCUR_OLD_STORED; + + mem_heap_free(heap); + + return(TRUE); + } + + mem_heap_free(heap); + + /* We have to store new position information, modify_clock etc., + to the cursor because it can now be on a different page, the record + under it may have been removed, etc. */ + + btr_pcur_store_position(cursor, mtr); + + return(FALSE); +} + +/****************************************************************** +If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY, +releases the page latch and bufferfix reserved by the cursor. +NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes +made by the current mini-transaction to the data protected by the +cursor latch, as then the latch must not be released until mtr_commit. */ +UNIV_INTERN +void +btr_pcur_release_leaf( +/*==================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + block = btr_pcur_get_block(cursor); + + btr_leaf_page_release(block, cursor->latch_mode, mtr); + + cursor->latch_mode = BTR_NO_LATCHES; + + cursor->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/************************************************************* +Moves the persistent cursor to the first record on the next page. Releases the +latch on the current page, and bufferunfixes it. Note that there must not be +modifications on the current page, as then the x-latch can be released only in +mtr_commit. */ +UNIV_INTERN +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /* in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr) /* in: mtr */ +{ + ulint next_page_no; + ulint space; + ulint zip_size; + page_t* page; + buf_block_t* next_block; + page_t* next_page; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_after_last_on_page(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + page = btr_pcur_get_page(cursor); + next_page_no = btr_page_get_next(page, mtr); + space = buf_block_get_space(btr_pcur_get_block(cursor)); + zip_size = buf_block_get_zip_size(btr_pcur_get_block(cursor)); + + ut_ad(next_page_no != FIL_NULL); + + next_block = btr_block_get(space, zip_size, next_page_no, + cursor->latch_mode, mtr); + next_page = buf_block_get_frame(next_block); +#ifdef UNIV_BTR_DEBUG + ut_a(page_is_comp(next_page) == page_is_comp(page)); + ut_a(btr_page_get_prev(next_page, mtr) + == buf_block_get_page_no(btr_pcur_get_block(cursor))); +#endif /* UNIV_BTR_DEBUG */ + next_block->check_index_page_at_flush = TRUE; + + btr_leaf_page_release(btr_pcur_get_block(cursor), + cursor->latch_mode, mtr); + + page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor)); + + page_check_dir(next_page); +} + +/************************************************************* +Moves the persistent cursor backward if it is on the first record of the page. +Commits mtr. Note that to prevent a possible deadlock, the operation +first stores the position of the cursor, commits mtr, acquires the necessary +latches and restores the cursor position again before returning. The +alphabetical position of the cursor is guaranteed to be sensible on +return, but it may happen that the cursor is not positioned on the last +record of any page, because the structure of the tree may have changed +during the time when the cursor had no latches. */ +UNIV_INTERN +void +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /* in: persistent cursor, must be on the first + record of the current page */ + mtr_t* mtr) /* in: mtr */ +{ + ulint prev_page_no; + ulint space; + page_t* page; + buf_block_t* prev_block; + ulint latch_mode; + ulint latch_mode2; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_before_first_on_page(cursor)); + ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr)); + + latch_mode = cursor->latch_mode; + + if (latch_mode == BTR_SEARCH_LEAF) { + + latch_mode2 = BTR_SEARCH_PREV; + + } else if (latch_mode == BTR_MODIFY_LEAF) { + + latch_mode2 = BTR_MODIFY_PREV; + } else { + latch_mode2 = 0; /* To eliminate compiler warning */ + ut_error; + } + + btr_pcur_store_position(cursor, mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + btr_pcur_restore_position(latch_mode2, cursor, mtr); + + page = btr_pcur_get_page(cursor); + + prev_page_no = btr_page_get_prev(page, mtr); + space = buf_block_get_space(btr_pcur_get_block(cursor)); + + if (prev_page_no == FIL_NULL) { + } else if (btr_pcur_is_before_first_on_page(cursor)) { + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(btr_pcur_get_block(cursor), + latch_mode, mtr); + + page_cur_set_after_last(prev_block, + btr_pcur_get_page_cur(cursor)); + } else { + + /* The repositioned cursor did not end on an infimum record on + a page. Cursor repositioning acquired a latch also on the + previous page, but we do not need the latch: release it. */ + + prev_block = btr_pcur_get_btr_cur(cursor)->left_block; + + btr_leaf_page_release(prev_block, latch_mode, mtr); + } + + cursor->latch_mode = latch_mode; + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/************************************************************* +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. */ +UNIV_INTERN +ibool +btr_pcur_move_to_prev( +/*==================*/ + /* out: TRUE if the cursor was not before first + in tree */ + btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + if (btr_pcur_is_before_first_on_page(cursor)) { + + if (btr_pcur_is_before_first_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_backward_from_page(cursor, mtr); + + return(TRUE); + } + + btr_pcur_move_to_prev_on_page(cursor); + + return(TRUE); +} + +/****************************************************************** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +UNIV_INTERN +void +btr_pcur_open_on_user_rec( +/*======================*/ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple, /* in: tuple on which search done */ + ulint mode, /* in: PAGE_CUR_L, ... */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /* in: memory buffer for persistent + cursor */ + mtr_t* mtr) /* in: mtr */ +{ + btr_pcur_open(index, tuple, mode, latch_mode, cursor, mtr); + + if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) { + + if (btr_pcur_is_after_last_on_page(cursor)) { + + btr_pcur_move_to_next_user_rec(cursor, mtr); + } + } else { + ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L)); + + /* Not implemented yet */ + + ut_error; + } +} diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c new file mode 100644 index 00000000000..8aafd738542 --- /dev/null +++ b/storage/xtradb/btr/btr0sea.c @@ -0,0 +1,1872 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "btr0sea.h" +#ifdef UNIV_NONINL +#include "btr0sea.ic" +#endif + +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "ha0ha.h" + +/* Flag: has the search system been enabled? +Protected by btr_search_latch and btr_search_enabled_mutex. */ +UNIV_INTERN char btr_search_enabled = TRUE; + +static mutex_t btr_search_enabled_mutex; + +/* A dummy variable to fool the compiler */ +UNIV_INTERN ulint btr_search_this_is_zero = 0; + +#ifdef UNIV_SEARCH_PERF_STAT +UNIV_INTERN ulint btr_search_n_succ = 0; +UNIV_INTERN ulint btr_search_n_hash_fail = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/* padding to prevent other memory update +hotspots from residing on the same memory +cache line as btr_search_latch */ +UNIV_INTERN byte btr_sea_pad1[64]; + +/* The latch protecting the adaptive search system: this latch protects the +(1) positions of records on those pages where a hash index has been built. +NOTE: It does not protect values of non-ordering fields within a record from +being updated in-place! We can use fact (1) to perform unique searches to +indexes. */ + +/* We will allocate the latch from dynamic memory to get it to the +same DRAM page as other hotspot semaphores */ +UNIV_INTERN rw_lock_t* btr_search_latch_temp; + +/* padding to prevent other memory update hotspots from residing on +the same memory cache line */ +UNIV_INTERN byte btr_sea_pad2[64]; + +UNIV_INTERN btr_search_sys_t* btr_search_sys; + +/* If the number of records on the page divided by this parameter +would have been successfully accessed using a hash index, the index +is then built on the page, assuming the global limit has been reached */ + +#define BTR_SEARCH_PAGE_BUILD_LIMIT 16 + +/* The global limit for consecutive potentially successful hash searches, +before hash index building is started */ + +#define BTR_SEARCH_BUILD_LIMIT 100 + +/************************************************************************ +Builds a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible values, and does not build a hash index if not. */ +static +void +btr_search_build_page_hash_index( +/*=============================*/ + dict_index_t* index, /* in: index for which to build, or NULL if + not known */ + buf_block_t* block, /* in: index page, s- or x-latched */ + ulint n_fields,/* in: hash this many full fields */ + ulint n_bytes,/* in: hash this many bytes from the next + field */ + ibool left_side);/* in: hash for searches from left side? */ + +/********************************************************************* +This function should be called before reserving any btr search mutex, if +the intended operation might add nodes to the search system hash table. +Because of the latching order, once we have reserved the btr search system +latch, we cannot allocate a free frame from the buffer pool. Checks that +there is a free buffer frame allocated for hash table heap in the btr search +system. If not, allocates a free frames for the heap. This check makes it +probable that, when have reserved the btr search system latch and we need to +allocate a new node to the hash table, it will succeed. However, the check +will not guarantee success. */ +static +void +btr_search_check_free_space_in_heap(void) +/*=====================================*/ +{ + hash_table_t* table; + mem_heap_t* heap; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + table = btr_search_sys->hash_index; + + heap = table->heap; + + /* Note that we peek the value of heap->free_block without reserving + the latch: this is ok, because we will not guarantee that there will + be enough free space in the hash table. */ + + if (heap->free_block == NULL) { + buf_block_t* block = buf_block_alloc(0); + + rw_lock_x_lock(&btr_search_latch); + + if (heap->free_block == NULL) { + heap->free_block = block; + } else { + buf_block_free(block); + } + + rw_lock_x_unlock(&btr_search_latch); + } +} + +/********************************************************************* +Creates and initializes the adaptive search system at a database start. */ +UNIV_INTERN +void +btr_search_sys_create( +/*==================*/ + ulint hash_size) /* in: hash index hash table size */ +{ + /* We allocate the search latch from dynamic memory: + see above at the global variable definition */ + + btr_search_latch_temp = mem_alloc(sizeof(rw_lock_t)); + + rw_lock_create(&btr_search_latch, SYNC_SEARCH_SYS); + mutex_create(&btr_search_enabled_mutex, SYNC_SEARCH_SYS_CONF); + + btr_search_sys = mem_alloc(sizeof(btr_search_sys_t)); + + btr_search_sys->hash_index = ha_create(hash_size, 0, 0); +} + +/************************************************************************ +Disable the adaptive hash search system and empty the index. */ +UNIV_INTERN +void +btr_search_disable(void) +/*====================*/ +{ + mutex_enter(&btr_search_enabled_mutex); + rw_lock_x_lock(&btr_search_latch); + + btr_search_enabled = FALSE; + + /* Clear all block->is_hashed flags and remove all entries + from btr_search_sys->hash_index. */ + buf_pool_drop_hash_index(); + + /* btr_search_enabled_mutex should guarantee this. */ + ut_ad(!btr_search_enabled); + + rw_lock_x_unlock(&btr_search_latch); + mutex_exit(&btr_search_enabled_mutex); +} + +/************************************************************************ +Enable the adaptive hash search system. */ +UNIV_INTERN +void +btr_search_enable(void) +/*====================*/ +{ + mutex_enter(&btr_search_enabled_mutex); + rw_lock_x_lock(&btr_search_latch); + + btr_search_enabled = TRUE; + + rw_lock_x_unlock(&btr_search_latch); + mutex_exit(&btr_search_enabled_mutex); +} + +/********************************************************************* +Creates and initializes a search info struct. */ +UNIV_INTERN +btr_search_t* +btr_search_info_create( +/*===================*/ + /* out, own: search info struct */ + mem_heap_t* heap) /* in: heap where created */ +{ + btr_search_t* info; + + info = mem_heap_alloc(heap, sizeof(btr_search_t)); + +#ifdef UNIV_DEBUG + info->magic_n = BTR_SEARCH_MAGIC_N; +#endif /* UNIV_DEBUG */ + + info->ref_count = 0; + info->root_guess = NULL; + + info->hash_analysis = 0; + info->n_hash_potential = 0; + + info->last_hash_succ = FALSE; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ = 0; + info->n_hash_fail = 0; + info->n_patt_succ = 0; + info->n_searches = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + + /* Set some sensible values */ + info->n_fields = 1; + info->n_bytes = 0; + + info->left_side = TRUE; + + return(info); +} + +/********************************************************************* +Returns the value of ref_count. The value is protected by +btr_search_latch. */ +UNIV_INTERN +ulint +btr_search_info_get_ref_count( +/*==========================*/ + /* out: ref_count value. */ + btr_search_t* info) /* in: search info. */ +{ + ulint ret; + + ut_ad(info); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(&btr_search_latch); + ret = info->ref_count; + rw_lock_s_unlock(&btr_search_latch); + + return(ret); +} + +/************************************************************************* +Updates the search info of an index about hash successes. NOTE that info +is NOT protected by any semaphore, to save CPU time! Do not assume its fields +are consistent. */ +static +void +btr_search_info_update_hash( +/*========================*/ + btr_search_t* info, /* in/out: search info */ + btr_cur_t* cursor) /* in: cursor which was just positioned */ +{ + dict_index_t* index; + ulint n_unique; + int cmp; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = cursor->index; + + if (dict_index_is_ibuf(index)) { + /* So many deletes are performed on an insert buffer tree + that we do not consider a hash index useful on it: */ + + return; + } + + n_unique = dict_index_get_n_unique_in_tree(index); + + if (info->n_hash_potential == 0) { + + goto set_new_recomm; + } + + /* Test if the search would have succeeded using the recommended + hash prefix */ + + if (info->n_fields >= n_unique && cursor->up_match >= n_unique) { +increment_potential: + info->n_hash_potential++; + + return; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->low_match, cursor->low_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto set_new_recomm; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->up_match, cursor->up_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto increment_potential; + } + +set_new_recomm: + /* We have to set a new recommendation; skip the hash analysis + for a while to avoid unnecessary CPU time usage when there is no + chance for success */ + + info->hash_analysis = 0; + + cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes, + cursor->low_match, cursor->low_bytes); + if (cmp == 0) { + info->n_hash_potential = 0; + + /* For extra safety, we set some sensible values here */ + + info->n_fields = 1; + info->n_bytes = 0; + + info->left_side = TRUE; + + } else if (cmp > 0) { + info->n_hash_potential = 1; + + if (cursor->up_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match < cursor->up_match) { + + info->n_fields = cursor->low_match + 1; + info->n_bytes = 0; + } else { + info->n_fields = cursor->low_match; + info->n_bytes = cursor->low_bytes + 1; + } + + info->left_side = TRUE; + } else { + info->n_hash_potential = 1; + + if (cursor->low_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match > cursor->up_match) { + + info->n_fields = cursor->up_match + 1; + info->n_bytes = 0; + } else { + info->n_fields = cursor->up_match; + info->n_bytes = cursor->up_bytes + 1; + } + + info->left_side = FALSE; + } +} + +/************************************************************************* +Updates the block search info on hash successes. NOTE that info and +block->n_hash_helps, n_fields, n_bytes, side are NOT protected by any +semaphore, to save CPU time! Do not assume the fields are consistent. */ +static +ibool +btr_search_update_block_hash_info( +/*==============================*/ + /* out: TRUE if building a (new) hash index on + the block is recommended */ + btr_search_t* info, /* in: search info */ + buf_block_t* block, /* in: buffer block */ + btr_cur_t* cursor __attribute__((unused))) + /* in: cursor */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); + ut_ad(rw_lock_own(&block->lock, RW_LOCK_SHARED) + || rw_lock_own(&block->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(cursor); + + info->last_hash_succ = FALSE; + + ut_a(buf_block_state_valid(block)); + ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N); + + if ((block->n_hash_helps > 0) + && (info->n_hash_potential > 0) + && (block->n_fields == info->n_fields) + && (block->n_bytes == info->n_bytes) + && (block->left_side == info->left_side)) { + + if ((block->is_hashed) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + + /* The search would presumably have succeeded using + the hash index */ + + info->last_hash_succ = TRUE; + } + + block->n_hash_helps++; + } else { + block->n_hash_helps = 1; + block->n_fields = info->n_fields; + block->n_bytes = info->n_bytes; + block->left_side = info->left_side; + } + +#ifdef UNIV_DEBUG + if (cursor->index->table->does_not_fit_in_memory) { + block->n_hash_helps = 0; + } +#endif /* UNIV_DEBUG */ + + if ((block->n_hash_helps > page_get_n_recs(block->frame) + / BTR_SEARCH_PAGE_BUILD_LIMIT) + && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) { + + if ((!block->is_hashed) + || (block->n_hash_helps + > 2 * page_get_n_recs(block->frame)) + || (block->n_fields != block->curr_n_fields) + || (block->n_bytes != block->curr_n_bytes) + || (block->left_side != block->curr_left_side)) { + + /* Build a new hash index on the page */ + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************* +Updates a hash node reference when it has been unsuccessfully used in a +search which could have succeeded with the used hash parameters. This can +happen because when building a hash index for a page, we do not check +what happens at page boundaries, and therefore there can be misleading +hash nodes. Also, collisions in the fold value can lead to misleading +references. This function lazily fixes these imperfections in the hash +index. */ +static +void +btr_search_update_hash_ref( +/*=======================*/ + btr_search_t* info, /* in: search info */ + buf_block_t* block, /* in: buffer block where cursor positioned */ + btr_cur_t* cursor) /* in: cursor */ +{ + ulint fold; + rec_t* rec; + dulint index_id; + + ut_ad(cursor->flag == BTR_CUR_HASH_FAIL); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(page_align(btr_cur_get_rec(cursor)) + == buf_block_get_frame(block)); + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(!dict_index_is_ibuf(cursor->index)); + + if ((info->n_hash_potential > 0) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_user_rec(rec)) { + + return; + } + + index_id = cursor->index->id; + fold = rec_fold(rec, + rec_get_offsets(rec, cursor->index, offsets_, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, + block->curr_n_bytes, index_id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ha_insert_for_fold(btr_search_sys->hash_index, fold, + block, rec); + } +} + +/************************************************************************* +Updates the search info. */ +UNIV_INTERN +void +btr_search_info_update_slow( +/*========================*/ + btr_search_t* info, /* in/out: search info */ + btr_cur_t* cursor) /* in: cursor which was just positioned */ +{ + buf_block_t* block; + ibool build_index; + ulint* params; + ulint* params2; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + block = btr_cur_get_block(cursor); + + /* NOTE that the following two function calls do NOT protect + info or block->n_fields etc. with any semaphore, to save CPU time! + We cannot assume the fields are consistent when we return from + those functions! */ + + btr_search_info_update_hash(info, cursor); + + build_index = btr_search_update_block_hash_info(info, block, cursor); + + if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) { + + btr_search_check_free_space_in_heap(); + } + + if (cursor->flag == BTR_CUR_HASH_FAIL) { + /* Update the hash node reference, if appropriate */ + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_hash_fail++; +#endif /* UNIV_SEARCH_PERF_STAT */ + + rw_lock_x_lock(&btr_search_latch); + + btr_search_update_hash_ref(info, block, cursor); + + rw_lock_x_unlock(&btr_search_latch); + } + + if (build_index) { + /* Note that since we did not protect block->n_fields etc. + with any semaphore, the values can be inconsistent. We have + to check inside the function call that they make sense. We + also malloc an array and store the values there to make sure + the compiler does not let the function call parameters change + inside the called function. It might be that the compiler + would optimize the call just to pass pointers to block. */ + + params = mem_alloc(3 * sizeof(ulint)); + params[0] = block->n_fields; + params[1] = block->n_bytes; + params[2] = block->left_side; + + /* Make sure the compiler cannot deduce the values and do + optimizations */ + + params2 = params + btr_search_this_is_zero; + + btr_search_build_page_hash_index(cursor->index, + block, + params2[0], + params2[1], + params2[2]); + mem_free(params); + } +} + +/********************************************************************** +Checks if a guessed position for a tree cursor is right. Note that if +mode is PAGE_CUR_LE, which is used in inserts, and the function returns +TRUE, then cursor->up_match and cursor->low_match both have sensible values. */ +static +ibool +btr_search_check_guess( +/*===================*/ + /* out: TRUE if success */ + btr_cur_t* cursor, /* in: guessed cursor position */ + ibool can_only_compare_to_cursor_rec, + /* in: if we do not have a latch on the page + of cursor, but only a latch on + btr_search_latch, then ONLY the columns + of the record UNDER the cursor are + protected, not the next or previous record + in the chain: we cannot look at the next or + previous record to check our guess! */ + const dtuple_t* tuple, /* in: data tuple */ + ulint mode, /* in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, + or PAGE_CUR_GE */ + mtr_t* mtr) /* in: mtr */ +{ + rec_t* rec; + ulint n_unique; + ulint match; + ulint bytes; + int cmp; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ibool success = FALSE; + rec_offs_init(offsets_); + + n_unique = dict_index_get_n_unique_in_tree(cursor->index); + + rec = btr_cur_get_rec(cursor); + + ut_ad(page_rec_is_user_rec(rec)); + + match = 0; + bytes = 0; + + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, rec, + offsets, &match, &bytes); + + if (mode == PAGE_CUR_GE) { + if (cmp == 1) { + goto exit_func; + } + + cursor->up_match = match; + + if (match >= n_unique) { + success = TRUE; + goto exit_func; + } + } else if (mode == PAGE_CUR_LE) { + if (cmp == -1) { + goto exit_func; + } + + cursor->low_match = match; + + } else if (mode == PAGE_CUR_G) { + if (cmp != -1) { + goto exit_func; + } + } else if (mode == PAGE_CUR_L) { + if (cmp != 1) { + goto exit_func; + } + } + + if (can_only_compare_to_cursor_rec) { + /* Since we could not determine if our guess is right just by + looking at the record under the cursor, return FALSE */ + goto exit_func; + } + + match = 0; + bytes = 0; + + if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) { + rec_t* prev_rec; + + ut_ad(!page_rec_is_infimum(rec)); + + prev_rec = page_rec_get_prev(rec); + + if (page_rec_is_infimum(prev_rec)) { + success = btr_page_get_prev(page_align(prev_rec), mtr) + == FIL_NULL; + + goto exit_func; + } + + offsets = rec_get_offsets(prev_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec, + offsets, &match, &bytes); + if (mode == PAGE_CUR_GE) { + success = cmp == 1; + } else { + success = cmp != -1; + } + + goto exit_func; + } else { + rec_t* next_rec; + + ut_ad(!page_rec_is_supremum(rec)); + + next_rec = page_rec_get_next(rec); + + if (page_rec_is_supremum(next_rec)) { + if (btr_page_get_next(page_align(next_rec), mtr) + == FIL_NULL) { + + cursor->up_match = 0; + success = TRUE; + } + + goto exit_func; + } + + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_unique, &heap); + cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec, + offsets, &match, &bytes); + if (mode == PAGE_CUR_LE) { + success = cmp == -1; + cursor->up_match = match; + } else { + success = cmp != 1; + } + } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +/********************************************************************** +Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. */ +UNIV_INTERN +ibool +btr_search_guess_on_hash( +/*=====================*/ + /* out: TRUE if succeeded */ + dict_index_t* index, /* in: index */ + btr_search_t* info, /* in: index search info */ + const dtuple_t* tuple, /* in: logical record */ + ulint mode, /* in: PAGE_CUR_L, ... */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ...; + NOTE that only if has_search_latch + is 0, we will have a latch set on + the cursor page, otherwise we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /* out: tree cursor */ + ulint has_search_latch,/* in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, RW_X_LATCH, or 0 */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + rec_t* rec; + ulint fold; + dulint index_id; +#ifdef notdefined + btr_cur_t cursor2; + btr_pcur_t pcur; +#endif + ut_ad(index && info && tuple && cursor && mtr); + ut_ad((latch_mode == BTR_SEARCH_LEAF) + || (latch_mode == BTR_MODIFY_LEAF)); + + /* Note that, for efficiency, the struct info may not be protected by + any latch here! */ + + if (UNIV_UNLIKELY(info->n_hash_potential == 0)) { + + return(FALSE); + } + + cursor->n_fields = info->n_fields; + cursor->n_bytes = info->n_bytes; + + if (UNIV_UNLIKELY(dtuple_get_n_fields(tuple) + < cursor->n_fields + (cursor->n_bytes > 0))) { + + return(FALSE); + } + + index_id = index->id; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ++; +#endif + fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id); + + cursor->fold = fold; + cursor->flag = BTR_CUR_HASH; + + if (UNIV_LIKELY(!has_search_latch)) { + rw_lock_s_lock(&btr_search_latch); + + if (UNIV_UNLIKELY(!btr_search_enabled)) { + goto failure_unlock; + } + } + + ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX); + ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0); + + rec = ha_search_and_get_data(btr_search_sys->hash_index, fold); + + if (UNIV_UNLIKELY(!rec)) { + goto failure_unlock; + } + + block = buf_block_align(rec); + + if (UNIV_LIKELY(!has_search_latch)) { + + if (UNIV_UNLIKELY( + !buf_page_get_known_nowait(latch_mode, block, + BUF_MAKE_YOUNG, + __FILE__, __LINE__, + mtr))) { + goto failure_unlock; + } + + rw_lock_s_unlock(&btr_search_latch); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + } + + if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) { + ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH); + + if (UNIV_LIKELY(!has_search_latch)) { + + btr_leaf_page_release(block, latch_mode, mtr); + } + + goto failure; + } + + ut_ad(page_rec_is_user_rec(rec)); + + btr_cur_position(index, rec, block, cursor); + + /* Check the validity of the guess within the page */ + + /* If we only have the latch on btr_search_latch, not on the + page, it only protects the columns of the record the cursor + is positioned on. We cannot look at the next of the previous + record to determine if our guess for the cursor position is + right. */ + if (UNIV_EXPECT + (ut_dulint_cmp(index_id, btr_page_get_index_id(block->frame)), 0) + || !btr_search_check_guess(cursor, + has_search_latch, + tuple, mode, mtr)) { + if (UNIV_LIKELY(!has_search_latch)) { + btr_leaf_page_release(block, latch_mode, mtr); + } + + goto failure; + } + + if (UNIV_LIKELY(info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5)) { + + info->n_hash_potential++; + } + +#ifdef notdefined + /* These lines of code can be used in a debug version to check + the correctness of the searched cursor position: */ + + info->last_hash_succ = FALSE; + + /* Currently, does not work if the following fails: */ + ut_ad(!has_search_latch); + + btr_leaf_page_release(block, latch_mode, mtr); + + btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, + &cursor2, 0, mtr); + if (mode == PAGE_CUR_GE + && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) { + + /* If mode is PAGE_CUR_GE, then the binary search + in the index tree may actually take us to the supremum + of the previous page */ + + info->last_hash_succ = FALSE; + + btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode, + &pcur, mtr); + ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor)); + } else { + ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor)); + } + + /* NOTE that it is theoretically possible that the above assertions + fail if the page of the cursor gets removed from the buffer pool + meanwhile! Thus it might not be a bug. */ +#endif + info->last_hash_succ = TRUE; + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_succ++; +#endif + if (UNIV_LIKELY(!has_search_latch) + && buf_page_peek_if_too_old(&block->page)) { + + buf_page_make_young(&block->page); + } + + /* Increment the page get statistics though we did not really + fix the page: for user info only */ + + buf_pool->n_page_gets++; + + return(TRUE); + + /*-------------------------------------------*/ +failure_unlock: + if (UNIV_LIKELY(!has_search_latch)) { + rw_lock_s_unlock(&btr_search_latch); + } +failure: + cursor->flag = BTR_CUR_HASH_FAIL; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_fail++; + + if (info->n_hash_succ > 0) { + info->n_hash_succ--; + } +#endif + info->last_hash_succ = FALSE; + + return(FALSE); +} + +/************************************************************************ +Drops a page hash index. */ +UNIV_INTERN +void +btr_search_drop_page_hash_index( +/*============================*/ + buf_block_t* block) /* in: block containing index page, + s- or x-latched, or an index page + for which we know that + block->buf_fix_count == 0 */ +{ + hash_table_t* table; + ulint n_fields; + ulint n_bytes; + const page_t* page; + const rec_t* rec; + ulint fold; + ulint prev_fold; + dulint index_id; + ulint n_cached; + ulint n_recs; + ulint* folds; + ulint i; + mem_heap_t* heap; + const dict_index_t* index; + ulint* offsets; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + +retry: + rw_lock_s_lock(&btr_search_latch); + page = block->frame; + + if (UNIV_LIKELY(!block->is_hashed)) { + + rw_lock_s_unlock(&btr_search_latch); + + return; + } + + table = btr_search_sys->hash_index; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX) + || (block->page.buf_fix_count == 0)); +#endif /* UNIV_SYNC_DEBUG */ + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + index = block->index; + ut_a(!dict_index_is_ibuf(index)); + + /* NOTE: The fields of block must not be accessed after + releasing btr_search_latch, as the index page might only + be s-latched! */ + + rw_lock_s_unlock(&btr_search_latch); + + ut_a(n_fields + n_bytes > 0); + + n_recs = page_get_n_recs(page); + + /* Calculate and cache fold values into an array for fast deletion + from the hash index */ + + folds = mem_alloc(n_recs * sizeof(ulint)); + + n_cached = 0; + + rec = page_get_infimum_rec(page); + rec = page_rec_get_next_low(rec, page_is_comp(page)); + + index_id = btr_page_get_index_id(page); + + ut_a(0 == ut_dulint_cmp(index_id, index->id)); + + prev_fold = 0; + + heap = NULL; + offsets = NULL; + + while (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0)); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + + if (fold == prev_fold && prev_fold != 0) { + + goto next_rec; + } + + /* Remove all hash nodes pointing to this page from the + hash chain */ + + folds[n_cached] = fold; + n_cached++; +next_rec: + rec = page_rec_get_next_low(rec, page_rec_is_comp(rec)); + prev_fold = fold; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + rw_lock_x_lock(&btr_search_latch); + + if (UNIV_UNLIKELY(!block->is_hashed)) { + /* Someone else has meanwhile dropped the hash index */ + + goto cleanup; + } + + ut_a(block->index == index); + + if (UNIV_UNLIKELY(block->curr_n_fields != n_fields) + || UNIV_UNLIKELY(block->curr_n_bytes != n_bytes)) { + + /* Someone else has meanwhile built a new hash index on the + page, with different parameters */ + + rw_lock_x_unlock(&btr_search_latch); + + mem_free(folds); + goto retry; + } + + for (i = 0; i < n_cached; i++) { + + ha_remove_all_nodes_to_page(table, folds[i], page); + } + + ut_a(index->search_info->ref_count > 0); + index->search_info->ref_count--; + + block->is_hashed = FALSE; + block->index = NULL; + +cleanup: +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (UNIV_UNLIKELY(block->n_pointers)) { + /* Corruption */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Corruption of adaptive hash index." + " After dropping\n" + "InnoDB: the hash index to a page of %s," + " still %lu hash nodes remain.\n", + index->name, (ulong) block->n_pointers); + rw_lock_x_unlock(&btr_search_latch); + + btr_search_validate(); + } else { + rw_lock_x_unlock(&btr_search_latch); + } +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + rw_lock_x_unlock(&btr_search_latch); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + mem_free(folds); +} + +/************************************************************************ +Drops a page hash index when a page is freed from a fseg to the file system. +Drops possible hash index if the page happens to be in the buffer pool. */ +UNIV_INTERN +void +btr_search_drop_page_hash_when_freed( +/*=================================*/ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no) /* in: page number */ +{ + buf_block_t* block; + mtr_t mtr; + + if (!buf_page_peek_if_search_hashed(space, page_no)) { + + return; + } + + mtr_start(&mtr); + + /* We assume that if the caller has a latch on the page, then the + caller has already dropped the hash index for the page, and we never + get here. Therefore we can acquire the s-latch to the page without + having to fear a deadlock. */ + + block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL, + BUF_GET_IF_IN_POOL, __FILE__, __LINE__, + &mtr); + /* Because the buffer pool mutex was released by + buf_page_peek_if_search_hashed(), it is possible that the + block was removed from the buffer pool by another thread + before buf_page_get_gen() got a chance to acquire the buffer + pool mutex again. Thus, we must check for a NULL return. */ + + if (UNIV_LIKELY(block != NULL)) { + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH); + + btr_search_drop_page_hash_index(block); + } + + mtr_commit(&mtr); +} + +/************************************************************************ +Builds a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible values, and does not build a hash index if not. */ +static +void +btr_search_build_page_hash_index( +/*=============================*/ + dict_index_t* index, /* in: index for which to build */ + buf_block_t* block, /* in: index page, s- or x-latched */ + ulint n_fields,/* in: hash this many full fields */ + ulint n_bytes,/* in: hash this many bytes from the next + field */ + ibool left_side)/* in: hash for searches from left side? */ +{ + hash_table_t* table; + page_t* page; + rec_t* rec; + rec_t* next_rec; + ulint fold; + ulint next_fold; + dulint index_id; + ulint n_cached; + ulint n_recs; + ulint* folds; + rec_t** recs; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(index); + ut_a(!dict_index_is_ibuf(index)); + + table = btr_search_sys->hash_index; + page = buf_block_get_frame(block); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + rw_lock_s_lock(&btr_search_latch); + + if (block->is_hashed && ((block->curr_n_fields != n_fields) + || (block->curr_n_bytes != n_bytes) + || (block->curr_left_side != left_side))) { + + rw_lock_s_unlock(&btr_search_latch); + + btr_search_drop_page_hash_index(block); + } else { + rw_lock_s_unlock(&btr_search_latch); + } + + n_recs = page_get_n_recs(page); + + if (n_recs == 0) { + + return; + } + + /* Check that the values for hash index build are sensible */ + + if (n_fields + n_bytes == 0) { + + return; + } + + if (dict_index_get_n_unique_in_tree(index) < n_fields + || (dict_index_get_n_unique_in_tree(index) == n_fields + && n_bytes > 0)) { + return; + } + + /* Calculate and cache fold values and corresponding records into + an array for fast insertion to the hash index */ + + folds = mem_alloc(n_recs * sizeof(ulint)); + recs = mem_alloc(n_recs * sizeof(rec_t*)); + + n_cached = 0; + + index_id = btr_page_get_index_id(page); + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + offsets = rec_get_offsets(rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + + if (!page_rec_is_supremum(rec)) { + ut_a(n_fields <= rec_offs_n_fields(offsets)); + + if (n_bytes > 0) { + ut_a(n_fields < rec_offs_n_fields(offsets)); + } + } + + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + + if (left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + for (;;) { + next_rec = page_rec_get_next(rec); + + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + break; + } + + offsets = rec_get_offsets(next_rec, index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index_id); + + if (fold != next_fold) { + /* Insert an entry into the hash index */ + + if (left_side) { + + folds[n_cached] = next_fold; + recs[n_cached] = next_rec; + n_cached++; + } else { + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + } + + rec = next_rec; + fold = next_fold; + } + + btr_search_check_free_space_in_heap(); + + rw_lock_x_lock(&btr_search_latch); + + if (UNIV_UNLIKELY(!btr_search_enabled)) { + goto exit_func; + } + + if (block->is_hashed && ((block->curr_n_fields != n_fields) + || (block->curr_n_bytes != n_bytes) + || (block->curr_left_side != left_side))) { + goto exit_func; + } + + /* This counter is decremented every time we drop page + hash index entries and is incremented here. Since we can + rebuild hash index for a page that is already hashed, we + have to take care not to increment the counter in that + case. */ + if (!block->is_hashed) { + index->search_info->ref_count++; + } + + block->is_hashed = TRUE; + block->n_hash_helps = 0; + + block->curr_n_fields = n_fields; + block->curr_n_bytes = n_bytes; + block->curr_left_side = left_side; + block->index = index; + + for (i = 0; i < n_cached; i++) { + + ha_insert_for_fold(table, folds[i], block, recs[i]); + } + +exit_func: + rw_lock_x_unlock(&btr_search_latch); + + mem_free(folds); + mem_free(recs); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/************************************************************************ +Moves or deletes hash entries for moved records. If new_page is already hashed, +then the hash index for page, if any, is dropped. If new_page is not hashed, +and page is hashed, then a new hash index is built to new_page with the same +parameters as page (this often happens when a page is split). */ +UNIV_INTERN +void +btr_search_move_or_delete_hash_entries( +/*===================================*/ + buf_block_t* new_block, /* in: records are copied + to this page */ + buf_block_t* block, /* in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index) /* in: record descriptor */ +{ + ulint n_fields; + ulint n_bytes; + ibool left_side; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); + ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(!new_block->is_hashed || new_block->index == index); + ut_a(!block->is_hashed || block->index == index); + ut_a(!(new_block->is_hashed || block->is_hashed) + || !dict_index_is_ibuf(index)); + + rw_lock_s_lock(&btr_search_latch); + + if (new_block->is_hashed) { + + rw_lock_s_unlock(&btr_search_latch); + + btr_search_drop_page_hash_index(block); + + return; + } + + if (block->is_hashed) { + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + left_side = block->curr_left_side; + + new_block->n_fields = block->curr_n_fields; + new_block->n_bytes = block->curr_n_bytes; + new_block->left_side = left_side; + + rw_lock_s_unlock(&btr_search_latch); + + ut_a(n_fields + n_bytes > 0); + + btr_search_build_page_hash_index(index, new_block, n_fields, + n_bytes, left_side); + ut_ad(n_fields == block->curr_n_fields); + ut_ad(n_bytes == block->curr_n_bytes); + ut_ad(left_side == block->curr_left_side); + return; + } + + rw_lock_s_unlock(&btr_search_latch); +} + +/************************************************************************ +Updates the page hash index when a single record is deleted from a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_delete( +/*=============================*/ + btr_cur_t* cursor) /* in: cursor which was positioned on the + record to delete using btr_cur_search_..., + the record is not yet deleted */ +{ + hash_table_t* table; + buf_block_t* block; + rec_t* rec; + ulint fold; + dulint index_id; + ibool found; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t* heap = NULL; + rec_offs_init(offsets_); + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(block->curr_n_fields + block->curr_n_bytes > 0); + ut_a(!dict_index_is_ibuf(cursor->index)); + + table = btr_search_sys->hash_index; + + index_id = cursor->index->id; + fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, offsets_, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, block->curr_n_bytes, index_id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + rw_lock_x_lock(&btr_search_latch); + + found = ha_search_and_delete_if_found(table, fold, rec); + + rw_lock_x_unlock(&btr_search_latch); +} + +/************************************************************************ +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_node_on_insert( +/*==================================*/ + btr_cur_t* cursor) /* in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +{ + hash_table_t* table; + buf_block_t* block; + rec_t* rec; + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(!dict_index_is_ibuf(cursor->index)); + + rw_lock_x_lock(&btr_search_latch); + + if ((cursor->flag == BTR_CUR_HASH) + && (cursor->n_fields == block->curr_n_fields) + && (cursor->n_bytes == block->curr_n_bytes) + && !block->curr_left_side) { + + table = btr_search_sys->hash_index; + + ha_search_and_update_if_found(table, cursor->fold, rec, + block, page_rec_get_next(rec)); + + rw_lock_x_unlock(&btr_search_latch); + } else { + rw_lock_x_unlock(&btr_search_latch); + + btr_search_update_hash_on_insert(cursor); + } +} + +/************************************************************************ +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_insert( +/*=============================*/ + btr_cur_t* cursor) /* in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +{ + hash_table_t* table; + buf_block_t* block; + rec_t* rec; + rec_t* ins_rec; + rec_t* next_rec; + dulint index_id; + ulint fold; + ulint ins_fold; + ulint next_fold = 0; /* remove warning (??? bug ???) */ + ulint n_fields; + ulint n_bytes; + ibool left_side; + ibool locked = FALSE; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + table = btr_search_sys->hash_index; + + btr_search_check_free_space_in_heap(); + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!block->is_hashed) { + + return; + } + + ut_a(block->index == cursor->index); + ut_a(!dict_index_is_ibuf(cursor->index)); + + index_id = cursor->index->id; + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + left_side = block->curr_left_side; + + ins_rec = page_rec_get_next(rec); + next_rec = page_rec_get_next(ins_rec); + + offsets = rec_get_offsets(ins_rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index_id); + + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets(next_rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index_id); + } + + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + n_fields + (n_bytes > 0), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id); + } else { + if (left_side) { + + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + + goto check_next_rec; + } + + if (fold != ins_fold) { + + if (!locked) { + + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + } + + if (!left_side) { + ha_insert_for_fold(table, fold, block, rec); + } else { + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + } + +check_next_rec: + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + if (!locked) { + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + } + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + } + + goto function_exit; + } + + if (ins_fold != next_fold) { + + if (!locked) { + + rw_lock_x_lock(&btr_search_latch); + + locked = TRUE; + } + + if (!left_side) { + + ha_insert_for_fold(table, ins_fold, block, ins_rec); + /* + fputs("Hash insert for ", stderr); + dict_index_name_print(stderr, cursor->index); + fprintf(stderr, " fold %lu\n", ins_fold); + */ + } else { + ha_insert_for_fold(table, next_fold, block, next_rec); + } + } + +function_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (locked) { + rw_lock_x_unlock(&btr_search_latch); + } +} + +/************************************************************************ +Validates the search system. */ +UNIV_INTERN +ibool +btr_search_validate(void) +/*=====================*/ + /* out: TRUE if ok */ +{ + ha_node_t* node; + ulint n_page_dumps = 0; + ibool ok = TRUE; + ulint i; + ulint cell_count; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + /* How many cells to check before temporarily releasing + btr_search_latch. */ + ulint chunk_size = 10000; + + rec_offs_init(offsets_); + + rw_lock_x_lock(&btr_search_latch); + buf_pool_mutex_enter(); + + cell_count = hash_get_n_cells(btr_search_sys->hash_index); + + for (i = 0; i < cell_count; i++) { + /* We release btr_search_latch every once in a while to + give other queries a chance to run. */ + if ((i != 0) && ((i % chunk_size) == 0)) { + buf_pool_mutex_exit(); + rw_lock_x_unlock(&btr_search_latch); + os_thread_yield(); + rw_lock_x_lock(&btr_search_latch); + buf_pool_mutex_enter(); + } + + node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node; + + for (; node != NULL; node = node->next) { + const buf_block_t* block + = buf_block_align(node->data); + const buf_block_t* hash_block; + + if (UNIV_LIKELY(buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE)) { + + /* The space and offset are only valid + for file blocks. It is possible that + the block is being freed + (BUF_BLOCK_REMOVE_HASH, see the + assertion and the comment below) */ + hash_block = buf_block_hash_get( + buf_block_get_space(block), + buf_block_get_page_no(block)); + } else { + hash_block = NULL; + } + + if (hash_block) { + ut_a(hash_block == block); + } else { + /* When a block is being freed, + buf_LRU_search_and_free_block() first + removes the block from + buf_pool->page_hash by calling + buf_LRU_block_remove_hashed_page(). + After that, it invokes + btr_search_drop_page_hash_index() to + remove the block from + btr_search_sys->hash_index. */ + + ut_a(buf_block_get_state(block) + == BUF_BLOCK_REMOVE_HASH); + } + + ut_a(!dict_index_is_ibuf(block->index)); + + offsets = rec_get_offsets((const rec_t*) node->data, + block->index, offsets, + block->curr_n_fields + + (block->curr_n_bytes > 0), + &heap); + + if (!block->is_hashed || node->fold + != rec_fold((rec_t*)(node->data), + offsets, + block->curr_n_fields, + block->curr_n_bytes, + btr_page_get_index_id(block->frame))) { + const page_t* page = block->frame; + + ok = FALSE; + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error in an adaptive hash" + " index pointer to page %lu\n" + "InnoDB: ptr mem address %p" + " index id %lu %lu," + " node fold %lu, rec fold %lu\n", + (ulong) page_get_page_no(page), + node->data, + (ulong) ut_dulint_get_high( + btr_page_get_index_id(page)), + (ulong) ut_dulint_get_low( + btr_page_get_index_id(page)), + (ulong) node->fold, + (ulong) rec_fold((rec_t*)(node->data), + offsets, + block->curr_n_fields, + block->curr_n_bytes, + btr_page_get_index_id( + page))); + + fputs("InnoDB: Record ", stderr); + rec_print_new(stderr, (rec_t*)node->data, + offsets); + fprintf(stderr, "\nInnoDB: on that page." + " Page mem address %p, is hashed %lu," + " n fields %lu, n bytes %lu\n" + "InnoDB: side %lu\n", + (void*) page, (ulong) block->is_hashed, + (ulong) block->curr_n_fields, + (ulong) block->curr_n_bytes, + (ulong) block->curr_left_side); + + if (n_page_dumps < 20) { + buf_page_print(page, 0); + n_page_dumps++; + } + } + } + } + + for (i = 0; i < cell_count; i += chunk_size) { + ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1); + + /* We release btr_search_latch every once in a while to + give other queries a chance to run. */ + if (i != 0) { + buf_pool_mutex_exit(); + rw_lock_x_unlock(&btr_search_latch); + os_thread_yield(); + rw_lock_x_lock(&btr_search_latch); + buf_pool_mutex_enter(); + } + + if (!ha_validate(btr_search_sys->hash_index, i, end_index)) { + ok = FALSE; + } + } + + buf_pool_mutex_exit(); + rw_lock_x_unlock(&btr_search_latch); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ok); +} diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c new file mode 100644 index 00000000000..99ee7554132 --- /dev/null +++ b/storage/xtradb/buf/buf0buddy.c @@ -0,0 +1,682 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#define THIS_MODULE +#include "buf0buddy.h" +#ifdef UNIV_NONINL +# include "buf0buddy.ic" +#endif +#undef THIS_MODULE +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "page0zip.h" + +/* Statistic counters */ + +#ifdef UNIV_DEBUG +/** Number of frames allocated from the buffer pool to the buddy system. +Protected by buf_pool_mutex. */ +static ulint buf_buddy_n_frames; +#endif /* UNIV_DEBUG */ +/** Statistics of the buddy system, indexed by block size. +Protected by buf_pool_mutex. */ +UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES + 1]; + +/************************************************************************** +Get the offset of the buddy of a compressed page frame. */ +UNIV_INLINE +byte* +buf_buddy_get( +/*==========*/ + /* out: the buddy relative of page */ + byte* page, /* in: compressed page */ + ulint size) /* in: page size in bytes */ +{ + ut_ad(ut_is_2pow(size)); + ut_ad(size >= BUF_BUDDY_LOW); + ut_ad(size < BUF_BUDDY_HIGH); + ut_ad(!ut_align_offset(page, size)); + + if (((ulint) page) & size) { + return(page - size); + } else { + return(page + size); + } +} + +/************************************************************************** +Add a block to the head of the appropriate buddy free list. */ +UNIV_INLINE +void +buf_buddy_add_to_free( +/*==================*/ + buf_page_t* bpage, /* in,own: block to be freed */ + ulint i) /* in: index of buf_pool->zip_free[] */ +{ +#ifdef UNIV_DEBUG_VALGRIND + buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + + if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i); +#endif /* UNIV_DEBUG_VALGRIND */ + + ut_ad(buf_pool->zip_free[i].start != bpage); + UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage); + +#ifdef UNIV_DEBUG_VALGRIND + if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i); + UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); +#endif /* UNIV_DEBUG_VALGRIND */ +} + +/************************************************************************** +Remove a block from the appropriate buddy free list. */ +UNIV_INLINE +void +buf_buddy_remove_from_free( +/*=======================*/ + buf_page_t* bpage, /* in: block to be removed */ + ulint i) /* in: index of buf_pool->zip_free[] */ +{ +#ifdef UNIV_DEBUG_VALGRIND + buf_page_t* prev = UT_LIST_GET_PREV(list, bpage); + buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); + + if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i); + if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i); + + ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE); + ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE); +#endif /* UNIV_DEBUG_VALGRIND */ + + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); + UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage); + +#ifdef UNIV_DEBUG_VALGRIND + if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i); + if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i); +#endif /* UNIV_DEBUG_VALGRIND */ +} + +/************************************************************************** +Try to allocate a block from buf_pool->zip_free[]. */ +static +void* +buf_buddy_alloc_zip( +/*================*/ + /* out: allocated block, or NULL + if buf_pool->zip_free[] was empty */ + ulint i) /* in: index of buf_pool->zip_free[] */ +{ + buf_page_t* bpage; + + ut_ad(buf_pool_mutex_own()); + ut_a(i < BUF_BUDDY_SIZES); + +#if defined UNIV_DEBUG && !defined UNIV_DEBUG_VALGRIND + /* Valgrind would complain about accessing free memory. */ + UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i]); +#endif /* UNIV_DEBUG && !UNIV_DEBUG_VALGRIND */ + bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]); + + if (bpage) { + UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); + + buf_buddy_remove_from_free(bpage, i); + } else if (i + 1 < BUF_BUDDY_SIZES) { + /* Attempt to split. */ + bpage = buf_buddy_alloc_zip(i + 1); + + if (bpage) { + buf_page_t* buddy = (buf_page_t*) + (((char*) bpage) + (BUF_BUDDY_LOW << i)); + + ut_ad(!buf_pool_contains_zip(buddy)); + ut_d(memset(buddy, i, BUF_BUDDY_LOW << i)); + buddy->state = BUF_BLOCK_ZIP_FREE; + buf_buddy_add_to_free(buddy, i); + } + } + +#ifdef UNIV_DEBUG + if (bpage) { + memset(bpage, ~i, BUF_BUDDY_LOW << i); + } +#endif /* UNIV_DEBUG */ + + UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i); + + return(bpage); +} + +/************************************************************************** +Deallocate a buffer frame of UNIV_PAGE_SIZE. */ +static +void +buf_buddy_block_free( +/*=================*/ + void* buf) /* in: buffer frame to deallocate */ +{ + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); + buf_page_t* bpage; + buf_block_t* block; + + ut_ad(buf_pool_mutex_own()); + ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); + + HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY + && bpage->in_zip_hash && !bpage->in_page_hash), + ((buf_block_t*) bpage)->frame == buf); + ut_a(bpage); + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY); + ut_ad(!bpage->in_page_hash); + ut_ad(bpage->in_zip_hash); + ut_d(bpage->in_zip_hash = FALSE); + HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage); + + ut_d(memset(buf, 0, UNIV_PAGE_SIZE)); + UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE); + + block = (buf_block_t*) bpage; + mutex_enter(&block->mutex); + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + + ut_ad(buf_buddy_n_frames > 0); + ut_d(buf_buddy_n_frames--); +} + +/************************************************************************** +Allocate a buffer block to the buddy allocator. */ +static +void +buf_buddy_block_register( +/*=====================*/ + buf_block_t* block) /* in: buffer frame to allocate */ +{ + const ulint fold = BUF_POOL_ZIP_FOLD(block); + ut_ad(buf_pool_mutex_own()); + ut_ad(!mutex_own(&buf_pool_zip_mutex)); + + buf_block_set_state(block, BUF_BLOCK_MEMORY); + + ut_a(block->frame); + ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE)); + + ut_ad(!block->page.in_page_hash); + ut_ad(!block->page.in_zip_hash); + ut_d(block->page.in_zip_hash = TRUE); + HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page); + + ut_d(buf_buddy_n_frames++); +} + +/************************************************************************** +Allocate a block from a bigger object. */ +static +void* +buf_buddy_alloc_from( +/*=================*/ + /* out: allocated block */ + void* buf, /* in: a block that is free to use */ + ulint i, /* in: index of buf_pool->zip_free[] */ + ulint j) /* in: size of buf as an index + of buf_pool->zip_free[] */ +{ + ulint offs = BUF_BUDDY_LOW << j; + ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(j >= i); + ut_ad(!ut_align_offset(buf, offs)); + + /* Add the unused parts of the block to the free lists. */ + while (j > i) { + buf_page_t* bpage; + + offs >>= 1; + j--; + + bpage = (buf_page_t*) ((byte*) buf + offs); + ut_d(memset(bpage, j, BUF_BUDDY_LOW << j)); + bpage->state = BUF_BLOCK_ZIP_FREE; +#if defined UNIV_DEBUG && !defined UNIV_DEBUG_VALGRIND + /* Valgrind would complain about accessing free memory. */ + UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[j]); +#endif /* UNIV_DEBUG && !UNIV_DEBUG_VALGRIND */ + buf_buddy_add_to_free(bpage, j); + } + + return(buf); +} + +/************************************************************************** +Allocate a block. The thread calling this function must hold +buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. +The buf_pool_mutex may only be released and reacquired if lru != NULL. */ +UNIV_INTERN +void* +buf_buddy_alloc_low( +/*================*/ + /* out: allocated block, + possibly NULL if lru==NULL */ + ulint i, /* in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + ibool* lru) /* in: pointer to a variable that will be assigned + TRUE if storage was allocated from the LRU list + and buf_pool_mutex was temporarily released, + or NULL if the LRU list should not be used */ +{ + buf_block_t* block; + + ut_ad(buf_pool_mutex_own()); + ut_ad(!mutex_own(&buf_pool_zip_mutex)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + block = buf_buddy_alloc_zip(i); + + if (block) { + + goto func_exit; + } + } + + /* Try allocating from the buf_pool->free list. */ + block = buf_LRU_get_free_only(); + + if (block) { + + goto alloc_big; + } + + if (!lru) { + + return(NULL); + } + + /* Try replacing an uncompressed page in the buffer pool. */ + buf_pool_mutex_exit(); + block = buf_LRU_get_free_block(0); + *lru = TRUE; + buf_pool_mutex_enter(); + +alloc_big: + buf_buddy_block_register(block); + + block = buf_buddy_alloc_from(block->frame, i, BUF_BUDDY_SIZES); + +func_exit: + buf_buddy_stat[i].used++; + return(block); +} + +/************************************************************************** +Try to relocate the control block of a compressed page. */ +static +ibool +buf_buddy_relocate_block( +/*=====================*/ + /* out: TRUE if relocated */ + buf_page_t* bpage, /* in: block to relocate */ + buf_page_t* dpage) /* in: free block to relocate to */ +{ + buf_page_t* b; + + ut_ad(buf_pool_mutex_own()); + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_FILE_PAGE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + case BUF_BLOCK_ZIP_DIRTY: + /* Cannot relocate dirty pages. */ + return(FALSE); + + case BUF_BLOCK_ZIP_PAGE: + break; + } + + mutex_enter(&buf_pool_zip_mutex); + + if (!buf_page_can_relocate(bpage)) { + mutex_exit(&buf_pool_zip_mutex); + return(FALSE); + } + + buf_relocate(bpage, dpage); + ut_d(bpage->state = BUF_BLOCK_ZIP_FREE); + + /* relocate buf_pool->zip_clean */ + b = UT_LIST_GET_PREV(list, dpage); + UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage); + + if (b) { + UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, dpage); + } else { + UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage); + } + + mutex_exit(&buf_pool_zip_mutex); + return(TRUE); +} + +/************************************************************************** +Try to relocate a block. */ +static +ibool +buf_buddy_relocate( +/*===============*/ + /* out: TRUE if relocated */ + void* src, /* in: block to relocate */ + void* dst, /* in: free block to relocate to */ + ulint i) /* in: index of buf_pool->zip_free[] */ +{ + buf_page_t* bpage; + const ulint size = BUF_BUDDY_LOW << i; + ullint usec = ut_time_us(NULL); + + ut_ad(buf_pool_mutex_own()); + ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(!ut_align_offset(src, size)); + ut_ad(!ut_align_offset(dst, size)); + UNIV_MEM_ASSERT_W(dst, size); + + /* We assume that all memory from buf_buddy_alloc() + is used for either compressed pages or buf_page_t + objects covering compressed pages. */ + + /* We look inside the allocated objects returned by + buf_buddy_alloc() and assume that anything of + PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains + a valid space_id and page_no in the page header. Should the + fields be invalid, we will be unable to relocate the block. + We also assume that anything that fits sizeof(buf_page_t) + actually is a properly initialized buf_page_t object. */ + + if (size >= PAGE_ZIP_MIN_SIZE) { + /* This is a compressed page. */ + mutex_t* mutex; + + /* The src block may be split into smaller blocks, + some of which may be free. Thus, the + mach_read_from_4() calls below may attempt to read + from free memory. The memory is "owned" by the buddy + allocator (and it has been allocated from the buffer + pool), so there is nothing wrong about this. The + mach_read_from_4() calls here will only trigger bogus + Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ + bpage = buf_page_hash_get( + mach_read_from_4((const byte*) src + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID), + mach_read_from_4((const byte*) src + + FIL_PAGE_OFFSET)); + + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool->page_hash yet. Obviously, + it cannot be relocated. */ + + return(FALSE); + } + + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); + + return(FALSE); + } + + /* The block must have been allocated, but it may + contain uninitialized data. */ + UNIV_MEM_ASSERT_W(src, size); + + mutex = buf_page_get_mutex(bpage); + + mutex_enter(mutex); + + if (buf_page_can_relocate(bpage)) { + /* Relocate the compressed page. */ + ut_a(bpage->zip.data == src); + memcpy(dst, src, size); + bpage->zip.data = dst; + mutex_exit(mutex); +success: + UNIV_MEM_INVALID(src, size); + { + buf_buddy_stat_t* buddy_stat + = &buf_buddy_stat[i]; + buddy_stat->relocated++; + buddy_stat->relocated_usec + += ut_time_us(NULL) - usec; + } + return(TRUE); + } + + mutex_exit(mutex); + } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) { + /* This must be a buf_page_t object. */ + UNIV_MEM_ASSERT_RW(src, size); + if (buf_buddy_relocate_block(src, dst)) { + + goto success; + } + } + + return(FALSE); +} + +/************************************************************************** +Deallocate a block. */ +UNIV_INTERN +void +buf_buddy_free_low( +/*===============*/ + void* buf, /* in: block to be freed, must not be + pointed to by the buffer pool */ + ulint i) /* in: index of buf_pool->zip_free[] */ +{ + buf_page_t* bpage; + buf_page_t* buddy; + + ut_ad(buf_pool_mutex_own()); + ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(buf_buddy_stat[i].used > 0); + + buf_buddy_stat[i].used--; +recombine: + UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i); + ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE); + + if (i == BUF_BUDDY_SIZES) { + buf_buddy_block_free(buf); + return; + } + + ut_ad(i < BUF_BUDDY_SIZES); + ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool_contains_zip(buf)); + + /* Try to combine adjacent blocks. */ + + buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i); + +#ifndef UNIV_DEBUG_VALGRIND + /* Valgrind would complain about accessing free memory. */ + + if (buddy->state != BUF_BLOCK_ZIP_FREE) { + + goto buddy_nonfree; + } + + /* The field buddy->state can only be trusted for free blocks. + If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if + it is in the free list. */ +#endif /* !UNIV_DEBUG_VALGRIND */ + + for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) { + UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); + + if (bpage == buddy) { +buddy_free: + /* The buddy is free: recombine */ + buf_buddy_remove_from_free(bpage, i); +buddy_free2: + ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE); + ut_ad(!buf_pool_contains_zip(buddy)); + i++; + buf = ut_align_down(buf, BUF_BUDDY_LOW << i); + + goto recombine; + } + + ut_a(bpage != buf); + + { + buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); + UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); + bpage = next; + } + } + +#ifndef UNIV_DEBUG_VALGRIND +buddy_nonfree: + /* Valgrind would complain about accessing free memory. */ + ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i])); +#endif /* UNIV_DEBUG_VALGRIND */ + + /* The buddy is not free. Is there a free block of this size? */ + bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + + if (bpage) { + /* Remove the block from the free list, because a successful + buf_buddy_relocate() will overwrite bpage->list. */ + + UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); + buf_buddy_remove_from_free(bpage, i); + + /* Try to relocate the buddy of buf to the free block. */ + if (buf_buddy_relocate(buddy, bpage, i)) { + + ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); + goto buddy_free2; + } + + buf_buddy_add_to_free(bpage, i); + + /* Try to relocate the buddy of the free block to buf. */ + buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage), + BUF_BUDDY_LOW << i); + +#if defined UNIV_DEBUG && !defined UNIV_DEBUG_VALGRIND + { + const buf_page_t* b; + + /* The buddy must not be (completely) free, because + we always recombine adjacent free blocks. + (Parts of the buddy can be free in + buf_pool->zip_free[j] with j < i.)*/ + for (b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + b; b = UT_LIST_GET_NEXT(list, b)) { + + ut_a(b != buddy); + } + } +#endif /* UNIV_DEBUG && !UNIV_DEBUG_VALGRIND */ + + if (buf_buddy_relocate(buddy, buf, i)) { + + buf = bpage; + UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); + ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); + goto buddy_free; + } + } + + /* Free the block to the buddy list. */ + bpage = buf; +#ifdef UNIV_DEBUG + if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) { + /* This area has most likely been allocated for at + least one compressed-only block descriptor. Check + that there are no live objects in the area. This is + not a complete check: it may yield false positives as + well as false negatives. Also, due to buddy blocks + being recombined, it is possible (although unlikely) + that this branch is never reached. */ + + char* c; + +# ifndef UNIV_DEBUG_VALGRIND + /* Valgrind would complain about accessing + uninitialized memory. Besides, Valgrind performs a + more exhaustive check, at every memory access. */ + const buf_page_t* b = buf; + const buf_page_t* const b_end = (buf_page_t*) + ((char*) b + (BUF_BUDDY_LOW << i)); + + for (; b < b_end; b++) { + /* Avoid false positives (and cause false + negatives) by checking for b->space < 1000. */ + + if ((b->state == BUF_BLOCK_ZIP_PAGE + || b->state == BUF_BLOCK_ZIP_DIRTY) + && b->space > 0 && b->space < 1000) { + fprintf(stderr, + "buddy dirty %p %u (%u,%u) %p,%lu\n", + (void*) b, + b->state, b->space, b->offset, + buf, i); + } + } +# endif /* !UNIV_DEBUG_VALGRIND */ + + /* Scramble the block. This should make any pointers + invalid and trigger a segmentation violation. Because + the scrambling can be reversed, it may be possible to + track down the object pointing to the freed data by + dereferencing the unscrambled bpage->LRU or + bpage->list pointers. */ + for (c = (char*) buf + (BUF_BUDDY_LOW << i); + c-- > (char*) buf; ) { + *c = ~*c ^ i; + } + } else { + /* Fill large blocks with a constant pattern. */ + memset(bpage, i, BUF_BUDDY_LOW << i); + } +#endif /* UNIV_DEBUG */ + bpage->state = BUF_BLOCK_ZIP_FREE; + buf_buddy_add_to_free(bpage, i); +} diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c new file mode 100644 index 00000000000..67013780ac6 --- /dev/null +++ b/storage/xtradb/buf/buf0buf.c @@ -0,0 +1,3920 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0buf.h" + +#ifdef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#include "buf0buddy.h" +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "trx0undo.h" +#include "srv0srv.h" +#include "page0zip.h" + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + +Performance improvement: +------------------------ +Thread scheduling in NT may be so slow that the OS wait mechanism should +not be used even in waiting for disk reads to complete. +Rather, we should put waiting query threads to the queue of +waiting jobs, and let the OS thread do something useful while the i/o +is processed. In this way we could remove most OS thread switches in +an i/o-intensive benchmark like TPC-C. + +A possibility is to put a user space thread library between the database +and NT. User space thread libraries might be very fast. + +SQL Server 7.0 can be configured to use 'fibers' which are lightweight +threads in NT. These should be studied. + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool mutex. + +The buf_pool mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +We intend to make the buffer buf_pool size on-line reconfigurable, +that is, the buf_pool size can be changed without closing the database. +Then the database administarator may adjust it to be bigger +at night, for example. The control block array must +contain enough control blocks for the maximum buffer buf_pool size +which is used in the particular database. +If the buf_pool size is cut, we exploit the virtual memory mechanism of +the OS, and just refrain from using frames at high addresses. Then the OS +can swap them to disk. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. + +The free list (buf_pool->free) contains blocks which are currently not +used. + +The common LRU list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would noramlly be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +of the LRU list, we make sure that most of the buf_pool stays in the +main memory, undisturbed. + +The unzip_LRU list contains a subset of the common LRU list. The +blocks on the unzip_LRU list hold a compressed file page and the +corresponding uncompressed page frame. A block is in unzip_LRU if and +only if the predicate buf_page_belongs_to_unzip_LRU(&block->page) +holds. The blocks in unzip_LRU will be in same order as they are in +the common LRU list. That is, each manipulation of the common LRU +list will result in the same manipulation of the unzip_LRU list. + +The chain of modified blocks (buf_pool->flush_list) contains the blocks +holding file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. + +The chain of unmodified compressed blocks (buf_pool->zip_clean) +contains the control blocks (buf_page_t) of those compressed pages +that are not in buf_pool->flush_list and for which no uncompressed +page has been allocated in the buffer pool. The control blocks for +uncompressed pages are accessible via buf_block_t objects that are +reachable via buf_pool->chunks[]. + +The chains of free memory blocks (buf_pool->zip_free[]) are used by +the buddy allocator (buf0buddy.c) to keep track of currently unused +memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These +blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type +BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer +pool. The buddy allocator is solely used for allocating control +blocks for compressed pages (buf_page_t) and compressed page frames. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix field is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and resets the io_fix field +when the io operation completes. + +A thread may request the above operation using the function +buf_page_get(). It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. +*/ + +/* Value in microseconds */ +static const int WAIT_FOR_READ = 5000; + +/* The buffer buf_pool of the database */ +UNIV_INTERN buf_pool_t* buf_pool = NULL; + +/* mutex protecting the buffer pool struct and control blocks, except the +read-write lock in them */ +UNIV_INTERN mutex_t buf_pool_mutex; +/* mutex protecting the control blocks of compressed-only pages +(of type buf_page_t, not buf_block_t) */ +UNIV_INTERN mutex_t buf_pool_zip_mutex; + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +static ulint buf_dbg_counter = 0; /* This is used to insert validation + operations in excution in the + debug version */ +/** Flag to forbid the release of the buffer pool mutex. +Protected by buf_pool_mutex. */ +UNIV_INTERN ulint buf_pool_mutex_exit_forbidden = 0; +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_DEBUG +/* If this is set TRUE, the program prints info whenever +read-ahead or flush occurs */ +UNIV_INTERN ibool buf_debug_prints = FALSE; +#endif /* UNIV_DEBUG */ + +/* A chunk of buffers. The buffer pool is allocated in chunks. */ +struct buf_chunk_struct{ + ulint mem_size; /* allocated size of the chunk */ + ulint size; /* size of frames[] and blocks[] */ + void* mem; /* pointer to the memory area which + was allocated for the frames */ + buf_block_t* blocks; /* array of buffer control blocks */ +}; + +/************************************************************************ +Calculates a page checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. */ +UNIV_INTERN +ulint +buf_calc_page_new_checksum( +/*=======================*/ + /* out: checksum */ + const byte* page) /* in: buffer page */ +{ + ulint checksum; + + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + ..._ARCH_LOG_NO, are written outside the buffer pool to the first + pages of data files, we have to skip them in the page checksum + calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_fold_binary(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET) + + ut_fold_binary(page + FIL_PAGE_DATA, + UNIV_PAGE_SIZE - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + checksum = checksum & 0xFFFFFFFFUL; + + return(checksum); +} + +/************************************************************************ +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! */ +UNIV_INTERN +ulint +buf_calc_page_old_checksum( +/*=======================*/ + /* out: checksum */ + const byte* page) /* in: buffer page */ +{ + ulint checksum; + + checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); + + checksum = checksum & 0xFFFFFFFFUL; + + return(checksum); +} + +/************************************************************************ +Checks if a page is corrupt. */ +UNIV_INTERN +ibool +buf_page_is_corrupted( +/*==================*/ + /* out: TRUE if corrupted */ + const byte* read_buf, /* in: a database page */ + ulint zip_size) /* in: size of compressed page; + 0 for uncompressed pages */ +{ + ulint checksum_field; + ulint old_checksum_field; +#ifndef UNIV_HOTBACKUP + ib_uint64_t current_lsn; +#endif + if (UNIV_LIKELY(!zip_size) + && memcmp(read_buf + FIL_PAGE_LSN + 4, + read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(TRUE); + } + +#ifndef UNIV_HOTBACKUP + if (recv_lsn_checks_on && log_peek_lsn(¤t_lsn)) { + if (current_lsn < mach_read_ull(read_buf + FIL_PAGE_LSN)) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: page %lu log sequence number" + " %llu\n" + "InnoDB: is in the future! Current system " + "log sequence number %llu.\n" + "InnoDB: Your database may be corrupt or " + "you may have copied the InnoDB\n" + "InnoDB: tablespace but not the InnoDB " + "log files. See\n" + "InnoDB: http://dev.mysql.com/doc/refman/" + "5.1/en/forcing-recovery.html\n" + "InnoDB: for more information.\n", + (ulong) mach_read_from_4(read_buf + + FIL_PAGE_OFFSET), + mach_read_ull(read_buf + FIL_PAGE_LSN), + current_lsn); + } + } +#endif + + /* If we use checksums validation, make additional check before + returning TRUE to ensure that the checksum is not equal to + BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums + disabled. Otherwise, skip checksum calculation and return FALSE */ + + if (UNIV_LIKELY(srv_use_checksums)) { + checksum_field = mach_read_from_4(read_buf + + FIL_PAGE_SPACE_OR_CHKSUM); + + if (UNIV_UNLIKELY(zip_size)) { + return(checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != page_zip_calc_checksum(read_buf, zip_size)); + } + + old_checksum_field = mach_read_from_4( + read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM); + + /* There are 2 valid formulas for old_checksum_field: + + 1. Very old versions of InnoDB only stored 8 byte lsn to the + start and the end of the page. + + 2. Newer InnoDB versions store the old formula checksum + there. */ + + if (old_checksum_field != mach_read_from_4(read_buf + + FIL_PAGE_LSN) + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC + && old_checksum_field + != buf_calc_page_old_checksum(read_buf)) { + + return(TRUE); + } + + /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id + (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */ + + if (checksum_field != 0 + && checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_page_new_checksum(read_buf)) { + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************ +Prints a page to stderr. */ +UNIV_INTERN +void +buf_page_print( +/*===========*/ + const byte* read_buf, /* in: a database page */ + ulint zip_size) /* in: compressed page size, or + 0 for uncompressed pages */ +{ + dict_index_t* index; + ulint checksum; + ulint old_checksum; + ulint size = zip_size; + + if (!size) { + size = UNIV_PAGE_SIZE; + } + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Page dump in ascii and hex (%lu bytes):\n", + (ulong) size); + ut_print_buf(stderr, read_buf, size); + fputs("\nInnoDB: End of page dump\n", stderr); + + if (zip_size) { + /* Print compressed page. */ + + switch (fil_page_get_type(read_buf)) { + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + checksum = srv_use_checksums + ? page_zip_calc_checksum(read_buf, zip_size) + : BUF_NO_CHECKSUM_MAGIC; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Compressed BLOB page" + " checksum %lu, stored %lu\n" + "InnoDB: Page lsn %lu %lu\n" + "InnoDB: Page number (if stored" + " to page already) %lu,\n" + "InnoDB: space id (if stored" + " to page already) %lu\n", + (ulong) checksum, + (ulong) mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + (ulong) mach_read_from_4( + read_buf + FIL_PAGE_LSN), + (ulong) mach_read_from_4( + read_buf + (FIL_PAGE_LSN + 4)), + (ulong) mach_read_from_4( + read_buf + FIL_PAGE_OFFSET), + (ulong) mach_read_from_4( + read_buf + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + return; + default: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unknown page type %lu," + " assuming FIL_PAGE_INDEX\n", + fil_page_get_type(read_buf)); + /* fall through */ + case FIL_PAGE_INDEX: + checksum = srv_use_checksums + ? page_zip_calc_checksum(read_buf, zip_size) + : BUF_NO_CHECKSUM_MAGIC; + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Compressed page checksum %lu," + " stored %lu\n" + "InnoDB: Page lsn %lu %lu\n" + "InnoDB: Page number (if stored" + " to page already) %lu,\n" + "InnoDB: space id (if stored" + " to page already) %lu\n", + (ulong) checksum, + (ulong) mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + (ulong) mach_read_from_4( + read_buf + FIL_PAGE_LSN), + (ulong) mach_read_from_4( + read_buf + (FIL_PAGE_LSN + 4)), + (ulong) mach_read_from_4( + read_buf + FIL_PAGE_OFFSET), + (ulong) mach_read_from_4( + read_buf + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + return; + case FIL_PAGE_TYPE_XDES: + /* This is an uncompressed page. */ + break; + } + } + + checksum = srv_use_checksums + ? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; + old_checksum = srv_use_checksums + ? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Page checksum %lu, prior-to-4.0.14-form" + " checksum %lu\n" + "InnoDB: stored checksum %lu, prior-to-4.0.14-form" + " stored checksum %lu\n" + "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn" + " at page end %lu\n" + "InnoDB: Page number (if stored to page already) %lu,\n" + "InnoDB: space id (if created with >= MySQL-4.1.1" + " and stored already) %lu\n", + (ulong) checksum, (ulong) old_checksum, + (ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM), + (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4), + (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + (ulong) mach_read_from_4(read_buf + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + + if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_INSERT) { + fprintf(stderr, + "InnoDB: Page may be an insert undo log page\n"); + } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_UPDATE) { + fprintf(stderr, + "InnoDB: Page may be an update undo log page\n"); + } + + switch (fil_page_get_type(read_buf)) { + case FIL_PAGE_INDEX: + fprintf(stderr, + "InnoDB: Page may be an index page where" + " index id is %lu %lu\n", + (ulong) ut_dulint_get_high( + btr_page_get_index_id(read_buf)), + (ulong) ut_dulint_get_low( + btr_page_get_index_id(read_buf))); + +#ifdef UNIV_HOTBACKUP + /* If the code is in ibbackup, dict_sys may be uninitialized, + i.e., NULL */ + + if (dict_sys == NULL) { + break; + } +#endif /* UNIV_HOTBACKUP */ + + index = dict_index_find_on_id_low( + btr_page_get_index_id(read_buf)); + if (index) { + fputs("InnoDB: (", stderr); + dict_index_name_print(stderr, NULL, index); + fputs(")\n", stderr); + } + break; + case FIL_PAGE_INODE: + fputs("InnoDB: Page may be an 'inode' page\n", stderr); + break; + case FIL_PAGE_IBUF_FREE_LIST: + fputs("InnoDB: Page may be an insert buffer free list page\n", + stderr); + break; + case FIL_PAGE_TYPE_ALLOCATED: + fputs("InnoDB: Page may be a freshly allocated page\n", + stderr); + break; + case FIL_PAGE_IBUF_BITMAP: + fputs("InnoDB: Page may be an insert buffer bitmap page\n", + stderr); + break; + case FIL_PAGE_TYPE_SYS: + fputs("InnoDB: Page may be a system page\n", + stderr); + break; + case FIL_PAGE_TYPE_TRX_SYS: + fputs("InnoDB: Page may be a transaction system page\n", + stderr); + break; + case FIL_PAGE_TYPE_FSP_HDR: + fputs("InnoDB: Page may be a file space header page\n", + stderr); + break; + case FIL_PAGE_TYPE_XDES: + fputs("InnoDB: Page may be an extent descriptor page\n", + stderr); + break; + case FIL_PAGE_TYPE_BLOB: + fputs("InnoDB: Page may be a BLOB page\n", + stderr); + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + fputs("InnoDB: Page may be a compressed BLOB page\n", + stderr); + break; + } +} + +/************************************************************************ +Initializes a buffer control block when the buf_pool is created. */ +static +void +buf_block_init( +/*===========*/ + buf_block_t* block, /* in: pointer to control block */ + byte* frame) /* in: pointer to buffer frame */ +{ + UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block); + + block->frame = frame; + + block->page.state = BUF_BLOCK_NOT_USED; + block->page.buf_fix_count = 0; + block->page.io_fix = BUF_IO_NONE; + + block->modify_clock = 0; + +#ifdef UNIV_DEBUG_FILE_ACCESSES + block->page.file_page_was_freed = FALSE; +#endif /* UNIV_DEBUG_FILE_ACCESSES */ + + block->check_index_page_at_flush = FALSE; + block->index = NULL; + +#ifdef UNIV_DEBUG + block->page.in_page_hash = FALSE; + block->page.in_zip_hash = FALSE; + block->page.in_flush_list = FALSE; + block->page.in_free_list = FALSE; + block->in_unzip_LRU_list = FALSE; +#endif /* UNIV_DEBUG */ + block->page.in_LRU_list = FALSE; +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + block->n_pointers = 0; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + page_zip_des_init(&block->page.zip); + + mutex_create(&block->mutex, SYNC_BUF_BLOCK); + + rw_lock_create(&block->lock, SYNC_LEVEL_VARYING); + ut_ad(rw_lock_validate(&(block->lock))); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ +} + +/************************************************************************ +Allocates a chunk of buffer frames. */ +static +buf_chunk_t* +buf_chunk_init( +/*===========*/ + /* out: chunk, or NULL on failure */ + buf_chunk_t* chunk, /* out: chunk of buffers */ + ulint mem_size) /* in: requested size in bytes */ +{ + buf_block_t* block; + byte* frame; + ulint i; + + /* Round down to a multiple of page size, + although it already should be. */ + mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE); + /* Reserve space for the block descriptors. */ + mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block) + + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); + + chunk->mem_size = mem_size; + chunk->mem = os_mem_alloc_large(&chunk->mem_size); + + if (UNIV_UNLIKELY(chunk->mem == NULL)) { + + return(NULL); + } + + /* Allocate the block descriptors from + the start of the memory block. */ + chunk->blocks = chunk->mem; + + /* Align a pointer to the first frame. Note that when + os_large_page_size is smaller than UNIV_PAGE_SIZE, + we may allocate one fewer block than requested. When + it is bigger, we may allocate more blocks than requested. */ + + frame = ut_align(chunk->mem, UNIV_PAGE_SIZE); + chunk->size = chunk->mem_size / UNIV_PAGE_SIZE + - (frame != chunk->mem); + + /* Subtract the space needed for block descriptors. */ + { + ulint size = chunk->size; + + while (frame < (byte*) (chunk->blocks + size)) { + frame += UNIV_PAGE_SIZE; + size--; + } + + chunk->size = size; + } + + /* Init block structs and assign frames for them. Then we + assign the frames to the first blocks (we already mapped the + memory above). */ + + block = chunk->blocks; + + for (i = chunk->size; i--; ) { + + buf_block_init(block, frame); + +#ifdef HAVE_purify + /* Wipe contents of frame to eliminate a Purify warning */ + memset(block->frame, '\0', UNIV_PAGE_SIZE); +#endif + /* Add the block to the free list */ + UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page)); + ut_d(block->page.in_free_list = TRUE); + + block++; + frame += UNIV_PAGE_SIZE; + } + + return(chunk); +} + +#ifdef UNIV_DEBUG +/************************************************************************* +Finds a block in the given buffer chunk that points to a +given compressed page. */ +static +buf_block_t* +buf_chunk_contains_zip( +/*===================*/ + /* out: buffer block pointing to + the compressed page, or NULL */ + buf_chunk_t* chunk, /* in: chunk being checked */ + const void* data) /* in: pointer to compressed page */ +{ + buf_block_t* block; + ulint i; + + ut_ad(buf_pool); + ut_ad(buf_pool_mutex_own()); + + block = chunk->blocks; + + for (i = chunk->size; i--; block++) { + if (block->page.zip.data == data) { + + return(block); + } + } + + return(NULL); +} + +/************************************************************************* +Finds a block in the buffer pool that points to a +given compressed page. */ +UNIV_INTERN +buf_block_t* +buf_pool_contains_zip( +/*==================*/ + /* out: buffer block pointing to + the compressed page, or NULL */ + const void* data) /* in: pointer to compressed page */ +{ + ulint n; + buf_chunk_t* chunk = buf_pool->chunks; + + for (n = buf_pool->n_chunks; n--; chunk++) { + buf_block_t* block = buf_chunk_contains_zip(chunk, data); + + if (block) { + return(block); + } + } + + return(NULL); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Checks that all file pages in the buffer chunk are in a replaceable state. */ +static +const buf_block_t* +buf_chunk_not_freed( +/*================*/ + /* out: address of a non-free block, + or NULL if all freed */ + buf_chunk_t* chunk) /* in: chunk being checked */ +{ + buf_block_t* block; + ulint i; + + ut_ad(buf_pool); + ut_ad(buf_pool_mutex_own()); + + block = chunk->blocks; + + for (i = chunk->size; i--; block++) { + mutex_enter(&block->mutex); + + if (buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && !buf_flush_ready_for_replace(&block->page)) { + + mutex_exit(&block->mutex); + return(block); + } + + mutex_exit(&block->mutex); + } + + return(NULL); +} + +/************************************************************************* +Checks that all blocks in the buffer chunk are in BUF_BLOCK_NOT_USED state. */ +static +ibool +buf_chunk_all_free( +/*===============*/ + /* out: TRUE if all freed */ + const buf_chunk_t* chunk) /* in: chunk being checked */ +{ + const buf_block_t* block; + ulint i; + + ut_ad(buf_pool); + ut_ad(buf_pool_mutex_own()); + + block = chunk->blocks; + + for (i = chunk->size; i--; block++) { + + if (buf_block_get_state(block) != BUF_BLOCK_NOT_USED) { + + return(FALSE); + } + } + + return(TRUE); +} + +/************************************************************************ +Frees a chunk of buffer frames. */ +static +void +buf_chunk_free( +/*===========*/ + buf_chunk_t* chunk) /* out: chunk of buffers */ +{ + buf_block_t* block; + const buf_block_t* block_end; + + ut_ad(buf_pool_mutex_own()); + + block_end = chunk->blocks + chunk->size; + + for (block = chunk->blocks; block < block_end; block++) { + ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED); + ut_a(!block->page.zip.data); + + ut_ad(!block->page.in_LRU_list); + ut_ad(!block->in_unzip_LRU_list); + ut_ad(!block->page.in_flush_list); + /* Remove the block from the free list. */ + ut_ad(block->page.in_free_list); + UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + + /* Free the latches. */ + mutex_free(&block->mutex); + rw_lock_free(&block->lock); +#ifdef UNIV_SYNC_DEBUG + rw_lock_free(&block->debug_latch); +#endif /* UNIV_SYNC_DEBUG */ + UNIV_MEM_UNDESC(block); + } + + os_mem_free_large(chunk->mem, chunk->mem_size); +} + +/************************************************************************ +Creates the buffer pool. */ +UNIV_INTERN +buf_pool_t* +buf_pool_init(void) +/*===============*/ + /* out, own: buf_pool object, NULL if not + enough memory or error */ +{ + buf_chunk_t* chunk; + ulint i; + + buf_pool = mem_zalloc(sizeof(buf_pool_t)); + + /* 1. Initialize general fields + ------------------------------- */ + mutex_create(&buf_pool_mutex, SYNC_BUF_POOL); + mutex_create(&buf_pool_zip_mutex, SYNC_BUF_BLOCK); + + buf_pool_mutex_enter(); + + buf_pool->n_chunks = 1; + buf_pool->chunks = chunk = mem_alloc(sizeof *chunk); + + UT_LIST_INIT(buf_pool->free); + + if (!buf_chunk_init(chunk, srv_buf_pool_size)) { + mem_free(chunk); + mem_free(buf_pool); + buf_pool = NULL; + return(NULL); + } + + srv_buf_pool_old_size = srv_buf_pool_size; + buf_pool->curr_size = chunk->size; + srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + + buf_pool->page_hash = hash_create(2 * buf_pool->curr_size); + buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size); + + buf_pool->last_printout_time = time(NULL); + + /* 2. Initialize flushing fields + -------------------------------- */ + + for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { + buf_pool->no_flush[i] = os_event_create(NULL); + } + + buf_pool->ulint_clock = 1; + + /* 3. Initialize LRU fields + --------------------------- */ + /* All fields are initialized by mem_zalloc(). */ + + buf_pool_mutex_exit(); + + btr_search_sys_create(buf_pool->curr_size + * UNIV_PAGE_SIZE / sizeof(void*) / 64); + + /* 4. Initialize the buddy allocator fields */ + /* All fields are initialized by mem_zalloc(). */ + + return(buf_pool); +} + +/************************************************************************ +Frees the buffer pool at shutdown. This must not be invoked before +freeing all mutexes. */ +UNIV_INTERN +void +buf_pool_free(void) +/*===============*/ +{ + buf_chunk_t* chunk; + buf_chunk_t* chunks; + + chunks = buf_pool->chunks; + chunk = chunks + buf_pool->n_chunks; + + while (--chunk >= chunks) { + /* Bypass the checks of buf_chunk_free(), since they + would fail at shutdown. */ + os_mem_free_large(chunk->mem, chunk->mem_size); + } + + buf_pool->n_chunks = 0; +} + + +/************************************************************************ +Drops the adaptive hash index. To prevent a livelock, this function +is only to be called while holding btr_search_latch and while +btr_search_enabled == FALSE. */ +UNIV_INTERN +void +buf_pool_drop_hash_index(void) +/*==========================*/ +{ + ibool released_search_latch; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!btr_search_enabled); + + do { + buf_chunk_t* chunks = buf_pool->chunks; + buf_chunk_t* chunk = chunks + buf_pool->n_chunks; + + released_search_latch = FALSE; + + while (--chunk >= chunks) { + buf_block_t* block = chunk->blocks; + ulint i = chunk->size; + + for (; i--; block++) { + /* block->is_hashed cannot be modified + when we have an x-latch on btr_search_latch; + see the comment in buf0buf.h */ + + if (!block->is_hashed) { + continue; + } + + /* To follow the latching order, we + have to release btr_search_latch + before acquiring block->latch. */ + rw_lock_x_unlock(&btr_search_latch); + /* When we release the search latch, + we must rescan all blocks, because + some may become hashed again. */ + released_search_latch = TRUE; + + rw_lock_x_lock(&block->lock); + + /* This should be guaranteed by the + callers, which will be holding + btr_search_enabled_mutex. */ + ut_ad(!btr_search_enabled); + + /* Because we did not buffer-fix the + block by calling buf_block_get_gen(), + it is possible that the block has been + allocated for some other use after + btr_search_latch was released above. + We do not care which file page the + block is mapped to. All we want to do + is to drop any hash entries referring + to the page. */ + + /* It is possible that + block->page.state != BUF_FILE_PAGE. + Even that does not matter, because + btr_search_drop_page_hash_index() will + check block->is_hashed before doing + anything. block->is_hashed can only + be set on uncompressed file pages. */ + + btr_search_drop_page_hash_index(block); + + rw_lock_x_unlock(&block->lock); + + rw_lock_x_lock(&btr_search_latch); + + ut_ad(!btr_search_enabled); + } + } + } while (released_search_latch); +} + +/************************************************************************ +Relocate a buffer control block. Relocates the block on the LRU list +and in buf_pool->page_hash. Does not relocate bpage->list. +The caller must take care of relocating bpage->list. */ +UNIV_INTERN +void +buf_relocate( +/*=========*/ + buf_page_t* bpage, /* in/out: control block being relocated; + buf_page_get_state(bpage) must be + BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ + buf_page_t* dpage) /* in/out: destination control block */ +{ + buf_page_t* b; + ulint fold; + + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + ut_a(bpage->buf_fix_count == 0); + ut_ad(bpage->in_LRU_list); + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_page_hash); + ut_ad(bpage == buf_page_hash_get(bpage->space, bpage->offset)); +#ifdef UNIV_DEBUG + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_FILE_PAGE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_ZIP_PAGE: + break; + } +#endif /* UNIV_DEBUG */ + + memcpy(dpage, bpage, sizeof *dpage); + + bpage->in_LRU_list = FALSE; + ut_d(bpage->in_page_hash = FALSE); + + /* relocate buf_pool->LRU */ + b = UT_LIST_GET_PREV(LRU, bpage); + UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage); + + if (b) { + UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage); + } else { + UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage); + } + + if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) { + buf_pool->LRU_old = dpage; +#ifdef UNIV_LRU_DEBUG + /* buf_pool->LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + } + + ut_d(UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU)); + + /* relocate buf_pool->page_hash */ + fold = buf_page_address_fold(bpage->space, bpage->offset); + + HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage); + + UNIV_MEM_INVALID(bpage, sizeof *bpage); +} + +/************************************************************************ +Shrinks the buffer pool. */ +static +void +buf_pool_shrink( +/*============*/ + /* out: TRUE if shrunk */ + ulint chunk_size) /* in: number of pages to remove */ +{ + buf_chunk_t* chunks; + buf_chunk_t* chunk; + ulint max_size; + ulint max_free_size; + buf_chunk_t* max_chunk; + buf_chunk_t* max_free_chunk; + + ut_ad(!buf_pool_mutex_own()); + +try_again: + btr_search_disable(); /* Empty the adaptive hash index again */ + buf_pool_mutex_enter(); + +shrink_again: + if (buf_pool->n_chunks <= 1) { + + /* Cannot shrink if there is only one chunk */ + goto func_done; + } + + /* Search for the largest free chunk + not larger than the size difference */ + chunks = buf_pool->chunks; + chunk = chunks + buf_pool->n_chunks; + max_size = max_free_size = 0; + max_chunk = max_free_chunk = NULL; + + while (--chunk >= chunks) { + if (chunk->size <= chunk_size + && chunk->size > max_free_size) { + if (chunk->size > max_size) { + max_size = chunk->size; + max_chunk = chunk; + } + + if (buf_chunk_all_free(chunk)) { + max_free_size = chunk->size; + max_free_chunk = chunk; + } + } + } + + if (!max_free_size) { + + ulint dirty = 0; + ulint nonfree = 0; + buf_block_t* block; + buf_block_t* bend; + + /* Cannot shrink: try again later + (do not assign srv_buf_pool_old_size) */ + if (!max_chunk) { + + goto func_exit; + } + + block = max_chunk->blocks; + bend = block + max_chunk->size; + + /* Move the blocks of chunk to the end of the + LRU list and try to flush them. */ + for (; block < bend; block++) { + switch (buf_block_get_state(block)) { + case BUF_BLOCK_NOT_USED: + continue; + case BUF_BLOCK_FILE_PAGE: + break; + default: + nonfree++; + continue; + } + + mutex_enter(&block->mutex); + /* The following calls will temporarily + release block->mutex and buf_pool_mutex. + Therefore, we have to always retry, + even if !dirty && !nonfree. */ + + if (!buf_flush_ready_for_replace(&block->page)) { + + buf_LRU_make_block_old(&block->page); + dirty++; + } else if (buf_LRU_free_block(&block->page, TRUE, NULL) + != BUF_LRU_FREED) { + nonfree++; + } + + mutex_exit(&block->mutex); + } + + buf_pool_mutex_exit(); + + /* Request for a flush of the chunk if it helps. + Do not flush if there are non-free blocks, since + flushing will not make the chunk freeable. */ + if (nonfree) { + /* Avoid busy-waiting. */ + os_thread_sleep(100000); + } else if (dirty + && buf_flush_batch(BUF_FLUSH_LRU, dirty, 0) + == ULINT_UNDEFINED) { + + buf_flush_wait_batch_end(BUF_FLUSH_LRU); + } + + goto try_again; + } + + max_size = max_free_size; + max_chunk = max_free_chunk; + + srv_buf_pool_old_size = srv_buf_pool_size; + + /* Rewrite buf_pool->chunks. Copy everything but max_chunk. */ + chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks); + memcpy(chunks, buf_pool->chunks, + (max_chunk - buf_pool->chunks) * sizeof *chunks); + memcpy(chunks + (max_chunk - buf_pool->chunks), + max_chunk + 1, + buf_pool->chunks + buf_pool->n_chunks + - (max_chunk + 1)); + ut_a(buf_pool->curr_size > max_chunk->size); + buf_pool->curr_size -= max_chunk->size; + srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + chunk_size -= max_chunk->size; + buf_chunk_free(max_chunk); + mem_free(buf_pool->chunks); + buf_pool->chunks = chunks; + buf_pool->n_chunks--; + + /* Allow a slack of one megabyte. */ + if (chunk_size > 1048576 / UNIV_PAGE_SIZE) { + + goto shrink_again; + } + +func_done: + srv_buf_pool_old_size = srv_buf_pool_size; +func_exit: + buf_pool_mutex_exit(); + btr_search_enable(); +} + +/************************************************************************ +Rebuild buf_pool->page_hash. */ +static +void +buf_pool_page_hash_rebuild(void) +/*============================*/ +{ + ulint i; + ulint n_chunks; + buf_chunk_t* chunk; + hash_table_t* page_hash; + hash_table_t* zip_hash; + buf_page_t* b; + + buf_pool_mutex_enter(); + + /* Free, create, and populate the hash table. */ + hash_table_free(buf_pool->page_hash); + buf_pool->page_hash = page_hash = hash_create(2 * buf_pool->curr_size); + zip_hash = hash_create(2 * buf_pool->curr_size); + + HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash, + BUF_POOL_ZIP_FOLD_BPAGE); + + hash_table_free(buf_pool->zip_hash); + buf_pool->zip_hash = zip_hash; + + /* Insert the uncompressed file pages to buf_pool->page_hash. */ + + chunk = buf_pool->chunks; + n_chunks = buf_pool->n_chunks; + + for (i = 0; i < n_chunks; i++, chunk++) { + ulint j; + buf_block_t* block = chunk->blocks; + + for (j = 0; j < chunk->size; j++, block++) { + if (buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE) { + ut_ad(!block->page.in_zip_hash); + ut_ad(block->page.in_page_hash); + + HASH_INSERT(buf_page_t, hash, page_hash, + buf_page_address_fold( + block->page.space, + block->page.offset), + &block->page); + } + } + } + + /* Insert the compressed-only pages to buf_pool->page_hash. + All such blocks are either in buf_pool->zip_clean or + in buf_pool->flush_list. */ + + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); + ut_ad(!b->in_flush_list); + ut_ad(b->in_LRU_list); + ut_ad(b->in_page_hash); + ut_ad(!b->in_zip_hash); + + HASH_INSERT(buf_page_t, hash, page_hash, + buf_page_address_fold(b->space, b->offset), b); + } + + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_flush_list); + ut_ad(b->in_LRU_list); + ut_ad(b->in_page_hash); + ut_ad(!b->in_zip_hash); + + switch (buf_page_get_state(b)) { + case BUF_BLOCK_ZIP_DIRTY: + HASH_INSERT(buf_page_t, hash, page_hash, + buf_page_address_fold(b->space, + b->offset), b); + break; + case BUF_BLOCK_FILE_PAGE: + /* uncompressed page */ + break; + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + } + + buf_pool_mutex_exit(); +} + +/************************************************************************ +Resizes the buffer pool. */ +UNIV_INTERN +void +buf_pool_resize(void) +/*=================*/ +{ + buf_pool_mutex_enter(); + + if (srv_buf_pool_old_size == srv_buf_pool_size) { + + buf_pool_mutex_exit(); + return; + } + + if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) { + + buf_pool_mutex_exit(); + + /* Disable adaptive hash indexes and empty the index + in order to free up memory in the buffer pool chunks. */ + buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size) + / UNIV_PAGE_SIZE); + } else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) { + + /* Enlarge the buffer pool by at least one megabyte */ + + ulint mem_size + = srv_buf_pool_size - srv_buf_pool_curr_size; + buf_chunk_t* chunks; + buf_chunk_t* chunk; + + chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks); + + memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks + * sizeof *chunks); + + chunk = &chunks[buf_pool->n_chunks]; + + if (!buf_chunk_init(chunk, mem_size)) { + mem_free(chunks); + } else { + buf_pool->curr_size += chunk->size; + srv_buf_pool_curr_size = buf_pool->curr_size + * UNIV_PAGE_SIZE; + mem_free(buf_pool->chunks); + buf_pool->chunks = chunks; + buf_pool->n_chunks++; + } + + srv_buf_pool_old_size = srv_buf_pool_size; + buf_pool_mutex_exit(); + } + + buf_pool_page_hash_rebuild(); +} + +/************************************************************************ +Moves to the block to the start of the LRU list if there is a danger +that the block would drift out of the buffer pool. */ +UNIV_INLINE +void +buf_block_make_young( +/*=================*/ + buf_page_t* bpage) /* in: block to make younger */ +{ + ut_ad(!buf_pool_mutex_own()); + + /* Note that we read freed_page_clock's without holding any mutex: + this is allowed since the result is used only in heuristics */ + + if (buf_page_peek_if_too_old(bpage)) { + + buf_pool_mutex_enter(); + /* There has been freeing activity in the LRU list: + best to move to the head of the LRU list */ + + buf_LRU_make_block_young(bpage); + buf_pool_mutex_exit(); + } +} + +/************************************************************************ +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from from slipping out of +the buffer pool. */ +UNIV_INTERN +void +buf_page_make_young( +/*================*/ + buf_page_t* bpage) /* in: buffer block of a file page */ +{ + buf_pool_mutex_enter(); + + ut_a(buf_page_in_file(bpage)); + + buf_LRU_make_block_young(bpage); + + buf_pool_mutex_exit(); +} + +/************************************************************************ +Resets the check_index_page_at_flush field of a page if found in the buffer +pool. */ +UNIV_INTERN +void +buf_reset_check_index_page_at_flush( +/*================================*/ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + + buf_pool_mutex_enter(); + + block = (buf_block_t*) buf_page_hash_get(space, offset); + + if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) { + block->check_index_page_at_flush = FALSE; + } + + buf_pool_mutex_exit(); +} + +/************************************************************************ +Returns the current state of is_hashed of a page. FALSE if the page is +not in the pool. NOTE that this operation does not fix the page in the +pool if it is found there. */ +UNIV_INTERN +ibool +buf_page_peek_if_search_hashed( +/*===========================*/ + /* out: TRUE if page hash index is built in search + system */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + ibool is_hashed; + + buf_pool_mutex_enter(); + + block = (buf_block_t*) buf_page_hash_get(space, offset); + + if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + is_hashed = FALSE; + } else { + is_hashed = block->is_hashed; + } + + buf_pool_mutex_exit(); + + return(is_hashed); +} + +#ifdef UNIV_DEBUG_FILE_ACCESSES +/************************************************************************ +Sets file_page_was_freed TRUE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. */ +UNIV_INTERN +buf_page_t* +buf_page_set_file_page_was_freed( +/*=============================*/ + /* out: control block if found in page hash table, + otherwise NULL */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_page_t* bpage; + + buf_pool_mutex_enter(); + + bpage = buf_page_hash_get(space, offset); + + if (bpage) { + bpage->file_page_was_freed = TRUE; + } + + buf_pool_mutex_exit(); + + return(bpage); +} + +/************************************************************************ +Sets file_page_was_freed FALSE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. */ +UNIV_INTERN +buf_page_t* +buf_page_reset_file_page_was_freed( +/*===============================*/ + /* out: control block if found in page hash table, + otherwise NULL */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + buf_page_t* bpage; + + buf_pool_mutex_enter(); + + bpage = buf_page_hash_get(space, offset); + + if (bpage) { + bpage->file_page_was_freed = FALSE; + } + + buf_pool_mutex_exit(); + + return(bpage); +} +#endif /* UNIV_DEBUG_FILE_ACCESSES */ + +/************************************************************************ +Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with buf_page_release_zip(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. */ +UNIV_INTERN +buf_page_t* +buf_page_get_zip( +/*=============*/ + /* out: pointer to the block */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size */ + ulint offset) /* in: page number */ +{ + buf_page_t* bpage; + mutex_t* block_mutex; + ibool must_read; + +#ifndef UNIV_LOG_DEBUG + ut_ad(!ibuf_inside()); +#endif + buf_pool->n_page_gets++; + + for (;;) { + buf_pool_mutex_enter(); +lookup: + bpage = buf_page_hash_get(space, offset); + if (bpage) { + break; + } + + /* Page not in buf_pool: needs to be read from file */ + + buf_pool_mutex_exit(); + + buf_read_page(space, zip_size, offset); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 37 || buf_validate()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + } + + if (UNIV_UNLIKELY(!bpage->zip.data)) { + /* There is no compressed page. */ + buf_pool_mutex_exit(); + return(NULL); + } + + block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + case BUF_BLOCK_ZIP_FREE: + ut_error; + break; + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + bpage->buf_fix_count++; + break; + case BUF_BLOCK_FILE_PAGE: + /* Discard the uncompressed page frame if possible. */ + if (buf_LRU_free_block(bpage, FALSE, NULL) + == BUF_LRU_FREED) { + + mutex_exit(block_mutex); + goto lookup; + } + + buf_block_buf_fix_inc((buf_block_t*) bpage, + __FILE__, __LINE__); + break; + } + + must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; + + buf_pool_mutex_exit(); + + buf_page_set_accessed(bpage, TRUE); + + mutex_exit(block_mutex); + + buf_block_make_young(bpage); + +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(!bpage->file_page_was_freed); +#endif + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(bpage->buf_fix_count > 0); + ut_a(buf_page_in_file(bpage)); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + if (must_read) { + /* Let us wait until the read operation + completes */ + + for (;;) { + enum buf_io_fix io_fix; + + mutex_enter(block_mutex); + io_fix = buf_page_get_io_fix(bpage); + mutex_exit(block_mutex); + + if (io_fix == BUF_IO_READ) { + + os_thread_sleep(WAIT_FOR_READ); + } else { + break; + } + } + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_page_get_space(bpage), + buf_page_get_page_no(bpage)) == 0); +#endif + return(bpage); +} + +/************************************************************************ +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_block_init_low( +/*===============*/ + buf_block_t* block) /* in: block to init */ +{ + block->check_index_page_at_flush = FALSE; + block->index = NULL; + + block->n_hash_helps = 0; + block->is_hashed = FALSE; + block->n_fields = 1; + block->n_bytes = 0; + block->left_side = TRUE; +} + +/************************************************************************ +Decompress a block. */ +static +ibool +buf_zip_decompress( +/*===============*/ + /* out: TRUE if successful */ + buf_block_t* block, /* in/out: block */ + ibool check) /* in: TRUE=verify the page checksum */ +{ + const byte* frame = block->page.zip.data; + + ut_ad(buf_block_get_zip_size(block)); + ut_a(buf_block_get_space(block) != 0); + + if (UNIV_LIKELY(check)) { + ulint stamp_checksum = mach_read_from_4( + frame + FIL_PAGE_SPACE_OR_CHKSUM); + ulint calc_checksum = page_zip_calc_checksum( + frame, page_zip_get_size(&block->page.zip)); + + if (UNIV_UNLIKELY(stamp_checksum != calc_checksum)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: compressed page checksum mismatch" + " (space %u page %u): %lu != %lu\n", + block->page.space, block->page.offset, + stamp_checksum, calc_checksum); + return(FALSE); + } + } + + switch (fil_page_get_type(frame)) { + case FIL_PAGE_INDEX: + if (page_zip_decompress(&block->page.zip, + block->frame)) { + return(TRUE); + } + + fprintf(stderr, + "InnoDB: unable to decompress space %lu page %lu\n", + (ulong) block->page.space, + (ulong) block->page.offset); + return(FALSE); + + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + /* Copy to uncompressed storage. */ + memcpy(block->frame, frame, + buf_block_get_zip_size(block)); + return(TRUE); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unknown compressed page" + " type %lu\n", + fil_page_get_type(frame)); + return(FALSE); +} + +/*********************************************************************** +Gets the block to whose frame the pointer is pointing to. */ +UNIV_INTERN +buf_block_t* +buf_block_align( +/*============*/ + /* out: pointer to block, never NULL */ + const byte* ptr) /* in: pointer to a frame */ +{ + buf_chunk_t* chunk; + ulint i; + + /* TODO: protect buf_pool->chunks with a mutex (it will + currently remain constant after buf_pool_init()) */ + for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) { + lint offs = ptr - chunk->blocks->frame; + + if (UNIV_UNLIKELY(offs < 0)) { + + continue; + } + + offs >>= UNIV_PAGE_SIZE_SHIFT; + + if (UNIV_LIKELY((ulint) offs < chunk->size)) { + buf_block_t* block = &chunk->blocks[offs]; + + /* The function buf_chunk_init() invokes + buf_block_init() so that block[n].frame == + block->frame + n * UNIV_PAGE_SIZE. Check it. */ + ut_ad(block->frame == page_align(ptr)); +#ifdef UNIV_DEBUG + /* A thread that updates these fields must + hold buf_pool_mutex and block->mutex. Acquire + only the latter. */ + mutex_enter(&block->mutex); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + /* These types should only be used in + the compressed buffer pool, whose + memory is allocated from + buf_pool->chunks, in UNIV_PAGE_SIZE + blocks flagged as BUF_BLOCK_MEMORY. */ + ut_error; + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + /* Some data structures contain + "guess" pointers to file pages. The + file pages may have been freed and + reused. Do not complain. */ + break; + case BUF_BLOCK_REMOVE_HASH: + /* buf_LRU_block_remove_hashed_page() + will overwrite the FIL_PAGE_OFFSET and + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with + 0xff and set the state to + BUF_BLOCK_REMOVE_HASH. */ + ut_ad(page_get_space_id(page_align(ptr)) + == 0xffffffff); + ut_ad(page_get_page_no(page_align(ptr)) + == 0xffffffff); + break; + case BUF_BLOCK_FILE_PAGE: + ut_ad(block->page.space + == page_get_space_id(page_align(ptr))); + ut_ad(block->page.offset + == page_get_page_no(page_align(ptr))); + break; + } + + mutex_exit(&block->mutex); +#endif /* UNIV_DEBUG */ + + return(block); + } + } + + /* The block should always be found. */ + ut_error; + return(NULL); +} + +/************************************************************************ +Find out if a buffer block was created by buf_chunk_init(). */ +static +ibool +buf_block_is_uncompressed( +/*======================*/ + /* out: TRUE if "block" has + been added to buf_pool->free + by buf_chunk_init() */ + const buf_block_t* block) /* in: pointer to block, + not dereferenced */ +{ + const buf_chunk_t* chunk = buf_pool->chunks; + const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks; + + ut_ad(buf_pool_mutex_own()); + + if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) { + /* The pointer should be aligned. */ + return(FALSE); + } + + while (chunk < echunk) { + if (block >= chunk->blocks + && block < chunk->blocks + chunk->size) { + + return(TRUE); + } + + chunk++; + } + + return(FALSE); +} + +/************************************************************************ +This is the general function used to get access to a database page. */ +UNIV_INTERN +buf_block_t* +buf_page_get_gen( +/*=============*/ + /* out: pointer to the block or NULL */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint offset, /* in: page number */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_block_t* guess, /* in: guessed block or NULL */ + ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_GET_NO_LATCH */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool accessed; + ulint fix_type; + ibool must_read; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_NO_LATCH)); + ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH)); + ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL) + || (mode == BUF_GET_NO_LATCH)); + ut_ad(zip_size == fil_space_get_zip_size(space)); +#ifndef UNIV_LOG_DEBUG + ut_ad(!ibuf_inside() || ibuf_page(space, zip_size, offset, NULL)); +#endif + buf_pool->n_page_gets++; +loop: + block = guess; + buf_pool_mutex_enter(); + + if (block) { + /* If the guess is a compressed page descriptor that + has been allocated by buf_buddy_alloc(), it may have + been invalidated by buf_buddy_relocate(). In that + case, block could point to something that happens to + contain the expected bits in block->page. Similarly, + the guess may be pointing to a buffer pool chunk that + has been released when resizing the buffer pool. */ + + if (!buf_block_is_uncompressed(block) + || offset != block->page.offset + || space != block->page.space + || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + + block = guess = NULL; + } else { + ut_ad(!block->page.in_zip_hash); + ut_ad(block->page.in_page_hash); + } + } + + if (block == NULL) { + block = (buf_block_t*) buf_page_hash_get(space, offset); + } + +loop2: + if (block == NULL) { + /* Page not in buf_pool: needs to be read from file */ + + buf_pool_mutex_exit(); + + if (mode == BUF_GET_IF_IN_POOL) { + + return(NULL); + } + + buf_read_page(space, zip_size, offset); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 37 || buf_validate()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + goto loop; + } + + ut_ad(page_zip_get_size(&block->page.zip) == zip_size); + + must_read = buf_block_get_io_fix(block) == BUF_IO_READ; + + if (must_read && mode == BUF_GET_IF_IN_POOL) { + /* The page is only being read to buffer */ + buf_pool_mutex_exit(); + + return(NULL); + } + + switch (buf_block_get_state(block)) { + buf_page_t* bpage; + ibool success; + + case BUF_BLOCK_FILE_PAGE: + break; + + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + bpage = &block->page; + + if (bpage->buf_fix_count + || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + /* This condition often occurs when the buffer + is not buffer-fixed, but I/O-fixed by + buf_page_init_for_read(). */ +wait_until_unfixed: + /* The block is buffer-fixed or I/O-fixed. + Try again later. */ + buf_pool_mutex_exit(); + os_thread_sleep(WAIT_FOR_READ); + + goto loop; + } + + /* Allocate an uncompressed page. */ + buf_pool_mutex_exit(); + + block = buf_LRU_get_free_block(0); + ut_a(block); + + buf_pool_mutex_enter(); + mutex_enter(&block->mutex); + + { + buf_page_t* hash_bpage + = buf_page_hash_get(space, offset); + + if (UNIV_UNLIKELY(bpage != hash_bpage)) { + /* The buf_pool->page_hash was modified + while buf_pool_mutex was released. + Free the block that was allocated. */ + + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + + block = (buf_block_t*) hash_bpage; + goto loop2; + } + } + + if (UNIV_UNLIKELY + (bpage->buf_fix_count + || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) { + + /* The block was buffer-fixed or I/O-fixed + while buf_pool_mutex was not held by this thread. + Free the block that was allocated and try again. + This should be extremely unlikely. */ + + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + + goto wait_until_unfixed; + } + + /* Move the compressed page from bpage to block, + and uncompress it. */ + + mutex_enter(&buf_pool_zip_mutex); + + buf_relocate(bpage, &block->page); + buf_block_init_low(block); + block->lock_hash_val = lock_rec_hash(space, offset); + + UNIV_MEM_DESC(&block->page.zip.data, + page_zip_get_size(&block->page.zip), block); + + if (buf_page_get_state(&block->page) + == BUF_BLOCK_ZIP_PAGE) { + UT_LIST_REMOVE(list, buf_pool->zip_clean, + &block->page); + ut_ad(!block->page.in_flush_list); + } else { + /* Relocate buf_pool->flush_list. */ + buf_page_t* b; + + b = UT_LIST_GET_PREV(list, &block->page); + ut_ad(block->page.in_flush_list); + UT_LIST_REMOVE(list, buf_pool->flush_list, + &block->page); + + if (b) { + UT_LIST_INSERT_AFTER( + list, buf_pool->flush_list, b, + &block->page); + } else { + UT_LIST_ADD_FIRST( + list, buf_pool->flush_list, + &block->page); + } + } + + /* Buffer-fix, I/O-fix, and X-latch the block + for the duration of the decompression. + Also add the block to the unzip_LRU list. */ + block->page.state = BUF_BLOCK_FILE_PAGE; + + /* Insert at the front of unzip_LRU list */ + buf_unzip_LRU_add_block(block, FALSE); + + block->page.buf_fix_count = 1; + buf_block_set_io_fix(block, BUF_IO_READ); + buf_pool->n_pend_unzip++; + rw_lock_x_lock(&block->lock); + mutex_exit(&block->mutex); + mutex_exit(&buf_pool_zip_mutex); + + buf_buddy_free(bpage, sizeof *bpage); + + buf_pool_mutex_exit(); + + /* Decompress the page and apply buffered operations + while not holding buf_pool_mutex or block->mutex. */ + success = buf_zip_decompress(block, srv_use_checksums); + + if (UNIV_LIKELY(success)) { + ibuf_merge_or_delete_for_page(block, space, offset, + zip_size, TRUE); + } + + /* Unfix and unlatch the block. */ + buf_pool_mutex_enter(); + mutex_enter(&block->mutex); + buf_pool->n_pend_unzip--; + block->page.buf_fix_count--; + buf_block_set_io_fix(block, BUF_IO_NONE); + mutex_exit(&block->mutex); + rw_lock_x_unlock(&block->lock); + + if (UNIV_UNLIKELY(!success)) { + + buf_pool_mutex_exit(); + return(NULL); + } + + break; + + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + mutex_enter(&block->mutex); + UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page); + + buf_block_buf_fix_inc(block, file, line); + buf_pool_mutex_exit(); + + /* Check if this is the first access to the page */ + + accessed = buf_page_is_accessed(&block->page); + + buf_page_set_accessed(&block->page, TRUE); + + mutex_exit(&block->mutex); + + buf_block_make_young(&block->page); + +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(!block->page.file_page_was_freed); +#endif + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + switch (rw_latch) { + case RW_NO_LATCH: + if (must_read) { + /* Let us wait until the read operation + completes */ + + for (;;) { + enum buf_io_fix io_fix; + + mutex_enter(&block->mutex); + io_fix = buf_block_get_io_fix(block); + mutex_exit(&block->mutex); + + if (io_fix == BUF_IO_READ) { + + os_thread_sleep(WAIT_FOR_READ); + } else { + break; + } + } + } + + fix_type = MTR_MEMO_BUF_FIX; + break; + + case RW_S_LATCH: + rw_lock_s_lock_func(&(block->lock), 0, file, line); + + fix_type = MTR_MEMO_PAGE_S_FIX; + break; + + default: + ut_ad(rw_latch == RW_X_LATCH); + rw_lock_x_lock_func(&(block->lock), 0, file, line); + + fix_type = MTR_MEMO_PAGE_X_FIX; + break; + } + + mtr_memo_push(mtr, block, fix_type); + + if (!accessed) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(space, zip_size, offset); + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0); +#endif + return(block); +} + +/************************************************************************ +This is the general function used to get optimistic access to a database +page. */ +UNIV_INTERN +ibool +buf_page_optimistic_get_func( +/*=========================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /* in: guessed buffer block */ + ib_uint64_t modify_clock,/* in: modify clock value if mode is + ..._GUESS_ON_CLOCK */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ibool accessed; + ibool success; + ulint fix_type; + + ut_ad(mtr && block); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + mutex_enter(&block->mutex); + + if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) { + + mutex_exit(&block->mutex); + + return(FALSE); + } + + buf_block_buf_fix_inc(block, file, line); + accessed = buf_page_is_accessed(&block->page); + buf_page_set_accessed(&block->page, TRUE); + + mutex_exit(&block->mutex); + + buf_block_make_young(&block->page); + + /* Check if this is the first access to the page */ + + ut_ad(!ibuf_inside() + || ibuf_page(buf_block_get_space(block), + buf_block_get_zip_size(block), + buf_block_get_page_no(block), NULL)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (UNIV_UNLIKELY(!success)) { + mutex_enter(&block->mutex); + buf_block_buf_fix_dec(block); + mutex_exit(&block->mutex); + + return(FALSE); + } + + if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) { + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else { + rw_lock_x_unlock(&(block->lock)); + } + + mutex_enter(&block->mutex); + buf_block_buf_fix_dec(block); + mutex_exit(&block->mutex); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(block->page.file_page_was_freed == FALSE); +#endif + if (UNIV_UNLIKELY(!accessed)) { + /* In the case of a first access, try to apply linear + read-ahead */ + + buf_read_ahead_linear(buf_block_get_space(block), + buf_block_get_zip_size(block), + buf_block_get_page_no(block)); + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0); +#endif + buf_pool->n_page_gets++; + + return(TRUE); +} + +/************************************************************************ +This is used to get access to a known database page, when no waiting can be +done. For example, if a search in an adaptive hash index leads us to this +frame. */ +UNIV_INTERN +ibool +buf_page_get_known_nowait( +/*======================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /* in: the known page */ + ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ibool success; + ulint fix_type; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + mutex_enter(&block->mutex); + + if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) { + /* Another thread is just freeing the block from the LRU list + of the buffer pool: do not try to access this page; this + attempt to access the page can only come through the hash + index because when the buffer block state is ..._REMOVE_HASH, + we have already removed it from the page address hash table + of the buffer pool. */ + + mutex_exit(&block->mutex); + + return(FALSE); + } + + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + buf_block_buf_fix_inc(block, file, line); + + mutex_exit(&block->mutex); + + if (mode == BUF_MAKE_YOUNG) { + buf_block_make_young(&block->page); + } + + ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); + + if (rw_latch == RW_S_LATCH) { + success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { + success = rw_lock_x_lock_func_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_X_FIX; + } + + if (!success) { + mutex_enter(&block->mutex); + buf_block_buf_fix_dec(block); + mutex_exit(&block->mutex); + + return(FALSE); + } + + mtr_memo_push(mtr, block, fix_type); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(block->page.file_page_was_freed == FALSE); +#endif + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a((mode == BUF_KEEP_OLD) + || (ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0)); +#endif + buf_pool->n_page_gets++; + + return(TRUE); +} + +/*********************************************************************** +Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the kernel mutex. */ +UNIV_INTERN +const buf_block_t* +buf_page_try_get_func( +/*==================*/ + /* out: pointer to a page or NULL */ + ulint space_id,/* in: tablespace id */ + ulint page_no,/* in: page number */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr) /* in: mini-transaction */ +{ + buf_block_t* block; + ibool success; + ulint fix_type; + + buf_pool_mutex_enter(); + block = buf_block_hash_get(space_id, page_no); + + if (!block) { + buf_pool_mutex_exit(); + return(NULL); + } + + mutex_enter(&block->mutex); + buf_pool_mutex_exit(); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_a(buf_block_get_space(block) == space_id); + ut_a(buf_block_get_page_no(block) == page_no); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + + buf_block_buf_fix_inc(block, file, line); + mutex_exit(&block->mutex); + + fix_type = MTR_MEMO_PAGE_S_FIX; + success = rw_lock_s_lock_nowait(&block->lock, file, line); + + if (!success) { + /* Let us try to get an X-latch. If the current thread + is holding an X-latch on the page, we cannot get an + S-latch. */ + + fix_type = MTR_MEMO_PAGE_X_FIX; + success = rw_lock_x_lock_func_nowait(&block->lock, + file, line); + } + + if (!success) { + mutex_enter(&block->mutex); + buf_block_buf_fix_dec(block); + mutex_exit(&block->mutex); + + return(NULL); + } + + mtr_memo_push(mtr, block, fix_type); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 5771 || buf_validate()); + ut_a(block->page.buf_fix_count > 0); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_DEBUG_FILE_ACCESSES + ut_a(block->page.file_page_was_freed == FALSE); +#endif /* UNIV_DEBUG_FILE_ACCESSES */ + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + buf_pool->n_page_gets++; + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0); +#endif + + return(block); +} + +/************************************************************************ +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_page_init_low( +/*==============*/ + buf_page_t* bpage) /* in: block to init */ +{ + bpage->flush_type = BUF_FLUSH_LRU; + bpage->accessed = FALSE; + bpage->io_fix = BUF_IO_NONE; + bpage->buf_fix_count = 0; + bpage->freed_page_clock = 0; + bpage->newest_modification = 0; + bpage->oldest_modification = 0; + HASH_INVALIDATE(bpage, hash); +#ifdef UNIV_DEBUG_FILE_ACCESSES + bpage->file_page_was_freed = FALSE; +#endif /* UNIV_DEBUG_FILE_ACCESSES */ +} + +#ifdef UNIV_HOTBACKUP +/************************************************************************ +Inits a page to the buffer buf_pool, for use in ibbackup --restore. */ +UNIV_INTERN +void +buf_page_init_for_backup_restore( +/*=============================*/ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space + in units of a page */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + buf_block_t* block) /* in: block to init */ +{ + buf_block_init_low(block); + + block->lock_hash_val = 0; + + buf_page_init_low(&block->page); + block->page.state = BUF_BLOCK_FILE_PAGE; + block->page.space = space; + block->page.offset = offset; + + page_zip_des_init(&block->page.zip); + + /* We assume that block->page.data has been allocated + with zip_size == UNIV_PAGE_SIZE. */ + ut_ad(zip_size <= UNIV_PAGE_SIZE); + ut_ad(ut_is_2pow(zip_size)); + page_zip_set_size(&block->page.zip, zip_size); +} +#endif /* UNIV_HOTBACKUP */ + +/************************************************************************ +Inits a page to the buffer buf_pool. */ +static +void +buf_page_init( +/*==========*/ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space + in units of a page */ + buf_block_t* block) /* in: block to init */ +{ + buf_page_t* hash_page; + + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&(block->mutex))); + ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); + + /* Set the state of the block */ + buf_block_set_file_page(block, space, offset); + +#ifdef UNIV_DEBUG_VALGRIND + if (!space) { + /* Silence valid Valgrind warnings about uninitialized + data being written to data files. There are some unused + bytes on some pages that InnoDB does not initialize. */ + UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + buf_block_init_low(block); + + block->lock_hash_val = lock_rec_hash(space, offset); + + /* Insert into the hash table of file pages */ + + hash_page = buf_page_hash_get(space, offset); + + if (UNIV_LIKELY_NULL(hash_page)) { + fprintf(stderr, + "InnoDB: Error: page %lu %lu already found" + " in the hash table: %p, %p\n", + (ulong) space, + (ulong) offset, + (const void*) hash_page, (const void*) block); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + mutex_exit(&block->mutex); + buf_pool_mutex_exit(); + buf_print(); + buf_LRU_print(); + buf_validate(); + buf_LRU_validate(); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_error; + } + + buf_page_init_low(&block->page); + + ut_ad(!block->page.in_zip_hash); + ut_ad(!block->page.in_page_hash); + ut_d(block->page.in_page_hash = TRUE); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, + buf_page_address_fold(space, offset), &block->page); +} + +/************************************************************************ +Function which inits a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. */ +UNIV_INTERN +buf_page_t* +buf_page_init_for_read( +/*===================*/ + /* out: pointer to the block or NULL */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size, or 0 */ + ibool unzip, /* in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version,/* in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset) /* in: page number */ +{ + buf_block_t* block; + buf_page_t* bpage; + mtr_t mtr; + ibool lru = FALSE; + void* data; + + ut_ad(buf_pool); + + *err = DB_SUCCESS; + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + /* It is a read-ahead within an ibuf routine */ + + ut_ad(!ibuf_bitmap_page(zip_size, offset)); + ut_ad(ibuf_inside()); + + mtr_start(&mtr); + + if (!recv_no_ibuf_operations + && !ibuf_page(space, zip_size, offset, &mtr)) { + + mtr_commit(&mtr); + + return(NULL); + } + } else { + ut_ad(mode == BUF_READ_ANY_PAGE); + } + + if (zip_size && UNIV_LIKELY(!unzip) + && UNIV_LIKELY(!recv_recovery_is_on())) { + block = NULL; + } else { + block = buf_LRU_get_free_block(0); + ut_ad(block); + } + + buf_pool_mutex_enter(); + + if (buf_page_hash_get(space, offset)) { + /* The page is already in the buffer pool. */ +err_exit: + if (block) { + mutex_enter(&block->mutex); + buf_LRU_block_free_non_file_page(block); + mutex_exit(&block->mutex); + } + + bpage = NULL; + goto func_exit; + } + + if (fil_tablespace_deleted_or_being_deleted_in_mem( + space, tablespace_version)) { + /* The page belongs to a space which has been + deleted or is being deleted. */ + *err = DB_TABLESPACE_DELETED; + + goto err_exit; + } + + if (block) { + bpage = &block->page; + mutex_enter(&block->mutex); + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list, to the old blocks */ + buf_LRU_add_block(bpage, TRUE/* to old blocks */); + + /* We set a pass-type x-lock on the frame because then + the same thread which called for the read operation + (and is running now at this point of code) can wait + for the read to complete by waiting for the x-lock on + the frame; if the x-lock were recursive, the same + thread would illegally get the x-lock before the page + read is completed. The x-lock is cleared by the + io-handler thread. */ + + rw_lock_x_lock_gen(&block->lock, BUF_IO_READ); + buf_page_set_io_fix(bpage, BUF_IO_READ); + + if (UNIV_UNLIKELY(zip_size)) { + page_zip_set_size(&block->page.zip, zip_size); + + /* buf_pool_mutex may be released and + reacquired by buf_buddy_alloc(). Thus, we + must release block->mutex in order not to + break the latching order in the reacquisition + of buf_pool_mutex. We also must defer this + operation until after the block descriptor has + been added to buf_pool->LRU and + buf_pool->page_hash. */ + mutex_exit(&block->mutex); + data = buf_buddy_alloc(zip_size, &lru); + mutex_enter(&block->mutex); + block->page.zip.data = data; + + /* To maintain the invariant + block->in_unzip_LRU_list + == buf_page_belongs_to_unzip_LRU(&block->page) + we have to add this block to unzip_LRU + after block->page.zip.data is set. */ + ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); + buf_unzip_LRU_add_block(block, TRUE); + } + + mutex_exit(&block->mutex); + } else { + /* Defer buf_buddy_alloc() until after the block has + been found not to exist. The buf_buddy_alloc() and + buf_buddy_free() calls may be expensive because of + buf_buddy_relocate(). */ + + /* The compressed page must be allocated before the + control block (bpage), in order to avoid the + invocation of buf_buddy_relocate_block() on + uninitialized data. */ + data = buf_buddy_alloc(zip_size, &lru); + bpage = buf_buddy_alloc(sizeof *bpage, &lru); + + /* If buf_buddy_alloc() allocated storage from the LRU list, + it released and reacquired buf_pool_mutex. Thus, we must + check the page_hash again, as it may have been modified. */ + if (UNIV_UNLIKELY(lru) + && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) { + + /* The block was added by some other thread. */ + buf_buddy_free(bpage, sizeof *bpage); + buf_buddy_free(data, zip_size); + + bpage = NULL; + goto func_exit; + } + + page_zip_des_init(&bpage->zip); + page_zip_set_size(&bpage->zip, zip_size); + bpage->zip.data = data; + + mutex_enter(&buf_pool_zip_mutex); + UNIV_MEM_DESC(bpage->zip.data, + page_zip_get_size(&bpage->zip), bpage); + buf_page_init_low(bpage); + bpage->state = BUF_BLOCK_ZIP_PAGE; + bpage->space = space; + bpage->offset = offset; + +#ifdef UNIV_DEBUG + bpage->in_page_hash = FALSE; + bpage->in_zip_hash = FALSE; + bpage->in_flush_list = FALSE; + bpage->in_free_list = FALSE; +#endif /* UNIV_DEBUG */ + bpage->in_LRU_list = FALSE; + + ut_d(bpage->in_page_hash = TRUE); + HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, + buf_page_address_fold(space, offset), bpage); + + /* The block must be put to the LRU list, to the old blocks */ + buf_LRU_add_block(bpage, TRUE/* to old blocks */); + buf_LRU_insert_zip_clean(bpage); + + buf_page_set_io_fix(bpage, BUF_IO_READ); + + mutex_exit(&buf_pool_zip_mutex); + } + + buf_pool->n_pend_reads++; +func_exit: + buf_pool_mutex_exit(); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) { + + mtr_commit(&mtr); + } + + ut_ad(!bpage || buf_page_in_file(bpage)); + return(bpage); +} + +/************************************************************************ +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). */ +UNIV_INTERN +buf_block_t* +buf_page_create( +/*============*/ + /* out: pointer to the block, page bufferfixed */ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space in units of + a page */ + ulint zip_size,/* in: compressed page size, or 0 */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + buf_frame_t* frame; + buf_block_t* block; + buf_block_t* free_block = NULL; + + ut_ad(mtr); + ut_ad(space || !zip_size); + + free_block = buf_LRU_get_free_block(0); + + buf_pool_mutex_enter(); + + block = (buf_block_t*) buf_page_hash_get(space, offset); + + if (block && buf_page_in_file(&block->page)) { +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(space, offset) == 0); +#endif +#ifdef UNIV_DEBUG_FILE_ACCESSES + block->page.file_page_was_freed = FALSE; +#endif /* UNIV_DEBUG_FILE_ACCESSES */ + + /* Page can be found in buf_pool */ + buf_pool_mutex_exit(); + + buf_block_free(free_block); + + return(buf_page_get_with_no_latch(space, zip_size, + offset, mtr)); + } + + /* If we get here, the page was not in buf_pool: init it there */ + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, "Creating space %lu page %lu to buffer\n", + (ulong) space, (ulong) offset); + } +#endif /* UNIV_DEBUG */ + + block = free_block; + + mutex_enter(&block->mutex); + + buf_page_init(space, offset, block); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(&block->page, FALSE); + + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + buf_pool->n_pages_created++; + + if (zip_size) { + void* data; + ibool lru; + + /* Prevent race conditions during buf_buddy_alloc(), + which may release and reacquire buf_pool_mutex, + by IO-fixing and X-latching the block. */ + + buf_page_set_io_fix(&block->page, BUF_IO_READ); + rw_lock_x_lock(&block->lock); + + page_zip_set_size(&block->page.zip, zip_size); + mutex_exit(&block->mutex); + /* buf_pool_mutex may be released and reacquired by + buf_buddy_alloc(). Thus, we must release block->mutex + in order not to break the latching order in + the reacquisition of buf_pool_mutex. We also must + defer this operation until after the block descriptor + has been added to buf_pool->LRU and buf_pool->page_hash. */ + data = buf_buddy_alloc(zip_size, &lru); + mutex_enter(&block->mutex); + block->page.zip.data = data; + + /* To maintain the invariant + block->in_unzip_LRU_list + == buf_page_belongs_to_unzip_LRU(&block->page) + we have to add this block to unzip_LRU after + block->page.zip.data is set. */ + ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); + buf_unzip_LRU_add_block(block, FALSE); + + buf_page_set_io_fix(&block->page, BUF_IO_NONE); + rw_lock_x_unlock(&block->lock); + } + + buf_pool_mutex_exit(); + + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + + buf_page_set_accessed(&block->page, TRUE); + + mutex_exit(&block->mutex); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + + ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(FALSE); + + frame = block->frame; + + memset(frame + FIL_PAGE_PREV, 0xff, 4); + memset(frame + FIL_PAGE_NEXT, 0xff, 4); + mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); + + /* Reset to zero the file flush lsn field in the page; if the first + page of an ibdata file is 'created' in this function into the buffer + pool then we lose the original contents of the file flush lsn stamp. + Then InnoDB could in a crash recovery print a big, false, corruption + warning if the stamp contains an lsn bigger than the ib_logfile lsn. */ + + memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(++buf_dbg_counter % 357 || buf_validate()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(buf_block_get_space(block), + buf_block_get_page_no(block)) == 0); +#endif + return(block); +} + +/************************************************************************ +Completes an asynchronous read or write request of a file page to or from +the buffer pool. */ +UNIV_INTERN +void +buf_page_io_complete( +/*=================*/ + buf_page_t* bpage) /* in: pointer to the block in question */ +{ + enum buf_io_fix io_type; + const ibool uncompressed = (buf_page_get_state(bpage) + == BUF_BLOCK_FILE_PAGE); + + ut_a(buf_page_in_file(bpage)); + + /* We do not need protect io_fix here by mutex to read + it because this is the only function where we can change the value + from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code + ensures that this is the only thread that handles the i/o for this + block. */ + + io_type = buf_page_get_io_fix(bpage); + ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); + + if (io_type == BUF_IO_READ) { + ulint read_page_no; + ulint read_space_id; + byte* frame; + + if (buf_page_get_zip_size(bpage)) { + frame = bpage->zip.data; + buf_pool->n_pend_unzip++; + if (uncompressed + && !buf_zip_decompress((buf_block_t*) bpage, + FALSE)) { + + buf_pool->n_pend_unzip--; + goto corrupt; + } + buf_pool->n_pend_unzip--; + } else { + ut_a(uncompressed); + frame = ((buf_block_t*) bpage)->frame; + } + + /* If this page is not uninitialized and not in the + doublewrite buffer, then the page number and space id + should be the same as in block. */ + read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET); + read_space_id = mach_read_from_4( + frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + if (bpage->space == TRX_SYS_SPACE + && trx_doublewrite_page_inside(bpage->offset)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: reading page %lu\n" + "InnoDB: which is in the" + " doublewrite buffer!\n", + (ulong) bpage->offset); + } else if (!read_space_id && !read_page_no) { + /* This is likely an uninitialized page. */ + } else if ((bpage->space + && bpage->space != read_space_id) + || bpage->offset != read_page_no) { + /* We did not compare space_id to read_space_id + if bpage->space == 0, because the field on the + page may contain garbage in MySQL < 4.1.1, + which only supported bpage->space == 0. */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: space id and page n:o" + " stored in the page\n" + "InnoDB: read in are %lu:%lu," + " should be %lu:%lu!\n", + (ulong) read_space_id, (ulong) read_page_no, + (ulong) bpage->space, + (ulong) bpage->offset); + } + + /* From version 3.23.38 up we store the page checksum + to the 4 first bytes of the page end lsn field */ + + if (buf_page_is_corrupted(frame, + buf_page_get_zip_size(bpage))) { +corrupt: + fprintf(stderr, + "InnoDB: Database page corruption on disk" + " or a failed\n" + "InnoDB: file read of page %lu.\n" + "InnoDB: You may have to recover" + " from a backup.\n", + (ulong) bpage->offset); + buf_page_print(frame, buf_page_get_zip_size(bpage)); + fprintf(stderr, + "InnoDB: Database page corruption on disk" + " or a failed\n" + "InnoDB: file read of page %lu.\n" + "InnoDB: You may have to recover" + " from a backup.\n", + (ulong) bpage->offset); + fputs("InnoDB: It is also possible that" + " your operating\n" + "InnoDB: system has corrupted its" + " own file cache\n" + "InnoDB: and rebooting your computer" + " removes the\n" + "InnoDB: error.\n" + "InnoDB: If the corrupt page is an index page\n" + "InnoDB: you can also try to" + " fix the corruption\n" + "InnoDB: by dumping, dropping," + " and reimporting\n" + "InnoDB: the corrupt table." + " You can use CHECK\n" + "InnoDB: TABLE to scan your" + " table for corruption.\n" + "InnoDB: See also" + " http://dev.mysql.com/doc/refman/5.1/en/" + "forcing-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); + + if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) { + fputs("InnoDB: Ending processing because of" + " a corrupt database page.\n", + stderr); + exit(1); + } + } + + if (recv_recovery_is_on()) { + /* Pages must be uncompressed for crash recovery. */ + ut_a(uncompressed); + recv_recover_page(FALSE, TRUE, (buf_block_t*) bpage); + } + + if (uncompressed && !recv_no_ibuf_operations) { + ibuf_merge_or_delete_for_page( + (buf_block_t*) bpage, bpage->space, + bpage->offset, buf_page_get_zip_size(bpage), + TRUE); + } + } + + buf_pool_mutex_enter(); + mutex_enter(buf_page_get_mutex(bpage)); + +#ifdef UNIV_IBUF_COUNT_DEBUG + if (io_type == BUF_IO_WRITE || uncompressed) { + /* For BUF_IO_READ of compressed-only blocks, the + buffered operations will be merged by buf_page_get_gen() + after the block has been uncompressed. */ + ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0); + } +#endif + /* Because this thread which does the unlocking is not the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread + id. */ + + buf_page_set_io_fix(bpage, BUF_IO_NONE); + + switch (io_type) { + case BUF_IO_READ: + /* NOTE that the call to ibuf may have moved the ownership of + the x-latch to this OS thread: do not let this confuse you in + debugging! */ + + ut_ad(buf_pool->n_pend_reads > 0); + buf_pool->n_pend_reads--; + buf_pool->n_pages_read++; + + if (uncompressed) { + rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, + BUF_IO_READ); + } + + break; + + case BUF_IO_WRITE: + /* Write means a flush operation: call the completion + routine in the flush system */ + + buf_flush_write_complete(bpage); + + if (uncompressed) { + rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock, + BUF_IO_WRITE); + } + + buf_pool->n_pages_written++; + + break; + + default: + ut_error; + } + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, "Has %s page space %lu page no %lu\n", + io_type == BUF_IO_READ ? "read" : "written", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif /* UNIV_DEBUG */ + + mutex_exit(buf_page_get_mutex(bpage)); + buf_pool_mutex_exit(); +} + +/************************************************************************* +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ +UNIV_INTERN +void +buf_pool_invalidate(void) +/*=====================*/ +{ + ibool freed; + + ut_ad(buf_all_freed()); + + freed = TRUE; + + while (freed) { + freed = buf_LRU_search_and_free_block(100); + } + + buf_pool_mutex_enter(); + + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); + ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0); + + buf_pool_mutex_exit(); +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************* +Validates the buffer buf_pool data structure. */ +UNIV_INTERN +ibool +buf_validate(void) +/*==============*/ +{ + buf_page_t* b; + buf_chunk_t* chunk; + ulint i; + ulint n_single_flush = 0; + ulint n_lru_flush = 0; + ulint n_list_flush = 0; + ulint n_lru = 0; + ulint n_flush = 0; + ulint n_free = 0; + ulint n_zip = 0; + + ut_ad(buf_pool); + + buf_pool_mutex_enter(); + + chunk = buf_pool->chunks; + + /* Check the uncompressed blocks. */ + + for (i = buf_pool->n_chunks; i--; chunk++) { + + ulint j; + buf_block_t* block = chunk->blocks; + + for (j = chunk->size; j--; block++) { + + mutex_enter(&block->mutex); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + /* These should only occur on + zip_clean, zip_free[], or flush_list. */ + ut_error; + break; + + case BUF_BLOCK_FILE_PAGE: + ut_a(buf_page_hash_get(buf_block_get_space( + block), + buf_block_get_page_no( + block)) + == &block->page); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(buf_page_get_io_fix(&block->page) + == BUF_IO_READ + || !ibuf_count_get(buf_block_get_space( + block), + buf_block_get_page_no( + block))); +#endif + switch (buf_page_get_io_fix(&block->page)) { + case BUF_IO_NONE: + break; + + case BUF_IO_WRITE: + switch (buf_page_get_flush_type( + &block->page)) { + case BUF_FLUSH_LRU: + n_lru_flush++; + ut_a(rw_lock_is_locked( + &block->lock, + RW_LOCK_SHARED)); + break; + case BUF_FLUSH_LIST: + n_list_flush++; + break; + case BUF_FLUSH_SINGLE_PAGE: + n_single_flush++; + break; + default: + ut_error; + } + + break; + + case BUF_IO_READ: + + ut_a(rw_lock_is_locked(&block->lock, + RW_LOCK_EX)); + break; + } + + n_lru++; + + if (block->page.oldest_modification > 0) { + n_flush++; + } + + break; + + case BUF_BLOCK_NOT_USED: + n_free++; + break; + + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + /* do nothing */ + break; + } + + mutex_exit(&block->mutex); + } + } + + mutex_enter(&buf_pool_zip_mutex); + + /* Check clean compressed-only blocks. */ + + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); + switch (buf_page_get_io_fix(b)) { + case BUF_IO_NONE: + /* All clean blocks should be I/O-unfixed. */ + break; + case BUF_IO_READ: + /* In buf_LRU_free_block(), we temporarily set + b->io_fix = BUF_IO_READ for a newly allocated + control block in order to prevent + buf_page_get_gen() from decompressing the block. */ + break; + default: + ut_error; + break; + } + ut_a(!b->oldest_modification); + ut_a(buf_page_hash_get(b->space, b->offset) == b); + + n_lru++; + n_zip++; + } + + /* Check dirty compressed-only blocks. */ + + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_flush_list); + + switch (buf_page_get_state(b)) { + case BUF_BLOCK_ZIP_DIRTY: + ut_a(b->oldest_modification); + n_lru++; + n_flush++; + n_zip++; + switch (buf_page_get_io_fix(b)) { + case BUF_IO_NONE: + case BUF_IO_READ: + break; + + case BUF_IO_WRITE: + switch (buf_page_get_flush_type(b)) { + case BUF_FLUSH_LRU: + n_lru_flush++; + break; + case BUF_FLUSH_LIST: + n_list_flush++; + break; + case BUF_FLUSH_SINGLE_PAGE: + n_single_flush++; + break; + default: + ut_error; + } + break; + } + break; + case BUF_BLOCK_FILE_PAGE: + /* uncompressed page */ + break; + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + ut_a(buf_page_hash_get(b->space, b->offset) == b); + } + + mutex_exit(&buf_pool_zip_mutex); + + if (n_lru + n_free > buf_pool->curr_size + n_zip) { + fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n", + (ulong) n_lru, (ulong) n_free, + (ulong) buf_pool->curr_size, (ulong) n_zip); + ut_error; + } + + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { + fprintf(stderr, "Free list len %lu, free blocks %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) n_free); + ut_error; + } + ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); + + ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); + + buf_pool_mutex_exit(); + + ut_a(buf_LRU_validate()); + ut_a(buf_flush_validate()); + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************* +Prints info of the buffer buf_pool data structure. */ +UNIV_INTERN +void +buf_print(void) +/*===========*/ +{ + dulint* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + dulint id; + ulint n_found; + buf_chunk_t* chunk; + dict_index_t* index; + + ut_ad(buf_pool); + + size = buf_pool->curr_size; + + index_ids = mem_alloc(sizeof(dulint) * size); + counts = mem_alloc(sizeof(ulint) * size); + + buf_pool_mutex_enter(); + + fprintf(stderr, + "buf_pool size %lu\n" + "database pages %lu\n" + "free pages %lu\n" + "modified database pages %lu\n" + "n pending decompressions %lu\n" + "n pending reads %lu\n" + "n pending flush LRU %lu list %lu single page %lu\n" + "pages read %lu, created %lu, written %lu\n", + (ulong) size, + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), + (ulong) buf_pool->n_pend_unzip, + (ulong) buf_pool->n_pend_reads, + (ulong) buf_pool->n_flush[BUF_FLUSH_LRU], + (ulong) buf_pool->n_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE], + (ulong) buf_pool->n_pages_read, buf_pool->n_pages_created, + (ulong) buf_pool->n_pages_written); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + chunk = buf_pool->chunks; + + for (i = buf_pool->n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + ulint n_blocks = chunk->size; + + for (; n_blocks--; block++) { + const buf_frame_t* frame = block->frame; + + if (fil_page_get_type(frame) == FIL_PAGE_INDEX) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (ut_dulint_cmp(index_ids[j], + id) == 0) { + counts[j]++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + } + + buf_pool_mutex_exit(); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + fprintf(stderr, + "Block count for index %lu in buffer is about %lu", + (ulong) ut_dulint_get_low(index_ids[i]), + (ulong) counts[i]); + + if (index) { + putc(' ', stderr); + dict_index_name_print(stderr, NULL, index); + } + + putc('\n', stderr); + } + + mem_free(index_ids); + mem_free(counts); + + ut_a(buf_validate()); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#ifdef UNIV_DEBUG +/************************************************************************* +Returns the number of latched pages in the buffer pool. */ +UNIV_INTERN +ulint +buf_get_latched_pages_number(void) +/*==============================*/ +{ + buf_chunk_t* chunk; + buf_page_t* b; + ulint i; + ulint fixed_pages_number = 0; + + buf_pool_mutex_enter(); + + chunk = buf_pool->chunks; + + for (i = buf_pool->n_chunks; i--; chunk++) { + buf_block_t* block; + ulint j; + + block = chunk->blocks; + + for (j = chunk->size; j--; block++) { + if (buf_block_get_state(block) + != BUF_BLOCK_FILE_PAGE) { + + continue; + } + + mutex_enter(&block->mutex); + + if (block->page.buf_fix_count != 0 + || buf_page_get_io_fix(&block->page) + != BUF_IO_NONE) { + fixed_pages_number++; + } + + mutex_exit(&block->mutex); + } + } + + mutex_enter(&buf_pool_zip_mutex); + + /* Traverse the lists of clean and dirty compressed-only blocks. */ + + for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); + ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE); + + if (b->buf_fix_count != 0 + || buf_page_get_io_fix(b) != BUF_IO_NONE) { + fixed_pages_number++; + } + } + + for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_flush_list); + + switch (buf_page_get_state(b)) { + case BUF_BLOCK_ZIP_DIRTY: + if (b->buf_fix_count != 0 + || buf_page_get_io_fix(b) != BUF_IO_NONE) { + fixed_pages_number++; + } + break; + case BUF_BLOCK_FILE_PAGE: + /* uncompressed page */ + break; + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + } + + mutex_exit(&buf_pool_zip_mutex); + buf_pool_mutex_exit(); + + return(fixed_pages_number); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Returns the number of pending buf pool ios. */ +UNIV_INTERN +ulint +buf_get_n_pending_ios(void) +/*=======================*/ +{ + return(buf_pool->n_pend_reads + + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); +} + +/************************************************************************* +Returns the ratio in percents of modified pages in the buffer pool / +database pages in the buffer pool. */ +UNIV_INTERN +ulint +buf_get_modified_ratio_pct(void) +/*============================*/ +{ + ulint ratio; + + //buf_pool_mutex_enter(); /* optimistic */ + + ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) + / (1 + UT_LIST_GET_LEN(buf_pool->LRU) + + UT_LIST_GET_LEN(buf_pool->free)); + + /* 1 + is there to avoid division by zero */ + + //buf_pool_mutex_exit(); /* optimistic */ + + return(ratio); +} + +/************************************************************************* +Prints info of the buffer i/o. */ +UNIV_INTERN +void +buf_print_io( +/*=========*/ + FILE* file) /* in/out: buffer where to print */ +{ + time_t current_time; + double time_elapsed; + ulint size; + + ut_ad(buf_pool); + size = buf_pool->curr_size; + + buf_pool_mutex_enter(); + + fprintf(file, + "Buffer pool size %lu\n" + "Buffer pool size, bytes %lu\n" + "Free buffers %lu\n" + "Database pages %lu\n" + "Modified db pages %lu\n" + "Pending reads %lu\n" + "Pending writes: LRU %lu, flush list %lu, single page %lu\n", + (ulong) size, + (ulong) size * UNIV_PAGE_SIZE, + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), + (ulong) buf_pool->n_pend_reads, + (ulong) buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->init_flush[BUF_FLUSH_LRU], + (ulong) buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->init_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool->last_printout_time); + buf_pool->last_printout_time = current_time; + + fprintf(file, + "Pages read %lu, created %lu, written %lu\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + (ulong) buf_pool->n_pages_read, + (ulong) buf_pool->n_pages_created, + (ulong) buf_pool->n_pages_written, + (buf_pool->n_pages_read - buf_pool->n_pages_read_old) + / time_elapsed, + (buf_pool->n_pages_created - buf_pool->n_pages_created_old) + / time_elapsed, + (buf_pool->n_pages_written - buf_pool->n_pages_written_old) + / time_elapsed); + + if (buf_pool->n_page_gets > buf_pool->n_page_gets_old) { + fprintf(file, "Buffer pool hit rate %lu / 1000\n", + (ulong) + (1000 - ((1000 * (buf_pool->n_pages_read + - buf_pool->n_pages_read_old)) + / (buf_pool->n_page_gets + - buf_pool->n_page_gets_old)))); + } else { + fputs("No buffer pool page gets since the last printout\n", + file); + } + + buf_pool->n_page_gets_old = buf_pool->n_page_gets; + buf_pool->n_pages_read_old = buf_pool->n_pages_read; + buf_pool->n_pages_created_old = buf_pool->n_pages_created; + buf_pool->n_pages_written_old = buf_pool->n_pages_written; + + /* Print some values to help us with visualizing what is + happening with LRU eviction. */ + fprintf(file, + "LRU len: %lu, unzip_LRU len: %lu\n" + "I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n", + UT_LIST_GET_LEN(buf_pool->LRU), + UT_LIST_GET_LEN(buf_pool->unzip_LRU), + buf_LRU_stat_sum.io, buf_LRU_stat_cur.io, + buf_LRU_stat_sum.unzip, buf_LRU_stat_cur.unzip); + + buf_pool_mutex_exit(); +} + +/************************************************************************** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats(void) +/*======================*/ +{ + buf_pool->last_printout_time = time(NULL); + buf_pool->n_page_gets_old = buf_pool->n_page_gets; + buf_pool->n_pages_read_old = buf_pool->n_pages_read; + buf_pool->n_pages_created_old = buf_pool->n_pages_created; + buf_pool->n_pages_written_old = buf_pool->n_pages_written; +} + +/************************************************************************* +Checks that all file pages in the buffer are in a replaceable state. */ +UNIV_INTERN +ibool +buf_all_freed(void) +/*===============*/ +{ + buf_chunk_t* chunk; + ulint i; + + ut_ad(buf_pool); + + buf_pool_mutex_enter(); + + chunk = buf_pool->chunks; + + for (i = buf_pool->n_chunks; i--; chunk++) { + + const buf_block_t* block = buf_chunk_not_freed(chunk); + + if (UNIV_LIKELY_NULL(block)) { + fprintf(stderr, + "Page %lu %lu still fixed or dirty\n", + (ulong) block->page.space, + (ulong) block->page.offset); + ut_error; + } + } + + buf_pool_mutex_exit(); + + return(TRUE); +} + +/************************************************************************* +Checks that there currently are no pending i/o-operations for the buffer +pool. */ +UNIV_INTERN +ibool +buf_pool_check_no_pending_io(void) +/*==============================*/ + /* out: TRUE if there is no pending i/o */ +{ + ibool ret; + + buf_pool_mutex_enter(); + + if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU] + + buf_pool->n_flush[BUF_FLUSH_LIST] + + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) { + ret = FALSE; + } else { + ret = TRUE; + } + + buf_pool_mutex_exit(); + + return(ret); +} + +/************************************************************************* +Gets the current length of the free list of buffer blocks. */ +UNIV_INTERN +ulint +buf_get_free_list_len(void) +/*=======================*/ +{ + ulint len; + + buf_pool_mutex_enter(); + + len = UT_LIST_GET_LEN(buf_pool->free); + + buf_pool_mutex_exit(); + + return(len); +} diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c new file mode 100644 index 00000000000..7cb5bb91994 --- /dev/null +++ b/storage/xtradb/buf/buf0flu.c @@ -0,0 +1,1277 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer buf_pool flush algorithm + +Created 11/11/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0flu.h" + +#ifdef UNIV_NONINL +#include "buf0flu.ic" +#include "trx0sys.h" +#endif + +#include "ut0byte.h" +#include "ut0lst.h" +#include "page0page.h" +#include "page0zip.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "ibuf0ibuf.h" +#include "log0log.h" +#include "os0file.h" +#include "trx0sys.h" +#include "srv0srv.h" + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/********************************************************************** +Validates the flush list. */ +static +ibool +buf_flush_validate_low(void); +/*========================*/ + /* out: TRUE if ok */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/************************************************************************ +Inserts a modified block into the flush list. */ +UNIV_INTERN +void +buf_flush_insert_into_flush_list( +/*=============================*/ + buf_block_t* block) /* in/out: block which is modified */ +{ + ut_ad(buf_pool_mutex_own()); + ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) + || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification + <= block->page.oldest_modification)); + + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.in_LRU_list); + ut_ad(block->page.in_page_hash); + ut_ad(!block->page.in_zip_hash); + ut_ad(!block->page.in_flush_list); + ut_d(block->page.in_flush_list = TRUE); + UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +} + +/************************************************************************ +Inserts a modified block into the flush list in the right sorted position. +This function is used by recovery, because there the modifications do not +necessarily come in the order of lsn's. */ +UNIV_INTERN +void +buf_flush_insert_sorted_into_flush_list( +/*====================================*/ + buf_block_t* block) /* in/out: block which is modified */ +{ + buf_page_t* prev_b; + buf_page_t* b; + + ut_ad(buf_pool_mutex_own()); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + ut_ad(block->page.in_LRU_list); + ut_ad(block->page.in_page_hash); + ut_ad(!block->page.in_zip_hash); + ut_ad(!block->page.in_flush_list); + ut_d(block->page.in_flush_list = TRUE); + + prev_b = NULL; + b = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (b && b->oldest_modification > block->page.oldest_modification) { + ut_ad(b->in_flush_list); + prev_b = b; + b = UT_LIST_GET_NEXT(list, b); + } + + if (prev_b == NULL) { + UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + } else { + UT_LIST_INSERT_AFTER(list, buf_pool->flush_list, + prev_b, &block->page); + } + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_a(buf_flush_validate_low()); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +} + +/************************************************************************ +Returns TRUE if the file page block is immediately suitable for replacement, +i.e., the transition FILE_PAGE => NOT_USED allowed. */ +UNIV_INTERN +ibool +buf_flush_ready_for_replace( +/*========================*/ + /* out: TRUE if can replace immediately */ + buf_page_t* bpage) /* in: buffer control block, must be + buf_page_in_file(bpage) and in the LRU list */ +{ + //ut_ad(buf_pool_mutex_own()); + //ut_ad(mutex_own(buf_page_get_mutex(bpage))); + //ut_ad(bpage->in_LRU_list); /* optimistic use */ + + if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) { + + return(bpage->oldest_modification == 0 + && buf_page_get_io_fix(bpage) == BUF_IO_NONE + && bpage->buf_fix_count == 0); + } + + /* permited not to own LRU_mutex.. */ +/* + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: buffer block state %lu" + " in the LRU list!\n", + (ulong) buf_page_get_state(bpage)); + ut_print_buf(stderr, bpage, sizeof(buf_page_t)); + putc('\n', stderr); +*/ + + return(FALSE); +} + +/************************************************************************ +Returns TRUE if the block is modified and ready for flushing. */ +UNIV_INLINE +ibool +buf_flush_ready_for_flush( +/*======================*/ + /* out: TRUE if can flush immediately */ + buf_page_t* bpage, /* in: buffer control block, must be + buf_page_in_file(bpage) */ + enum buf_flush flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ +{ + ut_a(buf_page_in_file(bpage)); + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST); + + if (bpage->oldest_modification != 0 + && buf_page_get_io_fix(bpage) == BUF_IO_NONE) { + ut_ad(bpage->in_flush_list); + + if (flush_type != BUF_FLUSH_LRU) { + + return(TRUE); + + } else if (bpage->buf_fix_count == 0) { + + /* If we are flushing the LRU list, to avoid deadlocks + we require the block not to be bufferfixed, and hence + not latched. */ + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************ +Remove a block from the flush list of modified blocks. */ +UNIV_INTERN +void +buf_flush_remove( +/*=============*/ + buf_page_t* bpage) /* in: pointer to the block in question */ +{ + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(bpage->in_flush_list); + ut_d(bpage->in_flush_list = FALSE); + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_PAGE: + /* clean compressed pages should not be on the flush list */ + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + return; + case BUF_BLOCK_ZIP_DIRTY: + buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); + UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + buf_LRU_insert_zip_clean(bpage); + break; + case BUF_BLOCK_FILE_PAGE: + UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + break; + } + + bpage->oldest_modification = 0; + + ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list)); +} + +/************************************************************************ +Updates the flush system data structures when a write is completed. */ +UNIV_INTERN +void +buf_flush_write_complete( +/*=====================*/ + buf_page_t* bpage) /* in: pointer to the block in question */ +{ + enum buf_flush flush_type; + + ut_ad(bpage); + + buf_flush_remove(bpage); + + flush_type = buf_page_get_flush_type(bpage); + buf_pool->n_flush[flush_type]--; + + if (flush_type == BUF_FLUSH_LRU) { + /* Put the block to the end of the LRU list to wait to be + moved to the free list */ + + buf_LRU_make_block_old(bpage); + + buf_pool->LRU_flush_ended++; + } + + /* fprintf(stderr, "n pending flush %lu\n", + buf_pool->n_flush[flush_type]); */ + + if ((buf_pool->n_flush[flush_type] == 0) + && (buf_pool->init_flush[flush_type] == FALSE)) { + + /* The running flush batch has ended */ + + os_event_set(buf_pool->no_flush[flush_type]); + } +} + +/************************************************************************ +Flushes possible buffered writes from the doublewrite memory buffer to disk, +and also wakes up the aio thread if simulated aio is used. It is very +important to call this function after a batch of writes has been posted, +and also when we may have to wait for a page latch! Otherwise a deadlock +of threads can occur. */ +static +void +buf_flush_buffered_writes(void) +/*===========================*/ +{ + byte* write_buf; + ulint len; + ulint len2; + ulint i; + + if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) { + os_aio_simulated_wake_handler_threads(); + + return; + } + + mutex_enter(&(trx_doublewrite->mutex)); + + /* Write first to doublewrite buffer blocks. We use synchronous + aio and thus know that file write has been completed when the + control returns. */ + + if (trx_doublewrite->first_free == 0) { + + mutex_exit(&(trx_doublewrite->mutex)); + + return; + } + + for (i = 0; i < trx_doublewrite->first_free; i++) { + + const buf_block_t* block; + + block = (buf_block_t*) trx_doublewrite->buf_block_arr[i]; + + if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE + || block->page.zip.data) { + /* No simple validate for compressed pages exists. */ + continue; + } + + if (UNIV_UNLIKELY + (memcmp(block->frame + (FIL_PAGE_LSN + 4), + block->frame + (UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + 4))) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: The page to be written" + " seems corrupt!\n" + "InnoDB: The lsn fields do not match!" + " Noticed in the buffer pool\n" + "InnoDB: before posting to the" + " doublewrite buffer.\n"); + } + + if (!block->check_index_page_at_flush) { + } else if (page_is_comp(block->frame)) { + if (UNIV_UNLIKELY + (!page_simple_validate_new(block->frame))) { +corrupted_page: + buf_page_print(block->frame, 0); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Apparent corruption of an" + " index page n:o %lu in space %lu\n" + "InnoDB: to be written to data file." + " We intentionally crash server\n" + "InnoDB: to prevent corrupt data" + " from ending up in data\n" + "InnoDB: files.\n", + (ulong) buf_block_get_page_no(block), + (ulong) buf_block_get_space(block)); + + ut_error; + } + } else if (UNIV_UNLIKELY + (!page_simple_validate_old(block->frame))) { + + goto corrupted_page; + } + } + + /* increment the doublewrite flushed pages counter */ + srv_dblwr_pages_written+= trx_doublewrite->first_free; + srv_dblwr_writes++; + + len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, + trx_doublewrite->first_free) * UNIV_PAGE_SIZE; + + write_buf = trx_doublewrite->write_buf; + i = 0; + + fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + trx_doublewrite->block1, 0, len, + (void*) write_buf, NULL); + + for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; + len2 += UNIV_PAGE_SIZE, i++) { + const buf_block_t* block = (buf_block_t*) + trx_doublewrite->buf_block_arr[i]; + + if (UNIV_LIKELY(!block->page.zip.data) + && UNIV_LIKELY(buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE) + && UNIV_UNLIKELY + (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4), + write_buf + len2 + + (UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: The page to be written" + " seems corrupt!\n" + "InnoDB: The lsn fields do not match!" + " Noticed in the doublewrite block1.\n"); + } + } + + if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + goto flush; + } + + len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + * UNIV_PAGE_SIZE; + + write_buf = trx_doublewrite->write_buf + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; + ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE); + + fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0, + trx_doublewrite->block2, 0, len, + (void*) write_buf, NULL); + + for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; + len2 += UNIV_PAGE_SIZE, i++) { + const buf_block_t* block = (buf_block_t*) + trx_doublewrite->buf_block_arr[i]; + + if (UNIV_LIKELY(!block->page.zip.data) + && UNIV_LIKELY(buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE) + && UNIV_UNLIKELY + (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4), + write_buf + len2 + + (UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: The page to be" + " written seems corrupt!\n" + "InnoDB: The lsn fields do not match!" + " Noticed in" + " the doublewrite block2.\n"); + } + } + +flush: + /* Now flush the doublewrite buffer data to disk */ + + fil_flush(TRX_SYS_SPACE); + + /* We know that the writes have been flushed to disk now + and in recovery we will find them in the doublewrite buffer + blocks. Next do the writes to the intended positions. */ + + for (i = 0; i < trx_doublewrite->first_free; i++) { + const buf_block_t* block = (buf_block_t*) + trx_doublewrite->buf_block_arr[i]; + + ut_a(buf_page_in_file(&block->page)); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(&block->page), + buf_page_get_zip_size(&block->page), + buf_page_get_page_no(&block->page), 0, + buf_page_get_zip_size(&block->page), + (void*)block->page.zip.data, + (void*)block); + + /* Increment the counter of I/O operations used + for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + + continue; + } + + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4), + block->frame + + (UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + 4))) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: The page to be written" + " seems corrupt!\n" + "InnoDB: The lsn fields do not match!" + " Noticed in the buffer pool\n" + "InnoDB: after posting and flushing" + " the doublewrite buffer.\n" + "InnoDB: Page buf fix count %lu," + " io fix %lu, state %lu\n", + (ulong)block->page.buf_fix_count, + (ulong)buf_block_get_io_fix(block), + (ulong)buf_block_get_state(block)); + } + + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_block_get_space(block), 0, + buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, + (void*)block->frame, (void*)block); + + /* Increment the counter of I/O operations used + for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + } + + /* Wake possible simulated aio thread to actually post the + writes to the operating system */ + + os_aio_simulated_wake_handler_threads(); + + /* Wait that all async writes to tablespaces have been posted to + the OS */ + + os_aio_wait_until_no_pending_writes(); + + /* Now we flush the data to disk (for example, with fsync) */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + /* We can now reuse the doublewrite memory buffer: */ + + trx_doublewrite->first_free = 0; + + mutex_exit(&(trx_doublewrite->mutex)); +} + +/************************************************************************ +Posts a buffer page for writing. If the doublewrite memory buffer is +full, calls buf_flush_buffered_writes and waits for for free space to +appear. */ +static +void +buf_flush_post_to_doublewrite_buf( +/*==============================*/ + buf_page_t* bpage) /* in: buffer block to write */ +{ + ulint zip_size; +try_again: + mutex_enter(&(trx_doublewrite->mutex)); + + ut_a(buf_page_in_file(bpage)); + + if (trx_doublewrite->first_free + >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mutex_exit(&(trx_doublewrite->mutex)); + + buf_flush_buffered_writes(); + + goto try_again; + } + + zip_size = buf_page_get_zip_size(bpage); + + if (UNIV_UNLIKELY(zip_size)) { + /* Copy the compressed page and clear the rest. */ + memcpy(trx_doublewrite->write_buf + + UNIV_PAGE_SIZE * trx_doublewrite->first_free, + bpage->zip.data, zip_size); + memset(trx_doublewrite->write_buf + + UNIV_PAGE_SIZE * trx_doublewrite->first_free + + zip_size, 0, UNIV_PAGE_SIZE - zip_size); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + + memcpy(trx_doublewrite->write_buf + + UNIV_PAGE_SIZE * trx_doublewrite->first_free, + ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE); + } + + trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage; + + trx_doublewrite->first_free++; + + if (trx_doublewrite->first_free + >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mutex_exit(&(trx_doublewrite->mutex)); + + buf_flush_buffered_writes(); + + return; + } + + mutex_exit(&(trx_doublewrite->mutex)); +} + +/************************************************************************ +Initializes a page for writing to the tablespace. */ +UNIV_INTERN +void +buf_flush_init_for_writing( +/*=======================*/ + byte* page, /* in/out: page */ + void* page_zip_, /* in/out: compressed page, or NULL */ + ib_uint64_t newest_lsn) /* in: newest modification lsn + to the page */ +{ + ut_ad(page); + + if (page_zip_) { + page_zip_des_t* page_zip = page_zip_; + ulint zip_size = page_zip_get_size(page_zip); + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + + switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + memcpy(page_zip->data, page, zip_size); + /* fall through */ + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + case FIL_PAGE_INDEX: + mach_write_ull(page_zip->data + + FIL_PAGE_LSN, newest_lsn); + memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); + mach_write_to_4(page_zip->data + + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? page_zip_calc_checksum( + page_zip->data, zip_size) + : BUF_NO_CHECKSUM_MAGIC); + return; + } + + ut_print_timestamp(stderr); + fputs(" InnoDB: ERROR: The compressed page to be written" + " seems corrupt:", stderr); + ut_print_buf(stderr, page, zip_size); + fputs("\nInnoDB: Possibly older version of the page:", stderr); + ut_print_buf(stderr, page_zip->data, zip_size); + putc('\n', stderr); + ut_error; + } + + /* Write the newest modification lsn to the page header and trailer */ + mach_write_ull(page + FIL_PAGE_LSN, newest_lsn); + + mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + newest_lsn); + + /* Store the new formula checksum */ + + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? buf_calc_page_new_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + + /* We overwrite the first 4 bytes of the end lsn field to store + the old formula checksum. Since it depends also on the field + FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the + new formula checksum. */ + + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + srv_use_checksums + ? buf_calc_page_old_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); +} + +/************************************************************************ +Does an asynchronous write of a buffer page. NOTE: in simulated aio and +also when the doublewrite buffer is used, we must call +buf_flush_buffered_writes after we have posted a batch of writes! */ +static +void +buf_flush_write_block_low( +/*======================*/ + buf_page_t* bpage) /* in: buffer block to write */ +{ + ulint zip_size = buf_page_get_zip_size(bpage); + page_t* frame = NULL; +#ifdef UNIV_LOG_DEBUG + static ibool univ_log_debug_warned; +#endif /* UNIV_LOG_DEBUG */ + + ut_ad(buf_page_in_file(bpage)); + + /* We are not holding buf_pool_mutex or block_mutex here. + Nevertheless, it is safe to access bpage, because it is + io_fixed and oldest_modification != 0. Thus, it cannot be + relocated in the buffer pool or removed from flush_list or + LRU_list. */ + ut_ad(!buf_pool_mutex_own()); + ut_ad(!mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); + ut_ad(bpage->oldest_modification != 0); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0); +#endif + ut_ad(bpage->newest_modification != 0); + +#ifdef UNIV_LOG_DEBUG + if (!univ_log_debug_warned) { + univ_log_debug_warned = TRUE; + fputs("Warning: cannot force log to disk if" + " UNIV_LOG_DEBUG is defined!\n" + "Crash recovery will not work!\n", + stderr); + } +#else + /* Force the log to the disk before writing the modified block */ + log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE); +#endif + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */ + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + case BUF_BLOCK_ZIP_DIRTY: + frame = bpage->zip.data; + if (UNIV_LIKELY(srv_use_checksums)) { + ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM) + == page_zip_calc_checksum(frame, zip_size)); + } + mach_write_ull(frame + FIL_PAGE_LSN, + bpage->newest_modification); + memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8); + break; + case BUF_BLOCK_FILE_PAGE: + frame = bpage->zip.data; + if (!frame) { + frame = ((buf_block_t*) bpage)->frame; + } + + buf_flush_init_for_writing(((buf_block_t*) bpage)->frame, + bpage->zip.data + ? &bpage->zip : NULL, + bpage->newest_modification); + break; + } + + if (!srv_use_doublewrite_buf || !trx_doublewrite) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage); + } else { + buf_flush_post_to_doublewrite_buf(bpage); + } +} + +/************************************************************************ +Writes a flushable page asynchronously from the buffer pool to a file. +NOTE: in simulated aio we must call +os_aio_simulated_wake_handler_threads after we have posted a batch of +writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be +held upon entering this function, and they will be released by this +function. */ +static +void +buf_flush_page( +/*===========*/ + buf_page_t* bpage, /* in: buffer control block */ + enum buf_flush flush_type) /* in: BUF_FLUSH_LRU + or BUF_FLUSH_LIST */ +{ + mutex_t* block_mutex; + ibool is_uncompressed; + + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + ut_ad(buf_pool_mutex_own()); + ut_ad(buf_page_in_file(bpage)); + + block_mutex = buf_page_get_mutex(bpage); + ut_ad(mutex_own(block_mutex)); + + ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); + + buf_page_set_io_fix(bpage, BUF_IO_WRITE); + + buf_page_set_flush_type(bpage, flush_type); + + if (buf_pool->n_flush[flush_type] == 0) { + + os_event_reset(buf_pool->no_flush[flush_type]); + } + + buf_pool->n_flush[flush_type]++; + + is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex)); + + switch (flush_type) { + ibool is_s_latched; + case BUF_FLUSH_LIST: + /* If the simulated aio thread is not running, we must + not wait for any latch, as we may end up in a deadlock: + if buf_fix_count == 0, then we know we need not wait */ + + is_s_latched = (bpage->buf_fix_count == 0); + if (is_s_latched && is_uncompressed) { + rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock, + BUF_IO_WRITE); + } + + mutex_exit(block_mutex); + buf_pool_mutex_exit(); + + /* Even though bpage is not protected by any mutex at + this point, it is safe to access bpage, because it is + io_fixed and oldest_modification != 0. Thus, it + cannot be relocated in the buffer pool or removed from + flush_list or LRU_list. */ + + if (!is_s_latched) { + buf_flush_buffered_writes(); + + if (is_uncompressed) { + rw_lock_s_lock_gen(&((buf_block_t*) bpage) + ->lock, BUF_IO_WRITE); + } + } + + break; + + case BUF_FLUSH_LRU: + /* VERY IMPORTANT: + Because any thread may call the LRU flush, even when owning + locks on pages, to avoid deadlocks, we must make sure that the + s-lock is acquired on the page without waiting: this is + accomplished because buf_flush_ready_for_flush() must hold, + and that requires the page not to be bufferfixed. */ + + if (is_uncompressed) { + rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock, + BUF_IO_WRITE); + } + + /* Note that the s-latch is acquired before releasing the + buf_pool mutex: this ensures that the latch is acquired + immediately. */ + + mutex_exit(block_mutex); + buf_pool_mutex_exit(); + break; + + default: + ut_error; + } + + /* Even though bpage is not protected by any mutex at this + point, it is safe to access bpage, because it is io_fixed and + oldest_modification != 0. Thus, it cannot be relocated in the + buffer pool or removed from flush_list or LRU_list. */ + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Flushing %u space %u page %u\n", + flush_type, bpage->space, bpage->offset); + } +#endif /* UNIV_DEBUG */ + buf_flush_write_block_low(bpage); +} + +/*************************************************************** +Flushes to disk all flushable pages within the flush area. */ +static +ulint +buf_flush_try_neighbors( +/*====================*/ + /* out: number of pages flushed */ + ulint space, /* in: space id */ + ulint offset, /* in: page offset */ + enum buf_flush flush_type) /* in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST */ +{ + buf_page_t* bpage; + ulint low, high; + ulint count = 0; + ulint i; + + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { + /* If there is little space, it is better not to flush any + block except from the end of the LRU list */ + + low = offset; + high = offset + 1; + } else { + /* When flushed, dirty blocks are searched in neighborhoods of + this size, and flushed along with the original page. */ + + ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA, + buf_pool->curr_size / 16); + + low = (offset / buf_flush_area) * buf_flush_area; + high = (offset / buf_flush_area + 1) * buf_flush_area; + } + + /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */ + + if (high > fil_space_get_size(space)) { + high = fil_space_get_size(space); + } + + buf_pool_mutex_enter(); + + for (i = low; i < high; i++) { + + bpage = buf_page_hash_get(space, i); + + if (!bpage) { + + continue; + } + + ut_a(buf_page_in_file(bpage)); + + /* We avoid flushing 'non-old' blocks in an LRU flush, + because the flushed blocks are soon freed */ + + if (flush_type != BUF_FLUSH_LRU + || i == offset + || buf_page_is_old(bpage)) { + mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_flush_ready_for_flush(bpage, flush_type) + && (i == offset || !bpage->buf_fix_count)) { + /* We only try to flush those + neighbors != offset where the buf fix count is + zero, as we then know that we probably can + latch the page without a semaphore wait. + Semaphore waits are expensive because we must + flush the doublewrite buffer before we start + waiting. */ + + buf_flush_page(bpage, flush_type); + ut_ad(!mutex_own(block_mutex)); + count++; + + buf_pool_mutex_enter(); + } else { + mutex_exit(block_mutex); + } + } + } + + buf_pool_mutex_exit(); + + return(count); +} + +/*********************************************************************** +This utility flushes dirty blocks from the end of the LRU list or flush_list. +NOTE 1: in the case of an LRU flush the calling thread may own latches to +pages: to avoid deadlocks, this function must be written so that it cannot +end up waiting for these latches! NOTE 2: in the case of a flush list flush, +the calling thread is not allowed to own any latches on pages! */ +UNIV_INTERN +ulint +buf_flush_batch( +/*============*/ + /* out: number of blocks for which the + write request was queued; + ULINT_UNDEFINED if there was a flush + of the same type already running */ + enum buf_flush flush_type, /* in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST; if BUF_FLUSH_LIST, + then the caller must not own any + latches on pages */ + ulint min_n, /* in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + ib_uint64_t lsn_limit) /* in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ +{ + buf_page_t* bpage; + ulint page_count = 0; + ulint old_page_count; + ulint space; + ulint offset; + + ut_ad((flush_type == BUF_FLUSH_LRU) + || (flush_type == BUF_FLUSH_LIST)); +#ifdef UNIV_SYNC_DEBUG + ut_ad((flush_type != BUF_FLUSH_LIST) + || sync_thread_levels_empty_gen(TRUE)); +#endif /* UNIV_SYNC_DEBUG */ + buf_pool_mutex_enter(); + + if ((buf_pool->n_flush[flush_type] > 0) + || (buf_pool->init_flush[flush_type] == TRUE)) { + + /* There is already a flush batch of the same type running */ + + buf_pool_mutex_exit(); + + return(ULINT_UNDEFINED); + } + + buf_pool->init_flush[flush_type] = TRUE; + + for (;;) { +flush_next: + /* If we have flushed enough, leave the loop */ + if (page_count >= min_n) { + + break; + } + + /* Start from the end of the list looking for a suitable + block to be flushed. */ + + if (flush_type == BUF_FLUSH_LRU) { + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + } else { + ut_ad(flush_type == BUF_FLUSH_LIST); + + bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + if (!bpage + || bpage->oldest_modification >= lsn_limit) { + /* We have flushed enough */ + + break; + } + ut_ad(bpage->in_flush_list); + } + + /* Note that after finding a single flushable page, we try to + flush also all its neighbors, and after that start from the + END of the LRU list or flush list again: the list may change + during the flushing and we cannot safely preserve within this + function a pointer to a block in the list! */ + + do { + mutex_t*block_mutex = buf_page_get_mutex(bpage); + ibool ready; + + ut_a(buf_page_in_file(bpage)); + + mutex_enter(block_mutex); + ready = buf_flush_ready_for_flush(bpage, flush_type); + mutex_exit(block_mutex); + + if (ready) { + space = buf_page_get_space(bpage); + offset = buf_page_get_page_no(bpage); + + buf_pool_mutex_exit(); + + old_page_count = page_count; + + if (srv_flush_neighbor_pages) { + /* Try to flush also all the neighbors */ + page_count += buf_flush_try_neighbors( + space, offset, flush_type); + } else { + /* Try to flush the page only */ + buf_pool_mutex_enter(); + + mutex_t* block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); + + buf_page_t* bpage_tmp = buf_page_hash_get(space, offset); + if (bpage_tmp) { + buf_flush_page(bpage_tmp, flush_type); + page_count++; + } + } + /* fprintf(stderr, + "Flush type %lu, page no %lu, neighb %lu\n", + flush_type, offset, + page_count - old_page_count); */ + + buf_pool_mutex_enter(); + goto flush_next; + + } else if (flush_type == BUF_FLUSH_LRU) { + bpage = UT_LIST_GET_PREV(LRU, bpage); + } else { + ut_ad(flush_type == BUF_FLUSH_LIST); + + bpage = UT_LIST_GET_PREV(list, bpage); + ut_ad(!bpage || bpage->in_flush_list); + } + } while (bpage != NULL); + + /* If we could not find anything to flush, leave the loop */ + + break; + } + + buf_pool->init_flush[flush_type] = FALSE; + + if (buf_pool->n_flush[flush_type] == 0) { + + /* The running flush batch has ended */ + + os_event_set(buf_pool->no_flush[flush_type]); + } + + buf_pool_mutex_exit(); + + buf_flush_buffered_writes(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && page_count > 0) { + ut_a(flush_type == BUF_FLUSH_LRU + || flush_type == BUF_FLUSH_LIST); + fprintf(stderr, flush_type == BUF_FLUSH_LRU + ? "Flushed %lu pages in LRU flush\n" + : "Flushed %lu pages in flush list flush\n", + (ulong) page_count); + } +#endif /* UNIV_DEBUG */ + + srv_buf_pool_flushed += page_count; + + return(page_count); +} + +/********************************************************************** +Waits until a flush batch of the given type ends */ +UNIV_INTERN +void +buf_flush_wait_batch_end( +/*=====================*/ + enum buf_flush type) /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ +{ + ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST)); + + os_event_wait(buf_pool->no_flush[type]); +} + +/********************************************************************** +Gives a recommendation of how many blocks should be flushed to establish +a big enough margin of replaceable blocks near the end of the LRU list +and in the free list. */ +static +ulint +buf_flush_LRU_recommendation(void) +/*==============================*/ + /* out: number of blocks which should be flushed + from the end of the LRU list */ +{ + buf_page_t* bpage; + ulint n_replaceable; + ulint distance = 0; + ibool have_LRU_mutex = FALSE; + + if(UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + have_LRU_mutex = TRUE; + + //buf_pool_mutex_enter(); + if (have_LRU_mutex) + buf_pool_mutex_enter(); + + n_replaceable = UT_LIST_GET_LEN(buf_pool->free); + + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + while ((bpage != NULL) + && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN + + BUF_FLUSH_EXTRA_MARGIN) + && (distance < BUF_LRU_FREE_SEARCH_LEN)) { + + if (!bpage->in_LRU_list) { + /* reatart. but it is very optimistic */ + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + continue; + } + + mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_flush_ready_for_replace(bpage)) { + n_replaceable++; + } + + mutex_exit(block_mutex); + + distance++; + + bpage = UT_LIST_GET_PREV(LRU, bpage); + } + + //buf_pool_mutex_exit(); + if (have_LRU_mutex) + buf_pool_mutex_exit(); + + if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { + + return(0); + } + + return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN + - n_replaceable); +} + +/************************************************************************* +Flushes pages from the end of the LRU list if there is too small a margin +of replaceable pages there or in the free list. VERY IMPORTANT: this function +is called also by threads which have locks on pages. To avoid deadlocks, we +flush only pages such that the s-lock required for flushing can be acquired +immediately, without waiting. */ +UNIV_INTERN +void +buf_flush_free_margin( +/*=======================*/ + ibool wait) +{ + ulint n_to_flush; + ulint n_flushed; + + n_to_flush = buf_flush_LRU_recommendation(); + + if (n_to_flush > 0) { + n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0); + if (wait && n_flushed == ULINT_UNDEFINED) { + /* There was an LRU type flush batch already running; + let us wait for it to end */ + + buf_flush_wait_batch_end(BUF_FLUSH_LRU); + } + } +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/********************************************************************** +Validates the flush list. */ +static +ibool +buf_flush_validate_low(void) +/*========================*/ + /* out: TRUE if ok */ +{ + buf_page_t* bpage; + + UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list); + + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (bpage != NULL) { + const ib_uint64_t om = bpage->oldest_modification; + ut_ad(bpage->in_flush_list); + ut_a(buf_page_in_file(bpage)); + ut_a(om > 0); + + bpage = UT_LIST_GET_NEXT(list, bpage); + + ut_a(!bpage || om >= bpage->oldest_modification); + } + + return(TRUE); +} + +/********************************************************************** +Validates the flush list. */ +UNIV_INTERN +ibool +buf_flush_validate(void) +/*====================*/ + /* out: TRUE if ok */ +{ + ibool ret; + + buf_pool_mutex_enter(); + + ret = buf_flush_validate_low(); + + buf_pool_mutex_exit(); + + return(ret); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c new file mode 100644 index 00000000000..dcf4d70c3eb --- /dev/null +++ b/storage/xtradb/buf/buf0lru.c @@ -0,0 +1,2060 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" + +#ifdef UNIV_NONINL +#include "buf0lru.ic" +#endif + +#include "ut0byte.h" +#include "ut0lst.h" +#include "ut0rnd.h" +#include "sync0sync.h" +#include "sync0rw.h" +#include "hash0hash.h" +#include "os0sync.h" +#include "fil0fil.h" +#include "btr0btr.h" +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "os0file.h" +#include "page0zip.h" +#include "log0recv.h" +#include "srv0srv.h" + +/* The number of blocks from the LRU_old pointer onward, including the block +pointed to, must be 3/8 of the whole LRU list length, except that the +tolerance defined below is allowed. Note that the tolerance must be small +enough such that for even the BUF_LRU_OLD_MIN_LEN long LRU list, the +LRU_old pointer is not allowed to point to either end of the LRU list. */ + +#define BUF_LRU_OLD_TOLERANCE 20 + +/* The whole LRU list length is divided by this number to determine an +initial segment in buf_LRU_get_recent_limit */ + +#define BUF_LRU_INITIAL_RATIO 8 + +/* When dropping the search hash index entries before deleting an ibd +file, we build a local array of pages belonging to that tablespace +in the buffer pool. Following is the size of that array. */ +#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024 + +/* If we switch on the InnoDB monitor because there are too few available +frames in the buffer pool, we set this to TRUE */ +UNIV_INTERN ibool buf_lru_switched_on_innodb_mon = FALSE; + +/********************************************************************** +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics, +buf_LRU_evict_from_unzip_LRU() decides if we want to evict from +unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the +uncompressed frame (meaning we can evict dirty blocks as well). From +the regular LRU, we will evict the entire block (i.e.: both the +uncompressed and compressed data), which must be clean. */ + +/* Number of intervals for which we keep the history of these stats. +Each interval is 1 second, defined by the rate at which +srv_error_monitor_thread() calls buf_LRU_stat_update(). */ +#define BUF_LRU_STAT_N_INTERVAL 50 + +/* Co-efficient with which we multiply I/O operations to equate them +with page_zip_decompress() operations. */ +#define BUF_LRU_IO_TO_UNZIP_FACTOR 50 + +/* Sampled values buf_LRU_stat_cur. +Protected by buf_pool_mutex. Updated by buf_LRU_stat_update(). */ +static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL]; +/* Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */ +static ulint buf_LRU_stat_arr_ind; + +/* Current operation counters. Not protected by any mutex. Cleared +by buf_LRU_stat_update(). */ +UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_cur; + +/* Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Protected by buf_pool_mutex. */ +UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_sum; + +/********************************************************************** +Takes a block out of the LRU list and page hash table. +If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), +the object will be freed and buf_pool_zip_mutex will be released. + +If a compressed page or a compressed-only block descriptor is freed, +other compressed pages or compressed-only block descriptors may be +relocated. */ +static +enum buf_page_state +buf_LRU_block_remove_hashed_page( +/*=============================*/ + /* out: the new state of the block + (BUF_BLOCK_ZIP_FREE if the state was + BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH + otherwise) */ + buf_page_t* bpage, /* in: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ + ibool zip); /* in: TRUE if should remove also the + compressed page of an uncompressed page */ +/********************************************************************** +Puts a file page whose has no hash index to the free list. */ +static +void +buf_LRU_block_free_hashed_page( +/*===========================*/ + buf_block_t* block); /* in: block, must contain a file page and + be in a state where it can be freed */ + +/********************************************************************** +Determines if the unzip_LRU list should be used for evicting a victim +instead of the general LRU list. */ +UNIV_INLINE +ibool +buf_LRU_evict_from_unzip_LRU(void) +/*==============================*/ + /* out: TRUE if should use unzip_LRU */ +{ + ulint io_avg; + ulint unzip_avg; + + ut_ad(buf_pool_mutex_own()); + + /* If the unzip_LRU list is empty, we can only use the LRU. */ + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) { + return(FALSE); + } + + /* If unzip_LRU is at most 10% of the size of the LRU list, + then use the LRU. This slack allows us to keep hot + decompressed pages in the buffer pool. */ + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) + <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) { + return(FALSE); + } + + /* If eviction hasn't started yet, we assume by default + that a workload is disk bound. */ + if (buf_pool->freed_page_clock == 0) { + return(TRUE); + } + + /* Calculate the average over past intervals, and add the values + of the current interval. */ + io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.io; + unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.unzip; + + /* Decide based on our formula. If the load is I/O bound + (unzip_avg is smaller than the weighted io_avg), evict an + uncompressed frame from unzip_LRU. Otherwise we assume that + the load is CPU bound and evict from the regular LRU. */ + return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR); +} + +/********************************************************************** +Attempts to drop page hash index on a batch of pages belonging to a +particular space id. */ +static +void +buf_LRU_drop_page_hash_batch( +/*=========================*/ + ulint space_id, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + const ulint* arr, /* in: array of page_no */ + ulint count) /* in: number of entries in array */ +{ + ulint i; + + ut_ad(arr != NULL); + ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE); + + for (i = 0; i < count; ++i) { + btr_search_drop_page_hash_when_freed(space_id, zip_size, + arr[i]); + } +} + +/********************************************************************** +When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page +hash index entries belonging to that table. This function tries to +do that in batch. Note that this is a 'best effort' attempt and does +not guarantee that ALL hash entries will be removed. */ +static +void +buf_LRU_drop_page_hash_for_tablespace( +/*==================================*/ + ulint id) /* in: space id */ +{ + buf_page_t* bpage; + ulint* page_arr; + ulint num_entries; + ulint zip_size; + + zip_size = fil_space_get_zip_size(id); + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + /* Somehow, the tablespace does not exist. Nothing to drop. */ + ut_ad(0); + return; + } + + page_arr = ut_malloc(sizeof(ulint) + * BUF_LRU_DROP_SEARCH_HASH_SIZE); + buf_pool_mutex_enter(); + +scan_again: + num_entries = 0; + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + while (bpage != NULL) { + mutex_t* block_mutex = buf_page_get_mutex(bpage); + buf_page_t* prev_bpage; + + mutex_enter(block_mutex); + prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + + ut_a(buf_page_in_file(bpage)); + + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE + || bpage->space != id + || bpage->buf_fix_count > 0 + || bpage->io_fix != BUF_IO_NONE) { + /* We leave the fixed pages as is in this scan. + To be dealt with later in the final scan. */ + mutex_exit(block_mutex); + goto next_page; + } + + if (((buf_block_t*) bpage)->is_hashed) { + + /* Store the offset(i.e.: page_no) in the array + so that we can drop hash index in a batch + later. */ + page_arr[num_entries] = bpage->offset; + mutex_exit(block_mutex); + ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); + ++num_entries; + + if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { + goto next_page; + } + /* Array full. We release the buf_pool_mutex to + obey the latching order. */ + buf_pool_mutex_exit(); + + buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, + num_entries); + num_entries = 0; + buf_pool_mutex_enter(); + } else { + mutex_exit(block_mutex); + } + +next_page: + /* Note that we may have released the buf_pool mutex + above after reading the prev_bpage during processing + of a page_hash_batch (i.e.: when the array was full). + This means that prev_bpage can change in LRU list. + This is OK because this function is a 'best effort' + to drop as many search hash entries as possible and + it does not guarantee that ALL such entries will be + dropped. */ + bpage = prev_bpage; + + /* If, however, bpage has been removed from LRU list + to the free list then we should restart the scan. + bpage->state is protected by buf_pool mutex. */ + if (bpage && !buf_page_in_file(bpage)) { + ut_a(num_entries == 0); + goto scan_again; + } + } + + buf_pool_mutex_exit(); + + /* Drop any remaining batch of search hashed pages. */ + buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); + ut_free(page_arr); +} + +/********************************************************************** +Invalidates all pages belonging to a given tablespace when we are deleting +the data file(s) of that tablespace. */ +UNIV_INTERN +void +buf_LRU_invalidate_tablespace( +/*==========================*/ + ulint id) /* in: space id */ +{ + buf_page_t* bpage; + ibool all_freed; + + /* Before we attempt to drop pages one by one we first + attempt to drop page hash index entries in batches to make + it more efficient. The batching attempt is a best effort + attempt and does not guarantee that all pages hash entries + will be dropped. We get rid of remaining page hash entries + one by one below. */ + buf_LRU_drop_page_hash_for_tablespace(id); + +scan_again: + buf_pool_mutex_enter(); + + all_freed = TRUE; + + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + while (bpage != NULL) { + mutex_t* block_mutex = buf_page_get_mutex(bpage); + buf_page_t* prev_bpage; + + ut_a(buf_page_in_file(bpage)); + + mutex_enter(block_mutex); + prev_bpage = UT_LIST_GET_PREV(LRU, bpage); + + if (buf_page_get_space(bpage) == id) { + if (bpage->buf_fix_count > 0 + || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + + /* We cannot remove this page during + this scan yet; maybe the system is + currently reading it in, or flushing + the modifications to the file */ + + all_freed = FALSE; + + goto next_page; + } + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Dropping space %lu page %lu\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif + if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE + && ((buf_block_t*) bpage)->is_hashed) { + ulint page_no; + ulint zip_size; + + buf_pool_mutex_exit(); + + zip_size = buf_page_get_zip_size(bpage); + page_no = buf_page_get_page_no(bpage); + + mutex_exit(block_mutex); + + /* Note that the following call will acquire + an S-latch on the page */ + + btr_search_drop_page_hash_when_freed( + id, zip_size, page_no); + goto scan_again; + } + + if (bpage->oldest_modification != 0) { + + buf_flush_remove(bpage); + } + + /* Remove from the LRU list */ + if (buf_LRU_block_remove_hashed_page(bpage, TRUE) + != BUF_BLOCK_ZIP_FREE) { + buf_LRU_block_free_hashed_page((buf_block_t*) + bpage); + } else { + /* The block_mutex should have been + released by buf_LRU_block_remove_hashed_page() + when it returns BUF_BLOCK_ZIP_FREE. */ + ut_ad(block_mutex == &buf_pool_zip_mutex); + ut_ad(!mutex_own(block_mutex)); + + /* The compressed block descriptor + (bpage) has been deallocated and + block_mutex released. Also, + buf_buddy_free() may have relocated + prev_bpage. Rescan the LRU list. */ + + bpage = UT_LIST_GET_LAST(buf_pool->LRU); + continue; + } + } +next_page: + mutex_exit(block_mutex); + bpage = prev_bpage; + } + + buf_pool_mutex_exit(); + + if (!all_freed) { + os_thread_sleep(20000); + + goto scan_again; + } +} + +/********************************************************************** +Gets the minimum LRU_position field for the blocks in an initial segment +(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not +guaranteed to be precise, because the ulint_clock may wrap around. */ +UNIV_INTERN +ulint +buf_LRU_get_recent_limit(void) +/*==========================*/ + /* out: the limit; zero if could not determine it */ +{ + const buf_page_t* bpage; + ulint len; + ulint limit; + + buf_pool_mutex_enter(); + + len = UT_LIST_GET_LEN(buf_pool->LRU); + + if (len < BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is too short to do read-ahead */ + + buf_pool_mutex_exit(); + + return(0); + } + + bpage = UT_LIST_GET_FIRST(buf_pool->LRU); + + limit = buf_page_get_LRU_position(bpage) - len / BUF_LRU_INITIAL_RATIO; + + buf_pool_mutex_exit(); + + return(limit); +} + +/************************************************************************ +Insert a compressed block into buf_pool->zip_clean in the LRU order. */ +UNIV_INTERN +void +buf_LRU_insert_zip_clean( +/*=====================*/ + buf_page_t* bpage) /* in: pointer to the block in question */ +{ + buf_page_t* b; + + ut_ad(buf_pool_mutex_own()); + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); + + /* Find the first successor of bpage in the LRU list + that is in the zip_clean list. */ + b = bpage; + do { + b = UT_LIST_GET_NEXT(LRU, b); + } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE); + + /* Insert bpage before b, i.e., after the predecessor of b. */ + if (b) { + b = UT_LIST_GET_PREV(list, b); + } + + if (b) { + UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage); + } else { + UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage); + } +} + +/********************************************************************** +Try to free an uncompressed page of a compressed block from the unzip +LRU list. The compressed page is preserved, and it need not be clean. */ +UNIV_INLINE +ibool +buf_LRU_free_from_unzip_LRU_list( +/*=============================*/ + /* out: TRUE if freed */ + ulint n_iterations) /* in: how many times this has been called + repeatedly without result: a high value means + that we should search farther; we will search + n_iterations / 5 of the unzip_LRU list, + or nothing if n_iterations >= 5 */ +{ + buf_block_t* block; + ulint distance; + + ut_ad(buf_pool_mutex_own()); + + /* Theoratically it should be much easier to find a victim + from unzip_LRU as we can choose even a dirty block (as we'll + be evicting only the uncompressed frame). In a very unlikely + eventuality that we are unable to find a victim from + unzip_LRU, we fall back to the regular LRU list. We do this + if we have done five iterations so far. */ + + if (UNIV_UNLIKELY(n_iterations >= 5) + || !buf_LRU_evict_from_unzip_LRU()) { + + return(FALSE); + } + + distance = 100 + (n_iterations + * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5; + + for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); + UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0); + block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) { + + enum buf_lru_free_block_status freed; + + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + + mutex_enter(&block->mutex); + freed = buf_LRU_free_block(&block->page, FALSE, NULL); + mutex_exit(&block->mutex); + + switch (freed) { + case BUF_LRU_FREED: + return(TRUE); + + case BUF_LRU_CANNOT_RELOCATE: + /* If we failed to relocate, try + regular LRU eviction. */ + return(FALSE); + + case BUF_LRU_NOT_FREED: + /* The block was buffer-fixed or I/O-fixed. + Keep looking. */ + continue; + } + + /* inappropriate return value from + buf_LRU_free_block() */ + ut_error; + } + + return(FALSE); +} + +/********************************************************************** +Try to free a clean page from the common LRU list. */ +UNIV_INLINE +ibool +buf_LRU_free_from_common_LRU_list( +/*==============================*/ + /* out: TRUE if freed */ + ulint n_iterations) /* in: how many times this has been called + repeatedly without result: a high value means + that we should search farther; if + n_iterations < 10, then we search + n_iterations / 10 * buf_pool->curr_size + pages from the end of the LRU list */ +{ + buf_page_t* bpage; + ulint distance; + + ut_ad(buf_pool_mutex_own()); + + distance = 100 + (n_iterations * buf_pool->curr_size) / 10; + + for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); + UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0); + bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) { + + enum buf_lru_free_block_status freed; + mutex_t* block_mutex + = buf_page_get_mutex(bpage); + + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + mutex_enter(block_mutex); + freed = buf_LRU_free_block(bpage, TRUE, NULL); + mutex_exit(block_mutex); + + switch (freed) { + case BUF_LRU_FREED: + return(TRUE); + + case BUF_LRU_NOT_FREED: + /* The block was dirty, buffer-fixed, or I/O-fixed. + Keep looking. */ + continue; + + case BUF_LRU_CANNOT_RELOCATE: + /* This should never occur, because we + want to discard the compressed page too. */ + break; + } + + /* inappropriate return value from + buf_LRU_free_block() */ + ut_error; + } + + return(FALSE); +} + +/********************************************************************** +Try to free a replaceable block. */ +UNIV_INTERN +ibool +buf_LRU_search_and_free_block( +/*==========================*/ + /* out: TRUE if found and freed */ + ulint n_iterations) /* in: how many times this has been called + repeatedly without result: a high value means + that we should search farther; if + n_iterations < 10, then we search + n_iterations / 10 * buf_pool->curr_size + pages from the end of the LRU list; if + n_iterations < 5, then we will also search + n_iterations / 5 of the unzip_LRU list. */ +{ + ibool freed = FALSE; + + buf_pool_mutex_enter(); + + freed = buf_LRU_free_from_unzip_LRU_list(n_iterations); + + if (!freed) { + freed = buf_LRU_free_from_common_LRU_list(n_iterations); + } + + if (!freed) { + buf_pool->LRU_flush_ended = 0; + } else if (buf_pool->LRU_flush_ended > 0) { + buf_pool->LRU_flush_ended--; + } + + buf_pool_mutex_exit(); + + return(freed); +} + +/********************************************************************** +Tries to remove LRU flushed blocks from the end of the LRU list and put them +to the free list. This is beneficial for the efficiency of the insert buffer +operation, as flushed pages from non-unique non-clustered indexes are here +taken out of the buffer pool, and their inserts redirected to the insert +buffer. Otherwise, the flushed blocks could get modified again before read +operations need new buffer blocks, and the i/o work done in flushing would be +wasted. */ +UNIV_INTERN +void +buf_LRU_try_free_flushed_blocks(void) +/*=================================*/ +{ + buf_pool_mutex_enter(); + + while (buf_pool->LRU_flush_ended > 0) { + + buf_pool_mutex_exit(); + + buf_LRU_search_and_free_block(1); + + buf_pool_mutex_enter(); + } + + buf_pool_mutex_exit(); +} + +/********************************************************************** +Returns TRUE if less than 25 % of the buffer pool is available. This can be +used in heuristics to prevent huge transactions eating up the whole buffer +pool for their locks. */ +UNIV_INTERN +ibool +buf_LRU_buf_pool_running_out(void) +/*==============================*/ + /* out: TRUE if less than 25 % of buffer pool + left */ +{ + ibool ret = FALSE; + + buf_pool_mutex_enter(); + + if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 4) { + + ret = TRUE; + } + + buf_pool_mutex_exit(); + + return(ret); +} + +/********************************************************************** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, returns NULL. */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_only(void) +/*=======================*/ + /* out: a free control block, or NULL + if the buf_block->free list is empty */ +{ + buf_block_t* block; + + ut_ad(buf_pool_mutex_own()); + + block = (buf_block_t*) UT_LIST_GET_FIRST(buf_pool->free); + + if (block) { + ut_ad(block->page.in_free_list); + ut_d(block->page.in_free_list = FALSE); + ut_ad(!block->page.in_flush_list); + ut_ad(!block->page.in_LRU_list); + ut_a(!buf_page_in_file(&block->page)); + UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + + mutex_enter(&block->mutex); + + buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); + UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); + + mutex_exit(&block->mutex); + } + + return(block); +} + +/********************************************************************** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, blocks are moved from the end of the +LRU list to the free list. */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_block( +/*===================*/ + /* out: the free control block, + in state BUF_BLOCK_READY_FOR_USE */ + ulint zip_size) /* in: compressed page size in bytes, + or 0 if uncompressed tablespace */ +{ + buf_block_t* block = NULL; + ibool freed; + ulint n_iterations = 1; + ibool mon_value_was = FALSE; + ibool started_monitor = FALSE; +loop: + buf_pool_mutex_enter(); + + if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: ERROR: over 95 percent of the buffer pool" + " is occupied by\n" + "InnoDB: lock heaps or the adaptive hash index!" + " Check that your\n" + "InnoDB: transactions do not set too many row locks.\n" + "InnoDB: Your buffer pool size is %lu MB." + " Maybe you should make\n" + "InnoDB: the buffer pool bigger?\n" + "InnoDB: We intentionally generate a seg fault" + " to print a stack trace\n" + "InnoDB: on Linux!\n", + (ulong) (buf_pool->curr_size + / (1024 * 1024 / UNIV_PAGE_SIZE))); + + ut_error; + + } else if (!recv_recovery_on + && (UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU)) + < buf_pool->curr_size / 3) { + + if (!buf_lru_switched_on_innodb_mon) { + + /* Over 67 % of the buffer pool is occupied by lock + heaps or the adaptive hash index. This may be a memory + leak! */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: WARNING: over 67 percent of" + " the buffer pool is occupied by\n" + "InnoDB: lock heaps or the adaptive" + " hash index! Check that your\n" + "InnoDB: transactions do not set too many" + " row locks.\n" + "InnoDB: Your buffer pool size is %lu MB." + " Maybe you should make\n" + "InnoDB: the buffer pool bigger?\n" + "InnoDB: Starting the InnoDB Monitor to print" + " diagnostics, including\n" + "InnoDB: lock heap and hash index sizes.\n", + (ulong) (buf_pool->curr_size + / (1024 * 1024 / UNIV_PAGE_SIZE))); + + buf_lru_switched_on_innodb_mon = TRUE; + srv_print_innodb_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } + } else if (buf_lru_switched_on_innodb_mon) { + + /* Switch off the InnoDB Monitor; this is a simple way + to stop the monitor if the situation becomes less urgent, + but may also surprise users if the user also switched on the + monitor! */ + + buf_lru_switched_on_innodb_mon = FALSE; + srv_print_innodb_monitor = FALSE; + } + + /* If there is a block in the free list, take it */ + block = buf_LRU_get_free_only(); + if (block) { + +#ifdef UNIV_DEBUG + block->page.zip.m_start = +#endif /* UNIV_DEBUG */ + block->page.zip.m_end = + block->page.zip.m_nonempty = + block->page.zip.n_blobs = 0; + + if (UNIV_UNLIKELY(zip_size)) { + ibool lru; + page_zip_set_size(&block->page.zip, zip_size); + block->page.zip.data = buf_buddy_alloc(zip_size, &lru); + UNIV_MEM_DESC(block->page.zip.data, zip_size, block); + } else { + page_zip_set_size(&block->page.zip, 0); + block->page.zip.data = NULL; + } + + buf_pool_mutex_exit(); + + if (started_monitor) { + srv_print_innodb_monitor = mon_value_was; + } + + return(block); + } + + /* If no block was in the free list, search from the end of the LRU + list and try to free a block there */ + + buf_pool_mutex_exit(); + + freed = buf_LRU_search_and_free_block(n_iterations); + + if (freed > 0) { + goto loop; + } + + if (n_iterations > 30) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: difficult to find free blocks in\n" + "InnoDB: the buffer pool (%lu search iterations)!" + " Consider\n" + "InnoDB: increasing the buffer pool size.\n" + "InnoDB: It is also possible that" + " in your Unix version\n" + "InnoDB: fsync is very slow, or" + " completely frozen inside\n" + "InnoDB: the OS kernel. Then upgrading to" + " a newer version\n" + "InnoDB: of your operating system may help." + " Look at the\n" + "InnoDB: number of fsyncs in diagnostic info below.\n" + "InnoDB: Pending flushes (fsync) log: %lu;" + " buffer pool: %lu\n" + "InnoDB: %lu OS file reads, %lu OS file writes," + " %lu OS fsyncs\n" + "InnoDB: Starting InnoDB Monitor to print further\n" + "InnoDB: diagnostics to the standard output.\n", + (ulong) n_iterations, + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); + + mon_value_was = srv_print_innodb_monitor; + started_monitor = TRUE; + srv_print_innodb_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } + + /* No free block was found: try to flush the LRU list */ + + buf_flush_free_margin(TRUE); + ++srv_buf_pool_wait_free; + + os_aio_simulated_wake_handler_threads(); + + buf_pool_mutex_enter(); + + if (buf_pool->LRU_flush_ended > 0) { + /* We have written pages in an LRU flush. To make the insert + buffer more efficient, we try to move these pages to the free + list. */ + + buf_pool_mutex_exit(); + + buf_LRU_try_free_flushed_blocks(); + } else { + buf_pool_mutex_exit(); + } + + if (n_iterations > 10) { + + os_thread_sleep(500000); + } + + n_iterations++; + + goto loop; +} + +/*********************************************************************** +Moves the LRU_old pointer so that the length of the old blocks list +is inside the allowed limits. */ +UNIV_INLINE +void +buf_LRU_old_adjust_len(void) +/*========================*/ +{ + ulint old_len; + ulint new_len; + + ut_a(buf_pool->LRU_old); + ut_ad(buf_pool_mutex_own()); +#if 3 * (BUF_LRU_OLD_MIN_LEN / 8) <= BUF_LRU_OLD_TOLERANCE + 5 +# error "3 * (BUF_LRU_OLD_MIN_LEN / 8) <= BUF_LRU_OLD_TOLERANCE + 5" +#endif +#ifdef UNIV_LRU_DEBUG + /* buf_pool->LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool->LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + + for (;;) { + old_len = buf_pool->LRU_old_len; + new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); + + ut_ad(buf_pool->LRU_old->in_LRU_list); + ut_a(buf_pool->LRU_old); +#ifdef UNIV_LRU_DEBUG + ut_a(buf_pool->LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + + /* Update the LRU_old pointer if necessary */ + + if (old_len < new_len - BUF_LRU_OLD_TOLERANCE) { + + buf_pool->LRU_old = UT_LIST_GET_PREV( + LRU, buf_pool->LRU_old); +#ifdef UNIV_LRU_DEBUG + ut_a(!buf_pool->LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + buf_page_set_old(buf_pool->LRU_old, TRUE); + buf_pool->LRU_old_len++; + + } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { + + buf_page_set_old(buf_pool->LRU_old, FALSE); + buf_pool->LRU_old = UT_LIST_GET_NEXT( + LRU, buf_pool->LRU_old); + buf_pool->LRU_old_len--; + } else { + return; + } + } +} + +/*********************************************************************** +Initializes the old blocks pointer in the LRU list. This function should be +called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */ +static +void +buf_LRU_old_init(void) +/*==================*/ +{ + buf_page_t* bpage; + + ut_ad(buf_pool_mutex_own()); + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); + + /* We first initialize all blocks in the LRU list as old and then use + the adjust function to move the LRU_old pointer to the right + position */ + + bpage = UT_LIST_GET_FIRST(buf_pool->LRU); + + while (bpage != NULL) { + ut_ad(bpage->in_LRU_list); + buf_page_set_old(bpage, TRUE); + bpage = UT_LIST_GET_NEXT(LRU, bpage); + } + + buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU); + buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU); + + buf_LRU_old_adjust_len(); +} + +/********************************************************************** +Remove a block from the unzip_LRU list if it belonged to the list. */ +static +void +buf_unzip_LRU_remove_block_if_needed( +/*=================================*/ + buf_page_t* bpage) /* in/out: control block */ +{ + ut_ad(buf_pool); + ut_ad(bpage); + ut_ad(buf_page_in_file(bpage)); + ut_ad(buf_pool_mutex_own()); + + if (buf_page_belongs_to_unzip_LRU(bpage)) { + buf_block_t* block = (buf_block_t*) bpage; + + ut_ad(block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = FALSE); + + UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block); + } +} + +/********************************************************************** +Removes a block from the LRU list. */ +UNIV_INLINE +void +buf_LRU_remove_block( +/*=================*/ + buf_page_t* bpage) /* in: control block */ +{ + ut_ad(buf_pool); + ut_ad(bpage); + ut_ad(buf_pool_mutex_own()); + + ut_a(buf_page_in_file(bpage)); + + ut_ad(bpage->in_LRU_list); + + /* If the LRU_old pointer is defined and points to just this block, + move it backward one step */ + + if (UNIV_UNLIKELY(bpage == buf_pool->LRU_old)) { + + /* Below: the previous block is guaranteed to exist, because + the LRU_old pointer is only allowed to differ by the + tolerance value from strict 3/8 of the LRU list length. */ + + buf_pool->LRU_old = UT_LIST_GET_PREV(LRU, bpage); + ut_a(buf_pool->LRU_old); +#ifdef UNIV_LRU_DEBUG + ut_a(!buf_pool->LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + buf_page_set_old(buf_pool->LRU_old, TRUE); + + buf_pool->LRU_old_len++; + } + + /* Remove the block from the LRU list */ + UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage); + bpage->in_LRU_list = FALSE; + + buf_unzip_LRU_remove_block_if_needed(bpage); + + /* If the LRU list is so short that LRU_old not defined, return */ + if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) { + + buf_pool->LRU_old = NULL; + + return; + } + + ut_ad(buf_pool->LRU_old); + + /* Update the LRU_old_len field if necessary */ + if (buf_page_is_old(bpage)) { + + buf_pool->LRU_old_len--; + } + + /* Adjust the length of the old block list if necessary */ + buf_LRU_old_adjust_len(); +} + +/********************************************************************** +Adds a block to the LRU list of decompressed zip pages. */ +UNIV_INTERN +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /* in: control block */ + ibool old) /* in: TRUE if should be put to the end + of the list, else put to the start */ +{ + ut_ad(buf_pool); + ut_ad(block); + ut_ad(buf_pool_mutex_own()); + + ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); + + ut_ad(!block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = TRUE); + + if (old) { + UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block); + } else { + UT_LIST_ADD_FIRST(unzip_LRU, buf_pool->unzip_LRU, block); + } +} + +/********************************************************************** +Adds a block to the LRU list end. */ +UNIV_INLINE +void +buf_LRU_add_block_to_end_low( +/*=========================*/ + buf_page_t* bpage) /* in: control block */ +{ + buf_page_t* last_bpage; + + ut_ad(buf_pool); + ut_ad(bpage); + ut_ad(buf_pool_mutex_own()); + + ut_a(buf_page_in_file(bpage)); + + last_bpage = UT_LIST_GET_LAST(buf_pool->LRU); + + if (last_bpage) { + bpage->LRU_position = last_bpage->LRU_position; + } else { + bpage->LRU_position = buf_pool_clock_tic(); + } + + ut_ad(!bpage->in_LRU_list); + UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage); + bpage->in_LRU_list = TRUE; + + buf_page_set_old(bpage, TRUE); + + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { + + buf_pool->LRU_old_len++; + } + + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool->LRU_old); + + /* Adjust the length of the old block list if necessary */ + + buf_LRU_old_adjust_len(); + + } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(); + } + + /* If this is a zipped block with decompressed frame as well + then put it on the unzip_LRU list */ + if (buf_page_belongs_to_unzip_LRU(bpage)) { + buf_unzip_LRU_add_block((buf_block_t*) bpage, TRUE); + } +} + +/********************************************************************** +Adds a block to the LRU list. */ +UNIV_INLINE +void +buf_LRU_add_block_low( +/*==================*/ + buf_page_t* bpage, /* in: control block */ + ibool old) /* in: TRUE if should be put to the old blocks + in the LRU list, else put to the start; if the + LRU list is very short, the block is added to + the start, regardless of this parameter */ +{ + ut_ad(buf_pool); + ut_ad(bpage); + ut_ad(buf_pool_mutex_own()); + + ut_a(buf_page_in_file(bpage)); + ut_ad(!bpage->in_LRU_list); + + if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) { + + UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, bpage); + + bpage->LRU_position = buf_pool_clock_tic(); + bpage->freed_page_clock = buf_pool->freed_page_clock; + } else { +#ifdef UNIV_LRU_DEBUG + /* buf_pool->LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool->LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old, + bpage); + buf_pool->LRU_old_len++; + + /* We copy the LRU position field of the previous block + to the new block */ + + bpage->LRU_position = (buf_pool->LRU_old)->LRU_position; + } + + bpage->in_LRU_list = TRUE; + + buf_page_set_old(bpage, old); + + if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool->LRU_old); + + /* Adjust the length of the old block list if necessary */ + + buf_LRU_old_adjust_len(); + + } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(); + } + + /* If this is a zipped block with decompressed frame as well + then put it on the unzip_LRU list */ + if (buf_page_belongs_to_unzip_LRU(bpage)) { + buf_unzip_LRU_add_block((buf_block_t*) bpage, old); + } +} + +/********************************************************************** +Adds a block to the LRU list. */ +UNIV_INTERN +void +buf_LRU_add_block( +/*==============*/ + buf_page_t* bpage, /* in: control block */ + ibool old) /* in: TRUE if should be put to the old + blocks in the LRU list, else put to the start; + if the LRU list is very short, the block is + added to the start, regardless of this + parameter */ +{ + buf_LRU_add_block_low(bpage, old); +} + +/********************************************************************** +Moves a block to the start of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_young( +/*=====================*/ + buf_page_t* bpage) /* in: control block */ +{ + buf_LRU_remove_block(bpage); + buf_LRU_add_block_low(bpage, FALSE); +} + +/********************************************************************** +Moves a block to the end of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_old( +/*===================*/ + buf_page_t* bpage) /* in: control block */ +{ + buf_LRU_remove_block(bpage); + buf_LRU_add_block_to_end_low(bpage); +} + +/********************************************************************** +Try to free a block. If bpage is a descriptor of a compressed-only +page, the descriptor object will be freed as well. + +NOTE: If this function returns BUF_LRU_FREED, it will not temporarily +release buf_pool_mutex. Furthermore, the page frame will no longer be +accessible via bpage. + +The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and +release these two mutexes after the call. No other +buf_page_get_mutex() may be held when calling this function. */ +UNIV_INTERN +enum buf_lru_free_block_status +buf_LRU_free_block( +/*===============*/ + /* out: BUF_LRU_FREED if freed, + BUF_LRU_CANNOT_RELOCATE or + BUF_LRU_NOT_FREED otherwise. */ + buf_page_t* bpage, /* in: block to be freed */ + ibool zip, /* in: TRUE if should remove also the + compressed page of an uncompressed page */ + ibool* buf_pool_mutex_released) + /* in: pointer to a variable that will + be assigned TRUE if buf_pool_mutex + was temporarily released, or NULL */ +{ + buf_page_t* b = NULL; + mutex_t* block_mutex = buf_page_get_mutex(bpage); + + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(block_mutex)); + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); + UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); + + if (!buf_page_can_relocate(bpage)) { + + /* Do not free buffer-fixed or I/O-fixed blocks. */ + return(BUF_LRU_NOT_FREED); + } + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0); +#endif /* UNIV_IBUF_COUNT_DEBUG */ + + if (zip || !bpage->zip.data) { + /* This would completely free the block. */ + /* Do not completely free dirty blocks. */ + + if (bpage->oldest_modification) { + return(BUF_LRU_NOT_FREED); + } + } else if (bpage->oldest_modification) { + /* Do not completely free dirty blocks. */ + + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + ut_ad(buf_page_get_state(bpage) + == BUF_BLOCK_ZIP_DIRTY); + return(BUF_LRU_NOT_FREED); + } + + goto alloc; + } else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + /* Allocate the control block for the compressed page. + If it cannot be allocated (without freeing a block + from the LRU list), refuse to free bpage. */ +alloc: + buf_pool_mutex_exit_forbid(); + b = buf_buddy_alloc(sizeof *b, NULL); + buf_pool_mutex_exit_allow(); + + if (UNIV_UNLIKELY(!b)) { + return(BUF_LRU_CANNOT_RELOCATE); + } + + memcpy(b, bpage, sizeof *b); + } + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, "Putting space %lu page %lu to free list\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif /* UNIV_DEBUG */ + + if (buf_LRU_block_remove_hashed_page(bpage, zip) + != BUF_BLOCK_ZIP_FREE) { + ut_a(bpage->buf_fix_count == 0); + + if (b) { + buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); + const ulint fold = buf_page_address_fold( + bpage->space, bpage->offset); + + ut_a(!buf_page_hash_get(bpage->space, bpage->offset)); + + b->state = b->oldest_modification + ? BUF_BLOCK_ZIP_DIRTY + : BUF_BLOCK_ZIP_PAGE; + UNIV_MEM_DESC(b->zip.data, + page_zip_get_size(&b->zip), b); + + /* The fields in_page_hash and in_LRU_list of + the to-be-freed block descriptor should have + been cleared in + buf_LRU_block_remove_hashed_page(), which + invokes buf_LRU_remove_block(). */ + ut_ad(!bpage->in_page_hash); + ut_ad(!bpage->in_LRU_list); + /* bpage->state was BUF_BLOCK_FILE_PAGE because + b != NULL. The type cast below is thus valid. */ + ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list); + + /* The fields of bpage were copied to b before + buf_LRU_block_remove_hashed_page() was invoked. */ + ut_ad(!b->in_zip_hash); + ut_ad(b->in_page_hash); + ut_ad(b->in_LRU_list); + + HASH_INSERT(buf_page_t, hash, + buf_pool->page_hash, fold, b); + + /* Insert b where bpage was in the LRU list. */ + if (UNIV_LIKELY(prev_b != NULL)) { + ulint lru_len; + + ut_ad(prev_b->in_LRU_list); + ut_ad(buf_page_in_file(prev_b)); + UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b); + + UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, + prev_b, b); + + if (buf_page_is_old(b)) { + buf_pool->LRU_old_len++; + if (UNIV_UNLIKELY + (buf_pool->LRU_old + == UT_LIST_GET_NEXT(LRU, b))) { + + buf_pool->LRU_old = b; + } +#ifdef UNIV_LRU_DEBUG + ut_a(prev_b->old + || !UT_LIST_GET_NEXT(LRU, b) + || UT_LIST_GET_NEXT(LRU, b)->old); + } else { + ut_a(!prev_b->old + || !UT_LIST_GET_NEXT(LRU, b) + || !UT_LIST_GET_NEXT(LRU, b)->old); +#endif /* UNIV_LRU_DEBUG */ + } + + lru_len = UT_LIST_GET_LEN(buf_pool->LRU); + + if (lru_len > BUF_LRU_OLD_MIN_LEN) { + ut_ad(buf_pool->LRU_old); + /* Adjust the length of the + old block list if necessary */ + buf_LRU_old_adjust_len(); + } else if (lru_len == BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is now long + enough for LRU_old to become + defined: init it */ + buf_LRU_old_init(); + } + } else { + b->in_LRU_list = FALSE; + buf_LRU_add_block_low(b, buf_page_is_old(b)); + } + + if (b->state == BUF_BLOCK_ZIP_PAGE) { + buf_LRU_insert_zip_clean(b); + } else { + buf_page_t* prev; + + ut_ad(b->in_flush_list); + ut_d(bpage->in_flush_list = FALSE); + + prev = UT_LIST_GET_PREV(list, b); + UT_LIST_REMOVE(list, buf_pool->flush_list, b); + + if (prev) { + ut_ad(prev->in_flush_list); + UT_LIST_INSERT_AFTER( + list, + buf_pool->flush_list, + prev, b); + } else { + UT_LIST_ADD_FIRST( + list, + buf_pool->flush_list, + b); + } + } + + bpage->zip.data = NULL; + page_zip_set_size(&bpage->zip, 0); + + /* Prevent buf_page_get_gen() from + decompressing the block while we release + buf_pool_mutex and block_mutex. */ + b->buf_fix_count++; + b->io_fix = BUF_IO_READ; + } + + if (buf_pool_mutex_released) { + *buf_pool_mutex_released = TRUE; + } + + buf_pool_mutex_exit(); + mutex_exit(block_mutex); + + /* Remove possible adaptive hash index on the page. + The page was declared uninitialized by + buf_LRU_block_remove_hashed_page(). We need to flag + the contents of the page valid (which it still is) in + order to avoid bogus Valgrind warnings.*/ + + UNIV_MEM_VALID(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + btr_search_drop_page_hash_index((buf_block_t*) bpage); + UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + + if (b) { + /* Compute and stamp the compressed page + checksum while not holding any mutex. The + block is already half-freed + (BUF_BLOCK_REMOVE_HASH) and removed from + buf_pool->page_hash, thus inaccessible by any + other thread. */ + + mach_write_to_4( + b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM, + UNIV_LIKELY(srv_use_checksums) + ? page_zip_calc_checksum( + b->zip.data, + page_zip_get_size(&b->zip)) + : BUF_NO_CHECKSUM_MAGIC); + } + + buf_pool_mutex_enter(); + mutex_enter(block_mutex); + + if (b) { + mutex_enter(&buf_pool_zip_mutex); + b->buf_fix_count--; + buf_page_set_io_fix(b, BUF_IO_NONE); + mutex_exit(&buf_pool_zip_mutex); + } + + buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + } else { + /* The block_mutex should have been released by + buf_LRU_block_remove_hashed_page() when it returns + BUF_BLOCK_ZIP_FREE. */ + ut_ad(block_mutex == &buf_pool_zip_mutex); + mutex_enter(block_mutex); + } + + return(BUF_LRU_FREED); +} + +/********************************************************************** +Puts a block back to the free list. */ +UNIV_INTERN +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block) /* in: block, must not contain a file page */ +{ + void* data; + + ut_ad(block); + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_READY_FOR_USE: + break; + default: + ut_error; + } + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->n_pointers == 0); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ut_ad(!block->page.in_free_list); + ut_ad(!block->page.in_flush_list); + ut_ad(!block->page.in_LRU_list); + + buf_block_set_state(block, BUF_BLOCK_NOT_USED); + + UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); +#ifdef UNIV_DEBUG + /* Wipe contents of page to reveal possible stale pointers to it */ + memset(block->frame, '\0', UNIV_PAGE_SIZE); +#else + /* Wipe page_no and space_id */ + memset(block->frame + FIL_PAGE_OFFSET, 0xfe, 4); + memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xfe, 4); +#endif + data = block->page.zip.data; + + if (data) { + block->page.zip.data = NULL; + mutex_exit(&block->mutex); + buf_pool_mutex_exit_forbid(); + buf_buddy_free(data, page_zip_get_size(&block->page.zip)); + buf_pool_mutex_exit_allow(); + mutex_enter(&block->mutex); + page_zip_set_size(&block->page.zip, 0); + } + + UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page)); + ut_d(block->page.in_free_list = TRUE); + + UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE); +} + +/********************************************************************** +Takes a block out of the LRU list and page hash table. +If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), +the object will be freed and buf_pool_zip_mutex will be released. + +If a compressed page or a compressed-only block descriptor is freed, +other compressed pages or compressed-only block descriptors may be +relocated. */ +static +enum buf_page_state +buf_LRU_block_remove_hashed_page( +/*=============================*/ + /* out: the new state of the block + (BUF_BLOCK_ZIP_FREE if the state was + BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH + otherwise) */ + buf_page_t* bpage, /* in: block, must contain a file page and + be in a state where it can be freed; there + may or may not be a hash index to the page */ + ibool zip) /* in: TRUE if should remove also the + compressed page of an uncompressed page */ +{ + const buf_page_t* hashed_bpage; + ut_ad(bpage); + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + ut_a(bpage->buf_fix_count == 0); + + UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); + + buf_LRU_remove_block(bpage); + + buf_pool->freed_page_clock += 1; + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_FILE_PAGE: + UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t)); + UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + buf_block_modify_clock_inc((buf_block_t*) bpage); + if (bpage->zip.data) { + const page_t* page = ((buf_block_t*) bpage)->frame; + const ulint zip_size + = page_zip_get_size(&bpage->zip); + + ut_a(!zip || bpage->oldest_modification == 0); + + switch (UNIV_EXPECT(fil_page_get_type(page), + FIL_PAGE_INDEX)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + if (!zip) { + /* InnoDB writes the data to the + uncompressed page frame. Copy it + to the compressed page, which will + be preserved. */ + memcpy(bpage->zip.data, page, + zip_size); + } + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + case FIL_PAGE_INDEX: +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(&bpage->zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + break; + default: + ut_print_timestamp(stderr); + fputs(" InnoDB: ERROR: The compressed page" + " to be evicted seems corrupt:", stderr); + ut_print_buf(stderr, page, zip_size); + fputs("\nInnoDB: Possibly older version" + " of the page:", stderr); + ut_print_buf(stderr, bpage->zip.data, + zip_size); + putc('\n', stderr); + ut_error; + } + + break; + } + /* fall through */ + case BUF_BLOCK_ZIP_PAGE: + ut_a(bpage->oldest_modification == 0); + UNIV_MEM_ASSERT_W(bpage->zip.data, + page_zip_get_size(&bpage->zip)); + break; + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + } + + hashed_bpage = buf_page_hash_get(bpage->space, bpage->offset); + + if (UNIV_UNLIKELY(bpage != hashed_bpage)) { + fprintf(stderr, + "InnoDB: Error: page %lu %lu not found" + " in the hash table\n", + (ulong) bpage->space, + (ulong) bpage->offset); + if (hashed_bpage) { + fprintf(stderr, + "InnoDB: In hash table we find block" + " %p of %lu %lu which is not %p\n", + (const void*) hashed_bpage, + (ulong) hashed_bpage->space, + (ulong) hashed_bpage->offset, + (const void*) bpage); + } + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + mutex_exit(buf_page_get_mutex(bpage)); + buf_pool_mutex_exit(); + buf_print(); + buf_LRU_print(); + buf_validate(); + buf_LRU_validate(); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_error; + } + + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_page_hash); + ut_d(bpage->in_page_hash = FALSE); + HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, + buf_page_address_fold(bpage->space, bpage->offset), + bpage); + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_PAGE: + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->in_flush_list); + ut_ad(!bpage->in_LRU_list); + ut_a(bpage->zip.data); + ut_a(buf_page_get_zip_size(bpage)); + + UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); + + mutex_exit(&buf_pool_zip_mutex); + buf_pool_mutex_exit_forbid(); + buf_buddy_free(bpage->zip.data, + page_zip_get_size(&bpage->zip)); + buf_buddy_free(bpage, sizeof(*bpage)); + buf_pool_mutex_exit_allow(); + UNIV_MEM_UNDESC(bpage); + return(BUF_BLOCK_ZIP_FREE); + + case BUF_BLOCK_FILE_PAGE: + memset(((buf_block_t*) bpage)->frame + + FIL_PAGE_OFFSET, 0xff, 4); + memset(((buf_block_t*) bpage)->frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame, + UNIV_PAGE_SIZE); + buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH); + + if (zip && bpage->zip.data) { + /* Free the compressed page. */ + void* data = bpage->zip.data; + bpage->zip.data = NULL; + + mutex_exit(&((buf_block_t*) bpage)->mutex); + buf_pool_mutex_exit_forbid(); + buf_buddy_free(data, page_zip_get_size(&bpage->zip)); + buf_pool_mutex_exit_allow(); + mutex_enter(&((buf_block_t*) bpage)->mutex); + page_zip_set_size(&bpage->zip, 0); + } + + return(BUF_BLOCK_REMOVE_HASH); + + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + ut_error; + return(BUF_BLOCK_ZIP_FREE); +} + +/********************************************************************** +Puts a file page whose has no hash index to the free list. */ +static +void +buf_LRU_block_free_hashed_page( +/*===========================*/ + buf_block_t* block) /* in: block, must contain a file page and + be in a state where it can be freed */ +{ + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + + buf_block_set_state(block, BUF_BLOCK_MEMORY); + + buf_LRU_block_free_non_file_page(block); +} + +/************************************************************************ +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +UNIV_INTERN +void +buf_LRU_stat_update(void) +/*=====================*/ +{ + buf_LRU_stat_t* item; + + /* If we haven't started eviction yet then don't update stats. */ + if (buf_pool->freed_page_clock == 0) { + goto func_exit; + } + + buf_pool_mutex_enter(); + + /* Update the index. */ + item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; + buf_LRU_stat_arr_ind++; + buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL; + + /* Add the current value and subtract the obsolete entry. */ + buf_LRU_stat_sum.io += buf_LRU_stat_cur.io - item->io; + buf_LRU_stat_sum.unzip += buf_LRU_stat_cur.unzip - item->unzip; + + /* Put current entry in the array. */ + memcpy(item, &buf_LRU_stat_cur, sizeof *item); + + buf_pool_mutex_exit(); + +func_exit: + /* Clear the current entry. */ + memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************** +Validates the LRU list. */ +UNIV_INTERN +ibool +buf_LRU_validate(void) +/*==================*/ +{ + buf_page_t* bpage; + buf_block_t* block; + ulint old_len; + ulint new_len; + ulint LRU_pos; + + ut_ad(buf_pool); + buf_pool_mutex_enter(); + + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { + + ut_a(buf_pool->LRU_old); + old_len = buf_pool->LRU_old_len; + new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); + ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); + ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); + } + + UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU); + + bpage = UT_LIST_GET_FIRST(buf_pool->LRU); + + old_len = 0; + + while (bpage != NULL) { + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + ut_error; + break; + case BUF_BLOCK_FILE_PAGE: + ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list + == buf_page_belongs_to_unzip_LRU(bpage)); + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + break; + } + + if (buf_page_is_old(bpage)) { + old_len++; + } + + if (buf_pool->LRU_old && (old_len == 1)) { + ut_a(buf_pool->LRU_old == bpage); + } + + LRU_pos = buf_page_get_LRU_position(bpage); + + bpage = UT_LIST_GET_NEXT(LRU, bpage); + + if (bpage) { + /* If the following assert fails, it may + not be an error: just the buf_pool clock + has wrapped around */ + ut_a(LRU_pos >= buf_page_get_LRU_position(bpage)); + } + } + + if (buf_pool->LRU_old) { + ut_a(buf_pool->LRU_old_len == old_len); + } + + UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free); + + for (bpage = UT_LIST_GET_FIRST(buf_pool->free); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(list, bpage)) { + + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED); + } + + UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU); + + for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU); + block; + block = UT_LIST_GET_NEXT(unzip_LRU, block)) { + + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); + } + + buf_pool_mutex_exit(); + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************** +Prints the LRU list. */ +UNIV_INTERN +void +buf_LRU_print(void) +/*===============*/ +{ + const buf_page_t* bpage; + + ut_ad(buf_pool); + buf_pool_mutex_enter(); + + fprintf(stderr, "Pool ulint clock %lu\n", + (ulong) buf_pool->ulint_clock); + + bpage = UT_LIST_GET_FIRST(buf_pool->LRU); + + while (bpage != NULL) { + + fprintf(stderr, "BLOCK space %lu page %lu ", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + + if (buf_page_is_old(bpage)) { + fputs("old ", stderr); + } + + if (bpage->buf_fix_count) { + fprintf(stderr, "buffix count %lu ", + (ulong) bpage->buf_fix_count); + } + + if (buf_page_get_io_fix(bpage)) { + fprintf(stderr, "io_fix %lu ", + (ulong) buf_page_get_io_fix(bpage)); + } + + if (bpage->oldest_modification) { + fputs("modif. ", stderr); + } + + switch (buf_page_get_state(bpage)) { + const byte* frame; + case BUF_BLOCK_FILE_PAGE: + frame = buf_block_get_frame((buf_block_t*) bpage); + fprintf(stderr, "\nLRU pos %lu type %lu" + " index id %lu\n", + (ulong) buf_page_get_LRU_position(bpage), + (ulong) fil_page_get_type(frame), + (ulong) ut_dulint_get_low( + btr_page_get_index_id(frame))); + break; + case BUF_BLOCK_ZIP_PAGE: + frame = bpage->zip.data; + fprintf(stderr, "\nLRU pos %lu type %lu size %lu" + " index id %lu\n", + (ulong) buf_page_get_LRU_position(bpage), + (ulong) fil_page_get_type(frame), + (ulong) buf_page_get_zip_size(bpage), + (ulong) ut_dulint_get_low( + btr_page_get_index_id(frame))); + break; + + default: + fprintf(stderr, "\nLRU pos %lu !state %lu!\n", + (ulong) buf_page_get_LRU_position(bpage), + (ulong) buf_page_get_state(bpage)); + break; + } + + bpage = UT_LIST_GET_NEXT(LRU, bpage); + } + + buf_pool_mutex_exit(); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.c new file mode 100644 index 00000000000..149be7f3bdd --- /dev/null +++ b/storage/xtradb/buf/buf0rea.c @@ -0,0 +1,818 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0rea.h" + +#include "fil0fil.h" +#include "mtr0mtr.h" + +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "ibuf0ibuf.h" +#include "log0recv.h" +#include "trx0sys.h" +#include "os0file.h" +#include "srv0start.h" + +extern ulint srv_read_ahead; +extern ulint srv_read_ahead_rnd; +extern ulint srv_read_ahead_seq; +extern ulint srv_buf_pool_reads; + +/* The size in blocks of the area where the random read-ahead algorithm counts +the accessed pages when deciding whether to read-ahead */ +#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA + +/* There must be at least this many pages in buf_pool in the area to start +a random read-ahead */ +#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + buf_read_ahead_random_area / 8) + +/* The linear read-ahead area size */ +#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA + +/* The linear read-ahead threshold */ +#define LINEAR_AREA_THRESHOLD_COEF 5 / 8 + +/* If there are buf_pool->curr_size per the number below pending reads, then +read-ahead is not done: this is to prevent flooding the buffer pool with +i/o-fixed buffer blocks */ +#define BUF_READ_AHEAD_PEND_LIMIT 2 + +/************************************************************************ +Low-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there, in which case does nothing. +Sets the io_fix flag and sets an exclusive lock on the buffer frame. The +flag is cleared and the x-lock released by an i/o-handler thread. */ +static +ulint +buf_read_page_low( +/*==============*/ + /* out: 1 if a read request was queued, 0 if the page + already resided in buf_pool, or if the page is in + the doublewrite buffer blocks in which case it is never + read into the pool, or if the tablespace does not + exist or is being dropped */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are + trying to read from a non-existent tablespace, or a + tablespace which is just now being dropped */ + ibool sync, /* in: TRUE if synchronous aio is desired */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ..., + ORed to OS_AIO_SIMULATED_WAKE_LATER (see below + at read-ahead functions) */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size, or 0 */ + ibool unzip, /* in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version, /* in: if the space memory object has + this timestamp different from what we are giving here, + treat the tablespace as dropped; this is a timestamp we + use to stop dangling page reads from a tablespace + which we have DISCARDed + IMPORTed back */ + ulint offset) /* in: page number */ +{ + buf_page_t* bpage; + ulint wake_later; + + *err = DB_SUCCESS; + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; + + if (trx_doublewrite && space == TRX_SYS_SPACE + && ( (offset >= trx_doublewrite->block1 + && offset < trx_doublewrite->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (offset >= trx_doublewrite->block2 + && offset < trx_doublewrite->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: trying to read" + " doublewrite buffer page %lu\n", + (ulong) offset); + + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* Trx sys header is so low in the latching order that we play + safe and do not leave the i/o-completion to an asynchronous + i/o-thread. Ibuf bitmap pages must always be read with + syncronous i/o, to make sure they do not get involved in + thread deadlocks. */ + + sync = TRUE; + } + + /* The following call will also check if the tablespace does not exist + or is being dropped; if we succeed in initing the page in the buffer + pool for read, then DISCARD cannot proceed until the read has + completed */ + bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip, + tablespace_version, offset); + if (bpage == NULL) { + + return(0); + } + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Posting read request for page %lu, sync %lu\n", + (ulong) offset, + (ulong) sync); + } +#endif + + ut_ad(buf_page_in_file(bpage)); + + if (zip_size) { + *err = fil_io(OS_FILE_READ | wake_later, + sync, space, zip_size, offset, 0, zip_size, + bpage->zip.data, bpage); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + + *err = fil_io(OS_FILE_READ | wake_later, + sync, space, 0, offset, 0, UNIV_PAGE_SIZE, + ((buf_block_t*) bpage)->frame, bpage); + } + ut_a(*err == DB_SUCCESS); + + if (sync) { + /* The i/o is already completed when we arrive from + fil_read */ + buf_page_io_complete(bpage); + } + + return(1); +} + +/************************************************************************ +Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. */ +static +ulint +buf_read_ahead_random( +/*==================*/ + /* out: number of page read requests issued; NOTE + that if we read ibuf pages, it may happen that + the page at the given page number does not get + read even if we return a value > 0! */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint offset) /* in: page number of a page which the current thread + wants to access */ +{ + ib_int64_t tablespace_version; + ulint recent_blocks = 0; + ulint count; + ulint LRU_recent_limit; + ulint ibuf_mode; + ulint low, high; + ulint err; + ulint i; + ulint buf_read_ahead_random_area; + + if (!(srv_read_ahead & 1)) { + return(0); + } + + if (srv_startup_is_before_trx_rollback_phase) { + /* No read-ahead to avoid thread deadlocks */ + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* If it is an ibuf bitmap page or trx sys hdr, we do + no read-ahead, as that could break the ibuf page access + order */ + + return(0); + } + + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + buf_read_ahead_random_area = BUF_READ_AHEAD_RANDOM_AREA; + + low = (offset / buf_read_ahead_random_area) + * buf_read_ahead_random_area; + high = (offset / buf_read_ahead_random_area + 1) + * buf_read_ahead_random_area; + if (high > fil_space_get_size(space)) { + + high = fil_space_get_size(space); + } + + /* Get the minimum LRU_position field value for an initial segment + of the LRU list, to determine which blocks have recently been added + to the start of the list. */ + + LRU_recent_limit = buf_LRU_get_recent_limit(); + + buf_pool_mutex_enter(); + + if (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + buf_pool_mutex_exit(); + + return(0); + } + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (i = low; i < high; i++) { + const buf_page_t* bpage = buf_page_hash_get(space, i); + + if (bpage + && buf_page_is_accessed(bpage) + && (buf_page_get_LRU_position(bpage) > LRU_recent_limit)) { + + recent_blocks++; + + if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) { + + buf_pool_mutex_exit(); + goto read_ahead; + } + } + } + + buf_pool_mutex_exit(); + /* Do nothing */ + return(0); + +read_ahead: + /* Read all the suitable blocks within the area */ + + if (ibuf_inside()) { + ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; + } else { + ibuf_mode = BUF_READ_ANY_PAGE; + } + + count = 0; + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync aio + mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(zip_size, i)) { + count += buf_read_page_low( + &err, FALSE, + ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, + space, zip_size, FALSE, + tablespace_version, i); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: in random" + " readahead trying to access\n" + "InnoDB: tablespace %lu page %lu,\n" + "InnoDB: but the tablespace does not" + " exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && (count > 0)) { + fprintf(stderr, + "Random read-ahead space %lu offset %lu pages %lu\n", + (ulong) space, (ulong) offset, + (ulong) count); + } +#endif /* UNIV_DEBUG */ + + ++srv_read_ahead_rnd; + return(count); +} + +/************************************************************************ +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. Does a random read-ahead if it seems +sensible. */ +UNIV_INTERN +ulint +buf_read_page( +/*==========*/ + /* out: number of page read requests issued: this can + be > 1 if read-ahead occurred */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint offset) /* in: page number */ +{ + ib_int64_t tablespace_version; + ulint count; + ulint count2; + ulint err; + + tablespace_version = fil_space_get_version(space); + + count = buf_read_ahead_random(space, zip_size, offset); + + /* We do the i/o in the synchronous aio mode to save thread + switches: hence TRUE */ + + count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, + zip_size, FALSE, + tablespace_version, offset); + srv_buf_pool_reads+= count2; + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: trying to access" + " tablespace %lu page no. %lu,\n" + "InnoDB: but the tablespace does not exist" + " or is just being dropped.\n", + (ulong) space, (ulong) offset); + } + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(FALSE); + + /* Increment number of I/O operations used for LRU policy. */ + buf_LRU_stat_inc_io(); + + return(count + count2); +} + +/************************************************************************ +Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. */ +UNIV_INTERN +ulint +buf_read_ahead_linear( +/*==================*/ + /* out: number of page read requests issued */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint offset) /* in: page number of a page; NOTE: the current thread + must want access to this page (see NOTE 3 above) */ +{ + ib_int64_t tablespace_version; + buf_page_t* bpage; + buf_frame_t* frame; + buf_page_t* pred_bpage = NULL; + ulint pred_offset; + ulint succ_offset; + ulint count; + int asc_or_desc; + ulint new_offset; + ulint fail_count; + ulint ibuf_mode; + ulint low, high; + ulint err; + ulint i; + const ulint buf_read_ahead_linear_area + = BUF_READ_AHEAD_LINEAR_AREA; + + if (!(srv_read_ahead & 2)) { + return(0); + } + + if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) { + /* No read-ahead to avoid thread deadlocks */ + return(0); + } + + low = (offset / buf_read_ahead_linear_area) + * buf_read_ahead_linear_area; + high = (offset / buf_read_ahead_linear_area + 1) + * buf_read_ahead_linear_area; + + if ((offset != low) && (offset != high - 1)) { + /* This is not a border page of the area: return */ + + return(0); + } + + if (ibuf_bitmap_page(zip_size, offset) + || trx_sys_hdr_page(space, offset)) { + + /* If it is an ibuf bitmap page or trx sys hdr, we do + no read-ahead, as that could break the ibuf page access + order */ + + return(0); + } + + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + buf_pool_mutex_enter(); + + if (high > fil_space_get_size(space)) { + buf_pool_mutex_exit(); + /* The area is not whole, return */ + + return(0); + } + + if (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + buf_pool_mutex_exit(); + + return(0); + } + + /* Check that almost all pages in the area have been accessed; if + offset == low, the accesses must be in a descending order, otherwise, + in an ascending order. */ + + asc_or_desc = 1; + + if (offset == low) { + asc_or_desc = -1; + } + + fail_count = 0; + + for (i = low; i < high; i++) { + bpage = buf_page_hash_get(space, i); + + if ((bpage == NULL) || !buf_page_is_accessed(bpage)) { + /* Not accessed */ + fail_count++; + + } else if (pred_bpage + && (ut_ulint_cmp( + buf_page_get_LRU_position(bpage), + buf_page_get_LRU_position(pred_bpage)) + != asc_or_desc)) { + /* Accesses not in the right order */ + + fail_count++; + pred_bpage = bpage; + } + } + + if (fail_count > buf_read_ahead_linear_area + * LINEAR_AREA_THRESHOLD_COEF) { + /* Too many failures: return */ + + buf_pool_mutex_exit(); + + return(0); + } + + /* If we got this far, we know that enough pages in the area have + been accessed in the right order: linear read-ahead can be sensible */ + + bpage = buf_page_hash_get(space, offset); + + if (bpage == NULL) { + buf_pool_mutex_exit(); + + return(0); + } + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_PAGE: + frame = bpage->zip.data; + break; + case BUF_BLOCK_FILE_PAGE: + frame = ((buf_block_t*) bpage)->frame; + break; + default: + ut_error; + break; + } + + /* Read the natural predecessor and successor page addresses from + the page; NOTE that because the calling thread may have an x-latch + on the page, we do not acquire an s-latch on the page, this is to + prevent deadlocks. Even if we read values which are nonsense, the + algorithm will work. */ + + pred_offset = fil_page_get_prev(frame); + succ_offset = fil_page_get_next(frame); + + buf_pool_mutex_exit(); + + if ((offset == low) && (succ_offset == offset + 1)) { + + /* This is ok, we can continue */ + new_offset = pred_offset; + + } else if ((offset == high - 1) && (pred_offset == offset - 1)) { + + /* This is ok, we can continue */ + new_offset = succ_offset; + } else { + /* Successor or predecessor not in the right order */ + + return(0); + } + + low = (new_offset / buf_read_ahead_linear_area) + * buf_read_ahead_linear_area; + high = (new_offset / buf_read_ahead_linear_area + 1) + * buf_read_ahead_linear_area; + + if ((new_offset != low) && (new_offset != high - 1)) { + /* This is not a border page of the area: return */ + + return(0); + } + + if (high > fil_space_get_size(space)) { + /* The area is not whole, return */ + + return(0); + } + + /* If we got this far, read-ahead can be sensible: do it */ + + if (ibuf_inside()) { + ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; + } else { + ibuf_mode = BUF_READ_ANY_PAGE; + } + + count = 0; + + /* Since Windows XP seems to schedule the i/o handler thread + very eagerly, and consequently it does not wait for the + full read batch to be posted, we use special heuristics here */ + + os_aio_simulated_put_read_threads_to_sleep(); + + for (i = low; i < high; i++) { + /* It is only sensible to do read-ahead in the non-sync + aio mode: hence FALSE as the first parameter */ + + if (!ibuf_bitmap_page(zip_size, i)) { + count += buf_read_page_low( + &err, FALSE, + ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, + space, zip_size, FALSE, tablespace_version, i); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: in" + " linear readahead trying to access\n" + "InnoDB: tablespace %lu page %lu,\n" + "InnoDB: but the tablespace does not" + " exist or is just being dropped.\n", + (ulong) space, (ulong) i); + } + } + } + + /* In simulated aio we wake the aio handler threads only after + queuing all aio requests, in native aio the following call does + nothing: */ + + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(FALSE); + +#ifdef UNIV_DEBUG + if (buf_debug_prints && (count > 0)) { + fprintf(stderr, + "LINEAR read-ahead space %lu offset %lu pages %lu\n", + (ulong) space, (ulong) offset, (ulong) count); + } +#endif /* UNIV_DEBUG */ + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + ++srv_read_ahead_seq; + return(count); +} + +/************************************************************************ +Issues read requests for pages which the ibuf module wants to read in, in +order to contract the insert buffer tree. Technically, this function is like +a read-ahead function. */ +UNIV_INTERN +void +buf_read_ibuf_merge_pages( +/*======================*/ + ibool sync, /* in: TRUE if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + const ulint* space_ids, /* in: array of space ids */ + const ib_int64_t* space_versions,/* in: the spaces must have + this version number + (timestamp), otherwise we + discard the read; we use this + to cancel reads if DISCARD + + IMPORT may have changed the + tablespace size */ + const ulint* page_nos, /* in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored) /* in: number of elements + in the arrays */ +{ + ulint i; + + ut_ad(!ibuf_inside()); +#ifdef UNIV_IBUF_DEBUG + ut_a(n_stored < UNIV_PAGE_SIZE); +#endif + while (buf_pool->n_pend_reads + > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { + os_thread_sleep(500000); + } + + for (i = 0; i < n_stored; i++) { + ulint zip_size = fil_space_get_zip_size(space_ids[i]); + ulint err; + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + + goto tablespace_deleted; + } + + buf_read_page_low(&err, sync && (i + 1 == n_stored), + BUF_READ_ANY_PAGE, space_ids[i], + zip_size, TRUE, space_versions[i], + page_nos[i]); + + if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) { +tablespace_deleted: + /* We have deleted or are deleting the single-table + tablespace: remove the entries for that page */ + + ibuf_merge_or_delete_for_page(NULL, space_ids[i], + page_nos[i], + zip_size, FALSE); + } + } + + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(FALSE); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Ibuf merge read-ahead space %lu pages %lu\n", + (ulong) space_ids[0], (ulong) n_stored); + } +#endif /* UNIV_DEBUG */ +} + +/************************************************************************ +Issues read requests for pages which recovery wants to read in. */ +UNIV_INTERN +void +buf_read_recv_pages( +/*================*/ + ibool sync, /* in: TRUE if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in + bytes, or 0 */ + const ulint* page_nos, /* in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored) /* in: number of page numbers + in the array */ +{ + ib_int64_t tablespace_version; + ulint count; + ulint err; + ulint i; + + zip_size = fil_space_get_zip_size(space); + tablespace_version = fil_space_get_version(space); + + for (i = 0; i < n_stored; i++) { + + count = 0; + + os_aio_print_debug = FALSE; + + while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { + + os_aio_simulated_wake_handler_threads(); + os_thread_sleep(500000); + + count++; + + if (count > 100) { + fprintf(stderr, + "InnoDB: Error: InnoDB has waited for" + " 50 seconds for pending\n" + "InnoDB: reads to the buffer pool to" + " be finished.\n" + "InnoDB: Number of pending reads %lu," + " pending pread calls %lu\n", + (ulong) buf_pool->n_pend_reads, + (ulong)os_file_n_pending_preads); + + os_aio_print_debug = TRUE; + } + } + + os_aio_print_debug = FALSE; + + if ((i + 1 == n_stored) && sync) { + buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, + zip_size, TRUE, tablespace_version, + page_nos[i]); + } else { + buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE + | OS_AIO_SIMULATED_WAKE_LATER, + space, zip_size, TRUE, + tablespace_version, page_nos[i]); + } + } + + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ + buf_flush_free_margin(FALSE); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Recovery applies read-ahead pages %lu\n", + (ulong) n_stored); + } +#endif /* UNIV_DEBUG */ +} diff --git a/storage/xtradb/data/data0data.c b/storage/xtradb/data/data0data.c new file mode 100644 index 00000000000..1cb3803b187 --- /dev/null +++ b/storage/xtradb/data/data0data.c @@ -0,0 +1,758 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "data0data.h" + +#ifdef UNIV_NONINL +#include "data0data.ic" +#endif + +#include "rem0rec.h" +#include "rem0cmp.h" +#include "page0page.h" +#include "page0zip.h" +#include "dict0dict.h" +#include "btr0cur.h" + +#include + +#ifdef UNIV_DEBUG +/* data pointers of tuple fields are initialized to point here +for error checking */ +UNIV_INTERN byte data_error; + +# ifndef UNIV_DEBUG_VALGRIND +/* this is used to fool the compiler in dtuple_validate */ +UNIV_INTERN ulint data_dummy; +# endif /* !UNIV_DEBUG_VALGRIND */ +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Tests if dfield data length and content is equal to the given. */ +UNIV_INTERN +ibool +dfield_data_is_binary_equal( +/*========================*/ + /* out: TRUE if equal */ + const dfield_t* field, /* in: field */ + ulint len, /* in: data length or UNIV_SQL_NULL */ + const byte* data) /* in: data */ +{ + if (len != dfield_get_len(field)) { + + return(FALSE); + } + + if (len == UNIV_SQL_NULL) { + + return(TRUE); + } + + if (0 != memcmp(dfield_get_data(field), data, len)) { + + return(FALSE); + } + + return(TRUE); +} + +/**************************************************************** +Compare two data tuples, respecting the collation of character fields. */ +UNIV_INTERN +int +dtuple_coll_cmp( +/*============*/ + /* out: 1, 0 , -1 if tuple1 is greater, equal, + less, respectively, than tuple2 */ + const dtuple_t* tuple1, /* in: tuple 1 */ + const dtuple_t* tuple2) /* in: tuple 2 */ +{ + ulint n_fields; + ulint i; + + ut_ad(tuple1 && tuple2); + ut_ad(tuple1->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(tuple2->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(tuple1)); + ut_ad(dtuple_check_typed(tuple2)); + + n_fields = dtuple_get_n_fields(tuple1); + + if (n_fields != dtuple_get_n_fields(tuple2)) { + + return(n_fields < dtuple_get_n_fields(tuple2) ? -1 : 1); + } + + for (i = 0; i < n_fields; i++) { + int cmp; + const dfield_t* field1 = dtuple_get_nth_field(tuple1, i); + const dfield_t* field2 = dtuple_get_nth_field(tuple2, i); + + cmp = cmp_dfield_dfield(field1, field2); + + if (cmp) { + return(cmp); + } + } + + return(0); +} + +/************************************************************************* +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +UNIV_INTERN +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /* in: tuple */ + ulint n_fields) /* in: number of fields */ +{ + ut_ad(tuple); + + tuple->n_fields = n_fields; + tuple->n_fields_cmp = n_fields; +} + +/************************************************************** +Checks that a data field is typed. */ +static +ibool +dfield_check_typed_no_assert( +/*=========================*/ + /* out: TRUE if ok */ + const dfield_t* field) /* in: data field */ +{ + if (dfield_get_type(field)->mtype > DATA_MYSQL + || dfield_get_type(field)->mtype < DATA_VARCHAR) { + + fprintf(stderr, + "InnoDB: Error: data field type %lu, len %lu\n", + (ulong) dfield_get_type(field)->mtype, + (ulong) dfield_get_len(field)); + return(FALSE); + } + + return(TRUE); +} + +/************************************************************** +Checks that a data tuple is typed. */ +UNIV_INTERN +ibool +dtuple_check_typed_no_assert( +/*=========================*/ + /* out: TRUE if ok */ + const dtuple_t* tuple) /* in: tuple */ +{ + const dfield_t* field; + ulint i; + + if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) { + fprintf(stderr, + "InnoDB: Error: index entry has %lu fields\n", + (ulong) dtuple_get_n_fields(tuple)); +dump: + fputs("InnoDB: Tuple contents: ", stderr); + dtuple_print(stderr, tuple); + putc('\n', stderr); + + return(FALSE); + } + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + field = dtuple_get_nth_field(tuple, i); + + if (!dfield_check_typed_no_assert(field)) { + goto dump; + } + } + + return(TRUE); +} + +/************************************************************** +Checks that a data field is typed. Asserts an error if not. */ +UNIV_INTERN +ibool +dfield_check_typed( +/*===============*/ + /* out: TRUE if ok */ + const dfield_t* field) /* in: data field */ +{ + if (dfield_get_type(field)->mtype > DATA_MYSQL + || dfield_get_type(field)->mtype < DATA_VARCHAR) { + + fprintf(stderr, + "InnoDB: Error: data field type %lu, len %lu\n", + (ulong) dfield_get_type(field)->mtype, + (ulong) dfield_get_len(field)); + + ut_error; + } + + return(TRUE); +} + +/************************************************************** +Checks that a data tuple is typed. Asserts an error if not. */ +UNIV_INTERN +ibool +dtuple_check_typed( +/*===============*/ + /* out: TRUE if ok */ + const dtuple_t* tuple) /* in: tuple */ +{ + const dfield_t* field; + ulint i; + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + field = dtuple_get_nth_field(tuple, i); + + ut_a(dfield_check_typed(field)); + } + + return(TRUE); +} + +#ifdef UNIV_DEBUG +/************************************************************** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. */ +UNIV_INTERN +ibool +dtuple_validate( +/*============*/ + /* out: TRUE if ok */ + const dtuple_t* tuple) /* in: tuple */ +{ + const dfield_t* field; + ulint n_fields; + ulint len; + ulint i; + + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + n_fields = dtuple_get_n_fields(tuple); + + /* We dereference all the data of each field to test + for memory traps */ + + for (i = 0; i < n_fields; i++) { + + field = dtuple_get_nth_field(tuple, i); + len = dfield_get_len(field); + + if (!dfield_is_null(field)) { + + const byte* data = dfield_get_data(field); +#ifndef UNIV_DEBUG_VALGRIND + ulint j; + + for (j = 0; j < len; j++) { + + data_dummy += *data; /* fool the compiler not + to optimize out this + code */ + data++; + } +#endif /* !UNIV_DEBUG_VALGRIND */ + + UNIV_MEM_ASSERT_RW(data, len); + } + } + + ut_a(dtuple_check_typed(tuple)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/***************************************************************** +Pretty prints a dfield value according to its data type. */ +UNIV_INTERN +void +dfield_print( +/*=========*/ + const dfield_t* dfield) /* in: dfield */ +{ + const byte* data; + ulint len; + ulint i; + + len = dfield_get_len(dfield); + data = dfield_get_data(dfield); + + if (dfield_is_null(dfield)) { + fputs("NULL", stderr); + + return; + } + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + case DATA_CHAR: + case DATA_VARCHAR: + for (i = 0; i < len; i++) { + int c = *data++; + putc(isprint(c) ? c : ' ', stderr); + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + break; + case DATA_INT: + ut_a(len == 4); /* only works for 32-bit integers */ + fprintf(stderr, "%d", (int)mach_read_from_4(data)); + break; + default: + ut_error; + } +} + +/***************************************************************** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +UNIV_INTERN +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield) /* in: dfield */ +{ + const byte* data; + ulint len; + ulint prtype; + ulint i; + ibool print_also_hex; + + len = dfield_get_len(dfield); + data = dfield_get_data(dfield); + + if (dfield_is_null(dfield)) { + fputs("NULL", stderr); + + return; + } + + prtype = dtype_get_prtype(dfield_get_type(dfield)); + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + dulint id; + case DATA_INT: + switch (len) { + ulint val; + case 1: + val = mach_read_from_1(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x80; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 2: + val = mach_read_from_2(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x8000; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 3: + val = mach_read_from_3(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x800000; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 4: + val = mach_read_from_4(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x80000000; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 6: + id = mach_read_from_6(data); + fprintf(stderr, "{%lu %lu}", + ut_dulint_get_high(id), + ut_dulint_get_low(id)); + break; + + case 7: + id = mach_read_from_7(data); + fprintf(stderr, "{%lu %lu}", + ut_dulint_get_high(id), + ut_dulint_get_low(id)); + break; + case 8: + id = mach_read_from_8(data); + fprintf(stderr, "{%lu %lu}", + ut_dulint_get_high(id), + ut_dulint_get_low(id)); + break; + default: + goto print_hex; + } + break; + + case DATA_SYS: + switch (prtype & DATA_SYS_PRTYPE_MASK) { + case DATA_TRX_ID: + id = mach_read_from_6(data); + + fprintf(stderr, "trx_id " TRX_ID_FMT, + TRX_ID_PREP_PRINTF(id)); + break; + + case DATA_ROLL_PTR: + id = mach_read_from_7(data); + + fprintf(stderr, "roll_ptr {%lu %lu}", + ut_dulint_get_high(id), ut_dulint_get_low(id)); + break; + + case DATA_ROW_ID: + id = mach_read_from_6(data); + + fprintf(stderr, "row_id {%lu %lu}", + ut_dulint_get_high(id), ut_dulint_get_low(id)); + break; + + default: + id = mach_dulint_read_compressed(data); + + fprintf(stderr, "mix_id {%lu %lu}", + ut_dulint_get_high(id), ut_dulint_get_low(id)); + } + break; + + case DATA_CHAR: + case DATA_VARCHAR: + print_also_hex = FALSE; + + for (i = 0; i < len; i++) { + int c = *data++; + + if (!isprint(c)) { + print_also_hex = TRUE; + + fprintf(stderr, "\\x%02x", (unsigned char) c); + } else { + putc(c, stderr); + } + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + + if (!print_also_hex) { + break; + } + + data = dfield_get_data(dfield); + /* fall through */ + + case DATA_BINARY: + default: +print_hex: + fputs(" Hex: ",stderr); + + for (i = 0; i < len; i++) { + fprintf(stderr, "%02lx", (ulint) *data++); + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + } +} + +/***************************************************************** +Print a dfield value using ut_print_buf. */ +static +void +dfield_print_raw( +/*=============*/ + FILE* f, /* in: output stream */ + const dfield_t* dfield) /* in: dfield */ +{ + ulint len = dfield_get_len(dfield); + if (!dfield_is_null(dfield)) { + ulint print_len = ut_min(len, 1000); + ut_print_buf(f, dfield_get_data(dfield), print_len); + if (len != print_len) { + fprintf(f, "(total %lu bytes%s)", + (ulong) len, + dfield_is_ext(dfield) ? ", external" : ""); + } + } else { + fputs(" SQL NULL", f); + } +} + +/************************************************************** +The following function prints the contents of a tuple. */ +UNIV_INTERN +void +dtuple_print( +/*=========*/ + FILE* f, /* in: output stream */ + const dtuple_t* tuple) /* in: tuple */ +{ + ulint n_fields; + ulint i; + + n_fields = dtuple_get_n_fields(tuple); + + fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields); + + for (i = 0; i < n_fields; i++) { + fprintf(f, " %lu:", (ulong) i); + + dfield_print_raw(f, dtuple_get_nth_field(tuple, i)); + + putc(';', f); + putc('\n', f); + } + + ut_ad(dtuple_validate(tuple)); +} + +/****************************************************************** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. */ +UNIV_INTERN +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + /* out, own: created big record vector, + NULL if we are not able to shorten + the entry enough, i.e., if there are + too many fixed-length or short fields + in entry or the index is clustered */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in/out: index entry */ + ulint* n_ext) /* in/out: number of + externally stored columns */ +{ + mem_heap_t* heap; + big_rec_t* vector; + dfield_t* dfield; + dict_field_t* ifield; + ulint size; + ulint n_fields; + ulint local_len; + ulint local_prefix_len; + + if (UNIV_UNLIKELY(!dict_index_is_clust(index))) { + return(NULL); + } + + if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) { + /* up to MySQL 5.1: store a 768-byte prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE + DICT_MAX_INDEX_COL_LEN; + } else { + /* new-format table: do not store any BLOB prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE; + } + + ut_a(dtuple_check_typed_no_assert(entry)); + + size = rec_get_converted_size(index, entry, *n_ext); + + if (UNIV_UNLIKELY(size > 1000000000)) { + fprintf(stderr, + "InnoDB: Warning: tuple size very big: %lu\n", + (ulong) size); + fputs("InnoDB: Tuple contents: ", stderr); + dtuple_print(stderr, entry); + putc('\n', stderr); + } + + heap = mem_heap_create(size + dtuple_get_n_fields(entry) + * sizeof(big_rec_field_t) + 1000); + + vector = mem_heap_alloc(heap, sizeof(big_rec_t)); + + vector->heap = heap; + vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry) + * sizeof(big_rec_field_t)); + + /* Decide which fields to shorten: the algorithm is to look for + a variable-length field that yields the biggest savings when + stored externally */ + + n_fields = 0; + + while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, + *n_ext), + dict_table_is_comp(index->table), + dict_index_get_n_fields(index), + dict_table_zip_size(index->table))) { + ulint i; + ulint longest = 0; + ulint longest_i = ULINT_MAX; + byte* data; + big_rec_field_t* b; + + for (i = dict_index_get_n_unique_in_tree(index); + i < dtuple_get_n_fields(entry); i++) { + ulint savings; + + dfield = dtuple_get_nth_field(entry, i); + ifield = dict_index_get_nth_field(index, i); + + /* Skip fixed-length, NULL, externally stored, + or short columns */ + + if (ifield->fixed_len + || dfield_is_null(dfield) + || dfield_is_ext(dfield) + || dfield_get_len(dfield) <= local_len + || dfield_get_len(dfield) + <= BTR_EXTERN_FIELD_REF_SIZE * 2) { + goto skip_field; + } + + savings = dfield_get_len(dfield) - local_len; + + /* Check that there would be savings */ + if (longest >= savings) { + goto skip_field; + } + + longest_i = i; + longest = savings; + +skip_field: + continue; + } + + if (!longest) { + /* Cannot shorten more */ + + mem_heap_free(heap); + + return(NULL); + } + + /* Move data from field longest_i to big rec vector. + + We store the first bytes locally to the record. Then + we can calculate all ordering fields in all indexes + from locally stored data. */ + + dfield = dtuple_get_nth_field(entry, longest_i); + ifield = dict_index_get_nth_field(index, longest_i); + local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE; + + b = &vector->fields[n_fields]; + b->field_no = longest_i; + b->len = dfield_get_len(dfield) - local_prefix_len; + b->data = (char*) dfield_get_data(dfield) + local_prefix_len; + + /* Allocate the locally stored part of the column. */ + data = mem_heap_alloc(heap, local_len); + + /* Copy the local prefix. */ + memcpy(data, dfield_get_data(dfield), local_prefix_len); + /* Clear the extern field reference (BLOB pointer). */ + memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE); +#if 0 + /* The following would fail the Valgrind checks in + page_cur_insert_rec_low() and page_cur_insert_rec_zip(). + The BLOB pointers in the record will be initialized after + the record and the BLOBs have been written. */ + UNIV_MEM_ALLOC(data + local_prefix_len, + BTR_EXTERN_FIELD_REF_SIZE); +#endif + + dfield_set_data(dfield, data, local_len); + dfield_set_ext(dfield); + + n_fields++; + (*n_ext)++; + ut_ad(n_fields < dtuple_get_n_fields(entry)); + } + + vector->n_fields = n_fields; + return(vector); +} + +/****************************************************************** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +UNIV_INTERN +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index __attribute__((unused)), /* in: index */ + dtuple_t* entry, /* in: entry whose data was put to vector */ + big_rec_t* vector) /* in, own: big rec vector; it is + freed in this function */ +{ + big_rec_field_t* b = vector->fields; + const big_rec_field_t* const end = b + vector->n_fields; + + for (; b < end; b++) { + dfield_t* dfield; + ulint local_len; + + dfield = dtuple_get_nth_field(entry, b->field_no); + local_len = dfield_get_len(dfield); + + ut_ad(dfield_is_ext(dfield)); + ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + ut_ad(local_len <= DICT_MAX_INDEX_COL_LEN); + + dfield_set_data(dfield, + (char*) b->data - local_len, + b->len + local_len); + } + + mem_heap_free(vector->heap); +} diff --git a/storage/xtradb/data/data0type.c b/storage/xtradb/data/data0type.c new file mode 100644 index 00000000000..5df933ef9fd --- /dev/null +++ b/storage/xtradb/data/data0type.c @@ -0,0 +1,300 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include "data0type.h" + +#ifdef UNIV_NONINL +#include "data0type.ic" +#endif + +/********************************************************************** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. + +NOTE: the prototype of this function is copied from ha_innodb.cc! If you change +this function, you MUST change also the prototype here! */ +UNIV_INTERN +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + /* out: number of bytes occupied by the first + n characters */ + ulint charset_id, /* in: character set id */ + ulint prefix_len, /* in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /* in: length of the string in bytes */ + const char* str); /* in: character string */ + +/* At the database startup we store the default-charset collation number of +this MySQL installation to this global variable. If we have < 4.1.2 format +column definitions, or records in the insert buffer, we use this +charset-collation code for them. */ + +UNIV_INTERN ulint data_mysql_default_charset_coll; + +/************************************************************************* +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. */ +UNIV_INTERN +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + /* out: length of the prefix, + in bytes */ + ulint prtype, /* in: precise type */ + ulint mbminlen, /* in: minimum length of a + multi-byte character */ + ulint mbmaxlen, /* in: maximum length of a + multi-byte character */ + ulint prefix_len, /* in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /* in: length of str (in bytes) */ + const char* str) /* in: the string whose prefix + length is being determined */ +{ +#ifndef UNIV_HOTBACKUP + ut_a(data_len != UNIV_SQL_NULL); + ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen)); + + if (mbminlen != mbmaxlen) { + ut_a(!(prefix_len % mbmaxlen)); + return(innobase_get_at_most_n_mbchars( + dtype_get_charset_coll(prtype), + prefix_len, data_len, str)); + } + + if (prefix_len < data_len) { + + return(prefix_len); + + } + + return(data_len); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ +} + +/************************************************************************* +Checks if a data main type is a string type. Also a BLOB is considered a +string type. */ +UNIV_INTERN +ibool +dtype_is_string_type( +/*=================*/ + /* out: TRUE if string type */ + ulint mtype) /* in: InnoDB main data type code: DATA_CHAR, ... */ +{ + if (mtype <= DATA_BLOB + || mtype == DATA_MYSQL + || mtype == DATA_VARMYSQL) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Checks if a type is a binary string type. Note that for tables created with +< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For +those DATA_BLOB columns this function currently returns FALSE. */ +UNIV_INTERN +ibool +dtype_is_binary_string_type( +/*========================*/ + /* out: TRUE if binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype) /* in: precise type */ +{ + if ((mtype == DATA_FIXBINARY) + || (mtype == DATA_BINARY) + || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Checks if a type is a non-binary string type. That is, dtype_is_string_type is +TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created +with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. +For those DATA_BLOB columns this function currently returns TRUE. */ +UNIV_INTERN +ibool +dtype_is_non_binary_string_type( +/*============================*/ + /* out: TRUE if non-binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype) /* in: precise type */ +{ + if (dtype_is_string_type(mtype) == TRUE + && dtype_is_binary_string_type(mtype, prtype) == FALSE) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Forms a precise type from the < 4.1.2 format precise type plus the +charset-collation code. */ +UNIV_INTERN +ulint +dtype_form_prtype( +/*==============*/ + ulint old_prtype, /* in: the MySQL type code and the flags + DATA_BINARY_TYPE etc. */ + ulint charset_coll) /* in: MySQL charset-collation code */ +{ + ut_a(old_prtype < 256 * 256); + ut_a(charset_coll < 256); + + return(old_prtype + (charset_coll << 16)); +} + +/************************************************************************* +Validates a data type structure. */ +UNIV_INTERN +ibool +dtype_validate( +/*===========*/ + /* out: TRUE if ok */ + const dtype_t* type) /* in: type struct to validate */ +{ + ut_a(type); + ut_a(type->mtype >= DATA_VARCHAR); + ut_a(type->mtype <= DATA_MYSQL); + + if (type->mtype == DATA_SYS) { + ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS); + } + + ut_a(type->mbminlen <= type->mbmaxlen); + + return(TRUE); +} + +/************************************************************************* +Prints a data type structure. */ +UNIV_INTERN +void +dtype_print( +/*========*/ + const dtype_t* type) /* in: type */ +{ + ulint mtype; + ulint prtype; + ulint len; + + ut_a(type); + + mtype = type->mtype; + prtype = type->prtype; + + switch (mtype) { + case DATA_VARCHAR: + fputs("DATA_VARCHAR", stderr); + break; + + case DATA_CHAR: + fputs("DATA_CHAR", stderr); + break; + + case DATA_BINARY: + fputs("DATA_BINARY", stderr); + break; + + case DATA_FIXBINARY: + fputs("DATA_FIXBINARY", stderr); + break; + + case DATA_BLOB: + fputs("DATA_BLOB", stderr); + break; + + case DATA_INT: + fputs("DATA_INT", stderr); + break; + + case DATA_MYSQL: + fputs("DATA_MYSQL", stderr); + break; + + case DATA_SYS: + fputs("DATA_SYS", stderr); + break; + + default: + fprintf(stderr, "type %lu", (ulong) mtype); + break; + } + + len = type->len; + + if ((type->mtype == DATA_SYS) + || (type->mtype == DATA_VARCHAR) + || (type->mtype == DATA_CHAR)) { + putc(' ', stderr); + if (prtype == DATA_ROW_ID) { + fputs("DATA_ROW_ID", stderr); + len = DATA_ROW_ID_LEN; + } else if (prtype == DATA_ROLL_PTR) { + fputs("DATA_ROLL_PTR", stderr); + len = DATA_ROLL_PTR_LEN; + } else if (prtype == DATA_TRX_ID) { + fputs("DATA_TRX_ID", stderr); + len = DATA_TRX_ID_LEN; + } else if (prtype == DATA_ENGLISH) { + fputs("DATA_ENGLISH", stderr); + } else { + fprintf(stderr, "prtype %lu", (ulong) prtype); + } + } else { + if (prtype & DATA_UNSIGNED) { + fputs(" DATA_UNSIGNED", stderr); + } + + if (prtype & DATA_BINARY_TYPE) { + fputs(" DATA_BINARY_TYPE", stderr); + } + + if (prtype & DATA_NOT_NULL) { + fputs(" DATA_NOT_NULL", stderr); + } + } + + fprintf(stderr, " len %lu", (ulong) len); +} diff --git a/storage/xtradb/dict/dict0boot.c b/storage/xtradb/dict/dict0boot.c new file mode 100644 index 00000000000..505ba53ec52 --- /dev/null +++ b/storage/xtradb/dict/dict0boot.c @@ -0,0 +1,462 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0boot.h" + +#ifdef UNIV_NONINL +#include "dict0boot.ic" +#endif + +#include "dict0crea.h" +#include "btr0btr.h" +#include "dict0load.h" +#include "dict0load.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "os0file.h" + +/************************************************************************** +Gets a pointer to the dictionary header and x-latches its page. */ +UNIV_INTERN +dict_hdr_t* +dict_hdr_get( +/*=========*/ + /* out: pointer to the dictionary header, + page x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + dict_hdr_t* header; + + block = buf_page_get(DICT_HDR_SPACE, 0, DICT_HDR_PAGE_NO, + RW_X_LATCH, mtr); + header = DICT_HDR + buf_block_get_frame(block); + + buf_block_dbg_add_level(block, SYNC_DICT_HEADER); + + return(header); +} + +/************************************************************************** +Returns a new table, index, or tree id. */ +UNIV_INTERN +dulint +dict_hdr_get_new_id( +/*================*/ + /* out: the new id */ + ulint type) /* in: DICT_HDR_ROW_ID, ... */ +{ + dict_hdr_t* dict_hdr; + dulint id; + mtr_t mtr; + + ut_ad((type == DICT_HDR_TABLE_ID) || (type == DICT_HDR_INDEX_ID)); + + mtr_start(&mtr); + + dict_hdr = dict_hdr_get(&mtr); + + id = mtr_read_dulint(dict_hdr + type, &mtr); + id = ut_dulint_add(id, 1); + + mlog_write_dulint(dict_hdr + type, id, &mtr); + + mtr_commit(&mtr); + + return(id); +} + +/************************************************************************** +Writes the current value of the row id counter to the dictionary header file +page. */ +UNIV_INTERN +void +dict_hdr_flush_row_id(void) +/*=======================*/ +{ + dict_hdr_t* dict_hdr; + dulint id; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + id = dict_sys->row_id; + + mtr_start(&mtr); + + dict_hdr = dict_hdr_get(&mtr); + + mlog_write_dulint(dict_hdr + DICT_HDR_ROW_ID, id, &mtr); + + mtr_commit(&mtr); +} + +/********************************************************************* +Creates the file page for the dictionary header. This function is +called only at the database creation. */ +static +ibool +dict_hdr_create( +/*============*/ + /* out: TRUE if succeed */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + dict_hdr_t* dict_header; + ulint root_page_no; + + ut_ad(mtr); + + /* Create the dictionary header file block in a new, allocated file + segment in the system tablespace */ + block = fseg_create(DICT_HDR_SPACE, 0, + DICT_HDR + DICT_HDR_FSEG_HEADER, mtr); + + ut_a(DICT_HDR_PAGE_NO == buf_block_get_page_no(block)); + + dict_header = dict_hdr_get(mtr); + + /* Start counting row, table, index, and tree ids from + DICT_HDR_FIRST_ID */ + mlog_write_dulint(dict_header + DICT_HDR_ROW_ID, + ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); + + mlog_write_dulint(dict_header + DICT_HDR_TABLE_ID, + ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); + + mlog_write_dulint(dict_header + DICT_HDR_INDEX_ID, + ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); + + /* Obsolete, but we must initialize it to 0 anyway. */ + mlog_write_dulint(dict_header + DICT_HDR_MIX_ID, + ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); + + /* Create the B-tree roots for the clustered indexes of the basic + system tables */ + + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_TABLES_ID, + srv_sys->dummy_ind1, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, 0, + DICT_TABLE_IDS_ID, + srv_sys->dummy_ind1, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_COLUMNS_ID, + srv_sys->dummy_ind1, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_INDEXES_ID, + srv_sys->dummy_ind1, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + DICT_HDR_SPACE, 0, DICT_FIELDS_ID, + srv_sys->dummy_ind1, mtr); + if (root_page_no == FIL_NULL) { + + return(FALSE); + } + + mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no, + MLOG_4BYTES, mtr); + /*--------------------------*/ + + return(TRUE); +} + +/********************************************************************* +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. */ +UNIV_INTERN +void +dict_boot(void) +/*===========*/ +{ + dict_table_t* table; + dict_index_t* index; + dict_hdr_t* dict_hdr; + mem_heap_t* heap; + mtr_t mtr; + ulint error; + + mtr_start(&mtr); + + /* Create the hash tables etc. */ + dict_init(); + + heap = mem_heap_create(450); + + mutex_enter(&(dict_sys->mutex)); + + /* Get the dictionary header */ + dict_hdr = dict_hdr_get(&mtr); + + /* Because we only write new row ids to disk-based data structure + (dictionary header) when it is divisible by + DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover + the latest value of the row id counter. Therefore we advance + the counter at the database startup to avoid overlapping values. + Note that when a user after database startup first time asks for + a new row id, then because the counter is now divisible by + ..._MARGIN, it will immediately be updated to the disk-based + header. */ + + dict_sys->row_id = ut_dulint_add( + ut_dulint_align_up(mtr_read_dulint(dict_hdr + DICT_HDR_ROW_ID, + &mtr), + DICT_HDR_ROW_ID_WRITE_MARGIN), + DICT_HDR_ROW_ID_WRITE_MARGIN); + + /* Insert into the dictionary cache the descriptions of the basic + system tables */ + /*-------------------------*/ + table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0); + + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); + /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ + dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); + /* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT) + and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */ + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + + table->id = DICT_TABLES_ID; + + dict_table_add_to_cache(table, heap); + dict_sys->sys_tables = table; + mem_heap_empty(heap); + + index = dict_mem_index_create("SYS_TABLES", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 1); + + dict_mem_index_add_field(index, "NAME", 0); + + index->id = DICT_TABLES_ID; + + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_TABLES, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + index = dict_mem_index_create("SYS_TABLES", "ID_IND", + DICT_HDR_SPACE, DICT_UNIQUE, 1); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_TABLE_IDS_ID; + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_TABLE_IDS, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0); + + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4); + + table->id = DICT_COLUMNS_ID; + + dict_table_add_to_cache(table, heap); + dict_sys->sys_columns = table; + mem_heap_empty(heap); + + index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_COLUMNS_ID; + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_COLUMNS, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0); + + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4); + + /* The '+ 2' below comes from the 2 system fields */ +#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2 +#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2" +#endif +#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2 +#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2" +#endif +#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2 +#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2" +#endif + + table->id = DICT_INDEXES_ID; + dict_table_add_to_cache(table, heap); + dict_sys->sys_indexes = table; + mem_heap_empty(heap); + + index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_INDEXES_ID; + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_INDEXES, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + /*-------------------------*/ + table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0); + + dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0); + + table->id = DICT_FIELDS_ID; + dict_table_add_to_cache(table, heap); + dict_sys->sys_fields = table; + mem_heap_free(heap); + + index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND", + DICT_HDR_SPACE, + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "INDEX_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_FIELDS_ID; + error = dict_index_add_to_cache(table, index, + mtr_read_ulint(dict_hdr + + DICT_HDR_FIELDS, + MLOG_4BYTES, &mtr), + FALSE); + ut_a(error == DB_SUCCESS); + + mtr_commit(&mtr); + /*-------------------------*/ + + /* Initialize the insert buffer table and index for each tablespace */ + + ibuf_init_at_db_start(); + + /* Load definitions of other indexes on system tables */ + + dict_load_sys_table(dict_sys->sys_tables); + dict_load_sys_table(dict_sys->sys_columns); + dict_load_sys_table(dict_sys->sys_indexes); + dict_load_sys_table(dict_sys->sys_fields); + + mutex_exit(&(dict_sys->mutex)); +} + +/********************************************************************* +Inserts the basic system table data into themselves in the database +creation. */ +static +void +dict_insert_initial_data(void) +/*==========================*/ +{ + /* Does nothing yet */ +} + +/********************************************************************* +Creates and initializes the data dictionary at the database creation. */ +UNIV_INTERN +void +dict_create(void) +/*=============*/ +{ + mtr_t mtr; + + mtr_start(&mtr); + + dict_hdr_create(&mtr); + + mtr_commit(&mtr); + + dict_boot(); + + dict_insert_initial_data(); +} diff --git a/storage/xtradb/dict/dict0crea.c b/storage/xtradb/dict/dict0crea.c new file mode 100644 index 00000000000..b9662c9a44c --- /dev/null +++ b/storage/xtradb/dict/dict0crea.c @@ -0,0 +1,1503 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0crea.h" + +#ifdef UNIV_NONINL +#include "dict0crea.ic" +#endif + +#include "btr0pcur.h" +#include "btr0btr.h" +#include "page0page.h" +#include "mach0data.h" +#include "dict0boot.h" +#include "dict0dict.h" +#include "que0que.h" +#include "row0ins.h" +#include "row0mysql.h" +#include "pars0pars.h" +#include "trx0roll.h" +#include "usr0sess.h" +#include "ut0vec.h" + +/********************************************************************* +Based on a table object, this function builds the entry to be inserted +in the SYS_TABLES system table. */ +static +dtuple_t* +dict_create_sys_tables_tuple( +/*=========================*/ + /* out: the tuple which should be inserted */ + dict_table_t* table, /* in: table */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the built tuple is allocated */ +{ + dict_table_t* sys_tables; + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + + ut_ad(table && heap); + + sys_tables = dict_sys->sys_tables; + + entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_tables); + + /* 0: NAME -----------------------------*/ + dfield = dtuple_get_nth_field(entry, 0); + + dfield_set_data(dfield, table->name, ut_strlen(table->name)); + /* 3: ID -------------------------------*/ + dfield = dtuple_get_nth_field(entry, 1); + + ptr = mem_heap_alloc(heap, 8); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + /* 4: N_COLS ---------------------------*/ + dfield = dtuple_get_nth_field(entry, 2); + +#if DICT_TF_COMPACT != 1 +#error +#endif + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, table->n_def + | ((table->flags & DICT_TF_COMPACT) << 31)); + dfield_set_data(dfield, ptr, 4); + /* 5: TYPE -----------------------------*/ + dfield = dtuple_get_nth_field(entry, 3); + + ptr = mem_heap_alloc(heap, 4); + if (table->flags & ~DICT_TF_COMPACT) { + ut_a(table->flags & DICT_TF_COMPACT); + ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP); + ut_a((table->flags & DICT_TF_ZSSIZE_MASK) + <= (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT)); + ut_a(!(table->flags & (~0 << DICT_TF_BITS))); + mach_write_to_4(ptr, table->flags); + } else { + mach_write_to_4(ptr, DICT_TABLE_ORDINARY); + } + + dfield_set_data(dfield, ptr, 4); + /* 6: MIX_ID (obsolete) ---------------------------*/ + dfield = dtuple_get_nth_field(entry, 4); + + ptr = mem_heap_zalloc(heap, 8); + + dfield_set_data(dfield, ptr, 8); + /* 7: MIX_LEN (obsolete) --------------------------*/ + + dfield = dtuple_get_nth_field(entry, 5); + + ptr = mem_heap_zalloc(heap, 4); + + dfield_set_data(dfield, ptr, 4); + /* 8: CLUSTER_NAME ---------------------*/ + dfield = dtuple_get_nth_field(entry, 6); + dfield_set_null(dfield); /* not supported */ + + /* 9: SPACE ----------------------------*/ + dfield = dtuple_get_nth_field(entry, 7); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, table->space); + + dfield_set_data(dfield, ptr, 4); + /*----------------------------------*/ + + return(entry); +} + +/********************************************************************* +Based on a table object, this function builds the entry to be inserted +in the SYS_COLUMNS system table. */ +static +dtuple_t* +dict_create_sys_columns_tuple( +/*==========================*/ + /* out: the tuple which should be inserted */ + dict_table_t* table, /* in: table */ + ulint i, /* in: column number */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the built tuple is allocated */ +{ + dict_table_t* sys_columns; + dtuple_t* entry; + const dict_col_t* column; + dfield_t* dfield; + byte* ptr; + const char* col_name; + + ut_ad(table && heap); + + column = dict_table_get_nth_col(table, i); + + sys_columns = dict_sys->sys_columns; + + entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_columns); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, 0); + + ptr = mem_heap_alloc(heap, 8); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + /* 1: POS ----------------------------*/ + dfield = dtuple_get_nth_field(entry, 1); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, i); + + dfield_set_data(dfield, ptr, 4); + /* 4: NAME ---------------------------*/ + dfield = dtuple_get_nth_field(entry, 2); + + col_name = dict_table_get_col_name(table, i); + dfield_set_data(dfield, col_name, ut_strlen(col_name)); + /* 5: MTYPE --------------------------*/ + dfield = dtuple_get_nth_field(entry, 3); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, column->mtype); + + dfield_set_data(dfield, ptr, 4); + /* 6: PRTYPE -------------------------*/ + dfield = dtuple_get_nth_field(entry, 4); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, column->prtype); + + dfield_set_data(dfield, ptr, 4); + /* 7: LEN ----------------------------*/ + dfield = dtuple_get_nth_field(entry, 5); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, column->len); + + dfield_set_data(dfield, ptr, 4); + /* 8: PREC ---------------------------*/ + dfield = dtuple_get_nth_field(entry, 6); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, 0/* unused */); + + dfield_set_data(dfield, ptr, 4); + /*---------------------------------*/ + + return(entry); +} + +/******************************************************************* +Builds a table definition to insert. */ +static +ulint +dict_build_table_def_step( +/*======================*/ + /* out: DB_SUCCESS or error code */ + que_thr_t* thr, /* in: query thread */ + tab_node_t* node) /* in: table create node */ +{ + dict_table_t* table; + dtuple_t* row; + ulint error; + const char* path_or_name; + ibool is_path; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = node->table; + + table->id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + thr_get_trx(thr)->table_id = table->id; + + if (srv_file_per_table) { + /* We create a new single-table tablespace for the table. + We initially let it be 4 pages: + - page 0 is the fsp header and an extent descriptor page, + - page 1 is an ibuf bitmap page, + - page 2 is the first inode page, + - page 3 will contain the root of the clustered index of the + table we create here. */ + + ulint space = 0; /* reset to zero for the call below */ + + if (table->dir_path_of_temp_table) { + /* We place tables created with CREATE TEMPORARY + TABLE in the tmp dir of mysqld server */ + + path_or_name = table->dir_path_of_temp_table; + is_path = TRUE; + } else { + path_or_name = table->name; + is_path = FALSE; + } + + ut_ad(dict_table_get_format(table) <= DICT_TF_FORMAT_MAX); + ut_ad(!dict_table_zip_size(table) + || dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP); + + error = fil_create_new_single_table_tablespace( + &space, path_or_name, is_path, + table->flags == DICT_TF_COMPACT ? 0 : table->flags, + FIL_IBD_FILE_INITIAL_SIZE); + table->space = (unsigned int) space; + + if (error != DB_SUCCESS) { + + return(error); + } + + mtr_start(&mtr); + + fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr); + + mtr_commit(&mtr); + } else { + /* Create in the system tablespace: disallow new features */ + table->flags &= DICT_TF_COMPACT; + } + + row = dict_create_sys_tables_tuple(table, node->heap); + + ins_node_set_new_row(node->tab_def, row); + + return(DB_SUCCESS); +} + +/******************************************************************* +Builds a column definition to insert. */ +static +ulint +dict_build_col_def_step( +/*====================*/ + /* out: DB_SUCCESS */ + tab_node_t* node) /* in: table create node */ +{ + dtuple_t* row; + + row = dict_create_sys_columns_tuple(node->table, node->col_no, + node->heap); + ins_node_set_new_row(node->col_def, row); + + return(DB_SUCCESS); +} + +/********************************************************************* +Based on an index object, this function builds the entry to be inserted +in the SYS_INDEXES system table. */ +static +dtuple_t* +dict_create_sys_indexes_tuple( +/*==========================*/ + /* out: the tuple which should be inserted */ + dict_index_t* index, /* in: index */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the built tuple is allocated */ +{ + dict_table_t* sys_indexes; + dict_table_t* table; + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(index && heap); + + sys_indexes = dict_sys->sys_indexes; + + table = dict_table_get_low(index->table_name); + + entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_indexes); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, 0); + + ptr = mem_heap_alloc(heap, 8); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + /* 1: ID ----------------------------*/ + dfield = dtuple_get_nth_field(entry, 1); + + ptr = mem_heap_alloc(heap, 8); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + /* 4: NAME --------------------------*/ + dfield = dtuple_get_nth_field(entry, 2); + + dfield_set_data(dfield, index->name, ut_strlen(index->name)); + /* 5: N_FIELDS ----------------------*/ + dfield = dtuple_get_nth_field(entry, 3); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, index->n_fields); + + dfield_set_data(dfield, ptr, 4); + /* 6: TYPE --------------------------*/ + dfield = dtuple_get_nth_field(entry, 4); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, index->type); + + dfield_set_data(dfield, ptr, 4); + /* 7: SPACE --------------------------*/ + +#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 7 +#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 7" +#endif + + dfield = dtuple_get_nth_field(entry, 5); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, index->space); + + dfield_set_data(dfield, ptr, 4); + /* 8: PAGE_NO --------------------------*/ + +#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 8 +#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 8" +#endif + + dfield = dtuple_get_nth_field(entry, 6); + + ptr = mem_heap_alloc(heap, 4); + mach_write_to_4(ptr, FIL_NULL); + + dfield_set_data(dfield, ptr, 4); + /*--------------------------------*/ + + return(entry); +} + +/********************************************************************* +Based on an index object, this function builds the entry to be inserted +in the SYS_FIELDS system table. */ +static +dtuple_t* +dict_create_sys_fields_tuple( +/*=========================*/ + /* out: the tuple which should be inserted */ + dict_index_t* index, /* in: index */ + ulint i, /* in: field number */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the built tuple is allocated */ +{ + dict_table_t* sys_fields; + dtuple_t* entry; + dict_field_t* field; + dfield_t* dfield; + byte* ptr; + ibool index_contains_column_prefix_field = FALSE; + ulint j; + + ut_ad(index && heap); + + for (j = 0; j < index->n_fields; j++) { + if (dict_index_get_nth_field(index, j)->prefix_len > 0) { + index_contains_column_prefix_field = TRUE; + break; + } + } + + field = dict_index_get_nth_field(index, i); + + sys_fields = dict_sys->sys_fields; + + entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, sys_fields); + + /* 0: INDEX_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, 0); + + ptr = mem_heap_alloc(heap, 8); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + /* 1: POS + PREFIX LENGTH ----------------------------*/ + + dfield = dtuple_get_nth_field(entry, 1); + + ptr = mem_heap_alloc(heap, 4); + + if (index_contains_column_prefix_field) { + /* If there are column prefix fields in the index, then + we store the number of the field to the 2 HIGH bytes + and the prefix length to the 2 low bytes, */ + + mach_write_to_4(ptr, (i << 16) + field->prefix_len); + } else { + /* Else we store the number of the field to the 2 LOW bytes. + This is to keep the storage format compatible with + InnoDB versions < 4.0.14. */ + + mach_write_to_4(ptr, i); + } + + dfield_set_data(dfield, ptr, 4); + /* 4: COL_NAME -------------------------*/ + dfield = dtuple_get_nth_field(entry, 2); + + dfield_set_data(dfield, field->name, + ut_strlen(field->name)); + /*---------------------------------*/ + + return(entry); +} + +/********************************************************************* +Creates the tuple with which the index entry is searched for writing the index +tree root page number, if such a tree is created. */ +static +dtuple_t* +dict_create_search_tuple( +/*=====================*/ + /* out: the tuple for search */ + const dtuple_t* tuple, /* in: the tuple inserted in the SYS_INDEXES + table */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the built tuple is allocated */ +{ + dtuple_t* search_tuple; + const dfield_t* field1; + dfield_t* field2; + + ut_ad(tuple && heap); + + search_tuple = dtuple_create(heap, 2); + + field1 = dtuple_get_nth_field(tuple, 0); + field2 = dtuple_get_nth_field(search_tuple, 0); + + dfield_copy(field2, field1); + + field1 = dtuple_get_nth_field(tuple, 1); + field2 = dtuple_get_nth_field(search_tuple, 1); + + dfield_copy(field2, field1); + + ut_ad(dtuple_validate(search_tuple)); + + return(search_tuple); +} + +/******************************************************************* +Builds an index definition row to insert. */ +static +ulint +dict_build_index_def_step( +/*======================*/ + /* out: DB_SUCCESS or error code */ + que_thr_t* thr, /* in: query thread */ + ind_node_t* node) /* in: index create node */ +{ + dict_table_t* table; + dict_index_t* index; + dtuple_t* row; + trx_t* trx; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = thr_get_trx(thr); + + index = node->index; + + table = dict_table_get_low(index->table_name); + + if (table == NULL) { + return(DB_TABLE_NOT_FOUND); + } + + trx->table_id = table->id; + + node->table = table; + + ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) + || dict_index_is_clust(index)); + + index->id = dict_hdr_get_new_id(DICT_HDR_INDEX_ID); + + /* Inherit the space id from the table; we store all indexes of a + table in the same tablespace */ + + index->space = table->space; + node->page_no = FIL_NULL; + row = dict_create_sys_indexes_tuple(index, node->heap); + node->ind_row = row; + + ins_node_set_new_row(node->ind_def, row); + +#ifdef ROW_MERGE_IS_INDEX_USABLE + /* Note that the index was created by this transaction. */ + index->trx_id = trx->id; +#endif /* ROW_MERGE_IS_INDEX_USABLE */ + + return(DB_SUCCESS); +} + +/******************************************************************* +Builds a field definition row to insert. */ +static +ulint +dict_build_field_def_step( +/*======================*/ + /* out: DB_SUCCESS */ + ind_node_t* node) /* in: index create node */ +{ + dict_index_t* index; + dtuple_t* row; + + index = node->index; + + row = dict_create_sys_fields_tuple(index, node->field_no, node->heap); + + ins_node_set_new_row(node->field_def, row); + + return(DB_SUCCESS); +} + +/******************************************************************* +Creates an index tree for the index if it is not a member of a cluster. */ +static +ulint +dict_create_index_tree_step( +/*========================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + ind_node_t* node) /* in: index create node */ +{ + dict_index_t* index; + dict_table_t* sys_indexes; + dict_table_t* table; + dtuple_t* search_tuple; + btr_pcur_t pcur; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + index = node->index; + table = node->table; + + sys_indexes = dict_sys->sys_indexes; + + /* Run a mini-transaction in which the index tree is allocated for + the index and its root address is written to the index entry in + sys_indexes */ + + mtr_start(&mtr); + + search_tuple = dict_create_search_tuple(node->ind_row, node->heap); + + btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes), + search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF, + &pcur, &mtr); + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + node->page_no = btr_create(index->type, index->space, + dict_table_zip_size(index->table), + index->id, index, &mtr); + /* printf("Created a new index tree in space %lu root page %lu\n", + index->space, index->page_no); */ + + page_rec_write_index_page_no(btr_pcur_get_rec(&pcur), + DICT_SYS_INDEXES_PAGE_NO_FIELD, + node->page_no, &mtr); + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (node->page_no == FIL_NULL) { + + return(DB_OUT_OF_FILE_SPACE); + } + + return(DB_SUCCESS); +} + +/*********************************************************************** +Drops the index tree associated with a row in SYS_INDEXES table. */ +UNIV_INTERN +void +dict_drop_index_tree( +/*=================*/ + rec_t* rec, /* in/out: record in the clustered index + of SYS_INDEXES table */ + mtr_t* mtr) /* in: mtr having the latch on the record page */ +{ + ulint root_page_no; + ulint space; + ulint zip_size; + const byte* ptr; + ulint len; + + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); + ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); + + ut_ad(len == 4); + + root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (root_page_no == FIL_NULL) { + /* The tree has already been freed */ + + return; + } + + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); + + ut_ad(len == 4); + + space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + zip_size = fil_space_get_zip_size(space); + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + return; + } + + /* We free all the pages but the root page first; this operation + may span several mini-transactions */ + + btr_free_but_not_root(space, zip_size, root_page_no); + + /* Then we free the root page in the same mini-transaction where + we write FIL_NULL to the appropriate field in the SYS_INDEXES + record: this mini-transaction marks the B-tree totally freed */ + + /* printf("Dropping index tree in space %lu root page %lu\n", space, + root_page_no); */ + btr_free_root(space, zip_size, root_page_no, mtr); + + page_rec_write_index_page_no(rec, + DICT_SYS_INDEXES_PAGE_NO_FIELD, + FIL_NULL, mtr); +} + +/*********************************************************************** +Truncates the index tree associated with a row in SYS_INDEXES table. */ +UNIV_INTERN +ulint +dict_truncate_index_tree( +/*=====================*/ + /* out: new root page number, or + FIL_NULL on failure */ + dict_table_t* table, /* in: the table the index belongs to */ + ulint space, /* in: 0=truncate, + nonzero=create the index tree in the + given tablespace */ + btr_pcur_t* pcur, /* in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ + mtr_t* mtr) /* in: mtr having the latch + on the record page. The mtr may be + committed and restarted in this call. */ +{ + ulint root_page_no; + ibool drop = !space; + ulint zip_size; + ulint type; + dulint index_id; + rec_t* rec; + const byte* ptr; + ulint len; + dict_index_t* index; + + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); + rec = btr_pcur_get_rec(pcur); + ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); + + ut_ad(len == 4); + + root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + + if (drop && root_page_no == FIL_NULL) { + /* The tree has been freed. */ + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Trying to TRUNCATE" + " a missing index of table %s!\n", table->name); + drop = FALSE; + } + + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); + + ut_ad(len == 4); + + if (drop) { + space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); + } + + zip_size = fil_space_get_zip_size(space); + + if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Trying to TRUNCATE" + " a missing .ibd file of table %s!\n", table->name); + return(FIL_NULL); + } + + ptr = rec_get_nth_field_old(rec, + DICT_SYS_INDEXES_TYPE_FIELD, &len); + ut_ad(len == 4); + type = mach_read_from_4(ptr); + + ptr = rec_get_nth_field_old(rec, 1, &len); + ut_ad(len == 8); + index_id = mach_read_from_8(ptr); + + if (!drop) { + + goto create; + } + + /* We free all the pages but the root page first; this operation + may span several mini-transactions */ + + btr_free_but_not_root(space, zip_size, root_page_no); + + /* Then we free the root page in the same mini-transaction where + we create the b-tree and write its new root page number to the + appropriate field in the SYS_INDEXES record: this mini-transaction + marks the B-tree totally truncated */ + + btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, mtr); + + btr_free_root(space, zip_size, root_page_no, mtr); +create: + /* We will temporarily write FIL_NULL to the PAGE_NO field + in SYS_INDEXES, so that the database will not get into an + inconsistent state in case it crashes between the mtr_commit() + below and the following mtr_commit() call. */ + page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, + FIL_NULL, mtr); + + /* We will need to commit the mini-transaction in order to avoid + deadlocks in the btr_create() call, because otherwise we would + be freeing and allocating pages in the same mini-transaction. */ + btr_pcur_store_position(pcur, mtr); + mtr_commit(mtr); + + mtr_start(mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + + /* Find the index corresponding to this SYS_INDEXES record. */ + for (index = UT_LIST_GET_FIRST(table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + if (!ut_dulint_cmp(index->id, index_id)) { + root_page_no = btr_create(type, space, zip_size, + index_id, index, mtr); + index->page = (unsigned int) root_page_no; + return(root_page_no); + } + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Index %lu %lu of table %s is missing\n" + "InnoDB: from the data dictionary during TRUNCATE!\n", + ut_dulint_get_high(index_id), + ut_dulint_get_low(index_id), + table->name); + + return(FIL_NULL); +} + +/************************************************************************* +Creates a table create graph. */ +UNIV_INTERN +tab_node_t* +tab_create_graph_create( +/*====================*/ + /* out, own: table create node */ + dict_table_t* table, /* in: table to create, built as a memory data + structure */ + mem_heap_t* heap) /* in: heap where created */ +{ + tab_node_t* node; + + node = mem_heap_alloc(heap, sizeof(tab_node_t)); + + node->common.type = QUE_NODE_CREATE_TABLE; + + node->table = table; + + node->state = TABLE_BUILD_TABLE_DEF; + node->heap = mem_heap_create(256); + + node->tab_def = ins_node_create(INS_DIRECT, dict_sys->sys_tables, + heap); + node->tab_def->common.parent = node; + + node->col_def = ins_node_create(INS_DIRECT, dict_sys->sys_columns, + heap); + node->col_def->common.parent = node; + + node->commit_node = commit_node_create(heap); + node->commit_node->common.parent = node; + + return(node); +} + +/************************************************************************* +Creates an index create graph. */ +UNIV_INTERN +ind_node_t* +ind_create_graph_create( +/*====================*/ + /* out, own: index create node */ + dict_index_t* index, /* in: index to create, built as a memory data + structure */ + mem_heap_t* heap) /* in: heap where created */ +{ + ind_node_t* node; + + node = mem_heap_alloc(heap, sizeof(ind_node_t)); + + node->common.type = QUE_NODE_CREATE_INDEX; + + node->index = index; + + node->state = INDEX_BUILD_INDEX_DEF; + node->page_no = FIL_NULL; + node->heap = mem_heap_create(256); + + node->ind_def = ins_node_create(INS_DIRECT, + dict_sys->sys_indexes, heap); + node->ind_def->common.parent = node; + + node->field_def = ins_node_create(INS_DIRECT, + dict_sys->sys_fields, heap); + node->field_def->common.parent = node; + + node->commit_node = commit_node_create(heap); + node->commit_node->common.parent = node; + + return(node); +} + +/*************************************************************** +Creates a table. This is a high-level function used in SQL execution graphs. */ +UNIV_INTERN +que_thr_t* +dict_create_table_step( +/*===================*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + tab_node_t* node; + ulint err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = thr_get_trx(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = TABLE_BUILD_TABLE_DEF; + } + + if (node->state == TABLE_BUILD_TABLE_DEF) { + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = dict_build_table_def_step(thr, node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = TABLE_BUILD_COL_DEF; + node->col_no = 0; + + thr->run_node = node->tab_def; + + return(thr); + } + + if (node->state == TABLE_BUILD_COL_DEF) { + + if (node->col_no < (node->table)->n_def) { + + err = dict_build_col_def_step(node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->col_no++; + + thr->run_node = node->col_def; + + return(thr); + } else { + node->state = TABLE_COMMIT_WORK; + } + } + + if (node->state == TABLE_COMMIT_WORK) { + + /* Table was correctly defined: do NOT commit the transaction + (CREATE TABLE does NOT do an implicit commit of the current + transaction) */ + + node->state = TABLE_ADD_TO_CACHE; + + /* thr->run_node = node->commit_node; + + return(thr); */ + } + + if (node->state == TABLE_ADD_TO_CACHE) { + + dict_table_add_to_cache(node->table, node->heap); + + err = DB_SUCCESS; + } + +function_exit: + trx->error_state = err; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + + return(NULL); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/*************************************************************** +Creates an index. This is a high-level function used in SQL execution +graphs. */ +UNIV_INTERN +que_thr_t* +dict_create_index_step( +/*===================*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ind_node_t* node; + ulint err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(mutex_own(&(dict_sys->mutex))); + + trx = thr_get_trx(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = INDEX_BUILD_INDEX_DEF; + } + + if (node->state == INDEX_BUILD_INDEX_DEF) { + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + err = dict_build_index_def_step(thr, node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = INDEX_BUILD_FIELD_DEF; + node->field_no = 0; + + thr->run_node = node->ind_def; + + return(thr); + } + + if (node->state == INDEX_BUILD_FIELD_DEF) { + + if (node->field_no < (node->index)->n_fields) { + + err = dict_build_field_def_step(node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->field_no++; + + thr->run_node = node->field_def; + + return(thr); + } else { + node->state = INDEX_ADD_TO_CACHE; + } + } + + if (node->state == INDEX_ADD_TO_CACHE) { + + dulint index_id = node->index->id; + + err = dict_index_add_to_cache(node->table, node->index, + FIL_NULL, TRUE); + + node->index = dict_index_get_if_in_cache_low(index_id); + ut_a(!node->index == (err != DB_SUCCESS)); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = INDEX_CREATE_INDEX_TREE; + } + + if (node->state == INDEX_CREATE_INDEX_TREE) { + + err = dict_create_index_tree_step(node); + + if (err != DB_SUCCESS) { + dict_index_remove_from_cache(node->table, node->index); + node->index = NULL; + + goto function_exit; + } + + node->index->page = node->page_no; + node->state = INDEX_COMMIT_WORK; + } + + if (node->state == INDEX_COMMIT_WORK) { + + /* Index was correctly defined: do NOT commit the transaction + (CREATE INDEX does NOT currently do an implicit commit of + the current transaction) */ + + node->state = INDEX_CREATE_INDEX_TREE; + + /* thr->run_node = node->commit_node; + + return(thr); */ + } + +function_exit: + trx->error_state = err; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + + return(NULL); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +#ifndef UNIV_HOTBACKUP +/******************************************************************** +Creates the foreign key constraints system tables inside InnoDB +at database creation or database start if they are not found or are +not of the right form. */ +UNIV_INTERN +ulint +dict_create_or_check_foreign_constraint_tables(void) +/*================================================*/ + /* out: DB_SUCCESS or error code */ +{ + dict_table_t* table1; + dict_table_t* table2; + ulint error; + trx_t* trx; + + mutex_enter(&(dict_sys->mutex)); + + table1 = dict_table_get_low("SYS_FOREIGN"); + table2 = dict_table_get_low("SYS_FOREIGN_COLS"); + + if (table1 && table2 + && UT_LIST_GET_LEN(table1->indexes) == 3 + && UT_LIST_GET_LEN(table2->indexes) == 1) { + + /* Foreign constraint system tables have already been + created, and they are ok */ + + mutex_exit(&(dict_sys->mutex)); + + return(DB_SUCCESS); + } + + mutex_exit(&(dict_sys->mutex)); + + trx = trx_allocate_for_mysql(); + + trx->op_info = "creating foreign key sys tables"; + + row_mysql_lock_data_dictionary(trx); + + if (table1) { + fprintf(stderr, + "InnoDB: dropping incompletely created" + " SYS_FOREIGN table\n"); + row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); + } + + if (table2) { + fprintf(stderr, + "InnoDB: dropping incompletely created" + " SYS_FOREIGN_COLS table\n"); + row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); + } + + fprintf(stderr, + "InnoDB: Creating foreign key constraint system tables\n"); + + /* NOTE: in dict_load_foreigns we use the fact that + there are 2 secondary indexes on SYS_FOREIGN, and they + are defined just like below */ + + /* NOTE: when designing InnoDB's foreign key support in 2001, we made + an error and made the table names and the foreign key id of type + 'CHAR' (internally, really a VARCHAR). We should have made the type + VARBINARY, like in other InnoDB system tables, to get a clean + design. */ + + error = que_eval_sql(NULL, + "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR," + " REF_NAME CHAR, N_COLS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN (ID);\n" + "CREATE INDEX FOR_IND" + " ON SYS_FOREIGN (FOR_NAME);\n" + "CREATE INDEX REF_IND" + " ON SYS_FOREIGN (REF_NAME);\n" + "CREATE TABLE\n" + "SYS_FOREIGN_COLS(ID CHAR, POS INT," + " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN_COLS (ID, POS);\n" + "END;\n" + , FALSE, trx); + + if (error != DB_SUCCESS) { + fprintf(stderr, "InnoDB: error %lu in creation\n", + (ulong) error); + + ut_a(error == DB_OUT_OF_FILE_SPACE + || error == DB_TOO_MANY_CONCURRENT_TRXS); + + fprintf(stderr, + "InnoDB: creation failed\n" + "InnoDB: tablespace is full\n" + "InnoDB: dropping incompletely created" + " SYS_FOREIGN tables\n"); + + row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); + row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); + + error = DB_MUST_GET_MORE_FILE_SPACE; + } + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + if (error == DB_SUCCESS) { + fprintf(stderr, + "InnoDB: Foreign key constraint system tables" + " created\n"); + } + + return(error); +} + +/******************************************************************** +Evaluate the given foreign key SQL statement. */ +static +ulint +dict_foreign_eval_sql( +/*==================*/ + /* out: error code or DB_SUCCESS */ + pars_info_t* info, /* in: info struct, or NULL */ + const char* sql, /* in: SQL string to evaluate */ + dict_table_t* table, /* in: table */ + dict_foreign_t* foreign,/* in: foreign */ + trx_t* trx) /* in: transaction */ +{ + ulint error; + FILE* ef = dict_foreign_err_file; + + error = que_eval_sql(info, sql, FALSE, trx); + + if (error == DB_DUPLICATE_KEY) { + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in foreign key constraint creation for table ", + ef); + ut_print_name(ef, trx, TRUE, table->name); + fputs(".\nA foreign key constraint of name ", ef); + ut_print_name(ef, trx, TRUE, foreign->id); + fputs("\nalready exists." + " (Note that internally InnoDB adds 'databasename'\n" + "in front of the user-defined constraint name.)\n" + "Note that InnoDB's FOREIGN KEY system tables store\n" + "constraint names as case-insensitive, with the\n" + "MySQL standard latin1_swedish_ci collation. If you\n" + "create tables or databases whose names differ only in\n" + "the character case, then collisions in constraint\n" + "names can occur. Workaround: name your constraints\n" + "explicitly with unique names.\n", + ef); + + mutex_exit(&dict_foreign_err_mutex); + + return(error); + } + + if (error != DB_SUCCESS) { + fprintf(stderr, + "InnoDB: Foreign key constraint creation failed:\n" + "InnoDB: internal error number %lu\n", (ulong) error); + + mutex_enter(&dict_foreign_err_mutex); + ut_print_timestamp(ef); + fputs(" Internal error in foreign key constraint creation" + " for table ", ef); + ut_print_name(ef, trx, TRUE, table->name); + fputs(".\n" + "See the MySQL .err log in the datadir" + " for more information.\n", ef); + mutex_exit(&dict_foreign_err_mutex); + + return(error); + } + + return(DB_SUCCESS); +} + +/************************************************************************ +Add a single foreign key field definition to the data dictionary tables in +the database. */ +static +ulint +dict_create_add_foreign_field_to_dictionary( +/*========================================*/ + /* out: error code or DB_SUCCESS */ + ulint field_nr, /* in: foreign field number */ + dict_table_t* table, /* in: table */ + dict_foreign_t* foreign, /* in: foreign */ + trx_t* trx) /* in: transaction */ +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_int4_literal(info, "pos", field_nr); + + pars_info_add_str_literal(info, "for_col_name", + foreign->foreign_col_names[field_nr]); + + pars_info_add_str_literal(info, "ref_col_name", + foreign->referenced_col_names[field_nr]); + + return(dict_foreign_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN_COLS VALUES" + "(:id, :pos, :for_col_name, :ref_col_name);\n" + "END;\n", + table, foreign, trx)); +} + +/************************************************************************ +Add a single foreign key definition to the data dictionary tables in the +database. We also generate names to constraints that were not named by the +user. A generated constraint has a name of the format +databasename/tablename_ibfk_, where the numbers start from 1, and +are given locally for this table, that is, the number is not global, as in +the old format constraints < 4.0.18 it used to be. */ +static +ulint +dict_create_add_foreign_to_dictionary( +/*==================================*/ + /* out: error code or DB_SUCCESS */ + ulint* id_nr, /* in/out: number to use in id generation; + incremented if used */ + dict_table_t* table, /* in: table */ + dict_foreign_t* foreign,/* in: foreign */ + trx_t* trx) /* in: transaction */ +{ + ulint error; + ulint i; + + pars_info_t* info = pars_info_create(); + + if (foreign->id == NULL) { + /* Generate a new constraint id */ + ulint namelen = strlen(table->name); + char* id = mem_heap_alloc(foreign->heap, namelen + 20); + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", table->name, (ulong) (*id_nr)++); + foreign->id = id; + } + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_str_literal(info, "for_name", table->name); + + pars_info_add_str_literal(info, "ref_name", + foreign->referenced_table_name); + + pars_info_add_int4_literal(info, "n_cols", + foreign->n_fields + (foreign->type << 24)); + + error = dict_foreign_eval_sql(info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN VALUES" + "(:id, :for_name, :ref_name, :n_cols);\n" + "END;\n" + , table, foreign, trx); + + if (error != DB_SUCCESS) { + + return(error); + } + + for (i = 0; i < foreign->n_fields; i++) { + error = dict_create_add_foreign_field_to_dictionary( + i, table, foreign, trx); + + if (error != DB_SUCCESS) { + + return(error); + } + } + + error = dict_foreign_eval_sql(NULL, + "PROCEDURE P () IS\n" + "BEGIN\n" + "COMMIT WORK;\n" + "END;\n" + , table, foreign, trx); + + return(error); +} + +/************************************************************************ +Adds foreign key definitions to data dictionary tables in the database. */ +UNIV_INTERN +ulint +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + /* out: error code or DB_SUCCESS */ + ulint start_id,/* in: if we are actually doing ALTER TABLE + ADD CONSTRAINT, we want to generate constraint + numbers which are bigger than in the table so + far; we number the constraints from + start_id + 1 up; start_id should be set to 0 if + we are creating a new table, or if the table + so far has no constraints for which the name + was generated here */ + dict_table_t* table, /* in: table */ + trx_t* trx) /* in: transaction */ +{ + dict_foreign_t* foreign; + ulint number = start_id + 1; + ulint error; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + if (NULL == dict_table_get_low("SYS_FOREIGN")) { + fprintf(stderr, + "InnoDB: table SYS_FOREIGN not found" + " in internal data dictionary\n"); + + return(DB_ERROR); + } + + for (foreign = UT_LIST_GET_FIRST(table->foreign_list); + foreign; + foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { + + error = dict_create_add_foreign_to_dictionary(&number, table, + foreign, trx); + + if (error != DB_SUCCESS) { + + return(error); + } + } + + return(DB_SUCCESS); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/dict/dict0dict.c b/storage/xtradb/dict/dict0dict.c new file mode 100644 index 00000000000..134be1dadba --- /dev/null +++ b/storage/xtradb/dict/dict0dict.c @@ -0,0 +1,4747 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "dict0dict.h" + +#ifdef UNIV_NONINL +#include "dict0dict.ic" +#endif + +#include "buf0buf.h" +#include "data0type.h" +#include "mach0data.h" +#include "dict0boot.h" +#include "dict0mem.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "page0zip.h" +#include "page0page.h" +#include "pars0pars.h" +#include "pars0sym.h" +#include "que0que.h" +#include "rem0cmp.h" +#include "row0merge.h" +#ifndef UNIV_HOTBACKUP +# include "m_ctype.h" /* my_isspace() */ +# include "ha_prototypes.h" /* innobase_strcasecmp() */ +#endif /* !UNIV_HOTBACKUP */ + +#include + +/* the dictionary system */ +UNIV_INTERN dict_sys_t* dict_sys = NULL; + +/* table create, drop, etc. reserve this in X-mode; implicit or +backround operations purge, rollback, foreign key checks reserve this +in S-mode; we cannot trust that MySQL protects implicit or background +operations a table drop since MySQL does not know of them; therefore +we need this; NOTE: a transaction which reserves this must keep book +on the mode in trx->dict_operation_lock_mode */ +UNIV_INTERN rw_lock_t dict_operation_lock; + +#define DICT_HEAP_SIZE 100 /* initial memory heap size when + creating a table or index object */ +#define DICT_POOL_PER_TABLE_HASH 512 /* buffer pool max size per table + hash table fixed size in bytes */ +#define DICT_POOL_PER_VARYING 4 /* buffer pool max size per data + dictionary varying size in bytes */ + +/* Identifies generated InnoDB foreign key names */ +static char dict_ibfk[] = "_ibfk_"; + +/*********************************************************************** +Tries to find column names for the index and sets the col field of the +index. */ +static +void +dict_index_find_cols( +/*=================*/ + dict_table_t* table, /* in: table */ + dict_index_t* index); /* in: index */ +/*********************************************************************** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + /* out, own: the internal + representation of the clustered + index */ + const dict_table_t* table, /* in: table */ + dict_index_t* index); /* in: user representation of + a clustered index */ +/*********************************************************************** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + /* out, own: the internal + representation of the non-clustered + index */ + const dict_table_t* table, /* in: table */ + dict_index_t* index); /* in: user representation of + a non-clustered index */ +/************************************************************************** +Removes a foreign constraint struct from the dictionary cache. */ +static +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign); /* in, own: foreign constraint */ +/************************************************************************** +Prints a column data. */ +static +void +dict_col_print_low( +/*===============*/ + const dict_table_t* table, /* in: table */ + const dict_col_t* col); /* in: column */ +/************************************************************************** +Prints an index data. */ +static +void +dict_index_print_low( +/*=================*/ + dict_index_t* index); /* in: index */ +/************************************************************************** +Prints a field data. */ +static +void +dict_field_print_low( +/*=================*/ + dict_field_t* field); /* in: field */ +/************************************************************************* +Frees a foreign key struct. */ +static +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign); /* in, own: foreign key struct */ + +/* Stream for storing detailed information about the latest foreign key +and unique key errors */ +UNIV_INTERN FILE* dict_foreign_err_file = NULL; +/* mutex protecting the foreign and unique error buffers */ +UNIV_INTERN mutex_t dict_foreign_err_mutex; + +#ifndef UNIV_HOTBACKUP +/********************************************************************** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +dict_casedn_str( +/*============*/ + char* a) /* in/out: string to put in lower case */ +{ + innobase_casedn_str(a); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************************ +Checks if the database name in two table names is the same. */ +UNIV_INTERN +ibool +dict_tables_have_same_db( +/*=====================*/ + /* out: TRUE if same db name */ + const char* name1, /* in: table name in the form + dbname '/' tablename */ + const char* name2) /* in: table name in the form + dbname '/' tablename */ +{ + for (; *name1 == *name2; name1++, name2++) { + if (*name1 == '/') { + return(TRUE); + } + ut_a(*name1); /* the names must contain '/' */ + } + return(FALSE); +} + +/************************************************************************ +Return the end of table name where we have removed dbname and '/'. */ +UNIV_INTERN +const char* +dict_remove_db_name( +/*================*/ + /* out: table name */ + const char* name) /* in: table name in the form + dbname '/' tablename */ +{ + const char* s = strchr(name, '/'); + ut_a(s); + + return(s + 1); +} + +/************************************************************************ +Get the database name length in a table name. */ +UNIV_INTERN +ulint +dict_get_db_name_len( +/*=================*/ + /* out: database name length */ + const char* name) /* in: table name in the form + dbname '/' tablename */ +{ + const char* s; + s = strchr(name, '/'); + ut_a(s); + return(s - name); +} + +/************************************************************************ +Reserves the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_enter_for_mysql(void) +/*============================*/ +{ + mutex_enter(&(dict_sys->mutex)); +} + +/************************************************************************ +Releases the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_exit_for_mysql(void) +/*===========================*/ +{ + mutex_exit(&(dict_sys->mutex)); +} + +/************************************************************************ +Decrements the count of open MySQL handles to a table. */ +UNIV_INTERN +void +dict_table_decrement_handle_count( +/*==============================*/ + dict_table_t* table, /* in/out: table */ + ibool dict_locked) /* in: TRUE=data dictionary locked */ +{ + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + + ut_ad(mutex_own(&dict_sys->mutex)); + ut_a(table->n_mysql_handles_opened > 0); + + table->n_mysql_handles_opened--; + + if (!dict_locked) { + mutex_exit(&dict_sys->mutex); + } +} + +/************************************************************************** +Returns a column's name. */ +UNIV_INTERN +const char* +dict_table_get_col_name( +/*====================*/ + /* out: column name. NOTE: not + guaranteed to stay valid if table is + modified in any way (columns added, + etc.). */ + const dict_table_t* table, /* in: table */ + ulint col_nr) /* in: column number */ +{ + ulint i; + const char* s; + + ut_ad(table); + ut_ad(col_nr < table->n_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + s = table->col_names; + if (s) { + for (i = 0; i < col_nr; i++) { + s += strlen(s) + 1; + } + } + + return(s); +} + + +/************************************************************************ +Acquire the autoinc lock.*/ +UNIV_INTERN +void +dict_table_autoinc_lock( +/*====================*/ + dict_table_t* table) /* in/out: table */ +{ + mutex_enter(&table->autoinc_mutex); +} + +/************************************************************************ +Unconditionally set the autoinc counter. */ +UNIV_INTERN +void +dict_table_autoinc_initialize( +/*==========================*/ + dict_table_t* table, /* in/out: table */ + ib_uint64_t value) /* in: next value to assign to a row */ +{ + ut_ad(mutex_own(&table->autoinc_mutex)); + + table->autoinc = value; +} + +/************************************************************************ +Reads the next autoinc value (== autoinc counter value), 0 if not yet +initialized. */ +UNIV_INTERN +ib_uint64_t +dict_table_autoinc_read( +/*====================*/ + /* out: value for a new row, or 0 */ + const dict_table_t* table) /* in: table */ +{ + ut_ad(mutex_own(&table->autoinc_mutex)); + + return(table->autoinc); +} + +/************************************************************************ +Updates the autoinc counter if the value supplied is greater than the +current value. */ +UNIV_INTERN +void +dict_table_autoinc_update_if_greater( +/*=================================*/ + + dict_table_t* table, /* in/out: table */ + ib_uint64_t value) /* in: value which was assigned to a row */ +{ + ut_ad(mutex_own(&table->autoinc_mutex)); + + if (value > table->autoinc) { + + table->autoinc = value; + } +} + +/************************************************************************ +Release the autoinc lock.*/ +UNIV_INTERN +void +dict_table_autoinc_unlock( +/*======================*/ + dict_table_t* table) /* in/out: table */ +{ + mutex_exit(&table->autoinc_mutex); +} + +/************************************************************************** +Looks for an index with the given table and index id. +NOTE that we do not reserve the dictionary mutex. */ +UNIV_INTERN +dict_index_t* +dict_index_get_on_id_low( +/*=====================*/ + /* out: index or NULL if not found + from cache */ + dict_table_t* table, /* in: table */ + dulint id) /* in: index id */ +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index) { + if (0 == ut_dulint_cmp(id, index->id)) { + /* Found */ + + return(index); + } + + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/************************************************************************ +Looks for column n in an index. */ +UNIV_INTERN +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + /* out: position in internal + representation of the index; + if not contained, returns + ULINT_UNDEFINED */ + const dict_index_t* index, /* in: index */ + ulint n) /* in: column number */ +{ + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + col = dict_table_get_nth_col(index->table, n); + + if (dict_index_is_clust(index)) { + + return(dict_col_get_clust_pos(col, index)); + } + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col && field->prefix_len == 0) { + + return(pos); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************ +Returns TRUE if the index contains a column or a prefix of that column. */ +UNIV_INTERN +ibool +dict_index_contains_col_or_prefix( +/*==============================*/ + /* out: TRUE if contains the column + or its prefix */ + const dict_index_t* index, /* in: index */ + ulint n) /* in: column number */ +{ + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + if (dict_index_is_clust(index)) { + + return(TRUE); + } + + col = dict_table_get_nth_col(index->table, n); + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************ +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. */ +UNIV_INTERN +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + /* out: position in internal + representation of the index; + if not contained, returns + ULINT_UNDEFINED */ + const dict_index_t* index, /* in: index from which to search */ + const dict_index_t* index2, /* in: index */ + ulint n) /* in: field number in index2 */ +{ + const dict_field_t* field; + const dict_field_t* field2; + ulint n_fields; + ulint pos; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + field2 = dict_index_get_nth_field(index2, n); + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (field->col == field2->col + && (field->prefix_len == 0 + || (field->prefix_len >= field2->prefix_len + && field2->prefix_len != 0))) { + + return(pos); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Returns a table object based on table id. */ +UNIV_INTERN +dict_table_t* +dict_table_get_on_id( +/*=================*/ + /* out: table, NULL if does not exist */ + dulint table_id, /* in: table id */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + + if (ut_dulint_cmp(table_id, DICT_FIELDS_ID) <= 0 + || trx->dict_operation_lock_mode == RW_X_LATCH) { + /* It is a system table which will always exist in the table + cache: we avoid acquiring the dictionary mutex, because + if we are doing a rollback to handle an error in TABLE + CREATE, for example, we already have the mutex! */ + + ut_ad(mutex_own(&(dict_sys->mutex)) + || trx->dict_operation_lock_mode == RW_X_LATCH); + + return(dict_table_get_on_id_low(table_id)); + } + + mutex_enter(&(dict_sys->mutex)); + + table = dict_table_get_on_id_low(table_id); + + mutex_exit(&(dict_sys->mutex)); + + return(table); +} + +/************************************************************************ +Looks for column n position in the clustered index. */ +UNIV_INTERN +ulint +dict_table_get_nth_col_pos( +/*=======================*/ + /* out: position in internal + representation of + the clustered index */ + const dict_table_t* table, /* in: table */ + ulint n) /* in: column number */ +{ + return(dict_index_get_nth_col_pos(dict_table_get_first_index(table), + n)); +} + +/************************************************************************ +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. */ +UNIV_INTERN +ibool +dict_table_col_in_clustered_key( +/*============================*/ + /* out: TRUE if the column, or its + prefix, is in the clustered key */ + const dict_table_t* table, /* in: table */ + ulint n) /* in: column number */ +{ + const dict_index_t* index; + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(table); + + col = dict_table_get_nth_col(table, n); + + index = dict_table_get_first_index(table); + + n_fields = dict_index_get_n_unique(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************** +Inits the data dictionary module. */ +UNIV_INTERN +void +dict_init(void) +/*===========*/ +{ + dict_sys = mem_alloc(sizeof(dict_sys_t)); + + mutex_create(&dict_sys->mutex, SYNC_DICT); + + dict_sys->table_hash = hash_create(buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH + * UNIV_WORD_SIZE)); + dict_sys->table_id_hash = hash_create(buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH + * UNIV_WORD_SIZE)); + dict_sys->size = 0; + + UT_LIST_INIT(dict_sys->table_LRU); + + rw_lock_create(&dict_operation_lock, SYNC_DICT_OPERATION); + + dict_foreign_err_file = os_file_create_tmpfile(); + ut_a(dict_foreign_err_file); + + mutex_create(&dict_foreign_err_mutex, SYNC_ANY_LATCH); +} + +/************************************************************************** +Returns a table object and optionally increment its MySQL open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low is usually the +appropriate function. */ +UNIV_INTERN +dict_table_t* +dict_table_get( +/*===========*/ + /* out: table, NULL if + does not exist */ + const char* table_name, /* in: table name */ + ibool inc_mysql_count) + /* in: whether to increment the open + handle count on the table */ +{ + dict_table_t* table; + + mutex_enter(&(dict_sys->mutex)); + + table = dict_table_get_low(table_name); + + if (inc_mysql_count && table) { + table->n_mysql_handles_opened++; + } + + mutex_exit(&(dict_sys->mutex)); + + if (table != NULL) { + if (!table->stat_initialized) { + /* If table->ibd_file_missing == TRUE, this will + print an error message and return without doing + anything. */ + dict_update_statistics(table); + } + } + + return(table); +} + +/************************************************************************** +Adds system columns to a table object. */ +UNIV_INTERN +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /* in/out: table */ + mem_heap_t* heap) /* in: temporary heap */ +{ + ut_ad(table); + ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->cached); + + /* NOTE: the system columns MUST be added in the following order + (so that they can be indexed by the numerical value of DATA_ROW_ID, + etc.) and as the last columns of the table memory object. + The clustered index will not always physically contain all + system columns. */ + + dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS, + DATA_ROW_ID | DATA_NOT_NULL, + DATA_ROW_ID_LEN); +#if DATA_ROW_ID != 0 +#error "DATA_ROW_ID != 0" +#endif + dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS, + DATA_TRX_ID | DATA_NOT_NULL, + DATA_TRX_ID_LEN); +#if DATA_TRX_ID != 1 +#error "DATA_TRX_ID != 1" +#endif + dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS, + DATA_ROLL_PTR | DATA_NOT_NULL, + DATA_ROLL_PTR_LEN); +#if DATA_ROLL_PTR != 2 +#error "DATA_ROLL_PTR != 2" +#endif + + /* This check reminds that if a new system column is added to + the program, it should be dealt with here */ +#if DATA_N_SYS_COLS != 3 +#error "DATA_N_SYS_COLS != 3" +#endif +} + +/************************************************************************** +Adds a table object to the dictionary cache. */ +UNIV_INTERN +void +dict_table_add_to_cache( +/*====================*/ + dict_table_t* table, /* in: table */ + mem_heap_t* heap) /* in: temporary heap */ +{ + ulint fold; + ulint id_fold; + ulint i; + ulint row_len; + + /* The lower limit for what we consider a "big" row */ +#define BIG_ROW_SIZE 1024 + + ut_ad(mutex_own(&(dict_sys->mutex))); + + dict_table_add_system_columns(table, heap); + + table->cached = TRUE; + + fold = ut_fold_string(table->name); + id_fold = ut_fold_dulint(table->id); + + row_len = 0; + for (i = 0; i < table->n_def; i++) { + ulint col_len = dict_col_get_max_size( + dict_table_get_nth_col(table, i)); + + row_len += col_len; + + /* If we have a single unbounded field, or several gigantic + fields, mark the maximum row size as BIG_ROW_SIZE. */ + if (row_len >= BIG_ROW_SIZE || col_len >= BIG_ROW_SIZE) { + row_len = BIG_ROW_SIZE; + + break; + } + } + + table->big_rows = row_len >= BIG_ROW_SIZE; + + /* Look for a table with the same name: error if such exists */ + { + dict_table_t* table2; + HASH_SEARCH(name_hash, dict_sys->table_hash, fold, + dict_table_t*, table2, ut_ad(table2->cached), + ut_strcmp(table2->name, table->name) == 0); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different name */ + HASH_SEARCH_ALL(name_hash, dict_sys->table_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + } + + /* Look for a table with the same id: error if such exists */ + { + dict_table_t* table2; + HASH_SEARCH(id_hash, dict_sys->table_id_hash, id_fold, + dict_table_t*, table2, ut_ad(table2->cached), + ut_dulint_cmp(table2->id, table->id) == 0); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different id */ + HASH_SEARCH_ALL(id_hash, dict_sys->table_id_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + } + + /* Add table to hash table of tables */ + HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, + table); + + /* Add table to hash table of tables based on table id */ + HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold, + table); + /* Add table to LRU list of tables */ + UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); + + dict_sys->size += mem_heap_get_size(table->heap); +} + +/************************************************************************** +Looks for an index with the given id. NOTE that we do not reserve +the dictionary mutex: this function is for emergency purposes like +printing info of a corrupt database page! */ +UNIV_INTERN +dict_index_t* +dict_index_find_on_id_low( +/*======================*/ + /* out: index or NULL if not found from cache */ + dulint id) /* in: index id */ +{ + dict_table_t* table; + dict_index_t* index; + + table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + + while (table) { + index = dict_table_get_first_index(table); + + while (index) { + if (0 == ut_dulint_cmp(id, index->id)) { + /* Found */ + + return(index); + } + + index = dict_table_get_next_index(index); + } + + table = UT_LIST_GET_NEXT(table_LRU, table); + } + + return(NULL); +} + +/************************************************************************** +Renames a table object. */ +UNIV_INTERN +ibool +dict_table_rename_in_cache( +/*=======================*/ + /* out: TRUE if success */ + dict_table_t* table, /* in/out: table */ + const char* new_name, /* in: new name */ + ibool rename_also_foreigns)/* in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ +{ + dict_foreign_t* foreign; + dict_index_t* index; + ulint fold; + ulint old_size; + const char* old_name; + + ut_ad(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + + old_size = mem_heap_get_size(table->heap); + old_name = table->name; + + fold = ut_fold_string(new_name); + + /* Look for a table with the same name: error if such exists */ + { + dict_table_t* table2; + HASH_SEARCH(name_hash, dict_sys->table_hash, fold, + dict_table_t*, table2, ut_ad(table2->cached), + (ut_strcmp(table2->name, new_name) == 0)); + if (UNIV_LIKELY_NULL(table2)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: dictionary cache" + " already contains a table ", stderr); + ut_print_name(stderr, NULL, TRUE, new_name); + fputs("\n" + "InnoDB: cannot rename table ", stderr); + ut_print_name(stderr, NULL, TRUE, old_name); + putc('\n', stderr); + return(FALSE); + } + } + + /* If the table is stored in a single-table tablespace, rename the + .ibd file */ + + if (table->space != 0) { + if (table->dir_path_of_temp_table != NULL) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: trying to rename a" + " TEMPORARY TABLE ", stderr); + ut_print_name(stderr, NULL, TRUE, old_name); + fputs(" (", stderr); + ut_print_filename(stderr, + table->dir_path_of_temp_table); + fputs(" )\n", stderr); + return(FALSE); + } else if (!fil_rename_tablespace(old_name, table->space, + new_name)) { + return(FALSE); + } + } + + /* Remove table from the hash tables of tables */ + HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash, + ut_fold_string(old_name), table); + table->name = mem_heap_strdup(table->heap, new_name); + + /* Add table to hash table of tables */ + HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, + table); + dict_sys->size += (mem_heap_get_size(table->heap) - old_size); + + /* Update the table_name field in indexes */ + index = dict_table_get_first_index(table); + + while (index != NULL) { + index->table_name = table->name; + + index = dict_table_get_next_index(index); + } + + if (!rename_also_foreigns) { + /* In ALTER TABLE we think of the rename table operation + in the direction table -> temporary table (#sql...) + as dropping the table with the old name and creating + a new with the new name. Thus we kind of drop the + constraints from the dictionary cache here. The foreign key + constraints will be inherited to the new table from the + system tables through a call of dict_load_foreigns. */ + + /* Remove the foreign constraints from the cache */ + foreign = UT_LIST_GET_LAST(table->foreign_list); + + while (foreign != NULL) { + dict_foreign_remove_from_cache(foreign); + foreign = UT_LIST_GET_LAST(table->foreign_list); + } + + /* Reset table field in referencing constraints */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign != NULL) { + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + /* Make the list of referencing constraints empty */ + + UT_LIST_INIT(table->referenced_list); + + return(TRUE); + } + + /* Update the table name fields in foreign constraints, and update also + the constraint id of new format >= 4.0.18 constraints. Note that at + this point we have already changed table->name to the new name. */ + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign != NULL) { + if (ut_strlen(foreign->foreign_table_name) + < ut_strlen(table->name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->foreign_table_name + = mem_heap_alloc(foreign->heap, + ut_strlen(table->name) + 1); + } + + strcpy(foreign->foreign_table_name, table->name); + + if (strchr(foreign->id, '/')) { + ulint db_len; + char* old_id; + + /* This is a >= 4.0.18 format id */ + + old_id = mem_strdup(foreign->id); + + if (ut_strlen(foreign->id) > ut_strlen(old_name) + + ((sizeof dict_ibfk) - 1) + && !memcmp(foreign->id, old_name, + ut_strlen(old_name)) + && !memcmp(foreign->id + ut_strlen(old_name), + dict_ibfk, (sizeof dict_ibfk) - 1)) { + + /* This is a generated >= 4.0.18 format id */ + + if (strlen(table->name) > strlen(old_name)) { + foreign->id = mem_heap_alloc( + foreign->heap, + strlen(table->name) + + strlen(old_id) + 1); + } + + /* Replace the prefix 'databasename/tablename' + with the new names */ + strcpy(foreign->id, table->name); + strcat(foreign->id, + old_id + ut_strlen(old_name)); + } else { + /* This is a >= 4.0.18 format id where the user + gave the id name */ + db_len = dict_get_db_name_len(table->name) + 1; + + if (dict_get_db_name_len(table->name) + > dict_get_db_name_len(foreign->id)) { + + foreign->id = mem_heap_alloc( + foreign->heap, + db_len + strlen(old_id) + 1); + } + + /* Replace the database prefix in id with the + one from table->name */ + + ut_memcpy(foreign->id, table->name, db_len); + + strcpy(foreign->id + db_len, + dict_remove_db_name(old_id)); + } + + mem_free(old_id); + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign != NULL) { + if (ut_strlen(foreign->referenced_table_name) + < ut_strlen(table->name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->referenced_table_name = mem_heap_alloc( + foreign->heap, strlen(table->name) + 1); + } + + strcpy(foreign->referenced_table_name, table->name); + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + return(TRUE); +} + +/************************************************************************** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +UNIV_INTERN +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /* in/out: table object already in cache */ + dulint new_id) /* in: new id to set */ +{ + ut_ad(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Remove the table from the hash table of id's */ + + HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_dulint(table->id), table); + table->id = new_id; + + /* Add the table back to the hash table */ + HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_dulint(table->id), table); +} + +/************************************************************************** +Removes a table object from the dictionary cache. */ +UNIV_INTERN +void +dict_table_remove_from_cache( +/*=========================*/ + dict_table_t* table) /* in, own: table */ +{ + dict_foreign_t* foreign; + dict_index_t* index; + ulint size; + + ut_ad(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + +#if 0 + fputs("Removing table ", stderr); + ut_print_name(stderr, table->name, ULINT_UNDEFINED); + fputs(" from dictionary cache\n", stderr); +#endif + + /* Remove the foreign constraints from the cache */ + foreign = UT_LIST_GET_LAST(table->foreign_list); + + while (foreign != NULL) { + dict_foreign_remove_from_cache(foreign); + foreign = UT_LIST_GET_LAST(table->foreign_list); + } + + /* Reset table field in referencing constraints */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign != NULL) { + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + /* Remove the indexes from the cache */ + index = UT_LIST_GET_LAST(table->indexes); + + while (index != NULL) { + dict_index_remove_from_cache(table, index); + index = UT_LIST_GET_LAST(table->indexes); + } + + /* Remove table from the hash tables of tables */ + HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash, + ut_fold_string(table->name), table); + HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_dulint(table->id), table); + + /* Remove table from LRU list of tables */ + UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); + + size = mem_heap_get_size(table->heap); + + ut_ad(dict_sys->size >= size); + + dict_sys->size -= size; + + dict_mem_table_free(table); +} + +/******************************************************************** +If the given column name is reserved for InnoDB system columns, return +TRUE. */ +UNIV_INTERN +ibool +dict_col_name_is_reserved( +/*======================*/ + /* out: TRUE if name is reserved */ + const char* name) /* in: column name */ +{ + /* This check reminds that if a new system column is added to + the program, it should be dealt with here. */ +#if DATA_N_SYS_COLS != 3 +#error "DATA_N_SYS_COLS != 3" +#endif + + static const char* reserved_names[] = { + "DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR" + }; + + ulint i; + + for (i = 0; i < UT_ARR_SIZE(reserved_names); i++) { + if (strcmp(name, reserved_names[i]) == 0) { + + return(TRUE); + } + } + + return(FALSE); +} + +/******************************************************************** +If an undo log record for this table might not fit on a single page, +return TRUE. */ +static +ibool +dict_index_too_big_for_undo( +/*========================*/ + /* out: TRUE if the undo log + record could become too big */ + const dict_table_t* table, /* in: table */ + const dict_index_t* new_index) /* in: index */ +{ + /* Make sure that all column prefixes will fit in the undo log record + in trx_undo_page_report_modify() right after trx_undo_page_init(). */ + + ulint i; + const dict_index_t* clust_index + = dict_table_get_first_index(table); + ulint undo_page_len + = TRX_UNDO_PAGE_HDR - TRX_UNDO_PAGE_HDR_SIZE + + 2 /* next record pointer */ + + 1 /* type_cmpl */ + + 11 /* trx->undo_no */ - 11 /* table->id */ + + 1 /* rec_get_info_bits() */ + + 11 /* DB_TRX_ID */ + + 11 /* DB_ROLL_PTR */ + + 10 + FIL_PAGE_DATA_END /* trx_undo_left() */ + + 2/* pointer to previous undo log record */; + + if (UNIV_UNLIKELY(!clust_index)) { + ut_a(dict_index_is_clust(new_index)); + clust_index = new_index; + } + + /* Add the size of the ordering columns in the + clustered index. */ + for (i = 0; i < clust_index->n_uniq; i++) { + const dict_col_t* col + = dict_index_get_nth_col(clust_index, i); + + /* Use the maximum output size of + mach_write_compressed(), although the encoded + length should always fit in 2 bytes. */ + undo_page_len += 5 + dict_col_get_max_size(col); + } + + /* Add the old values of the columns to be updated. + First, the amount and the numbers of the columns. + These are written by mach_write_compressed() whose + maximum output length is 5 bytes. However, given that + the quantities are below REC_MAX_N_FIELDS (10 bits), + the maximum length is 2 bytes per item. */ + undo_page_len += 2 * (dict_table_get_n_cols(table) + 1); + + for (i = 0; i < clust_index->n_def; i++) { + const dict_col_t* col + = dict_index_get_nth_col(clust_index, i); + ulint max_size + = dict_col_get_max_size(col); + ulint fixed_size + = dict_col_get_fixed_size(col); + + if (fixed_size) { + /* Fixed-size columns are stored locally. */ + max_size = fixed_size; + } else if (max_size <= BTR_EXTERN_FIELD_REF_SIZE * 2) { + /* Short columns are stored locally. */ + } else if (!col->ord_part) { + /* See if col->ord_part would be set + because of new_index. */ + ulint j; + + for (j = 0; j < new_index->n_uniq; j++) { + if (dict_index_get_nth_col( + new_index, j) == col) { + + goto is_ord_part; + } + } + + /* This is not an ordering column in any index. + Thus, it can be stored completely externally. */ + max_size = BTR_EXTERN_FIELD_REF_SIZE; + } else { +is_ord_part: + /* This is an ordering column in some index. + A long enough prefix must be written to the + undo log. See trx_undo_page_fetch_ext(). */ + + if (max_size > REC_MAX_INDEX_COL_LEN) { + max_size = REC_MAX_INDEX_COL_LEN; + } + + max_size += BTR_EXTERN_FIELD_REF_SIZE; + } + + undo_page_len += 5 + max_size; + } + + return(undo_page_len >= UNIV_PAGE_SIZE); +} + +/******************************************************************** +If a record of this index might not fit on a single B-tree page, +return TRUE. */ +static +ibool +dict_index_too_big_for_tree( +/*========================*/ + /* out: TRUE if the index + record could become too big */ + const dict_table_t* table, /* in: table */ + const dict_index_t* new_index) /* in: index */ +{ + ulint zip_size; + ulint comp; + ulint i; + /* maximum possible storage size of a record */ + ulint rec_max_size; + /* maximum allowed size of a record on a leaf page */ + ulint page_rec_max; + /* maximum allowed size of a node pointer record */ + ulint page_ptr_max; + + comp = dict_table_is_comp(table); + zip_size = dict_table_zip_size(table); + + if (zip_size && zip_size < UNIV_PAGE_SIZE) { + /* On a compressed page, two records must fit in the + uncompressed page modification log. On compressed + pages with zip_size == UNIV_PAGE_SIZE, this limit will + never be reached. */ + ut_ad(comp); + /* The maximum allowed record size is the size of + an empty page, minus a byte for recoding the heap + number in the page modification log. The maximum + allowed node pointer size is half that. */ + page_rec_max = page_zip_empty_size(new_index->n_fields, + zip_size) - 1; + page_ptr_max = page_rec_max / 2; + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. */ + rec_max_size = 2; + } else { + /* The maximum allowed record size is half a B-tree + page. No additional sparse page directory entry will + be generated for the first few user records. */ + page_rec_max = page_get_free_space_of_empty(comp) / 2; + page_ptr_max = page_rec_max; + /* Each record has a header. */ + rec_max_size = comp + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES; + } + + if (comp) { + /* Include the "null" flags in the + maximum possible record size. */ + rec_max_size += UT_BITS_IN_BYTES(new_index->n_nullable); + } else { + /* For each column, include a 2-byte offset and a + "null" flag. The 1-byte format is only used in short + records that do not contain externally stored columns. + Such records could never exceed the page limit, even + when using the 2-byte format. */ + rec_max_size += 2 * new_index->n_fields; + } + + /* Compute the maximum possible record size. */ + for (i = 0; i < new_index->n_fields; i++) { + const dict_field_t* field + = dict_index_get_nth_field(new_index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint field_max_size; + ulint field_ext_max_size; + + /* In dtuple_convert_big_rec(), variable-length columns + that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2 + may be chosen for external storage. + + Fixed-length columns, and all columns of secondary + index records are always stored inline. */ + + /* Determine the maximum length of the index field. + The field_ext_max_size should be computed as the worst + case in rec_get_converted_size_comp() for + REC_STATUS_ORDINARY records. */ + + field_max_size = dict_col_get_fixed_size(col); + if (field_max_size) { + /* dict_index_add_col() should guarantee this */ + ut_ad(!field->prefix_len + || field->fixed_len == field->prefix_len); + /* Fixed lengths are not encoded + in ROW_FORMAT=COMPACT. */ + field_ext_max_size = 0; + goto add_field_size; + } + + field_max_size = dict_col_get_max_size(col); + field_ext_max_size = field_max_size < 256 ? 1 : 2; + + if (field->prefix_len) { + if (field->prefix_len < field_max_size) { + field_max_size = field->prefix_len; + } + } else if (field_max_size > BTR_EXTERN_FIELD_REF_SIZE * 2 + && dict_index_is_clust(new_index)) { + + /* In the worst case, we have a locally stored + column of BTR_EXTERN_FIELD_REF_SIZE * 2 bytes. + The length can be stored in one byte. If the + column were stored externally, the lengths in + the clustered index page would be + BTR_EXTERN_FIELD_REF_SIZE and 2. */ + field_max_size = BTR_EXTERN_FIELD_REF_SIZE * 2; + field_ext_max_size = 1; + } + + if (comp) { + /* Add the extra size for ROW_FORMAT=COMPACT. + For ROW_FORMAT=REDUNDANT, these bytes were + added to rec_max_size before this loop. */ + rec_max_size += field_ext_max_size; + } +add_field_size: + rec_max_size += field_max_size; + + /* Check the size limit on leaf pages. */ + if (UNIV_UNLIKELY(rec_max_size >= page_rec_max)) { + + return(TRUE); + } + + /* Check the size limit on non-leaf pages. Records + stored in non-leaf B-tree pages consist of the unique + columns of the record (the key columns of the B-tree) + and a node pointer field. When we have processed the + unique columns, rec_max_size equals the size of the + node pointer record minus the node pointer column. */ + if (i + 1 == dict_index_get_n_unique_in_tree(new_index) + && rec_max_size + REC_NODE_PTR_SIZE >= page_ptr_max) { + + return(TRUE); + } + } + + return(FALSE); +} + +/************************************************************************** +Adds an index to the dictionary cache. */ +UNIV_INTERN +ulint +dict_index_add_to_cache( +/*====================*/ + /* out: DB_SUCCESS or DB_TOO_BIG_RECORD */ + dict_table_t* table, /* in: table on which the index is */ + dict_index_t* index, /* in, own: index; NOTE! The index memory + object is freed in this function! */ + ulint page_no,/* in: root page number of the index */ + ibool strict) /* in: TRUE=refuse to create the index + if records could be too big to fit in + an B-tree page */ +{ + dict_index_t* new_index; + ulint n_ord; + ulint i; + + ut_ad(index); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(index->n_def == index->n_fields); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + ut_ad(mem_heap_validate(index->heap)); + ut_a(!dict_index_is_clust(index) + || UT_LIST_GET_LEN(table->indexes) == 0); + + dict_index_find_cols(table, index); + + /* Build the cache internal representation of the index, + containing also the added system fields */ + + if (dict_index_is_clust(index)) { + new_index = dict_index_build_internal_clust(table, index); + } else { + new_index = dict_index_build_internal_non_clust(table, index); + } + + /* Set the n_fields value in new_index to the actual defined + number of fields in the cache internal representation */ + + new_index->n_fields = new_index->n_def; + + if (strict && dict_index_too_big_for_tree(table, new_index)) { +too_big: + dict_mem_index_free(new_index); + dict_mem_index_free(index); + return(DB_TOO_BIG_RECORD); + } + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + n_ord = new_index->n_fields; + } else { + n_ord = new_index->n_uniq; + } + + switch (dict_table_get_format(table)) { + case DICT_TF_FORMAT_51: + /* ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT store + prefixes of externally stored columns locally within + the record. There are no special considerations for + the undo log record size. */ + goto undo_size_ok; + + case DICT_TF_FORMAT_ZIP: + /* In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED, + column prefix indexes require that prefixes of + externally stored columns are written to the undo log. + This may make the undo log record bigger than the + record on the B-tree page. The maximum size of an + undo log record is the page size. That must be + checked for below. */ + break; + +#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX +# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX" +#endif + } + + for (i = 0; i < n_ord; i++) { + const dict_field_t* field + = dict_index_get_nth_field(new_index, i); + const dict_col_t* col + = dict_field_get_col(field); + + /* In dtuple_convert_big_rec(), variable-length columns + that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2 + may be chosen for external storage. If the column appears + in an ordering column of an index, a longer prefix of + REC_MAX_INDEX_COL_LEN will be copied to the undo log + by trx_undo_page_report_modify() and + trx_undo_page_fetch_ext(). It suffices to check the + capacity of the undo log whenever new_index includes + a column prefix on a column that may be stored externally. */ + + if (field->prefix_len /* prefix index */ + && !col->ord_part /* not yet ordering column */ + && !dict_col_get_fixed_size(col) /* variable-length */ + && dict_col_get_max_size(col) + > BTR_EXTERN_FIELD_REF_SIZE * 2 /* long enough */) { + + if (dict_index_too_big_for_undo(table, new_index)) { + /* An undo log record might not fit in + a single page. Refuse to create this index. */ + + goto too_big; + } + + break; + } + } + +undo_size_ok: + /* Flag the ordering columns */ + + for (i = 0; i < n_ord; i++) { + + dict_index_get_nth_field(new_index, i)->col->ord_part = 1; + } + + /* Add the new index as the last index for the table */ + + UT_LIST_ADD_LAST(indexes, table->indexes, new_index); + new_index->table = table; + new_index->table_name = table->name; + + new_index->search_info = btr_search_info_create(new_index->heap); + + new_index->stat_index_size = 1; + new_index->stat_n_leaf_pages = 1; + + new_index->page = page_no; + rw_lock_create(&new_index->lock, SYNC_INDEX_TREE); + + if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) { + + new_index->stat_n_diff_key_vals = mem_heap_alloc( + new_index->heap, + (1 + dict_index_get_n_unique(new_index)) + * sizeof(ib_int64_t)); + /* Give some sensible values to stat_n_... in case we do + not calculate statistics quickly enough */ + + for (i = 0; i <= dict_index_get_n_unique(new_index); i++) { + + new_index->stat_n_diff_key_vals[i] = 100; + } + } + + dict_sys->size += mem_heap_get_size(new_index->heap); + + dict_mem_index_free(index); + + return(DB_SUCCESS); +} + +/************************************************************************** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /* in/out: table */ + dict_index_t* index) /* in, own: index */ +{ + ulint size; + ulint retries = 0; + btr_search_t* info; + + ut_ad(table && index); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* We always create search info whether or not adaptive + hash index is enabled or not. */ + info = index->search_info; + ut_ad(info); + + /* We are not allowed to free the in-memory index struct + dict_index_t until all entries in the adaptive hash index + that point to any of the page belonging to his b-tree index + are dropped. This is so because dropping of these entries + require access to dict_index_t struct. To avoid such scenario + We keep a count of number of such pages in the search_info and + only free the dict_index_t struct when this count drops to + zero. */ + + for (;;) { + ulint ref_count = btr_search_info_get_ref_count(info); + if (ref_count == 0) { + break; + } + + /* Sleep for 10ms before trying again. */ + os_thread_sleep(10000); + ++retries; + + if (retries % 500 == 0) { + /* No luck after 5 seconds of wait. */ + fprintf(stderr, "InnoDB: Error: Waited for" + " %lu secs for hash index" + " ref_count (%lu) to drop" + " to 0.\n" + "index: \"%s\"" + " table: \"%s\"\n", + retries/100, + ref_count, + index->name, + table->name); + } + + /* To avoid a hang here we commit suicide if the + ref_count doesn't drop to zero in 600 seconds. */ + if (retries >= 60000) { + ut_error; + } + } + + rw_lock_free(&index->lock); + + /* Remove the index from the list of indexes of the table */ + UT_LIST_REMOVE(indexes, table->indexes, index); + + size = mem_heap_get_size(index->heap); + + ut_ad(dict_sys->size >= size); + + dict_sys->size -= size; + + dict_mem_index_free(index); +} + +/*********************************************************************** +Tries to find column names for the index and sets the col field of the +index. */ +static +void +dict_index_find_cols( +/*=================*/ + dict_table_t* table, /* in: table */ + dict_index_t* index) /* in: index */ +{ + ulint i; + + ut_ad(table && index); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(mutex_own(&(dict_sys->mutex))); + + for (i = 0; i < index->n_fields; i++) { + ulint j; + dict_field_t* field = dict_index_get_nth_field(index, i); + + for (j = 0; j < table->n_cols; j++) { + if (!strcmp(dict_table_get_col_name(table, j), + field->name)) { + field->col = dict_table_get_nth_col(table, j); + + goto found; + } + } + + /* It is an error not to find a matching column. */ + ut_error; + +found: + ; + } +} + +/*********************************************************************** +Adds a column to index. */ +UNIV_INTERN +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /* in/out: index */ + const dict_table_t* table, /* in: table */ + dict_col_t* col, /* in: column */ + ulint prefix_len) /* in: column prefix length */ +{ + dict_field_t* field; + const char* col_name; + + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + + dict_mem_index_add_field(index, col_name, prefix_len); + + field = dict_index_get_nth_field(index, index->n_def - 1); + + field->col = col; + field->fixed_len = (unsigned int) dict_col_get_fixed_size(col); + + if (prefix_len && field->fixed_len > prefix_len) { + field->fixed_len = (unsigned int) prefix_len; + } + + /* Long fixed-length fields that need external storage are treated as + variable-length fields, so that the extern flag can be embedded in + the length word. */ + + if (field->fixed_len > DICT_MAX_INDEX_COL_LEN) { + field->fixed_len = 0; + } +#if DICT_MAX_INDEX_COL_LEN != 768 + /* The comparison limit above must be constant. If it were + changed, the disk format of some fixed-length columns would + change, which would be a disaster. */ +# error "DICT_MAX_INDEX_COL_LEN != 768" +#endif + + if (!(col->prtype & DATA_NOT_NULL)) { + index->n_nullable++; + } +} + +/*********************************************************************** +Copies fields contained in index2 to index1. */ +static +void +dict_index_copy( +/*============*/ + dict_index_t* index1, /* in: index to copy to */ + dict_index_t* index2, /* in: index to copy from */ + const dict_table_t* table, /* in: table */ + ulint start, /* in: first position to copy */ + ulint end) /* in: last position to copy */ +{ + dict_field_t* field; + ulint i; + + /* Copy fields contained in index2 */ + + for (i = start; i < end; i++) { + + field = dict_index_get_nth_field(index2, i); + dict_index_add_col(index1, table, field->col, + field->prefix_len); + } +} + +/*********************************************************************** +Copies types of fields contained in index to tuple. */ +UNIV_INTERN +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /* in/out: data tuple */ + const dict_index_t* index, /* in: index */ + ulint n_fields) /* in: number of + field types to copy */ +{ + ulint i; + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + dtuple_set_types_binary(tuple, n_fields); + + return; + } + + for (i = 0; i < n_fields; i++) { + const dict_field_t* ifield; + dtype_t* dfield_type; + + ifield = dict_index_get_nth_field(index, i); + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dict_col_copy_type(dict_field_get_col(ifield), dfield_type); + } +} + +/*********************************************************************** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +UNIV_INTERN +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /* in/out: data tuple */ + const dict_table_t* table) /* in: table */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + dfield_t* dfield = dtuple_get_nth_field(tuple, i); + dtype_t* dtype = dfield_get_type(dfield); + + dfield_set_null(dfield); + dict_col_copy_type(dict_table_get_nth_col(table, i), dtype); + } +} + +/*********************************************************************** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + /* out, own: the internal + representation of the clustered + index */ + const dict_table_t* table, /* in: table */ + dict_index_t* index) /* in: user representation of + a clustered index */ +{ + dict_index_t* new_index; + dict_field_t* field; + ulint fixed_size; + ulint trx_id_pos; + ulint i; + ibool* indexed; + + ut_ad(table && index); + ut_ad(dict_index_is_clust(index)); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Create a new index object with certainly enough fields */ + new_index = dict_mem_index_create(table->name, + index->name, table->space, + index->type, + index->n_fields + table->n_cols); + + /* Copy other relevant data from the old index struct to the new + struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy the fields of index */ + dict_index_copy(new_index, index, table, 0, index->n_fields); + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + /* No fixed number of fields determines an entry uniquely */ + + new_index->n_uniq = REC_MAX_N_FIELDS; + + } else if (dict_index_is_unique(index)) { + /* Only the fields defined so far are needed to identify + the index entry uniquely */ + + new_index->n_uniq = new_index->n_def; + } else { + /* Also the row id is needed to identify the entry */ + new_index->n_uniq = 1 + new_index->n_def; + } + + new_index->trx_id_offset = 0; + + if (!dict_index_is_ibuf(index)) { + /* Add system columns, trx id first */ + + trx_id_pos = new_index->n_def; + +#if DATA_ROW_ID != 0 +# error "DATA_ROW_ID != 0" +#endif +#if DATA_TRX_ID != 1 +# error "DATA_TRX_ID != 1" +#endif +#if DATA_ROLL_PTR != 2 +# error "DATA_ROLL_PTR != 2" +#endif + + if (!dict_index_is_unique(index)) { + dict_index_add_col(new_index, table, + dict_table_get_sys_col( + table, DATA_ROW_ID), + 0); + trx_id_pos++; + } + + dict_index_add_col(new_index, table, + dict_table_get_sys_col(table, DATA_TRX_ID), + 0); + + dict_index_add_col(new_index, table, + dict_table_get_sys_col(table, + DATA_ROLL_PTR), + 0); + + for (i = 0; i < trx_id_pos; i++) { + + fixed_size = dict_col_get_fixed_size( + dict_index_get_nth_col(new_index, i)); + + if (fixed_size == 0) { + new_index->trx_id_offset = 0; + + break; + } + + if (dict_index_get_nth_field(new_index, i)->prefix_len + > 0) { + new_index->trx_id_offset = 0; + + break; + } + + new_index->trx_id_offset += (unsigned int) fixed_size; + } + + } + + /* Remember the table columns already contained in new_index */ + indexed = mem_zalloc(table->n_cols * sizeof *indexed); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index non-system columns of table not yet included + there */ + for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) { + + dict_col_t* col = dict_table_get_nth_col(table, i); + ut_ad(col->mtype != DATA_SYS); + + if (!indexed[col->ind]) { + dict_index_add_col(new_index, table, col, 0); + } + } + + mem_free(indexed); + + ut_ad(dict_index_is_ibuf(index) + || (UT_LIST_GET_LEN(table->indexes) == 0)); + + new_index->cached = TRUE; + + return(new_index); +} + +/*********************************************************************** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + /* out, own: the internal + representation of the non-clustered + index */ + const dict_table_t* table, /* in: table */ + dict_index_t* index) /* in: user representation of + a non-clustered index */ +{ + dict_field_t* field; + dict_index_t* new_index; + dict_index_t* clust_index; + ulint i; + ibool* indexed; + + ut_ad(table && index); + ut_ad(!dict_index_is_clust(index)); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* The clustered index should be the first in the list of indexes */ + clust_index = UT_LIST_GET_FIRST(table->indexes); + + ut_ad(clust_index); + ut_ad(dict_index_is_clust(clust_index)); + ut_ad(!(clust_index->type & DICT_UNIVERSAL)); + + /* Create a new index */ + new_index = dict_mem_index_create( + table->name, index->name, index->space, index->type, + index->n_fields + 1 + clust_index->n_uniq); + + /* Copy other relevant data from the old index + struct to the new struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy fields from index to new_index */ + dict_index_copy(new_index, index, table, 0, index->n_fields); + + /* Remember the table columns already contained in new_index */ + indexed = mem_zalloc(table->n_cols * sizeof *indexed); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index the columns necessary to determine the clustered + index entry uniquely */ + + for (i = 0; i < clust_index->n_uniq; i++) { + + field = dict_index_get_nth_field(clust_index, i); + + if (!indexed[field->col->ind]) { + dict_index_add_col(new_index, table, field->col, + field->prefix_len); + } + } + + mem_free(indexed); + + if (dict_index_is_unique(index)) { + new_index->n_uniq = index->n_fields; + } else { + new_index->n_uniq = new_index->n_def; + } + + /* Set the n_fields value in new_index to the actual defined + number of fields */ + + new_index->n_fields = new_index->n_def; + + new_index->cached = TRUE; + + return(new_index); +} + +/*====================== FOREIGN KEY PROCESSING ========================*/ + +/************************************************************************* +Checks if a table is referenced by foreign keys. */ +UNIV_INTERN +ibool +dict_table_is_referenced_by_foreign_key( +/*====================================*/ + /* out: TRUE if table is referenced + by a foreign key */ + const dict_table_t* table) /* in: InnoDB table */ +{ + return(UT_LIST_GET_LEN(table->referenced_list) > 0); +} + +/************************************************************************* +Check if the index is referenced by a foreign key, if TRUE return foreign +else return NULL */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_referenced_constraint( +/*=================================*/ + /* out: pointer to foreign key struct if index + is defined for foreign key, otherwise NULL */ + dict_table_t* table, /* in: InnoDB table */ + dict_index_t* index) /* in: InnoDB index */ +{ + dict_foreign_t* foreign; + + ut_ad(index != NULL); + ut_ad(table != NULL); + + for (foreign = UT_LIST_GET_FIRST(table->referenced_list); + foreign; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { + + if (foreign->referenced_index == index) { + + return(foreign); + } + } + + return(NULL); +} + +/************************************************************************* +Checks if a index is defined for a foreign key constraint. Index is a part +of a foreign key constraint if the index is referenced by foreign key +or index is a foreign key index. */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_foreign_constraint( +/*==============================*/ + /* out: pointer to foreign key struct if index + is defined for foreign key, otherwise NULL */ + dict_table_t* table, /* in: InnoDB table */ + dict_index_t* index) /* in: InnoDB index */ +{ + dict_foreign_t* foreign; + + ut_ad(index != NULL); + ut_ad(table != NULL); + + for (foreign = UT_LIST_GET_FIRST(table->foreign_list); + foreign; + foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { + + if (foreign->foreign_index == index + || foreign->referenced_index == index) { + + return(foreign); + } + } + + return(NULL); +} + +/************************************************************************* +Frees a foreign key struct. */ +static +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign) /* in, own: foreign key struct */ +{ + mem_heap_free(foreign->heap); +} + +/************************************************************************** +Removes a foreign constraint struct from the dictionary cache. */ +static +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /* in, own: foreign constraint */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_a(foreign); + + if (foreign->referenced_table) { + UT_LIST_REMOVE(referenced_list, + foreign->referenced_table->referenced_list, + foreign); + } + + if (foreign->foreign_table) { + UT_LIST_REMOVE(foreign_list, + foreign->foreign_table->foreign_list, + foreign); + } + + dict_foreign_free(foreign); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************** +Looks for the foreign constraint from the foreign and referenced lists +of a table. */ +static +dict_foreign_t* +dict_foreign_find( +/*==============*/ + /* out: foreign constraint */ + dict_table_t* table, /* in: table object */ + const char* id) /* in: foreign constraint id */ +{ + dict_foreign_t* foreign; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign) { + if (ut_strcmp(id, foreign->id) == 0) { + + return(foreign); + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign) { + if (ut_strcmp(id, foreign->id) == 0) { + + return(foreign); + } + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + return(NULL); +} + +/************************************************************************* +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. */ +static +dict_index_t* +dict_foreign_find_index( +/*====================*/ + /* out: matching index, NULL if not found */ + dict_table_t* table, /* in: table */ + const char** columns,/* in: array of column names */ + ulint n_cols, /* in: number of columns */ + dict_index_t* types_idx, /* in: NULL or an index to whose types the + column types must match */ + ibool check_charsets, + /* in: whether to check charsets. + only has an effect if types_idx != NULL */ + ulint check_null) + /* in: nonzero if none of the columns must + be declared NOT NULL */ +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index != NULL) { + /* Ignore matches that refer to the same instance + or the index is to be dropped */ + if (index->to_be_dropped || types_idx == index) { + + goto next_rec; + + } else if (dict_index_get_n_fields(index) >= n_cols) { + ulint i; + + for (i = 0; i < n_cols; i++) { + dict_field_t* field; + const char* col_name; + + field = dict_index_get_nth_field(index, i); + + col_name = dict_table_get_col_name( + table, dict_col_get_no(field->col)); + + if (field->prefix_len != 0) { + /* We do not accept column prefix + indexes here */ + + break; + } + + if (0 != innobase_strcasecmp(columns[i], + col_name)) { + break; + } + + if (check_null + && (field->col->prtype & DATA_NOT_NULL)) { + + return(NULL); + } + + if (types_idx && !cmp_cols_are_equal( + dict_index_get_nth_col(index, i), + dict_index_get_nth_col(types_idx, + i), + check_charsets)) { + + break; + } + } + + if (i == n_cols) { + /* We found a matching index */ + + return(index); + } + } + +next_rec: + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/************************************************************************** +Find an index that is equivalent to the one passed in and is not marked +for deletion. */ +UNIV_INTERN +dict_index_t* +dict_foreign_find_equiv_index( +/*==========================*/ + /* out: index equivalent to + foreign->foreign_index, or NULL */ + dict_foreign_t* foreign)/* in: foreign key */ +{ + ut_a(foreign != NULL); + + /* Try to find an index which contains the columns as the + first fields and in the right order, and the types are the + same as in foreign->foreign_index */ + + return(dict_foreign_find_index( + foreign->foreign_table, + foreign->foreign_col_names, foreign->n_fields, + foreign->foreign_index, TRUE, /* check types */ + FALSE/* allow columns to be NULL */)); +} + +/************************************************************************** +Returns an index object by matching on the name and column names and +if more than one index matches return the index with the max id */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_by_max_id( +/*===========================*/ + /* out: matching index, NULL if not found */ + dict_table_t* table, /* in: table */ + const char* name, /* in: the index name to find */ + const char** columns,/* in: array of column names */ + ulint n_cols) /* in: number of columns */ +{ + dict_index_t* index; + dict_index_t* found; + + found = NULL; + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (ut_strcmp(index->name, name) == 0 + && dict_index_get_n_ordering_defined_by_user(index) + == n_cols) { + + ulint i; + + for (i = 0; i < n_cols; i++) { + dict_field_t* field; + const char* col_name; + + field = dict_index_get_nth_field(index, i); + + col_name = dict_table_get_col_name( + table, dict_col_get_no(field->col)); + + if (0 != innobase_strcasecmp( + columns[i], col_name)) { + + break; + } + } + + if (i == n_cols) { + /* We found a matching index, select + the index with the higher id*/ + + if (!found + || ut_dulint_cmp(index->id, found->id) > 0) { + + found = index; + } + } + } + + index = dict_table_get_next_index(index); + } + + return(found); +} + +/************************************************************************** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report_low( +/*==========================*/ + FILE* file, /* in: output stream */ + const char* name) /* in: table name */ +{ + rewind(file); + ut_print_timestamp(file); + fprintf(file, " Error in foreign key constraint of table %s:\n", + name); +} + +/************************************************************************** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report( +/*======================*/ + FILE* file, /* in: output stream */ + dict_foreign_t* fk, /* in: foreign key constraint */ + const char* msg) /* in: the error message */ +{ + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(file, fk->foreign_table_name); + fputs(msg, file); + fputs(" Constraint:\n", file); + dict_print_info_on_foreign_key_in_create_format(file, NULL, fk, TRUE); + putc('\n', file); + if (fk->foreign_index) { + fputs("The index in the foreign key in table is ", file); + ut_print_name(file, NULL, FALSE, fk->foreign_index->name); + fputs("\n" + "See http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-foreign-key-constraints.html\n" + "for correct foreign key definition.\n", + file); + } + mutex_exit(&dict_foreign_err_mutex); +} + +/************************************************************************** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of the foreign table and the referenced table must already +be in the dictionary cache! */ +UNIV_INTERN +ulint +dict_foreign_add_to_cache( +/*======================*/ + /* out: DB_SUCCESS or error code */ + dict_foreign_t* foreign, /* in, own: foreign key constraint */ + ibool check_charsets) /* in: TRUE=check charset + compatibility */ +{ + dict_table_t* for_table; + dict_table_t* ref_table; + dict_foreign_t* for_in_cache = NULL; + dict_index_t* index; + ibool added_to_referenced_list= FALSE; + FILE* ef = dict_foreign_err_file; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + for_table = dict_table_check_if_in_cache_low( + foreign->foreign_table_name); + + ref_table = dict_table_check_if_in_cache_low( + foreign->referenced_table_name); + ut_a(for_table || ref_table); + + if (for_table) { + for_in_cache = dict_foreign_find(for_table, foreign->id); + } + + if (!for_in_cache && ref_table) { + for_in_cache = dict_foreign_find(ref_table, foreign->id); + } + + if (for_in_cache) { + /* Free the foreign object */ + mem_heap_free(foreign->heap); + } else { + for_in_cache = foreign; + } + + if (for_in_cache->referenced_table == NULL && ref_table) { + index = dict_foreign_find_index( + ref_table, + for_in_cache->referenced_col_names, + for_in_cache->n_fields, for_in_cache->foreign_index, + check_charsets, FALSE); + + if (index == NULL) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in referenced table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "referenced table do not match" + " the ones in table."); + + if (for_in_cache == foreign) { + mem_heap_free(foreign->heap); + } + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->referenced_table = ref_table; + for_in_cache->referenced_index = index; + UT_LIST_ADD_LAST(referenced_list, + ref_table->referenced_list, + for_in_cache); + added_to_referenced_list = TRUE; + } + + if (for_in_cache->foreign_table == NULL && for_table) { + index = dict_foreign_find_index( + for_table, + for_in_cache->foreign_col_names, + for_in_cache->n_fields, + for_in_cache->referenced_index, check_charsets, + for_in_cache->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL)); + + if (index == NULL) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in the table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "table do not match" + " the ones in the referenced table\n" + "or one of the ON ... SET NULL columns" + " is declared NOT NULL."); + + if (for_in_cache == foreign) { + if (added_to_referenced_list) { + UT_LIST_REMOVE( + referenced_list, + ref_table->referenced_list, + for_in_cache); + } + + mem_heap_free(foreign->heap); + } + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->foreign_table = for_table; + for_in_cache->foreign_index = index; + UT_LIST_ADD_LAST(foreign_list, + for_table->foreign_list, + for_in_cache); + } + + return(DB_SUCCESS); +} + +/************************************************************************* +Scans from pointer onwards. Stops if is at the start of a copy of +'string' where characters are compared without case sensitivity, and +only outside `` or "" quotes. Stops also at '\0'. */ +static +const char* +dict_scan_to( +/*=========*/ + /* out: scanned up to this */ + const char* ptr, /* in: scan from */ + const char* string) /* in: look for this */ +{ + char quote = '\0'; + + for (; *ptr; ptr++) { + if (*ptr == quote) { + /* Closing quote character: do not look for + starting quote or the keyword. */ + quote = '\0'; + } else if (quote) { + /* Within quotes: do nothing. */ + } else if (*ptr == '`' || *ptr == '"') { + /* Starting quote: remember the quote character. */ + quote = *ptr; + } else { + /* Outside quotes: look for the keyword. */ + ulint i; + for (i = 0; string[i]; i++) { + if (toupper((int)(unsigned char)(ptr[i])) + != toupper((int)(unsigned char) + (string[i]))) { + goto nomatch; + } + } + break; +nomatch: + ; + } + } + + return(ptr); +} + +/************************************************************************* +Accepts a specified string. Comparisons are case-insensitive. */ +static +const char* +dict_accept( +/*========*/ + /* out: if string was accepted, the pointer + is moved after that, else ptr is returned */ + struct charset_info_st* cs,/* in: the character set of ptr */ + const char* ptr, /* in: scan from this */ + const char* string, /* in: accept only this string as the next + non-whitespace string */ + ibool* success)/* out: TRUE if accepted */ +{ + const char* old_ptr = ptr; + const char* old_ptr2; + + *success = FALSE; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + old_ptr2 = ptr; + + ptr = dict_scan_to(ptr, string); + + if (*ptr == '\0' || old_ptr2 != ptr) { + return(old_ptr); + } + + *success = TRUE; + + return(ptr + ut_strlen(string)); +} + +/************************************************************************* +Scans an id. For the lexical definition of an 'id', see the code below. +Strips backquotes or double quotes from around the id. */ +static +const char* +dict_scan_id( +/*=========*/ + /* out: scanned to */ + struct charset_info_st* cs,/* in: the character set of ptr */ + const char* ptr, /* in: scanned to */ + mem_heap_t* heap, /* in: heap where to allocate the id + (NULL=id will not be allocated, but it + will point to string near ptr) */ + const char** id, /* out,own: the id; NULL if no id was + scannable */ + ibool table_id,/* in: TRUE=convert the allocated id + as a table name; FALSE=convert to UTF-8 */ + ibool accept_also_dot) + /* in: TRUE if also a dot can appear in a + non-quoted id; in a quoted id it can appear + always */ +{ + char quote = '\0'; + ulint len = 0; + const char* s; + char* str; + char* dst; + + *id = NULL; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + if (*ptr == '\0') { + + return(ptr); + } + + if (*ptr == '`' || *ptr == '"') { + quote = *ptr++; + } + + s = ptr; + + if (quote) { + for (;;) { + if (!*ptr) { + /* Syntax error */ + return(ptr); + } + if (*ptr == quote) { + ptr++; + if (*ptr != quote) { + break; + } + } + ptr++; + len++; + } + } else { + while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')' + && (accept_also_dot || *ptr != '.') + && *ptr != ',' && *ptr != '\0') { + + ptr++; + } + + len = ptr - s; + } + + if (UNIV_UNLIKELY(!heap)) { + /* no heap given: id will point to source string */ + *id = s; + return(ptr); + } + + if (quote) { + char* d; + str = d = mem_heap_alloc(heap, len + 1); + while (len--) { + if ((*d++ = *s++) == quote) { + s++; + } + } + *d++ = 0; + len = d - str; + ut_ad(*s == quote); + ut_ad(s + 1 == ptr); + } else { + str = mem_heap_strdupl(heap, s, len); + } + + if (!table_id) { +convert_id: + /* Convert the identifier from connection character set + to UTF-8. */ + len = 3 * len + 1; + *id = dst = mem_heap_alloc(heap, len); + + innobase_convert_from_id(cs, dst, str, len); + } else if (!strncmp(str, srv_mysql50_table_name_prefix, + sizeof srv_mysql50_table_name_prefix)) { + /* This is a pre-5.1 table name + containing chars other than [A-Za-z0-9]. + Discard the prefix and use raw UTF-8 encoding. */ + str += sizeof srv_mysql50_table_name_prefix; + len -= sizeof srv_mysql50_table_name_prefix; + goto convert_id; + } else { + /* Encode using filename-safe characters. */ + len = 5 * len + 1; + *id = dst = mem_heap_alloc(heap, len); + + innobase_convert_from_table_id(cs, dst, str, len); + } + + return(ptr); +} + +/************************************************************************* +Tries to scan a column name. */ +static +const char* +dict_scan_col( +/*==========*/ + /* out: scanned to */ + struct charset_info_st* cs, /* in: the character set of ptr */ + const char* ptr, /* in: scanned to */ + ibool* success,/* out: TRUE if success */ + dict_table_t* table, /* in: table in which the column is */ + const dict_col_t** column, /* out: pointer to column if success */ + mem_heap_t* heap, /* in: heap where to allocate */ + const char** name) /* out,own: the column name; + NULL if no name was scannable */ +{ + ulint i; + + *success = FALSE; + + ptr = dict_scan_id(cs, ptr, heap, name, FALSE, TRUE); + + if (*name == NULL) { + + return(ptr); /* Syntax error */ + } + + if (table == NULL) { + *success = TRUE; + *column = NULL; + } else { + for (i = 0; i < dict_table_get_n_cols(table); i++) { + + const char* col_name = dict_table_get_col_name( + table, i); + + if (0 == innobase_strcasecmp(col_name, *name)) { + /* Found */ + + *success = TRUE; + *column = dict_table_get_nth_col(table, i); + strcpy((char*) *name, col_name); + + break; + } + } + } + + return(ptr); +} + +/************************************************************************* +Scans a table name from an SQL string. */ +static +const char* +dict_scan_table_name( +/*=================*/ + /* out: scanned to */ + struct charset_info_st* cs,/* in: the character set of ptr */ + const char* ptr, /* in: scanned to */ + dict_table_t** table, /* out: table object or NULL */ + const char* name, /* in: foreign key table name */ + ibool* success,/* out: TRUE if ok name found */ + mem_heap_t* heap, /* in: heap where to allocate the id */ + const char** ref_name)/* out,own: the table name; + NULL if no name was scannable */ +{ + const char* database_name = NULL; + ulint database_name_len = 0; + const char* table_name = NULL; + ulint table_name_len; + const char* scan_name; + char* ref; + + *success = FALSE; + *table = NULL; + + ptr = dict_scan_id(cs, ptr, heap, &scan_name, TRUE, FALSE); + + if (scan_name == NULL) { + + return(ptr); /* Syntax error */ + } + + if (*ptr == '.') { + /* We scanned the database name; scan also the table name */ + + ptr++; + + database_name = scan_name; + database_name_len = strlen(database_name); + + ptr = dict_scan_id(cs, ptr, heap, &table_name, TRUE, FALSE); + + if (table_name == NULL) { + + return(ptr); /* Syntax error */ + } + } else { + /* To be able to read table dumps made with InnoDB-4.0.17 or + earlier, we must allow the dot separator between the database + name and the table name also to appear within a quoted + identifier! InnoDB used to print a constraint as: + ... REFERENCES `databasename.tablename` ... + starting from 4.0.18 it is + ... REFERENCES `databasename`.`tablename` ... */ + const char* s; + + for (s = scan_name; *s; s++) { + if (*s == '.') { + database_name = scan_name; + database_name_len = s - scan_name; + scan_name = ++s; + break;/* to do: multiple dots? */ + } + } + + table_name = scan_name; + } + + if (database_name == NULL) { + /* Use the database name of the foreign key table */ + + database_name = name; + database_name_len = dict_get_db_name_len(name); + } + + table_name_len = strlen(table_name); + + /* Copy database_name, '/', table_name, '\0' */ + ref = mem_heap_alloc(heap, database_name_len + table_name_len + 2); + memcpy(ref, database_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); +#ifndef __WIN__ + if (srv_lower_case_table_names) { +#endif /* !__WIN__ */ + /* The table name is always put to lower case on Windows. */ + innobase_casedn_str(ref); +#ifndef __WIN__ + } +#endif /* !__WIN__ */ + + *success = TRUE; + *ref_name = ref; + *table = dict_table_get_low(ref); + + return(ptr); +} + +/************************************************************************* +Skips one id. The id is allowed to contain also '.'. */ +static +const char* +dict_skip_word( +/*===========*/ + /* out: scanned to */ + struct charset_info_st* cs,/* in: the character set of ptr */ + const char* ptr, /* in: scanned to */ + ibool* success)/* out: TRUE if success, FALSE if just spaces + left in string or a syntax error */ +{ + const char* start; + + *success = FALSE; + + ptr = dict_scan_id(cs, ptr, NULL, &start, FALSE, TRUE); + + if (start) { + *success = TRUE; + } + + return(ptr); +} + +/************************************************************************* +Removes MySQL comments from an SQL string. A comment is either +(a) '#' to the end of the line, +(b) '--' to the end of the line, or +(c) '' till the next '' (like the familiar +C comment syntax). */ +static +char* +dict_strip_comments( +/*================*/ + /* out, own: SQL string stripped from + comments; the caller must free this + with mem_free()! */ + const char* sql_string) /* in: SQL string */ +{ + char* str; + const char* sptr; + char* ptr; + /* unclosed quote character (0 if none) */ + char quote = 0; + + str = mem_alloc(strlen(sql_string) + 1); + + sptr = sql_string; + ptr = str; + + for (;;) { +scan_more: + if (*sptr == '\0') { + *ptr = '\0'; + + ut_a(ptr <= str + strlen(sql_string)); + + return(str); + } + + if (*sptr == quote) { + /* Closing quote character: do not look for + starting quote or comments. */ + quote = 0; + } else if (quote) { + /* Within quotes: do not look for + starting quotes or comments. */ + } else if (*sptr == '"' || *sptr == '`') { + /* Starting quote: remember the quote character. */ + quote = *sptr; + } else if (*sptr == '#' + || (sptr[0] == '-' && sptr[1] == '-' + && sptr[2] == ' ')) { + for (;;) { + /* In Unix a newline is 0x0A while in Windows + it is 0x0D followed by 0x0A */ + + if (*sptr == (char)0x0A + || *sptr == (char)0x0D + || *sptr == '\0') { + + goto scan_more; + } + + sptr++; + } + } else if (!quote && *sptr == '/' && *(sptr + 1) == '*') { + for (;;) { + if (*sptr == '*' && *(sptr + 1) == '/') { + + sptr += 2; + + goto scan_more; + } + + if (*sptr == '\0') { + + goto scan_more; + } + + sptr++; + } + } + + *ptr = *sptr; + + ptr++; + sptr++; + } +} + +/************************************************************************* +Finds the highest for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_. */ +static +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + /* out: highest number, 0 if table has no new + format foreign key constraints */ + dict_table_t* table) /* in: table in the dictionary memory cache */ +{ + dict_foreign_t* foreign; + char* endp; + ulint biggest_id = 0; + ulint id; + ulint len; + + ut_a(table); + + len = ut_strlen(table->name); + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign) { + if (ut_strlen(foreign->id) > ((sizeof dict_ibfk) - 1) + len + && 0 == ut_memcmp(foreign->id, table->name, len) + && 0 == ut_memcmp(foreign->id + len, + dict_ibfk, (sizeof dict_ibfk) - 1) + && foreign->id[len + ((sizeof dict_ibfk) - 1)] != '0') { + /* It is of the >= 4.0.18 format */ + + id = strtoul(foreign->id + len + + ((sizeof dict_ibfk) - 1), + &endp, 10); + if (*endp == '\0') { + ut_a(id != biggest_id); + + if (id > biggest_id) { + biggest_id = id; + } + } + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + return(biggest_id); +} + +/************************************************************************* +Reports a simple foreign key create clause syntax error. */ +static +void +dict_foreign_report_syntax_err( +/*===========================*/ + const char* name, /* in: table name */ + const char* start_of_latest_foreign, + /* in: start of the foreign key clause + in the SQL string */ + const char* ptr) /* in: place of the syntax error */ +{ + FILE* ef = dict_foreign_err_file; + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nSyntax error close to:\n%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); +} + +/************************************************************************* +Scans a table create SQL string and adds to the data dictionary the foreign +key constraints declared in the string. This function should be called after +the indexes for a table have been created. Each foreign key constraint must +be accompanied with indexes in both participating tables. The indexes are +allowed to contain more fields than mentioned in the constraint. */ +static +ulint +dict_create_foreign_constraints_low( +/*================================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap, /* in: memory heap */ + struct charset_info_st* cs,/* in: the character set of sql_string */ + const char* sql_string, + /* in: CREATE TABLE or ALTER TABLE statement + where foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the database + name before it: test.table2; the default + database is the database of parameter name */ + const char* name, /* in: table full name in the normalized form + database_name/table_name */ + ibool reject_fks) + /* in: if TRUE, fail with error code + DB_CANNOT_ADD_CONSTRAINT if any foreign + keys are found. */ +{ + dict_table_t* table; + dict_table_t* referenced_table; + dict_table_t* table_to_alter; + ulint highest_id_so_far = 0; + dict_index_t* index; + dict_foreign_t* foreign; + const char* ptr = sql_string; + const char* start_of_latest_foreign = sql_string; + FILE* ef = dict_foreign_err_file; + const char* constraint_name; + ibool success; + ulint error; + const char* ptr1; + const char* ptr2; + ulint i; + ulint j; + ibool is_on_delete; + ulint n_on_deletes; + ulint n_on_updates; + const dict_col_t*columns[500]; + const char* column_names[500]; + const char* referenced_table_name; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = dict_table_get_low(name); + + if (table == NULL) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, + "Cannot find the table in the internal" + " data dictionary of InnoDB.\n" + "Create table statement:\n%s\n", sql_string); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_ERROR); + } + + /* First check if we are actually doing an ALTER TABLE, and in that + case look for the table being altered */ + + ptr = dict_accept(cs, ptr, "ALTER", &success); + + if (!success) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "TABLE", &success); + + if (!success) { + + goto loop; + } + + /* We are doing an ALTER TABLE: scan the table name we are altering */ + + ptr = dict_scan_table_name(cs, ptr, &table_to_alter, name, + &success, heap, &referenced_table_name); + if (!success) { + fprintf(stderr, + "InnoDB: Error: could not find" + " the table being ALTERED in:\n%s\n", + sql_string); + + return(DB_ERROR); + } + + /* Starting from 4.0.18 and 4.1.2, we generate foreign key id's in the + format databasename/tablename_ibfk_, where is local + to the table; look for the highest for table_to_alter, so + that we can assign to new constraints higher numbers. */ + + /* If we are altering a temporary table, the table name after ALTER + TABLE does not correspond to the internal table name, and + table_to_alter is NULL. TODO: should we fix this somehow? */ + + if (table_to_alter == NULL) { + highest_id_so_far = 0; + } else { + highest_id_so_far = dict_table_get_highest_foreign_id( + table_to_alter); + } + + /* Scan for foreign key declarations in a loop */ +loop: + /* Scan either to "CONSTRAINT" or "FOREIGN", whichever is closer */ + + ptr1 = dict_scan_to(ptr, "CONSTRAINT"); + ptr2 = dict_scan_to(ptr, "FOREIGN"); + + constraint_name = NULL; + + if (ptr1 < ptr2) { + /* The user may have specified a constraint name. Pick it so + that we can store 'databasename/constraintname' as the id of + of the constraint to system tables. */ + ptr = ptr1; + + ptr = dict_accept(cs, ptr, "CONSTRAINT", &success); + + ut_a(success); + + if (!my_isspace(cs, *ptr) && *ptr != '"' && *ptr != '`') { + goto loop; + } + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + /* read constraint name unless got "CONSTRAINT FOREIGN" */ + if (ptr != ptr2) { + ptr = dict_scan_id(cs, ptr, heap, + &constraint_name, FALSE, FALSE); + } + } else { + ptr = ptr2; + } + + if (*ptr == '\0') { + /* The proper way to reject foreign keys for temporary + tables would be to split the lexing and syntactical + analysis of foreign key clauses from the actual adding + of them, so that ha_innodb.cc could first parse the SQL + command, determine if there are any foreign keys, and + if so, immediately reject the command if the table is a + temporary one. For now, this kludge will work. */ + if (reject_fks && (UT_LIST_GET_LEN(table->foreign_list) > 0)) { + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /**********************************************************/ + /* The following call adds the foreign key constraints + to the data dictionary system tables on disk */ + + error = dict_create_add_foreigns_to_dictionary( + highest_id_so_far, table, trx); + return(error); + } + + start_of_latest_foreign = ptr; + + ptr = dict_accept(cs, ptr, "FOREIGN", &success); + + if (!success) { + goto loop; + } + + if (!my_isspace(cs, *ptr)) { + goto loop; + } + + ptr = dict_accept(cs, ptr, "KEY", &success); + + if (!success) { + goto loop; + } + + ptr = dict_accept(cs, ptr, "(", &success); + + if (!success) { + /* MySQL allows also an index id before the '('; we + skip it */ + ptr = dict_skip_word(cs, ptr, &success); + + if (!success) { + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, "(", &success); + + if (!success) { + /* We do not flag a syntax error here because in an + ALTER TABLE we may also have DROP FOREIGN KEY abc */ + + goto loop; + } + } + + i = 0; + + /* Scan the columns in the first list */ +col_loop1: + ut_a(i < (sizeof column_names) / sizeof *column_names); + ptr = dict_scan_col(cs, ptr, &success, table, columns + i, + heap, column_names + i); + if (!success) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nCannot resolve column name close to:\n%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + i++; + + ptr = dict_accept(cs, ptr, ",", &success); + + if (success) { + goto col_loop1; + } + + ptr = dict_accept(cs, ptr, ")", &success); + + if (!success) { + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Try to find an index which contains the columns + as the first fields and in the right order */ + + index = dict_foreign_find_index(table, column_names, i, + NULL, TRUE, FALSE); + + if (!index) { + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fputs("There is no index in table ", ef); + ut_print_name(ef, NULL, TRUE, name); + fprintf(ef, " where the columns appear\n" + "as the first columns. Constraint:\n%s\n" + "See http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-foreign-key-constraints.html\n" + "for correct foreign key definition.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + ptr = dict_accept(cs, ptr, "REFERENCES", &success); + + if (!success || !my_isspace(cs, *ptr)) { + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Let us create a constraint struct */ + + foreign = dict_mem_foreign_create(); + + if (constraint_name) { + ulint db_len; + + /* Catenate 'databasename/' to the constraint name specified + by the user: we conceive the constraint as belonging to the + same MySQL 'database' as the table itself. We store the name + to foreign->id. */ + + db_len = dict_get_db_name_len(table->name); + + foreign->id = mem_heap_alloc( + foreign->heap, db_len + strlen(constraint_name) + 2); + + ut_memcpy(foreign->id, table->name, db_len); + foreign->id[db_len] = '/'; + strcpy(foreign->id + db_len + 1, constraint_name); + } + + foreign->foreign_table = table; + foreign->foreign_table_name = mem_heap_strdup(foreign->heap, + table->name); + foreign->foreign_index = index; + foreign->n_fields = (unsigned int) i; + foreign->foreign_col_names = mem_heap_alloc(foreign->heap, + i * sizeof(void*)); + for (i = 0; i < foreign->n_fields; i++) { + foreign->foreign_col_names[i] = mem_heap_strdup( + foreign->heap, + dict_table_get_col_name(table, + dict_col_get_no(columns[i]))); + } + + ptr = dict_scan_table_name(cs, ptr, &referenced_table, name, + &success, heap, &referenced_table_name); + + /* Note that referenced_table can be NULL if the user has suppressed + checking of foreign key constraints! */ + + if (!success || (!referenced_table && trx->check_foreigns)) { + dict_foreign_free(foreign); + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nCannot resolve table name close to:\n" + "%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, "(", &success); + + if (!success) { + dict_foreign_free(foreign); + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Scan the columns in the second list */ + i = 0; + +col_loop2: + ptr = dict_scan_col(cs, ptr, &success, referenced_table, columns + i, + heap, column_names + i); + i++; + + if (!success) { + dict_foreign_free(foreign); + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\nCannot resolve column name close to:\n" + "%s\n", + start_of_latest_foreign, ptr); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, ",", &success); + + if (success) { + goto col_loop2; + } + + ptr = dict_accept(cs, ptr, ")", &success); + + if (!success || foreign->n_fields != i) { + dict_foreign_free(foreign); + + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + n_on_deletes = 0; + n_on_updates = 0; + +scan_on_conditions: + /* Loop here as long as we can find ON ... conditions */ + + ptr = dict_accept(cs, ptr, "ON", &success); + + if (!success) { + + goto try_find_index; + } + + ptr = dict_accept(cs, ptr, "DELETE", &success); + + if (!success) { + ptr = dict_accept(cs, ptr, "UPDATE", &success); + + if (!success) { + dict_foreign_free(foreign); + + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + is_on_delete = FALSE; + n_on_updates++; + } else { + is_on_delete = TRUE; + n_on_deletes++; + } + + ptr = dict_accept(cs, ptr, "RESTRICT", &success); + + if (success) { + goto scan_on_conditions; + } + + ptr = dict_accept(cs, ptr, "CASCADE", &success); + + if (success) { + if (is_on_delete) { + foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE; + } else { + foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE; + } + + goto scan_on_conditions; + } + + ptr = dict_accept(cs, ptr, "NO", &success); + + if (success) { + ptr = dict_accept(cs, ptr, "ACTION", &success); + + if (!success) { + dict_foreign_free(foreign); + dict_foreign_report_syntax_err( + name, start_of_latest_foreign, ptr); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + if (is_on_delete) { + foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION; + } else { + foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION; + } + + goto scan_on_conditions; + } + + ptr = dict_accept(cs, ptr, "SET", &success); + + if (!success) { + dict_foreign_free(foreign); + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + ptr = dict_accept(cs, ptr, "NULL", &success); + + if (!success) { + dict_foreign_free(foreign); + dict_foreign_report_syntax_err(name, start_of_latest_foreign, + ptr); + return(DB_CANNOT_ADD_CONSTRAINT); + } + + for (j = 0; j < foreign->n_fields; j++) { + if ((dict_index_get_nth_col(foreign->foreign_index, j)->prtype) + & DATA_NOT_NULL) { + + /* It is not sensible to define SET NULL + if the column is not allowed to be NULL! */ + + dict_foreign_free(foreign); + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\n" + "You have defined a SET NULL condition" + " though some of the\n" + "columns are defined as NOT NULL.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + } + + if (is_on_delete) { + foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL; + } else { + foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL; + } + + goto scan_on_conditions; + +try_find_index: + if (n_on_deletes > 1 || n_on_updates > 1) { + /* It is an error to define more than 1 action */ + + dict_foreign_free(foreign); + + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\n" + "You have twice an ON DELETE clause" + " or twice an ON UPDATE clause.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + + /* Try to find an index which contains the columns as the first fields + and in the right order, and the types are the same as in + foreign->foreign_index */ + + if (referenced_table) { + index = dict_foreign_find_index(referenced_table, + column_names, i, + foreign->foreign_index, + TRUE, FALSE); + if (!index) { + dict_foreign_free(foreign); + mutex_enter(&dict_foreign_err_mutex); + dict_foreign_error_report_low(ef, name); + fprintf(ef, "%s:\n" + "Cannot find an index in the" + " referenced table where the\n" + "referenced columns appear as the" + " first columns, or column types\n" + "in the table and the referenced table" + " do not match for constraint.\n" + "Note that the internal storage type of" + " ENUM and SET changed in\n" + "tables created with >= InnoDB-4.1.12," + " and such columns in old tables\n" + "cannot be referenced by such columns" + " in new tables.\n" + "See http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-foreign-key-constraints.html\n" + "for correct foreign key definition.\n", + start_of_latest_foreign); + mutex_exit(&dict_foreign_err_mutex); + + return(DB_CANNOT_ADD_CONSTRAINT); + } + } else { + ut_a(trx->check_foreigns == FALSE); + index = NULL; + } + + foreign->referenced_index = index; + foreign->referenced_table = referenced_table; + + foreign->referenced_table_name + = mem_heap_strdup(foreign->heap, referenced_table_name); + + foreign->referenced_col_names = mem_heap_alloc(foreign->heap, + i * sizeof(void*)); + for (i = 0; i < foreign->n_fields; i++) { + foreign->referenced_col_names[i] + = mem_heap_strdup(foreign->heap, column_names[i]); + } + + /* We found an ok constraint definition: add to the lists */ + + UT_LIST_ADD_LAST(foreign_list, table->foreign_list, foreign); + + if (referenced_table) { + UT_LIST_ADD_LAST(referenced_list, + referenced_table->referenced_list, + foreign); + } + + goto loop; +} + +/************************************************************************* +Scans a table create SQL string and adds to the data dictionary the foreign +key constraints declared in the string. This function should be called after +the indexes for a table have been created. Each foreign key constraint must +be accompanied with indexes in both participating tables. The indexes are +allowed to contain more fields than mentioned in the constraint. */ +UNIV_INTERN +ulint +dict_create_foreign_constraints( +/*============================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES + table2(c, d), table2 can be written + also with the database + name before it: test.table2; the + default database id the database of + parameter name */ + const char* name, /* in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /* in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ +{ + char* str; + ulint err; + mem_heap_t* heap; + + ut_a(trx); + ut_a(trx->mysql_thd); + + str = dict_strip_comments(sql_string); + heap = mem_heap_create(10000); + + err = dict_create_foreign_constraints_low( + trx, heap, innobase_get_charset(trx->mysql_thd), str, name, + reject_fks); + + mem_heap_free(heap); + mem_free(str); + + return(err); +} + +/************************************************************************** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. */ +UNIV_INTERN +ulint +dict_foreign_parse_drop_constraints( +/*================================*/ + /* out: DB_SUCCESS or + DB_CANNOT_DROP_CONSTRAINT if + syntax error or the constraint + id does not match */ + mem_heap_t* heap, /* in: heap from which we can + allocate memory */ + trx_t* trx, /* in: transaction */ + dict_table_t* table, /* in: table */ + ulint* n, /* out: number of constraints + to drop */ + const char*** constraints_to_drop) /* out: id's of the + constraints to drop */ +{ + dict_foreign_t* foreign; + ibool success; + char* str; + const char* ptr; + const char* id; + FILE* ef = dict_foreign_err_file; + struct charset_info_st* cs; + + ut_a(trx); + ut_a(trx->mysql_thd); + + cs = innobase_get_charset(trx->mysql_thd); + + *n = 0; + + *constraints_to_drop = mem_heap_alloc(heap, 1000 * sizeof(char*)); + + str = dict_strip_comments(*(trx->mysql_query_str)); + ptr = str; + + ut_ad(mutex_own(&(dict_sys->mutex))); +loop: + ptr = dict_scan_to(ptr, "DROP"); + + if (*ptr == '\0') { + mem_free(str); + + return(DB_SUCCESS); + } + + ptr = dict_accept(cs, ptr, "DROP", &success); + + if (!my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "FOREIGN", &success); + + if (!success || !my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "KEY", &success); + + if (!success) { + + goto syntax_error; + } + + ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE); + + if (id == NULL) { + + goto syntax_error; + } + + ut_a(*n < 1000); + (*constraints_to_drop)[*n] = id; + (*n)++; + + /* Look for the given constraint id */ + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign != NULL) { + if (0 == strcmp(foreign->id, id) + || (strchr(foreign->id, '/') + && 0 == strcmp(id, + dict_remove_db_name(foreign->id)))) { + /* Found */ + break; + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + if (foreign == NULL) { + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in dropping of a foreign key constraint" + " of table ", ef); + ut_print_name(ef, NULL, TRUE, table->name); + fputs(",\n" + "in SQL command\n", ef); + fputs(str, ef); + fputs("\nCannot find a constraint with the given id ", ef); + ut_print_name(ef, NULL, FALSE, id); + fputs(".\n", ef); + mutex_exit(&dict_foreign_err_mutex); + + mem_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); + } + + goto loop; + +syntax_error: + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Syntax error in dropping of a" + " foreign key constraint of table ", ef); + ut_print_name(ef, NULL, TRUE, table->name); + fprintf(ef, ",\n" + "close to:\n%s\n in SQL command\n%s\n", ptr, str); + mutex_exit(&dict_foreign_err_mutex); + + mem_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); +} +#endif /* UNIV_HOTBACKUP */ + +/*==================== END OF FOREIGN KEY PROCESSING ====================*/ + +/************************************************************************** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys->mutex is already being held. */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + /* out: index, NULL if not found */ + dulint index_id) /* in: index id */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + + return(dict_index_find_on_id_low(index_id)); +} + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************** +Returns an index object if it is found in the dictionary cache. */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + /* out: index, NULL if not found */ + dulint index_id) /* in: index id */ +{ + dict_index_t* index; + + if (dict_sys == NULL) { + return(NULL); + } + + mutex_enter(&(dict_sys->mutex)); + + index = dict_index_get_if_in_cache_low(index_id); + + mutex_exit(&(dict_sys->mutex)); + + return(index); +} +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +#ifdef UNIV_DEBUG +/************************************************************************** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. */ +UNIV_INTERN +ibool +dict_index_check_search_tuple( +/*==========================*/ + /* out: TRUE if ok */ + const dict_index_t* index, /* in: index tree */ + const dtuple_t* tuple) /* in: tuple used in a search */ +{ + ut_a(index); + ut_a(dtuple_get_n_fields_cmp(tuple) + <= dict_index_get_n_unique_in_tree(index)); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Builds a node pointer out of a physical record and a page number. */ +UNIV_INTERN +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + /* out, own: node pointer */ + const dict_index_t* index, /* in: index */ + const rec_t* rec, /* in: record for which to build node + pointer */ + ulint page_no,/* in: page number to put in node + pointer */ + mem_heap_t* heap, /* in: memory heap where pointer + created */ + ulint level) /* in: level of rec in tree: + 0 means leaf level */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + ulint n_unique; + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + /* In a universal index tree, we take the whole record as + the node pointer if the record is on the leaf level, + on non-leaf levels we remove the last field, which + contains the page number of the child page */ + + ut_a(!dict_table_is_comp(index->table)); + n_unique = rec_get_n_fields_old(rec); + + if (level > 0) { + ut_a(n_unique > 1); + n_unique--; + } + } else { + n_unique = dict_index_get_n_unique_in_tree(index); + } + + tuple = dtuple_create(heap, n_unique + 1); + + /* When searching in the tree for the node pointer, we must not do + comparison on the last field, the page number field, as on upper + levels in the tree there may be identical node pointers with a + different page number; therefore, we set the n_fields_cmp to one + less: */ + + dtuple_set_n_fields_cmp(tuple, n_unique); + + dict_index_copy_types(tuple, index, n_unique); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + field = dtuple_get_nth_field(tuple, n_unique); + dfield_set_data(field, buf, 4); + + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4); + + rec_copy_prefix_to_dtuple(tuple, rec, index, n_unique, heap); + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) + | REC_STATUS_NODE_PTR); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/************************************************************************** +Copies an initial segment of a physical record, long enough to specify an +index entry uniquely. */ +UNIV_INTERN +rec_t* +dict_index_copy_rec_order_prefix( +/*=============================*/ + /* out: pointer to the prefix record */ + const dict_index_t* index, /* in: index */ + const rec_t* rec, /* in: record for which to + copy prefix */ + ulint* n_fields,/* out: number of fields copied */ + byte** buf, /* in/out: memory buffer for the + copied prefix, or NULL */ + ulint* buf_size)/* in/out: buffer size */ +{ + ulint n; + + UNIV_PREFETCH_R(rec); + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + ut_a(!dict_table_is_comp(index->table)); + n = rec_get_n_fields_old(rec); + } else { + n = dict_index_get_n_unique_in_tree(index); + } + + *n_fields = n; + return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size)); +} + +/************************************************************************** +Builds a typed data tuple out of a physical record. */ +UNIV_INTERN +dtuple_t* +dict_index_build_data_tuple( +/*========================*/ + /* out, own: data tuple */ + dict_index_t* index, /* in: index tree */ + rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ + mem_heap_t* heap) /* in: memory heap where tuple created */ +{ + dtuple_t* tuple; + + ut_ad(dict_table_is_comp(index->table) + || n_fields <= rec_get_n_fields_old(rec)); + + tuple = dtuple_create(heap, n_fields); + + dict_index_copy_types(tuple, index, n_fields); + + rec_copy_prefix_to_dtuple(tuple, rec, index, n_fields, heap); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/************************************************************************* +Calculates the minimum record length in an index. */ +UNIV_INTERN +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /* in: index */ +{ + ulint sum = 0; + ulint i; + + if (dict_table_is_comp(index->table)) { + ulint nullable = 0; + sum = REC_N_NEW_EXTRA_BYTES; + for (i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_col_t* col + = dict_index_get_nth_col(index, i); + ulint size = dict_col_get_fixed_size(col); + sum += size; + if (!size) { + size = col->len; + sum += size < 128 ? 1 : 2; + } + if (!(col->prtype & DATA_NOT_NULL)) { + nullable++; + } + } + + /* round the NULL flags up to full bytes */ + sum += UT_BITS_IN_BYTES(nullable); + + return(sum); + } + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + sum += dict_col_get_fixed_size( + dict_index_get_nth_col(index, i)); + } + + if (sum > 127) { + sum += 2 * dict_index_get_n_fields(index); + } else { + sum += dict_index_get_n_fields(index); + } + + sum += REC_N_OLD_EXTRA_BYTES; + + return(sum); +} + +/************************************************************************* +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. */ +UNIV_INTERN +void +dict_update_statistics_low( +/*=======================*/ + dict_table_t* table, /* in/out: table */ + ibool has_dict_mutex __attribute__((unused))) + /* in: TRUE if the caller has the + dictionary mutex */ +{ + dict_index_t* index; + ulint size; + ulint sum_of_index_sizes = 0; + + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: cannot calculate statistics for table %s\n" + "InnoDB: because the .ibd file is missing. For help," + " please refer to\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n", + table->name); + + return; + } + + /* If we have set a high innodb_force_recovery level, do not calculate + statistics, as a badly corrupted index can cause a crash in it. */ + + if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + + return; + } + + /* Find out the sizes of the indexes and how many different values + for the key they approximately have */ + + index = dict_table_get_first_index(table); + + if (index == NULL) { + /* Table definition is corrupt */ + + return; + } + + while (index) { + size = btr_get_size(index, BTR_TOTAL_SIZE); + + index->stat_index_size = size; + + sum_of_index_sizes += size; + + size = btr_get_size(index, BTR_N_LEAF_PAGES); + + if (size == 0) { + /* The root node of the tree is a leaf */ + size = 1; + } + + index->stat_n_leaf_pages = size; + + btr_estimate_number_of_different_key_vals(index); + + index = dict_table_get_next_index(index); + } + + index = dict_table_get_first_index(table); + + table->stat_n_rows = index->stat_n_diff_key_vals[ + dict_index_get_n_unique(index)]; + + table->stat_clustered_index_size = index->stat_index_size; + + table->stat_sum_of_other_index_sizes = sum_of_index_sizes + - index->stat_index_size; + + table->stat_initialized = TRUE; + + table->stat_modified_counter = 0; +} + +/************************************************************************* +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. */ +UNIV_INTERN +void +dict_update_statistics( +/*===================*/ + dict_table_t* table) /* in/out: table */ +{ + dict_update_statistics_low(table, FALSE); +} + +/************************************************************************** +Prints info of a foreign key constraint. */ +static +void +dict_foreign_print_low( +/*===================*/ + dict_foreign_t* foreign) /* in: foreign key constraint */ +{ + ulint i; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + fprintf(stderr, " FOREIGN KEY CONSTRAINT %s: %s (", + foreign->id, foreign->foreign_table_name); + + for (i = 0; i < foreign->n_fields; i++) { + fprintf(stderr, " %s", foreign->foreign_col_names[i]); + } + + fprintf(stderr, " )\n" + " REFERENCES %s (", + foreign->referenced_table_name); + + for (i = 0; i < foreign->n_fields; i++) { + fprintf(stderr, " %s", foreign->referenced_col_names[i]); + } + + fputs(" )\n", stderr); +} + +/************************************************************************** +Prints a table data. */ +UNIV_INTERN +void +dict_table_print( +/*=============*/ + dict_table_t* table) /* in: table */ +{ + mutex_enter(&(dict_sys->mutex)); + dict_table_print_low(table); + mutex_exit(&(dict_sys->mutex)); +} + +/************************************************************************** +Prints a table data when we know the table name. */ +UNIV_INTERN +void +dict_table_print_by_name( +/*=====================*/ + const char* name) +{ + dict_table_t* table; + + mutex_enter(&(dict_sys->mutex)); + + table = dict_table_get_low(name); + + ut_a(table); + + dict_table_print_low(table); + mutex_exit(&(dict_sys->mutex)); +} + +/************************************************************************** +Prints a table data. */ +UNIV_INTERN +void +dict_table_print_low( +/*=================*/ + dict_table_t* table) /* in: table */ +{ + dict_index_t* index; + dict_foreign_t* foreign; + ulint i; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + dict_update_statistics_low(table, TRUE); + + fprintf(stderr, + "--------------------------------------\n" + "TABLE: name %s, id %lu %lu, flags %lx, columns %lu," + " indexes %lu, appr.rows %lu\n" + " COLUMNS: ", + table->name, + (ulong) ut_dulint_get_high(table->id), + (ulong) ut_dulint_get_low(table->id), + (ulong) table->flags, + (ulong) table->n_cols, + (ulong) UT_LIST_GET_LEN(table->indexes), + (ulong) table->stat_n_rows); + + for (i = 0; i + 1 < (ulint) table->n_cols; i++) { + dict_col_print_low(table, dict_table_get_nth_col(table, i)); + fputs("; ", stderr); + } + + putc('\n', stderr); + + index = UT_LIST_GET_FIRST(table->indexes); + + while (index != NULL) { + dict_index_print_low(index); + index = UT_LIST_GET_NEXT(indexes, index); + } + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign != NULL) { + dict_foreign_print_low(foreign); + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign != NULL) { + dict_foreign_print_low(foreign); + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } +} + +/************************************************************************** +Prints a column data. */ +static +void +dict_col_print_low( +/*===============*/ + const dict_table_t* table, /* in: table */ + const dict_col_t* col) /* in: column */ +{ + dtype_t type; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + dict_col_copy_type(col, &type); + fprintf(stderr, "%s: ", dict_table_get_col_name(table, + dict_col_get_no(col))); + + dtype_print(&type); +} + +/************************************************************************** +Prints an index data. */ +static +void +dict_index_print_low( +/*=================*/ + dict_index_t* index) /* in: index */ +{ + ib_int64_t n_vals; + ulint i; + const char* type_string; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + if (index->n_user_defined_cols > 0) { + n_vals = index->stat_n_diff_key_vals[ + index->n_user_defined_cols]; + } else { + n_vals = index->stat_n_diff_key_vals[1]; + } + + if (dict_index_is_clust(index)) { + type_string = "clustered index"; + } else if (dict_index_is_unique(index)) { + type_string = "unique index"; + } else { + type_string = "secondary index"; + } + + fprintf(stderr, + " INDEX: name %s, id %lu %lu, fields %lu/%lu," + " uniq %lu, type %lu\n" + " root page %lu, appr.key vals %lu," + " leaf pages %lu, size pages %lu\n" + " FIELDS: ", + index->name, + (ulong) ut_dulint_get_high(index->id), + (ulong) ut_dulint_get_low(index->id), + (ulong) index->n_user_defined_cols, + (ulong) index->n_fields, + (ulong) index->n_uniq, + (ulong) index->type, + (ulong) index->page, + (ulong) n_vals, + (ulong) index->stat_n_leaf_pages, + (ulong) index->stat_index_size); + + for (i = 0; i < index->n_fields; i++) { + dict_field_print_low(dict_index_get_nth_field(index, i)); + } + + putc('\n', stderr); + +#ifdef UNIV_BTR_PRINT + btr_print_size(index); + + btr_print_index(index, 7); +#endif /* UNIV_BTR_PRINT */ +} + +/************************************************************************** +Prints a field data. */ +static +void +dict_field_print_low( +/*=================*/ + dict_field_t* field) /* in: field */ +{ + ut_ad(mutex_own(&(dict_sys->mutex))); + + fprintf(stderr, " %s", field->name); + + if (field->prefix_len != 0) { + fprintf(stderr, "(%lu)", (ulong) field->prefix_len); + } +} + +/************************************************************************** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +UNIV_INTERN +void +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + FILE* file, /* in: file where to print */ + trx_t* trx, /* in: transaction */ + dict_foreign_t* foreign, /* in: foreign key constraint */ + ibool add_newline) /* in: whether to add a newline */ +{ + const char* stripped_id; + ulint i; + + if (strchr(foreign->id, '/')) { + /* Strip the preceding database name from the constraint id */ + stripped_id = foreign->id + 1 + + dict_get_db_name_len(foreign->id); + } else { + stripped_id = foreign->id; + } + + putc(',', file); + + if (add_newline) { + /* SHOW CREATE TABLE wants constraints each printed nicely + on its own line, while error messages want no newlines + inserted. */ + fputs("\n ", file); + } + + fputs(" CONSTRAINT ", file); + ut_print_name(file, trx, FALSE, stripped_id); + fputs(" FOREIGN KEY (", file); + + for (i = 0;;) { + ut_print_name(file, trx, FALSE, foreign->foreign_col_names[i]); + if (++i < foreign->n_fields) { + fputs(", ", file); + } else { + break; + } + } + + fputs(") REFERENCES ", file); + + if (dict_tables_have_same_db(foreign->foreign_table_name, + foreign->referenced_table_name)) { + /* Do not print the database name of the referenced table */ + ut_print_name(file, trx, TRUE, + dict_remove_db_name( + foreign->referenced_table_name)); + } else { + ut_print_name(file, trx, TRUE, + foreign->referenced_table_name); + } + + putc(' ', file); + putc('(', file); + + for (i = 0;;) { + ut_print_name(file, trx, FALSE, + foreign->referenced_col_names[i]); + if (++i < foreign->n_fields) { + fputs(", ", file); + } else { + break; + } + } + + putc(')', file); + + if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) { + fputs(" ON DELETE CASCADE", file); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) { + fputs(" ON DELETE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + fputs(" ON DELETE NO ACTION", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + fputs(" ON UPDATE CASCADE", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + fputs(" ON UPDATE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + fputs(" ON UPDATE NO ACTION", file); + } +} + +/************************************************************************** +Outputs info on foreign keys of a table. */ +UNIV_INTERN +void +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /* in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + FILE* file, /* in: file where to print */ + trx_t* trx, /* in: transaction */ + dict_table_t* table) /* in: table */ +{ + dict_foreign_t* foreign; + + mutex_enter(&(dict_sys->mutex)); + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + if (foreign == NULL) { + mutex_exit(&(dict_sys->mutex)); + + return; + } + + while (foreign != NULL) { + if (create_table_format) { + dict_print_info_on_foreign_key_in_create_format( + file, trx, foreign, TRUE); + } else { + ulint i; + fputs("; (", file); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + putc(' ', file); + } + + ut_print_name(file, trx, FALSE, + foreign->foreign_col_names[i]); + } + + fputs(") REFER ", file); + ut_print_name(file, trx, TRUE, + foreign->referenced_table_name); + putc('(', file); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + putc(' ', file); + } + ut_print_name( + file, trx, FALSE, + foreign->referenced_col_names[i]); + } + + putc(')', file); + + if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) { + fputs(" ON DELETE CASCADE", file); + } + + if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) { + fputs(" ON DELETE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + fputs(" ON DELETE NO ACTION", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + fputs(" ON UPDATE CASCADE", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + fputs(" ON UPDATE SET NULL", file); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + fputs(" ON UPDATE NO ACTION", file); + } + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + mutex_exit(&(dict_sys->mutex)); +} + +/************************************************************************ +Displays the names of the index and the table. */ +UNIV_INTERN +void +dict_index_name_print( +/*==================*/ + FILE* file, /* in: output stream */ + trx_t* trx, /* in: transaction */ + const dict_index_t* index) /* in: index to print */ +{ + fputs("index ", file); + ut_print_name(file, trx, FALSE, index->name); + fputs(" of table ", file); + ut_print_name(file, trx, TRUE, index->table_name); +} + +/************************************************************************** +Get index by name */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name( +/*=========================*/ + /* out: index, NULL if does not exist */ + dict_table_t* table, /* in: table */ + const char* name) /* in: name of the index to find */ +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (ut_strcmp(index->name, name) == 0) { + + return(index); + } + + index = dict_table_get_next_index(index); + } + + return(NULL); + +} + +/************************************************************************** +Replace the index passed in with another equivalent index in the tables +foreign key list. */ +UNIV_INTERN +void +dict_table_replace_index_in_foreign_list( +/*=====================================*/ + dict_table_t* table, /* in/out: table */ + dict_index_t* index) /* in: index to be replaced */ +{ + dict_foreign_t* foreign; + + for (foreign = UT_LIST_GET_FIRST(table->foreign_list); + foreign; + foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) { + + if (foreign->foreign_index == index) { + dict_index_t* new_index + = dict_foreign_find_equiv_index(foreign); + ut_a(new_index); + + foreign->foreign_index = new_index; + } + } +} + +/************************************************************************** +In case there is more than one index with the same name return the index +with the min(id). */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name_and_min_id( +/*=====================================*/ + /* out: index, NULL if does not exist */ + dict_table_t* table, /* in: table */ + const char* name) /* in: name of the index to find */ +{ + dict_index_t* index; + dict_index_t* min_index; /* Index with matching name and min(id) */ + + min_index = NULL; + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (ut_strcmp(index->name, name) == 0) { + if (!min_index + || ut_dulint_cmp(index->id, min_index->id) < 0) { + + min_index = index; + } + } + + index = dict_table_get_next_index(index); + } + + return(min_index); + +} + +#ifdef UNIV_DEBUG +/************************************************************************** +Check for duplicate index entries in a table [using the index name] */ +UNIV_INTERN +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table) /* in: Check for dup indexes + in this table */ +{ + /* Check for duplicates, ignoring indexes that are marked + as to be dropped */ + + const dict_index_t* index1; + const dict_index_t* index2; + + /* The primary index _must_ exist */ + ut_a(UT_LIST_GET_LEN(table->indexes) > 0); + + index1 = UT_LIST_GET_FIRST(table->indexes); + index2 = UT_LIST_GET_NEXT(indexes, index1); + + while (index1 && index2) { + + while (index2) { + + if (!index2->to_be_dropped) { + ut_ad(ut_strcmp(index1->name, index2->name)); + } + + index2 = UT_LIST_GET_NEXT(indexes, index2); + } + + index1 = UT_LIST_GET_NEXT(indexes, index1); + index2 = UT_LIST_GET_NEXT(indexes, index1); + } +} +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/dict/dict0load.c b/storage/xtradb/dict/dict0load.c new file mode 100644 index 00000000000..44590a261a6 --- /dev/null +++ b/storage/xtradb/dict/dict0load.c @@ -0,0 +1,1460 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0load.h" +#ifndef UNIV_HOTBACKUP +#include "mysql_version.h" +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_NONINL +#include "dict0load.ic" +#endif + +#include "btr0pcur.h" +#include "btr0btr.h" +#include "page0page.h" +#include "mach0data.h" +#include "dict0dict.h" +#include "dict0boot.h" +#include "rem0cmp.h" +#include "srv0start.h" +#include "srv0srv.h" + +/******************************************************************** +Returns TRUE if index's i'th column's name is 'name' .*/ +static +ibool +name_of_col_is( +/*===========*/ + /* out: */ + dict_table_t* table, /* in: table */ + dict_index_t* index, /* in: index */ + ulint i, /* in: */ + const char* name) /* in: name to compare to */ +{ + ulint tmp = dict_col_get_no(dict_field_get_col( + dict_index_get_nth_field( + index, i))); + + return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0); +} + +/************************************************************************ +Finds the first table name in the given database. */ +UNIV_INTERN +char* +dict_get_first_table_name_in_db( +/*============================*/ + /* out, own: table name, NULL if + does not exist; the caller must + free the memory in the string! */ + const char* name) /* in: database name which ends in '/' */ +{ + dict_table_t* sys_tables; + btr_pcur_t pcur; + dict_index_t* sys_index; + dtuple_t* tuple; + mem_heap_t* heap; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + heap = mem_heap_create(1000); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!dict_table_is_comp(sys_tables)); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, name, ut_strlen(name)); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); +loop: + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + field = rec_get_nth_field_old(rec, 0, &len); + + if (len < strlen(name) + || ut_memcmp(name, field, strlen(name)) != 0) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + if (!rec_get_deleted_flag(rec, 0)) { + + /* We found one */ + + char* table_name = mem_strdupl((char*) field, len); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(table_name); + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + goto loop; +} + +/************************************************************************ +Prints to the standard output information on all tables found in the data +dictionary system table. */ +UNIV_INTERN +void +dict_print(void) +/*============*/ +{ + dict_table_t* sys_tables; + dict_index_t* sys_index; + dict_table_t* table; + btr_pcur_t pcur; + const rec_t* rec; + const byte* field; + ulint len; + mtr_t mtr; + + /* Enlarge the fatal semaphore wait timeout during the InnoDB table + monitor printout */ + + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + mutex_enter(&(dict_sys->mutex)); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + + btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, + TRUE, &mtr); +loop: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* end of index */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + mutex_exit(&(dict_sys->mutex)); + + /* Restore the fatal semaphore wait timeout */ + + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + return; + } + + field = rec_get_nth_field_old(rec, 0, &len); + + if (!rec_get_deleted_flag(rec, 0)) { + + /* We found one */ + + char* table_name = mem_strdupl((char*) field, len); + + btr_pcur_store_position(&pcur, &mtr); + + mtr_commit(&mtr); + + table = dict_table_get_low(table_name); + mem_free(table_name); + + if (table == NULL) { + fputs("InnoDB: Failed to load table ", stderr); + ut_print_namel(stderr, NULL, TRUE, (char*) field, len); + putc('\n', stderr); + } else { + /* The table definition was corrupt if there + is no index */ + + if (dict_table_get_first_index(table)) { + dict_update_statistics_low(table, TRUE); + } + + dict_table_print_low(table); + } + + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + } + + goto loop; +} + +/************************************************************************ +Determine the flags of a table described in SYS_TABLES. */ +static +ulint +dict_sys_tables_get_flags( +/*======================*/ + /* out: compressed page size in kilobytes; + or 0 if the tablespace is uncompressed, + ULINT_UNDEFINED on error */ + const rec_t* rec) /* in: a record of SYS_TABLES */ +{ + const byte* field; + ulint len; + ulint n_cols; + ulint flags; + + field = rec_get_nth_field_old(rec, 5, &len); + ut_a(len == 4); + + flags = mach_read_from_4(field); + + if (UNIV_LIKELY(flags == DICT_TABLE_ORDINARY)) { + return(0); + } + + field = rec_get_nth_field_old(rec, 4, &len); + n_cols = mach_read_from_4(field); + + if (UNIV_UNLIKELY(!(n_cols & 0x80000000UL))) { + /* New file formats require ROW_FORMAT=COMPACT. */ + return(ULINT_UNDEFINED); + } + + switch (flags & (DICT_TF_FORMAT_MASK | DICT_TF_COMPACT)) { + default: + case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT: + case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT: + /* flags should be DICT_TABLE_ORDINARY, + or DICT_TF_FORMAT_MASK should be nonzero. */ + return(ULINT_UNDEFINED); + + case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT: +#if DICT_TF_FORMAT_MAX > DICT_TF_FORMAT_ZIP +# error "missing case labels for DICT_TF_FORMAT_ZIP .. DICT_TF_FORMAT_MAX" +#endif + /* We support this format. */ + break; + } + + if (UNIV_UNLIKELY((flags & DICT_TF_ZSSIZE_MASK) + > (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT))) { + /* Unsupported compressed page size. */ + return(ULINT_UNDEFINED); + } + + if (UNIV_UNLIKELY(flags & (~0 << DICT_TF_BITS))) { + /* Some unused bits are set. */ + return(ULINT_UNDEFINED); + } + + return(flags); +} + +/************************************************************************ +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). + +In a normal startup, we create the tablespace objects for every table in +InnoDB's data dictionary, if the corresponding .ibd file exists. +We also scan the biggest space id, and store it to fil_system. */ +UNIV_INTERN +void +dict_check_tablespaces_and_store_max_id( +/*====================================*/ + ibool in_crash_recovery) /* in: are we doing a crash recovery */ +{ + dict_table_t* sys_tables; + dict_index_t* sys_index; + btr_pcur_t pcur; + const rec_t* rec; + ulint max_space_id = 0; + mtr_t mtr; + + mutex_enter(&(dict_sys->mutex)); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!dict_table_is_comp(sys_tables)); + + btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, + TRUE, &mtr); +loop: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* end of index */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + /* We must make the tablespace cache aware of the biggest + known space id */ + + /* printf("Biggest space id in data dictionary %lu\n", + max_space_id); */ + fil_set_max_space_id_if_bigger(max_space_id); + + mutex_exit(&(dict_sys->mutex)); + + return; + } + + if (!rec_get_deleted_flag(rec, 0)) { + + /* We found one */ + const byte* field; + ulint len; + ulint space_id; + ulint flags; + char* name; + + field = rec_get_nth_field_old(rec, 0, &len); + name = mem_strdupl((char*) field, len); + + flags = dict_sys_tables_get_flags(rec); + if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) { + + field = rec_get_nth_field_old(rec, 5, &len); + flags = mach_read_from_4(field); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary" + " has unknown type %lx.\n", + (ulong) flags); + + goto loop; + } + + field = rec_get_nth_field_old(rec, 9, &len); + ut_a(len == 4); + + space_id = mach_read_from_4(field); + + btr_pcur_store_position(&pcur, &mtr); + + mtr_commit(&mtr); + + if (space_id != 0 && in_crash_recovery) { + /* Check that the tablespace (the .ibd file) really + exists; print a warning to the .err log if not */ + + fil_space_for_table_exists_in_mem(space_id, name, + FALSE, TRUE, TRUE); + } + + if (space_id != 0 && !in_crash_recovery) { + /* It is a normal database startup: create the space + object and check that the .ibd file exists. */ + + fil_open_single_table_tablespace(FALSE, space_id, + flags, name); + } + + mem_free(name); + + if (space_id > max_space_id) { + max_space_id = space_id; + } + + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + } + + goto loop; +} + +/************************************************************************ +Loads definitions for table columns. */ +static +void +dict_load_columns( +/*==============*/ + dict_table_t* table, /* in: table */ + mem_heap_t* heap) /* in: memory heap for temporary storage */ +{ + dict_table_t* sys_columns; + dict_index_t* sys_index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + byte* buf; + char* name; + ulint mtype; + ulint prtype; + ulint col_len; + ulint i; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + mtr_start(&mtr); + + sys_columns = dict_table_get_low("SYS_COLUMNS"); + sys_index = UT_LIST_GET_FIRST(sys_columns->indexes); + ut_a(!dict_table_is_comp(sys_columns)); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) { + + rec = btr_pcur_get_rec(&pcur); + + ut_a(btr_pcur_is_on_user_rec(&pcur)); + + ut_a(!rec_get_deleted_flag(rec, 0)); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + ut_a(ut_dulint_cmp(table->id, mach_read_from_8(field)) == 0); + + field = rec_get_nth_field_old(rec, 1, &len); + ut_ad(len == 4); + ut_a(i == mach_read_from_4(field)); + + ut_a(name_of_col_is(sys_columns, sys_index, 4, "NAME")); + + field = rec_get_nth_field_old(rec, 4, &len); + name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old(rec, 5, &len); + mtype = mach_read_from_4(field); + + field = rec_get_nth_field_old(rec, 6, &len); + prtype = mach_read_from_4(field); + + if (dtype_get_charset_coll(prtype) == 0 + && dtype_is_string_type(mtype)) { + /* The table was created with < 4.1.2. */ + + if (dtype_is_binary_string_type(mtype, prtype)) { + /* Use the binary collation for + string columns of binary type. */ + + prtype = dtype_form_prtype( + prtype, + DATA_MYSQL_BINARY_CHARSET_COLL); + } else { + /* Use the default charset for + other than binary columns. */ + + prtype = dtype_form_prtype( + prtype, + data_mysql_default_charset_coll); + } + } + + field = rec_get_nth_field_old(rec, 7, &len); + col_len = mach_read_from_4(field); + + ut_a(name_of_col_is(sys_columns, sys_index, 8, "PREC")); + + dict_mem_table_add_col(table, heap, name, + mtype, prtype, col_len); + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); +} + +/************************************************************************ +Loads definitions for index fields. */ +static +void +dict_load_fields( +/*=============*/ + dict_index_t* index, /* in: index whose fields to load */ + mem_heap_t* heap) /* in: memory heap for temporary storage */ +{ + dict_table_t* sys_fields; + dict_index_t* sys_index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + ulint pos_and_prefix_len; + ulint prefix_len; + const rec_t* rec; + const byte* field; + ulint len; + byte* buf; + ulint i; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + mtr_start(&mtr); + + sys_fields = dict_table_get_low("SYS_FIELDS"); + sys_index = UT_LIST_GET_FIRST(sys_fields->indexes); + ut_a(!dict_table_is_comp(sys_fields)); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, index->id); + + dfield_set_data(dfield, buf, 8); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (i = 0; i < index->n_fields; i++) { + + rec = btr_pcur_get_rec(&pcur); + + ut_a(btr_pcur_is_on_user_rec(&pcur)); + + /* There could be delete marked records in SYS_FIELDS + because SYS_FIELDS.INDEX_ID can be updated + by ALTER TABLE ADD INDEX. */ + + if (rec_get_deleted_flag(rec, 0)) { + + goto next_rec; + } + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + field = rec_get_nth_field_old(rec, 1, &len); + ut_a(len == 4); + + /* The next field stores the field position in the index + and a possible column prefix length if the index field + does not contain the whole column. The storage format is + like this: if there is at least one prefix field in the index, + then the HIGH 2 bytes contain the field number (== i) and the + low 2 bytes the prefix length for the field. Otherwise the + field number (== i) is contained in the 2 LOW bytes. */ + + pos_and_prefix_len = mach_read_from_4(field); + + ut_a((pos_and_prefix_len & 0xFFFFUL) == i + || (pos_and_prefix_len & 0xFFFF0000UL) == (i << 16)); + + if ((i == 0 && pos_and_prefix_len > 0) + || (pos_and_prefix_len & 0xFFFF0000UL) > 0) { + + prefix_len = pos_and_prefix_len & 0xFFFFUL; + } else { + prefix_len = 0; + } + + ut_a(name_of_col_is(sys_fields, sys_index, 4, "COL_NAME")); + + field = rec_get_nth_field_old(rec, 4, &len); + + dict_mem_index_add_field(index, + mem_heap_strdupl(heap, + (char*) field, len), + prefix_len); + +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); +} + +/************************************************************************ +Loads definitions for table indexes. Adds them to the data dictionary +cache. */ +static +ulint +dict_load_indexes( +/*==============*/ + /* out: DB_SUCCESS if ok, DB_CORRUPTION + if corruption of dictionary table or + DB_UNSUPPORTED if table has unknown index + type */ + dict_table_t* table, /* in: table */ + mem_heap_t* heap) /* in: memory heap for temporary storage */ +{ + dict_table_t* sys_indexes; + dict_index_t* sys_index; + dict_index_t* index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + ulint name_len; + char* name_buf; + ulint type; + ulint space; + ulint page_no; + ulint n_fields; + byte* buf; + ibool is_sys_table; + dulint id; + mtr_t mtr; + ulint error = DB_SUCCESS; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + if ((ut_dulint_get_high(table->id) == 0) + && (ut_dulint_get_low(table->id) < DICT_HDR_FIRST_ID)) { + is_sys_table = TRUE; + } else { + is_sys_table = FALSE; + } + + mtr_start(&mtr); + + sys_indexes = dict_table_get_low("SYS_INDEXES"); + sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes); + ut_a(!dict_table_is_comp(sys_indexes)); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (;;) { + if (!btr_pcur_is_on_user_rec(&pcur)) { + + break; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + if (ut_memcmp(buf, field, len) != 0) { + break; + } else if (rec_get_deleted_flag(rec, 0)) { + /* Skip delete marked records */ + goto next_rec; + } + + field = rec_get_nth_field_old(rec, 1, &len); + ut_ad(len == 8); + id = mach_read_from_8(field); + + ut_a(name_of_col_is(sys_indexes, sys_index, 4, "NAME")); + + field = rec_get_nth_field_old(rec, 4, &name_len); + name_buf = mem_heap_strdupl(heap, (char*) field, name_len); + + field = rec_get_nth_field_old(rec, 5, &len); + n_fields = mach_read_from_4(field); + + field = rec_get_nth_field_old(rec, 6, &len); + type = mach_read_from_4(field); + + field = rec_get_nth_field_old(rec, 7, &len); + space = mach_read_from_4(field); + + ut_a(name_of_col_is(sys_indexes, sys_index, 8, "PAGE_NO")); + + field = rec_get_nth_field_old(rec, 8, &len); + page_no = mach_read_from_4(field); + + /* We check for unsupported types first, so that the + subsequent checks are relevant for the supported types. */ + if (type & ~(DICT_CLUSTERED | DICT_UNIQUE)) { + + fprintf(stderr, + "InnoDB: Error: unknown type %lu" + " of index %s of table %s\n", + (ulong) type, name_buf, table->name); + + error = DB_UNSUPPORTED; + goto func_exit; + } else if (page_no == FIL_NULL) { + + fprintf(stderr, + "InnoDB: Error: trying to load index %s" + " for table %s\n" + "InnoDB: but the index tree has been freed!\n", + name_buf, table->name); + + error = DB_CORRUPTION; + goto func_exit; + } else if ((type & DICT_CLUSTERED) == 0 + && NULL == dict_table_get_first_index(table)) { + + fputs("InnoDB: Error: trying to load index ", + stderr); + ut_print_name(stderr, NULL, FALSE, name_buf); + fputs(" for table ", stderr); + ut_print_name(stderr, NULL, TRUE, table->name); + fputs("\nInnoDB: but the first index" + " is not clustered!\n", stderr); + + error = DB_CORRUPTION; + goto func_exit; + } else if (is_sys_table + && ((type & DICT_CLUSTERED) + || ((table == dict_sys->sys_tables) + && (name_len == (sizeof "ID_IND") - 1) + && (0 == ut_memcmp(name_buf, + "ID_IND", name_len))))) { + + /* The index was created in memory already at booting + of the database server */ + } else { + index = dict_mem_index_create(table->name, name_buf, + space, type, n_fields); + index->id = id; + + dict_load_fields(index, heap); + error = dict_index_add_to_cache(table, index, page_no, + FALSE); + /* The data dictionary tables should never contain + invalid index definitions. If we ignored this error + and simply did not load this index definition, the + .frm file would disagree with the index definitions + inside InnoDB. */ + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + + goto func_exit; + } + } + +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(error); +} + +/************************************************************************ +Loads a table definition and also all its index definitions, and also +the cluster definition if the table is a member in a cluster. Also loads +all foreign key constraints where the foreign key is in the table or where +a foreign key references columns in this table. Adds all these to the data +dictionary cache. */ +UNIV_INTERN +dict_table_t* +dict_load_table( +/*============*/ + /* out: table, NULL if does not exist; + if the table is stored in an .ibd file, + but the file does not exist, + then we set the ibd_file_missing flag TRUE + in the table object we return */ + const char* name) /* in: table name in the + databasename/tablename format */ +{ + ibool ibd_file_missing = FALSE; + dict_table_t* table; + dict_table_t* sys_tables; + btr_pcur_t pcur; + dict_index_t* sys_index; + dtuple_t* tuple; + mem_heap_t* heap; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + ulint space; + ulint n_cols; + ulint flags; + ulint err; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + heap = mem_heap_create(32000); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low("SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + ut_a(!dict_table_is_comp(sys_tables)); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, name, ut_strlen(name)); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur) + || rec_get_deleted_flag(rec, 0)) { + /* Not found */ +err_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + field = rec_get_nth_field_old(rec, 0, &len); + + /* Check if the table name in record is the searched one */ + if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) { + + goto err_exit; + } + + ut_a(name_of_col_is(sys_tables, sys_index, 9, "SPACE")); + + field = rec_get_nth_field_old(rec, 9, &len); + space = mach_read_from_4(field); + + /* Check if the tablespace exists and has the right name */ + if (space != 0) { + flags = dict_sys_tables_get_flags(rec); + + if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) { + field = rec_get_nth_field_old(rec, 5, &len); + flags = mach_read_from_4(field); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary" + " has unknown type %lx.\n", + (ulong) flags); + goto err_exit; + } + + if (fil_space_for_table_exists_in_mem(space, name, FALSE, + FALSE, FALSE)) { + /* Ok; (if we did a crash recovery then the tablespace + can already be in the memory cache) */ + } else { + /* In >= 4.1.9, InnoDB scans the data dictionary also + at a normal mysqld startup. It is an error if the + space object does not exist in memory. */ + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: error: space object of table %s,\n" + "InnoDB: space id %lu did not exist in memory." + " Retrying an open.\n", + name, (ulong)space); + /* Try to open the tablespace */ + if (!fil_open_single_table_tablespace( + TRUE, space, flags, name)) { + /* We failed to find a sensible tablespace + file */ + + ibd_file_missing = TRUE; + } + } + } else { + flags = 0; + } + + ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS")); + + field = rec_get_nth_field_old(rec, 4, &len); + n_cols = mach_read_from_4(field); + + /* The high-order bit of N_COLS is the "compact format" flag. */ + if (n_cols & 0x80000000UL) { + flags |= DICT_TF_COMPACT; + } + + table = dict_mem_table_create(name, space, n_cols & ~0x80000000UL, + flags); + + table->ibd_file_missing = (unsigned int) ibd_file_missing; + + ut_a(name_of_col_is(sys_tables, sys_index, 3, "ID")); + + field = rec_get_nth_field_old(rec, 3, &len); + table->id = mach_read_from_8(field); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + dict_load_columns(table, heap); + + dict_table_add_to_cache(table, heap); + + mem_heap_empty(heap); + + err = dict_load_indexes(table, heap); +#ifndef UNIV_HOTBACKUP + /* If the force recovery flag is set, we open the table irrespective + of the error condition, since the user may want to dump data from the + clustered index. However we load the foreign key information only if + all indexes were loaded. */ + if (err == DB_SUCCESS) { + err = dict_load_foreigns(table->name, TRUE); + } else if (!srv_force_recovery) { + dict_table_remove_from_cache(table); + table = NULL; + } +# if 0 + if (err != DB_SUCCESS && table != NULL) { + + mutex_enter(&dict_foreign_err_mutex); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: could not make a foreign key" + " definition to match\n" + "InnoDB: the foreign key table" + " or the referenced table!\n" + "InnoDB: The data dictionary of InnoDB is corrupt." + " You may need to drop\n" + "InnoDB: and recreate the foreign key table" + " or the referenced table.\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n" + "InnoDB: Latest foreign key error printout:\n%s\n", + dict_foreign_err_buf); + + mutex_exit(&dict_foreign_err_mutex); + } +# endif /* 0 */ +#endif /* !UNIV_HOTBACKUP */ + mem_heap_free(heap); + + return(table); +} + +/*************************************************************************** +Loads a table object based on the table id. */ +UNIV_INTERN +dict_table_t* +dict_load_table_on_id( +/*==================*/ + /* out: table; NULL if table does not exist */ + dulint table_id) /* in: table id */ +{ + byte id_buf[8]; + btr_pcur_t pcur; + mem_heap_t* heap; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_table_ids; + dict_table_t* sys_tables; + const rec_t* rec; + const byte* field; + ulint len; + dict_table_t* table; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* NOTE that the operation of this function is protected by + the dictionary mutex, and therefore no deadlocks can occur + with other dictionary operations. */ + + mtr_start(&mtr); + /*---------------------------------------------------*/ + /* Get the secondary index based on ID for table SYS_TABLES */ + sys_tables = dict_sys->sys_tables; + sys_table_ids = dict_table_get_next_index( + dict_table_get_first_index(sys_tables)); + ut_a(!dict_table_is_comp(sys_tables)); + heap = mem_heap_create(256); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + /* Write the table id in byte format to id_buf */ + mach_write_to_8(id_buf, table_id); + + dfield_set_data(dfield, id_buf, 8); + dict_index_copy_types(tuple, sys_table_ids, 1); + + btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur) + || rec_get_deleted_flag(rec, 0)) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + /*---------------------------------------------------*/ + /* Now we have the record in the secondary index containing the + table ID and NAME */ + + rec = btr_pcur_get_rec(&pcur); + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + /* Check if the table id in record is the one searched for */ + if (ut_dulint_cmp(table_id, mach_read_from_8(field)) != 0) { + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(NULL); + } + + /* Now we get the table name from the record */ + field = rec_get_nth_field_old(rec, 1, &len); + /* Load the table definition to memory */ + table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len)); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + return(table); +} + +/************************************************************************ +This function is called when the database is booted. Loads system table +index definitions except for the clustered index which is added to the +dictionary cache at booting before calling this function. */ +UNIV_INTERN +void +dict_load_sys_table( +/*================*/ + dict_table_t* table) /* in: system table */ +{ + mem_heap_t* heap; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + heap = mem_heap_create(1000); + + dict_load_indexes(table, heap); + + mem_heap_free(heap); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************ +Loads foreign key constraint col names (also for the referenced table). */ +static +void +dict_load_foreign_cols( +/*===================*/ + const char* id, /* in: foreign constraint id as a + null-terminated string */ + dict_foreign_t* foreign)/* in: foreign constraint object */ +{ + dict_table_t* sys_foreign_cols; + dict_index_t* sys_index; + btr_pcur_t pcur; + dtuple_t* tuple; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + ulint i; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + foreign->foreign_col_names = mem_heap_alloc( + foreign->heap, foreign->n_fields * sizeof(void*)); + + foreign->referenced_col_names = mem_heap_alloc( + foreign->heap, foreign->n_fields * sizeof(void*)); + mtr_start(&mtr); + + sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS"); + sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes); + ut_a(!dict_table_is_comp(sys_foreign_cols)); + + tuple = dtuple_create(foreign->heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, id, ut_strlen(id)); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + for (i = 0; i < foreign->n_fields; i++) { + + rec = btr_pcur_get_rec(&pcur); + + ut_a(btr_pcur_is_on_user_rec(&pcur)); + ut_a(!rec_get_deleted_flag(rec, 0)); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_a(len == ut_strlen(id)); + ut_a(ut_memcmp(id, field, len) == 0); + + field = rec_get_nth_field_old(rec, 1, &len); + ut_a(len == 4); + ut_a(i == mach_read_from_4(field)); + + field = rec_get_nth_field_old(rec, 4, &len); + foreign->foreign_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + field = rec_get_nth_field_old(rec, 5, &len); + foreign->referenced_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); +} + +/*************************************************************************** +Loads a foreign key constraint to the dictionary cache. */ +static +ulint +dict_load_foreign( +/*==============*/ + /* out: DB_SUCCESS or error code */ + const char* id, /* in: foreign constraint id as a + null-terminated string */ + ibool check_charsets) + /* in: TRUE=check charset compatibility */ +{ + dict_foreign_t* foreign; + dict_table_t* sys_foreign; + btr_pcur_t pcur; + dict_index_t* sys_index; + dtuple_t* tuple; + mem_heap_t* heap2; + dfield_t* dfield; + const rec_t* rec; + const byte* field; + ulint len; + ulint n_fields_and_type; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + heap2 = mem_heap_create(1000); + + mtr_start(&mtr); + + sys_foreign = dict_table_get_low("SYS_FOREIGN"); + sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes); + ut_a(!dict_table_is_comp(sys_foreign)); + + tuple = dtuple_create(heap2, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, id, ut_strlen(id)); + dict_index_copy_types(tuple, sys_index, 1); + + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur) + || rec_get_deleted_flag(rec, 0)) { + /* Not found */ + + fprintf(stderr, + "InnoDB: Error A: cannot load foreign constraint %s\n", + id); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap2); + + return(DB_ERROR); + } + + field = rec_get_nth_field_old(rec, 0, &len); + + /* Check if the id in record is the searched one */ + if (len != ut_strlen(id) || ut_memcmp(id, field, len) != 0) { + + fprintf(stderr, + "InnoDB: Error B: cannot load foreign constraint %s\n", + id); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap2); + + return(DB_ERROR); + } + + /* Read the table names and the number of columns associated + with the constraint */ + + mem_heap_free(heap2); + + foreign = dict_mem_foreign_create(); + + n_fields_and_type = mach_read_from_4( + rec_get_nth_field_old(rec, 5, &len)); + + ut_a(len == 4); + + /* We store the type in the bits 24..29 of n_fields_and_type. */ + + foreign->type = (unsigned int) (n_fields_and_type >> 24); + foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL); + + foreign->id = mem_heap_strdup(foreign->heap, id); + + field = rec_get_nth_field_old(rec, 3, &len); + foreign->foreign_table_name = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + field = rec_get_nth_field_old(rec, 4, &len); + foreign->referenced_table_name = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + dict_load_foreign_cols(id, foreign); + + /* If the foreign table is not yet in the dictionary cache, we + have to load it so that we are able to make type comparisons + in the next function call. */ + + dict_table_get_low(foreign->foreign_table_name); + + /* Note that there may already be a foreign constraint object in + the dictionary cache for this constraint: then the following + call only sets the pointers in it to point to the appropriate table + and index objects and frees the newly created object foreign. + Adding to the cache should always succeed since we are not creating + a new foreign key constraint but loading one from the data + dictionary. */ + + return(dict_foreign_add_to_cache(foreign, check_charsets)); +} + +/*************************************************************************** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. Note that we know that the dictionary +cache already contains all constraints where the other relevant table is +already in the dictionary cache. */ +UNIV_INTERN +ulint +dict_load_foreigns( +/*===============*/ + /* out: DB_SUCCESS or error code */ + const char* table_name, /* in: table name */ + ibool check_charsets) /* in: TRUE=check charset + compatibility */ +{ + btr_pcur_t pcur; + mem_heap_t* heap; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sec_index; + dict_table_t* sys_foreign; + const rec_t* rec; + const byte* field; + ulint len; + char* id ; + ulint err; + mtr_t mtr; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + sys_foreign = dict_table_get_low("SYS_FOREIGN"); + + if (sys_foreign == NULL) { + /* No foreign keys defined yet in this database */ + + fprintf(stderr, + "InnoDB: Error: no foreign key system tables" + " in the database\n"); + + return(DB_ERROR); + } + + ut_a(!dict_table_is_comp(sys_foreign)); + mtr_start(&mtr); + + /* Get the secondary index based on FOR_NAME from table + SYS_FOREIGN */ + + sec_index = dict_table_get_next_index( + dict_table_get_first_index(sys_foreign)); +start_load: + heap = mem_heap_create(256); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(dfield, table_name, ut_strlen(table_name)); + dict_index_copy_types(tuple, sec_index, 1); + + btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); +loop: + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* End of index */ + + goto load_next_index; + } + + /* Now we have the record in the secondary index containing a table + name and a foreign constraint ID */ + + rec = btr_pcur_get_rec(&pcur); + field = rec_get_nth_field_old(rec, 0, &len); + + /* Check if the table name in the record is the one searched for; the + following call does the comparison in the latin1_swedish_ci + charset-collation, in a case-insensitive way. */ + + if (0 != cmp_data_data(dfield_get_type(dfield)->mtype, + dfield_get_type(dfield)->prtype, + dfield_get_data(dfield), dfield_get_len(dfield), + field, len)) { + + goto load_next_index; + } + + /* Since table names in SYS_FOREIGN are stored in a case-insensitive + order, we have to check that the table name matches also in a binary + string comparison. On Unix, MySQL allows table names that only differ + in character case. */ + + if (0 != ut_memcmp(field, table_name, len)) { + + goto next_rec; + } + + if (rec_get_deleted_flag(rec, 0)) { + + goto next_rec; + } + + /* Now we get a foreign key constraint id */ + field = rec_get_nth_field_old(rec, 1, &len); + id = mem_heap_strdupl(heap, (char*) field, len); + + btr_pcur_store_position(&pcur, &mtr); + + mtr_commit(&mtr); + + /* Load the foreign constraint definition to the dictionary cache */ + + err = dict_load_foreign(id, check_charsets); + + if (err != DB_SUCCESS) { + btr_pcur_close(&pcur); + mem_heap_free(heap); + + return(err); + } + + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + goto loop; + +load_next_index: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(heap); + + sec_index = dict_table_get_next_index(sec_index); + + if (sec_index != NULL) { + + mtr_start(&mtr); + + goto start_load; + } + + return(DB_SUCCESS); +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/dict/dict0mem.c b/storage/xtradb/dict/dict0mem.c new file mode 100644 index 00000000000..31c0c23f450 --- /dev/null +++ b/storage/xtradb/dict/dict0mem.c @@ -0,0 +1,308 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "dict0mem.h" + +#ifdef UNIV_NONINL +#include "dict0mem.ic" +#endif + +#include "rem0rec.h" +#include "data0type.h" +#include "mach0data.h" +#include "dict0dict.h" +#include "lock0lock.h" + +#define DICT_HEAP_SIZE 100 /* initial memory heap size when + creating a table or index object */ + +/************************************************************************** +Creates a table memory object. */ +UNIV_INTERN +dict_table_t* +dict_mem_table_create( +/*==================*/ + /* out, own: table object */ + const char* name, /* in: table name */ + ulint space, /* in: space where the clustered index of + the table is placed; this parameter is + ignored if the table is made a member of + a cluster */ + ulint n_cols, /* in: number of columns */ + ulint flags) /* in: table flags */ +{ + dict_table_t* table; + mem_heap_t* heap; + + ut_ad(name); + ut_a(!(flags & (~0 << DICT_TF_BITS))); + + heap = mem_heap_create(DICT_HEAP_SIZE); + + table = mem_heap_zalloc(heap, sizeof(dict_table_t)); + + table->heap = heap; + + table->flags = (unsigned int) flags; + table->name = mem_heap_strdup(heap, name); + table->space = (unsigned int) space; + table->n_cols = (unsigned int) (n_cols + DATA_N_SYS_COLS); + + table->cols = mem_heap_alloc(heap, (n_cols + DATA_N_SYS_COLS) + * sizeof(dict_col_t)); + + table->autoinc_lock = mem_heap_alloc(heap, lock_get_size()); + + mutex_create(&table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX); + + table->autoinc = 0; + + /* The number of transactions that are either waiting on the + AUTOINC lock or have been granted the lock. */ + table->n_waiting_or_granted_auto_inc_locks = 0; + +#ifdef UNIV_DEBUG + table->magic_n = DICT_TABLE_MAGIC_N; +#endif /* UNIV_DEBUG */ + return(table); +} + +/******************************************************************** +Free a table memory object. */ +UNIV_INTERN +void +dict_mem_table_free( +/*================*/ + dict_table_t* table) /* in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_d(table->cached = FALSE); + + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); +} + +/******************************************************************** +Append 'name' to 'col_names' (@see dict_table_t::col_names). */ +static +const char* +dict_add_col_name( +/*==============*/ + /* out: new column names array */ + const char* col_names, /* in: existing column names, or + NULL */ + ulint cols, /* in: number of existing columns */ + const char* name, /* in: new column name */ + mem_heap_t* heap) /* in: heap */ +{ + ulint old_len; + ulint new_len; + ulint total_len; + char* res; + + ut_ad(!cols == !col_names); + + /* Find out length of existing array. */ + if (col_names) { + const char* s = col_names; + ulint i; + + for (i = 0; i < cols; i++) { + s += strlen(s) + 1; + } + + old_len = s - col_names; + } else { + old_len = 0; + } + + new_len = strlen(name) + 1; + total_len = old_len + new_len; + + res = mem_heap_alloc(heap, total_len); + + if (old_len > 0) { + memcpy(res, col_names, old_len); + } + + memcpy(res + old_len, name, new_len); + + return(res); +} + +/************************************************************************** +Adds a column definition to a table. */ +UNIV_INTERN +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /* in: table */ + mem_heap_t* heap, /* in: temporary memory heap, or NULL */ + const char* name, /* in: column name, or NULL */ + ulint mtype, /* in: main datatype */ + ulint prtype, /* in: precise type */ + ulint len) /* in: precision */ +{ + dict_col_t* col; + ulint mbminlen; + ulint mbmaxlen; + ulint i; + + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!heap == !name); + + i = table->n_def++; + + if (name) { + if (UNIV_UNLIKELY(table->n_def == table->n_cols)) { + heap = table->heap; + } + if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) { + /* All preceding column names are empty. */ + char* s = mem_heap_zalloc(heap, table->n_def); + table->col_names = s; + } + + table->col_names = dict_add_col_name(table->col_names, + i, name, heap); + } + + col = dict_table_get_nth_col(table, i); + + col->ind = (unsigned int) i; + col->ord_part = 0; + + col->mtype = (unsigned int) mtype; + col->prtype = (unsigned int) prtype; + col->len = (unsigned int) len; + + dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen); + + col->mbminlen = (unsigned int) mbminlen; + col->mbmaxlen = (unsigned int) mbmaxlen; +} + +/************************************************************************** +Creates an index memory object. */ +UNIV_INTERN +dict_index_t* +dict_mem_index_create( +/*==================*/ + /* out, own: index object */ + const char* table_name, /* in: table name */ + const char* index_name, /* in: index name */ + ulint space, /* in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /* in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /* in: number of fields */ +{ + dict_index_t* index; + mem_heap_t* heap; + + ut_ad(table_name && index_name); + + heap = mem_heap_create(DICT_HEAP_SIZE); + index = mem_heap_zalloc(heap, sizeof(dict_index_t)); + + index->heap = heap; + + index->type = type; + index->space = (unsigned int) space; + index->name = mem_heap_strdup(heap, index_name); + index->table_name = table_name; + index->n_fields = (unsigned int) n_fields; + index->fields = mem_heap_alloc(heap, 1 + n_fields + * sizeof(dict_field_t)); + /* The '1 +' above prevents allocation + of an empty mem block */ +#ifdef UNIV_DEBUG + index->magic_n = DICT_INDEX_MAGIC_N; +#endif /* UNIV_DEBUG */ + return(index); +} + +/************************************************************************** +Creates and initializes a foreign constraint memory object. */ +UNIV_INTERN +dict_foreign_t* +dict_mem_foreign_create(void) +/*=========================*/ + /* out, own: foreign constraint struct */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + + heap = mem_heap_create(100); + + foreign = mem_heap_zalloc(heap, sizeof(dict_foreign_t)); + + foreign->heap = heap; + + return(foreign); +} + +/************************************************************************** +Adds a field definition to an index. NOTE: does not take a copy +of the column name if the field is a column. The memory occupied +by the column name may be released only after publishing the index. */ +UNIV_INTERN +void +dict_mem_index_add_field( +/*=====================*/ + dict_index_t* index, /* in: index */ + const char* name, /* in: column name */ + ulint prefix_len) /* in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ +{ + dict_field_t* field; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->n_def++; + + field = dict_index_get_nth_field(index, index->n_def - 1); + + field->name = name; + field->prefix_len = (unsigned int) prefix_len; +} + +/************************************************************************** +Frees an index memory object. */ +UNIV_INTERN +void +dict_mem_index_free( +/*================*/ + dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + mem_heap_free(index->heap); +} diff --git a/storage/xtradb/dyn/dyn0dyn.c b/storage/xtradb/dyn/dyn0dyn.c new file mode 100644 index 00000000000..16e82eaed66 --- /dev/null +++ b/storage/xtradb/dyn/dyn0dyn.c @@ -0,0 +1,64 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +#include "dyn0dyn.h" +#ifdef UNIV_NONINL +#include "dyn0dyn.ic" +#endif + +/**************************************************************** +Adds a new block to a dyn array. */ +UNIV_INTERN +dyn_block_t* +dyn_array_add_block( +/*================*/ + /* out: created block */ + dyn_array_t* arr) /* in: dyn array */ +{ + mem_heap_t* heap; + dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + if (arr->heap == NULL) { + UT_LIST_INIT(arr->base); + UT_LIST_ADD_FIRST(list, arr->base, arr); + + arr->heap = mem_heap_create(sizeof(dyn_block_t)); + } + + block = dyn_array_get_last_block(arr); + block->used = block->used | DYN_BLOCK_FULL_FLAG; + + heap = arr->heap; + + block = mem_heap_alloc(heap, sizeof(dyn_block_t)); + + block->used = 0; + + UT_LIST_ADD_LAST(list, arr->base, block); + + return(block); +} diff --git a/storage/xtradb/eval/eval0eval.c b/storage/xtradb/eval/eval0eval.c new file mode 100644 index 00000000000..a2590c63c38 --- /dev/null +++ b/storage/xtradb/eval/eval0eval.c @@ -0,0 +1,851 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "eval0eval.h" + +#ifdef UNIV_NONINL +#include "eval0eval.ic" +#endif + +#include "data0data.h" +#include "row0sel.h" + +/* The RND function seed */ +static ulint eval_rnd = 128367121; + +/* Dummy adress used when we should allocate a buffer of size 0 in +the function below */ + +static byte eval_dummy; + +/********************************************************************* +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has an allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. */ +UNIV_INTERN +byte* +eval_node_alloc_val_buf( +/*====================*/ + /* out: pointer to allocated buffer */ + que_node_t* node, /* in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /* in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL + || que_node_get_type(node) == QUE_NODE_FUNC); + + dfield = que_node_get_val(node); + + data = dfield_get_data(dfield); + + if (data && data != &eval_dummy) { + mem_free(data); + } + + if (size == 0) { + data = &eval_dummy; + } else { + data = mem_alloc(size); + } + + que_node_set_val_buf_size(node, size); + + dfield_set_data(dfield, data, size); + + return(data); +} + +/********************************************************************* +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +UNIV_INTERN +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node) /* in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL + || que_node_get_type(node) == QUE_NODE_FUNC); + + dfield = que_node_get_val(node); + + data = dfield_get_data(dfield); + + if (que_node_get_val_buf_size(node) > 0) { + ut_a(data); + + mem_free(data); + } +} + +/********************************************************************* +Evaluates a comparison node. */ +UNIV_INTERN +ibool +eval_cmp( +/*=====*/ + /* out: the result of the comparison */ + func_node_t* cmp_node) /* in: comparison node */ +{ + que_node_t* arg1; + que_node_t* arg2; + int res; + ibool val; + int func; + + ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC); + + arg1 = cmp_node->args; + arg2 = que_node_get_next(arg1); + + res = cmp_dfield_dfield(que_node_get_val(arg1), + que_node_get_val(arg2)); + val = TRUE; + + func = cmp_node->func; + + if (func == '=') { + if (res != 0) { + val = FALSE; + } + } else if (func == '<') { + if (res != -1) { + val = FALSE; + } + } else if (func == PARS_LE_TOKEN) { + if (res == 1) { + val = FALSE; + } + } else if (func == PARS_NE_TOKEN) { + if (res == 0) { + val = FALSE; + } + } else if (func == PARS_GE_TOKEN) { + if (res == -1) { + val = FALSE; + } + } else { + ut_ad(func == '>'); + + if (res != 1) { + val = FALSE; + } + } + + eval_node_set_ibool_val(cmp_node, val); + + return(val); +} + +/********************************************************************* +Evaluates a logical operation node. */ +UNIV_INLINE +void +eval_logical( +/*=========*/ + func_node_t* logical_node) /* in: logical operation node */ +{ + que_node_t* arg1; + que_node_t* arg2; + ibool val1; + ibool val2 = 0; /* remove warning */ + ibool val = 0; /* remove warning */ + int func; + + ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC); + + arg1 = logical_node->args; + arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */ + + val1 = eval_node_get_ibool_val(arg1); + + if (arg2) { + val2 = eval_node_get_ibool_val(arg2); + } + + func = logical_node->func; + + if (func == PARS_AND_TOKEN) { + val = val1 & val2; + } else if (func == PARS_OR_TOKEN) { + val = val1 | val2; + } else if (func == PARS_NOT_TOKEN) { + val = TRUE - val1; + } else { + ut_error; + } + + eval_node_set_ibool_val(logical_node, val); +} + +/********************************************************************* +Evaluates an arithmetic operation node. */ +UNIV_INLINE +void +eval_arith( +/*=======*/ + func_node_t* arith_node) /* in: arithmetic operation node */ +{ + que_node_t* arg1; + que_node_t* arg2; + lint val1; + lint val2 = 0; /* remove warning */ + lint val; + int func; + + ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC); + + arg1 = arith_node->args; + arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */ + + val1 = eval_node_get_int_val(arg1); + + if (arg2) { + val2 = eval_node_get_int_val(arg2); + } + + func = arith_node->func; + + if (func == '+') { + val = val1 + val2; + } else if ((func == '-') && arg2) { + val = val1 - val2; + } else if (func == '-') { + val = -val1; + } else if (func == '*') { + val = val1 * val2; + } else { + ut_ad(func == '/'); + val = val1 / val2; + } + + eval_node_set_int_val(arith_node, val); +} + +/********************************************************************* +Evaluates an aggregate operation node. */ +UNIV_INLINE +void +eval_aggregate( +/*===========*/ + func_node_t* node) /* in: aggregate operation node */ +{ + que_node_t* arg; + lint val; + lint arg_val; + int func; + + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + val = eval_node_get_int_val(node); + + func = node->func; + + if (func == PARS_COUNT_TOKEN) { + + val = val + 1; + } else { + ut_ad(func == PARS_SUM_TOKEN); + + arg = node->args; + arg_val = eval_node_get_int_val(arg); + + val = val + arg_val; + } + + eval_node_set_int_val(node, val); +} + +/********************************************************************* +Evaluates a predefined function node where the function is not relevant +in benchmarks. */ +static +void +eval_predefined_2( +/*==============*/ + func_node_t* func_node) /* in: predefined function node */ +{ + que_node_t* arg; + que_node_t* arg1; + que_node_t* arg2 = 0; /* remove warning (??? bug ???) */ + lint int_val; + byte* data; + ulint len1; + ulint len2; + int func; + ulint i; + + ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC); + + arg1 = func_node->args; + + if (arg1) { + arg2 = que_node_get_next(arg1); + } + + func = func_node->func; + + if (func == PARS_PRINTF_TOKEN) { + + arg = arg1; + + while (arg) { + dfield_print(que_node_get_val(arg)); + + arg = que_node_get_next(arg); + } + + putc('\n', stderr); + + } else if (func == PARS_ASSERT_TOKEN) { + + if (!eval_node_get_ibool_val(arg1)) { + fputs("SQL assertion fails in a stored procedure!\n", + stderr); + } + + ut_a(eval_node_get_ibool_val(arg1)); + + /* This function, or more precisely, a debug procedure, + returns no value */ + + } else if (func == PARS_RND_TOKEN) { + + len1 = (ulint)eval_node_get_int_val(arg1); + len2 = (ulint)eval_node_get_int_val(arg2); + + ut_ad(len2 >= len1); + + if (len2 > len1) { + int_val = (lint) (len1 + + (eval_rnd % (len2 - len1 + 1))); + } else { + int_val = (lint) len1; + } + + eval_rnd = ut_rnd_gen_next_ulint(eval_rnd); + + eval_node_set_int_val(func_node, int_val); + + } else if (func == PARS_RND_STR_TOKEN) { + + len1 = (ulint)eval_node_get_int_val(arg1); + + data = eval_node_ensure_val_buf(func_node, len1); + + for (i = 0; i < len1; i++) { + data[i] = (byte)(97 + (eval_rnd % 3)); + + eval_rnd = ut_rnd_gen_next_ulint(eval_rnd); + } + } else { + ut_error; + } +} + +/********************************************************************* +Evaluates a notfound-function node. */ +UNIV_INLINE +void +eval_notfound( +/*==========*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + sym_node_t* cursor; + sel_node_t* sel_node; + ibool ibool_val; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + ut_ad(func_node->func == PARS_NOTFOUND_TOKEN); + + cursor = arg1; + + ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL); + + if (cursor->token_type == SYM_LIT) { + + ut_ad(ut_memcmp(dfield_get_data(que_node_get_val(cursor)), + "SQL", 3) == 0); + + sel_node = cursor->sym_table->query_graph->last_sel_node; + } else { + sel_node = cursor->alias->cursor_def; + } + + if (sel_node->state == SEL_NODE_NO_MORE_ROWS) { + ibool_val = TRUE; + } else { + ibool_val = FALSE; + } + + eval_node_set_ibool_val(func_node, ibool_val); +} + +/********************************************************************* +Evaluates a substr-function node. */ +UNIV_INLINE +void +eval_substr( +/*========*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + que_node_t* arg3; + dfield_t* dfield; + byte* str1; + ulint len1; + ulint len2; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + ut_ad(func_node->func == PARS_SUBSTR_TOKEN); + + arg3 = que_node_get_next(arg2); + + str1 = dfield_get_data(que_node_get_val(arg1)); + + len1 = (ulint)eval_node_get_int_val(arg2); + len2 = (ulint)eval_node_get_int_val(arg3); + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1 + len1, len2); +} + +/********************************************************************* +Evaluates a replstr-procedure node. */ +static +void +eval_replstr( +/*=========*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + que_node_t* arg3; + que_node_t* arg4; + byte* str1; + byte* str2; + ulint len1; + ulint len2; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + ut_ad(que_node_get_type(arg1) == QUE_NODE_SYMBOL); + + arg3 = que_node_get_next(arg2); + arg4 = que_node_get_next(arg3); + + str1 = dfield_get_data(que_node_get_val(arg1)); + str2 = dfield_get_data(que_node_get_val(arg2)); + + len1 = (ulint)eval_node_get_int_val(arg3); + len2 = (ulint)eval_node_get_int_val(arg4); + + if ((dfield_get_len(que_node_get_val(arg1)) < len1 + len2) + || (dfield_get_len(que_node_get_val(arg2)) < len2)) { + + ut_error; + } + + ut_memcpy(str1 + len1, str2, len2); +} + +/********************************************************************* +Evaluates an instr-function node. */ +static +void +eval_instr( +/*=======*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + dfield_t* dfield1; + dfield_t* dfield2; + lint int_val; + byte* str1; + byte* str2; + byte match_char; + ulint len1; + ulint len2; + ulint i; + ulint j; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + dfield1 = que_node_get_val(arg1); + dfield2 = que_node_get_val(arg2); + + str1 = dfield_get_data(dfield1); + str2 = dfield_get_data(dfield2); + + len1 = dfield_get_len(dfield1); + len2 = dfield_get_len(dfield2); + + if (len2 == 0) { + ut_error; + } + + match_char = str2[0]; + + for (i = 0; i < len1; i++) { + /* In this outer loop, the number of matched characters is 0 */ + + if (str1[i] == match_char) { + + if (i + len2 > len1) { + + break; + } + + for (j = 1;; j++) { + /* We have already matched j characters */ + + if (j == len2) { + int_val = i + 1; + + goto match_found; + } + + if (str1[i + j] != str2[j]) { + + break; + } + } + } + } + + int_val = 0; + +match_found: + eval_node_set_int_val(func_node, int_val); +} + +/********************************************************************* +Evaluates a predefined function node. */ +UNIV_INLINE +void +eval_binary_to_number( +/*==================*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg1; + dfield_t* dfield; + byte* str1; + byte* str2; + ulint len1; + ulint int_val; + + arg1 = func_node->args; + + dfield = que_node_get_val(arg1); + + str1 = dfield_get_data(dfield); + len1 = dfield_get_len(dfield); + + if (len1 > 4) { + ut_error; + } + + if (len1 == 4) { + str2 = str1; + } else { + int_val = 0; + str2 = (byte*)&int_val; + + ut_memcpy(str2 + (4 - len1), str1, len1); + } + + eval_node_copy_and_alloc_val(func_node, str2, 4); +} + +/********************************************************************* +Evaluates a predefined function node. */ +static +void +eval_concat( +/*========*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg; + dfield_t* dfield; + byte* data; + ulint len; + ulint len1; + + arg = func_node->args; + len = 0; + + while (arg) { + len1 = dfield_get_len(que_node_get_val(arg)); + + len += len1; + + arg = que_node_get_next(arg); + } + + data = eval_node_ensure_val_buf(func_node, len); + + arg = func_node->args; + len = 0; + + while (arg) { + dfield = que_node_get_val(arg); + len1 = dfield_get_len(dfield); + + ut_memcpy(data + len, dfield_get_data(dfield), len1); + + len += len1; + + arg = que_node_get_next(arg); + } +} + +/********************************************************************* +Evaluates a predefined function node. If the first argument is an integer, +this function looks at the second argument which is the integer length in +bytes, and converts the integer to a VARCHAR. +If the first argument is of some other type, this function converts it to +BINARY. */ +UNIV_INLINE +void +eval_to_binary( +/*===========*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + dfield_t* dfield; + byte* str1; + ulint len; + ulint len1; + + arg1 = func_node->args; + + str1 = dfield_get_data(que_node_get_val(arg1)); + + if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) { + + len = dfield_get_len(que_node_get_val(arg1)); + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1, len); + + return; + } + + arg2 = que_node_get_next(arg1); + + len1 = (ulint)eval_node_get_int_val(arg2); + + if (len1 > 4) { + + ut_error; + } + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1 + (4 - len1), len1); +} + +/********************************************************************* +Evaluates a predefined function node. */ +UNIV_INLINE +void +eval_predefined( +/*============*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg1; + lint int_val; + byte* data; + int func; + + func = func_node->func; + + arg1 = func_node->args; + + if (func == PARS_LENGTH_TOKEN) { + + int_val = (lint)dfield_get_len(que_node_get_val(arg1)); + + } else if (func == PARS_TO_CHAR_TOKEN) { + + /* Convert number to character string as a + signed decimal integer. */ + + ulint uint_val; + int int_len; + + int_val = eval_node_get_int_val(arg1); + + /* Determine the length of the string. */ + + if (int_val == 0) { + int_len = 1; /* the number 0 occupies 1 byte */ + } else { + int_len = 0; + if (int_val < 0) { + uint_val = ((ulint) -int_val - 1) + 1; + int_len++; /* reserve space for minus sign */ + } else { + uint_val = (ulint) int_val; + } + for (; uint_val > 0; int_len++) { + uint_val /= 10; + } + } + + /* allocate the string */ + data = eval_node_ensure_val_buf(func_node, int_len + 1); + + /* add terminating NUL character */ + data[int_len] = 0; + + /* convert the number */ + + if (int_val == 0) { + data[0] = '0'; + } else { + int tmp; + if (int_val < 0) { + data[0] = '-'; /* preceding minus sign */ + uint_val = ((ulint) -int_val - 1) + 1; + } else { + uint_val = (ulint) int_val; + } + for (tmp = int_len; uint_val > 0; uint_val /= 10) { + data[--tmp] = (byte) + ('0' + (byte)(uint_val % 10)); + } + } + + dfield_set_len(que_node_get_val(func_node), int_len); + + return; + + } else if (func == PARS_TO_NUMBER_TOKEN) { + + int_val = atoi((char*) + dfield_get_data(que_node_get_val(arg1))); + + } else if (func == PARS_SYSDATE_TOKEN) { + int_val = (lint)ut_time(); + } else { + eval_predefined_2(func_node); + + return; + } + + eval_node_set_int_val(func_node, int_val); +} + +/********************************************************************* +Evaluates a function node. */ +UNIV_INTERN +void +eval_func( +/*======*/ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg; + ulint class; + ulint func; + + ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC); + + class = func_node->class; + func = func_node->func; + + arg = func_node->args; + + /* Evaluate first the argument list */ + while (arg) { + eval_exp(arg); + + /* The functions are not defined for SQL null argument + values, except for eval_cmp and notfound */ + + if (dfield_is_null(que_node_get_val(arg)) + && (class != PARS_FUNC_CMP) + && (func != PARS_NOTFOUND_TOKEN) + && (func != PARS_PRINTF_TOKEN)) { + ut_error; + } + + arg = que_node_get_next(arg); + } + + if (class == PARS_FUNC_CMP) { + eval_cmp(func_node); + } else if (class == PARS_FUNC_ARITH) { + eval_arith(func_node); + } else if (class == PARS_FUNC_AGGREGATE) { + eval_aggregate(func_node); + } else if (class == PARS_FUNC_PREDEFINED) { + + if (func == PARS_NOTFOUND_TOKEN) { + eval_notfound(func_node); + } else if (func == PARS_SUBSTR_TOKEN) { + eval_substr(func_node); + } else if (func == PARS_REPLSTR_TOKEN) { + eval_replstr(func_node); + } else if (func == PARS_INSTR_TOKEN) { + eval_instr(func_node); + } else if (func == PARS_BINARY_TO_NUMBER_TOKEN) { + eval_binary_to_number(func_node); + } else if (func == PARS_CONCAT_TOKEN) { + eval_concat(func_node); + } else if (func == PARS_TO_BINARY_TOKEN) { + eval_to_binary(func_node); + } else { + eval_predefined(func_node); + } + } else { + ut_ad(class == PARS_FUNC_LOGICAL); + + eval_logical(func_node); + } +} diff --git a/storage/xtradb/eval/eval0proc.c b/storage/xtradb/eval/eval0proc.c new file mode 100644 index 00000000000..9c7563e8c7d --- /dev/null +++ b/storage/xtradb/eval/eval0proc.c @@ -0,0 +1,294 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "eval0proc.h" + +#ifdef UNIV_NONINL +#include "eval0proc.ic" +#endif + +/************************************************************************** +Performs an execution step of an if-statement node. */ +UNIV_INTERN +que_thr_t* +if_step( +/*====*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + if_node_t* node; + elsif_node_t* elsif_node; + + ut_ad(thr); + + node = thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_IF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Evaluate the condition */ + + eval_exp(node->cond); + + if (eval_node_get_ibool_val(node->cond)) { + + /* The condition evaluated to TRUE: start execution + from the first statement in the statement list */ + + thr->run_node = node->stat_list; + + } else if (node->else_part) { + thr->run_node = node->else_part; + + } else if (node->elsif_list) { + elsif_node = node->elsif_list; + + for (;;) { + eval_exp(elsif_node->cond); + + if (eval_node_get_ibool_val( + elsif_node->cond)) { + + /* The condition evaluated to TRUE: + start execution from the first + statement in the statement list */ + + thr->run_node = elsif_node->stat_list; + + break; + } + + elsif_node = que_node_get_next(elsif_node); + + if (elsif_node == NULL) { + thr->run_node = NULL; + + break; + } + } + } else { + thr->run_node = NULL; + } + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/************************************************************************** +Performs an execution step of a while-statement node. */ +UNIV_INTERN +que_thr_t* +while_step( +/*=======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + while_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_WHILE); + + ut_ad((thr->prev_node == que_node_get_parent(node)) + || (que_node_get_next(thr->prev_node) == NULL)); + + /* Evaluate the condition */ + + eval_exp(node->cond); + + if (eval_node_get_ibool_val(node->cond)) { + + /* The condition evaluated to TRUE: start execution + from the first statement in the statement list */ + + thr->run_node = node->stat_list; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/************************************************************************** +Performs an execution step of an assignment statement node. */ +UNIV_INTERN +que_thr_t* +assign_step( +/*========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + assign_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT); + + /* Evaluate the value to assign */ + + eval_exp(node->val); + + eval_node_copy_val(node->var->alias, node->val); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/************************************************************************** +Performs an execution step of a for-loop node. */ +UNIV_INTERN +que_thr_t* +for_step( +/*=====*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + for_node_t* node; + que_node_t* parent; + lint loop_var_value; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_FOR); + + parent = que_node_get_parent(node); + + if (thr->prev_node != parent) { + + /* Move to the next statement */ + thr->run_node = que_node_get_next(thr->prev_node); + + if (thr->run_node != NULL) { + + return(thr); + } + + /* Increment the value of loop_var */ + + loop_var_value = 1 + eval_node_get_int_val(node->loop_var); + } else { + /* Initialize the loop */ + + eval_exp(node->loop_start_limit); + eval_exp(node->loop_end_limit); + + loop_var_value = eval_node_get_int_val(node->loop_start_limit); + + node->loop_end_value + = (int) eval_node_get_int_val(node->loop_end_limit); + } + + /* Check if we should do another loop */ + + if (loop_var_value > node->loop_end_value) { + + /* Enough loops done */ + + thr->run_node = parent; + } else { + eval_node_set_int_val(node->loop_var, loop_var_value); + + thr->run_node = node->stat_list; + } + + return(thr); +} + +/************************************************************************** +Performs an execution step of an exit statement node. */ +UNIV_INTERN +que_thr_t* +exit_step( +/*======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + exit_node_t* node; + que_node_t* loop_node; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_EXIT); + + /* Loops exit by setting thr->run_node as the loop node's parent, so + find our containing loop node and get its parent. */ + + loop_node = que_node_get_containing_loop_node(node); + + /* If someone uses an EXIT statement outside of a loop, this will + trigger. */ + ut_a(loop_node); + + thr->run_node = que_node_get_parent(loop_node); + + return(thr); +} + +/************************************************************************** +Performs an execution step of a return-statement node. */ +UNIV_INTERN +que_thr_t* +return_step( +/*========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + return_node_t* node; + que_node_t* parent; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_RETURN); + + parent = node; + + while (que_node_get_type(parent) != QUE_NODE_PROC) { + + parent = que_node_get_parent(parent); + } + + ut_a(parent); + + thr->run_node = que_node_get_parent(parent); + + return(thr); +} diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.c new file mode 100644 index 00000000000..dad045fe067 --- /dev/null +++ b/storage/xtradb/fil/fil0fil.c @@ -0,0 +1,4817 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The tablespace memory cache + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#include "fil0fil.h" + +#include "mem0mem.h" +#include "sync0sync.h" +#include "hash0hash.h" +#include "os0file.h" +#include "os0sync.h" +#include "mach0data.h" +#include "ibuf0ibuf.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0zip.h" + + +/* + IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE + ============================================= + +The tablespace cache is responsible for providing fast read/write access to +tablespaces and logs of the database. File creation and deletion is done +in other modules which know more of the logic of the operation, however. + +A tablespace consists of a chain of files. The size of the files does not +have to be divisible by the database block size, because we may just leave +the last incomplete block unused. When a new file is appended to the +tablespace, the maximum size of the file is also specified. At the moment, +we think that it is best to extend the file to its maximum size already at +the creation of the file, because then we can avoid dynamically extending +the file when more space is needed for the tablespace. + +A block's position in the tablespace is specified with a 32-bit unsigned +integer. The files in the chain are thought to be catenated, and the block +corresponding to an address n is the nth block in the catenated file (where +the first block is named the 0th block, and the incomplete block fragments +at the end of files are not taken into account). A tablespace can be extended +by appending a new file at the end of the chain. + +Our tablespace concept is similar to the one of Oracle. + +To acquire more speed in disk transfers, a technique called disk striping is +sometimes used. This means that logical block addresses are divided in a +round-robin fashion across several disks. Windows NT supports disk striping, +so there we do not need to support it in the database. Disk striping is +implemented in hardware in RAID disks. We conclude that it is not necessary +to implement it in the database. Oracle 7 does not support disk striping, +either. + +Another trick used at some database sites is replacing tablespace files by +raw disks, that is, the whole physical disk drive, or a partition of it, is +opened as a single file, and it is accessed through byte offsets calculated +from the start of the disk or the partition. This is recommended in some +books on database tuning to achieve more speed in i/o. Using raw disk +certainly prevents the OS from fragmenting disk space, but it is not clear +if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file +system + EIDE Conner disk only a negligible difference in speed when reading +from a file, versus reading from a raw disk. + +To have fast access to a tablespace or a log file, we put the data structures +to a hash table. Each tablespace and log file is given an unique 32-bit +identifier. + +Some operating systems do not support many open files at the same time, +though NT seems to tolerate at least 900 open files. Therefore, we put the +open files in an LRU-list. If we need to open another file, we may close the +file at the end of the LRU-list. When an i/o-operation is pending on a file, +the file cannot be closed. We take the file nodes with pending i/o-operations +out of the LRU-list and keep a count of pending operations. When an operation +completes, we decrement the count and return the file node to the LRU-list if +the count drops to zero. */ + +/* When mysqld is run, the default directory "." is the mysqld datadir, +but in the MySQL Embedded Server Library and ibbackup it is not the default +directory, and we must set the base file path explicitly */ +UNIV_INTERN const char* fil_path_to_mysql_datadir = "."; + +/* The number of fsyncs done to the log */ +UNIV_INTERN ulint fil_n_log_flushes = 0; + +UNIV_INTERN ulint fil_n_pending_log_flushes = 0; +UNIV_INTERN ulint fil_n_pending_tablespace_flushes = 0; + +/* Null file address */ +UNIV_INTERN fil_addr_t fil_addr_null = {FIL_NULL, 0}; + +/* File node of a tablespace or the log data space */ +struct fil_node_struct { + fil_space_t* space; /* backpointer to the space where this node + belongs */ + char* name; /* path to the file */ + ibool open; /* TRUE if file open */ + os_file_t handle; /* OS handle to the file, if file open */ + ibool is_raw_disk;/* TRUE if the 'file' is actually a raw + device or a raw disk partition */ + ulint size; /* size of the file in database pages, 0 if + not known yet; the possible last incomplete + megabyte may be ignored if space == 0 */ + ulint n_pending; + /* count of pending i/o's on this file; + closing of the file is not allowed if + this is > 0 */ + ulint n_pending_flushes; + /* count of pending flushes on this file; + closing of the file is not allowed if + this is > 0 */ + ib_int64_t modification_counter;/* when we write to the file we + increment this by one */ + ib_int64_t flush_counter;/* up to what modification_counter value + we have flushed the modifications to disk */ + UT_LIST_NODE_T(fil_node_t) chain; + /* link field for the file chain */ + UT_LIST_NODE_T(fil_node_t) LRU; + /* link field for the LRU list */ + ulint magic_n; +}; + +#define FIL_NODE_MAGIC_N 89389 + +/* Tablespace or log data space: let us call them by a common name space */ +struct fil_space_struct { + char* name; /* space name = the path to the first file in + it */ + ulint id; /* space id */ + ib_int64_t tablespace_version; + /* in DISCARD/IMPORT this timestamp is used to + check if we should ignore an insert buffer + merge request for a page because it actually + was for the previous incarnation of the + space */ + ibool mark; /* this is set to TRUE at database startup if + the space corresponds to a table in the InnoDB + data dictionary; so we can print a warning of + orphaned tablespaces */ + ibool stop_ios;/* TRUE if we want to rename the .ibd file of + tablespace and want to stop temporarily + posting of new i/o requests on the file */ + ibool stop_ibuf_merges; + /* we set this TRUE when we start deleting a + single-table tablespace */ + ibool is_being_deleted; + /* this is set to TRUE when we start + deleting a single-table tablespace and its + file; when this flag is set no further i/o + or flush requests can be placed on this space, + though there may be such requests still being + processed on this space */ + ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */ + UT_LIST_BASE_NODE_T(fil_node_t) chain; + /* base node for the file chain */ + ulint size; /* space size in pages; 0 if a single-table + tablespace whose size we do not know yet; + last incomplete megabytes in data files may be + ignored if space == 0 */ + ulint flags; /* in: compressed page size + and file format, or 0 */ + ulint n_reserved_extents; + /* number of reserved free extents for + ongoing operations like B-tree page split */ + ulint n_pending_flushes; /* this is > 0 when flushing + the tablespace to disk; dropping of the + tablespace is forbidden if this is > 0 */ + ulint n_pending_ibuf_merges;/* this is > 0 when merging + insert buffer entries to a page so that we + may need to access the ibuf bitmap page in the + tablespade: dropping of the tablespace is + forbidden if this is > 0 */ + hash_node_t hash; /* hash chain node */ + hash_node_t name_hash;/* hash chain the name_hash table */ + rw_lock_t latch; /* latch protecting the file space storage + allocation */ + UT_LIST_NODE_T(fil_space_t) unflushed_spaces; + /* list of spaces with at least one unflushed + file we have written to */ + ibool is_in_unflushed_spaces; /* TRUE if this space is + currently in the list above */ + UT_LIST_NODE_T(fil_space_t) space_list; + /* list of all spaces */ + ulint magic_n; +}; + +#define FIL_SPACE_MAGIC_N 89472 + +/* The tablespace memory cache; also the totality of logs = the log data space, +is stored here; below we talk about tablespaces, but also the ib_logfiles +form a 'space' and it is handled here */ + +typedef struct fil_system_struct fil_system_t; +struct fil_system_struct { + mutex_t mutex; /* The mutex protecting the cache */ + hash_table_t* spaces; /* The hash table of spaces in the + system; they are hashed on the space + id */ + hash_table_t* name_hash; /* hash table based on the space + name */ + UT_LIST_BASE_NODE_T(fil_node_t) LRU; + /* base node for the LRU list of the + most recently used open files with no + pending i/o's; if we start an i/o on + the file, we first remove it from this + list, and return it to the start of + the list when the i/o ends; + log files and the system tablespace are + not put to this list: they are opened + after the startup, and kept open until + shutdown */ + UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces; + /* base node for the list of those + tablespaces whose files contain + unflushed writes; those spaces have + at least one file node where + modification_counter > flush_counter */ + ulint n_open; /* number of files currently open */ + ulint max_n_open; /* n_open is not allowed to exceed + this */ + ib_int64_t modification_counter;/* when we write to a file we + increment this by one */ + ulint max_assigned_id;/* maximum space id in the existing + tables, or assigned during the time + mysqld has been up; at an InnoDB + startup we scan the data dictionary + and set here the maximum of the + space id's of the tables there */ + ib_int64_t tablespace_version; + /* a counter which is incremented for + every space object memory creation; + every space mem object gets a + 'timestamp' from this; in DISCARD/ + IMPORT this is used to check if we + should ignore an insert buffer merge + request */ + UT_LIST_BASE_NODE_T(fil_space_t) space_list; + /* list of all file spaces */ +}; + +/* The tablespace memory cache. This variable is NULL before the module is +initialized. */ +UNIV_INTERN fil_system_t* fil_system = NULL; + + +/************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ +static +void +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space); /* in: space */ +/************************************************************************ +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type); /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ +/*********************************************************************** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. */ +static +ulint +fil_get_space_id_for_table( +/*=======================*/ + /* out: space id, ULINT_UNDEFINED if not + found */ + const char* name); /* in: table name in the standard + 'databasename/tablename' format */ +/************************************************************************ +Reads data from a space to a buffer. Remember that the possible incomplete +blocks at the end of file are ignored: they are not taken into account when +calculating the byte offset within a space. */ +UNIV_INLINE +ulint +fil_read( +/*=====*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ + ibool sync, /* in: TRUE if synchronous aio is desired */ + ulint space_id, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /* in: offset in number of blocks */ + ulint byte_offset, /* in: remainder of offset in bytes; in aio + this must be divisible by the OS block size */ + ulint len, /* in: how many bytes to read; this must not + cross a file boundary; in aio this must be a + block size multiple */ + void* buf, /* in/out: buffer where to store data read; + in aio this must be appropriately aligned */ + void* message) /* in: message for aio handler if non-sync + aio used, else ignored */ +{ + return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, + byte_offset, len, buf, message)); +} + +/************************************************************************ +Writes data to a space from a buffer. Remember that the possible incomplete +blocks at the end of file are ignored: they are not taken into account when +calculating the byte offset within a space. */ +UNIV_INLINE +ulint +fil_write( +/*======*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ + ibool sync, /* in: TRUE if synchronous aio is desired */ + ulint space_id, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /* in: offset in number of blocks */ + ulint byte_offset, /* in: remainder of offset in bytes; in aio + this must be divisible by the OS block size */ + ulint len, /* in: how many bytes to write; this must + not cross a file boundary; in aio this must + be a block size multiple */ + void* buf, /* in: buffer from which to write; in aio + this must be appropriately aligned */ + void* message) /* in: message for aio handler if non-sync + aio used, else ignored */ +{ + return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, + byte_offset, len, buf, message)); +} + +/*********************************************************************** +Returns the table space by a given id, NULL if not found. */ +UNIV_INLINE +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id) /* in: space id */ +{ + fil_space_t* space; + + ut_ad(mutex_own(&fil_system->mutex)); + + HASH_SEARCH(hash, fil_system->spaces, id, + fil_space_t*, space, + ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), + space->id == id); + + return(space); +} + +/*********************************************************************** +Returns the table space by a given name, NULL if not found. */ +UNIV_INLINE +fil_space_t* +fil_space_get_by_name( +/*==================*/ + const char* name) /* in: space name */ +{ + fil_space_t* space; + ulint fold; + + ut_ad(mutex_own(&fil_system->mutex)); + + fold = ut_fold_string(name); + + HASH_SEARCH(name_hash, fil_system->name_hash, fold, + fil_space_t*, space, + ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), + !strcmp(name, space->name)); + + return(space); +} + +/*********************************************************************** +Returns the version number of a tablespace, -1 if not found. */ +UNIV_INTERN +ib_int64_t +fil_space_get_version( +/*==================*/ + /* out: version number, -1 if the tablespace does not + exist in the memory cache */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ib_int64_t version = -1; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space) { + version = space->tablespace_version; + } + + mutex_exit(&(system->mutex)); + + return(version); +} + +/*********************************************************************** +Returns the latch of a file space. */ +UNIV_INTERN +rw_lock_t* +fil_space_get_latch( +/*================*/ + /* out: latch protecting storage allocation */ + ulint id, /* in: space id */ + ulint* flags) /* out: tablespace flags */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + ut_a(space); + + if (flags) { + *flags = space->flags; + } + + mutex_exit(&(system->mutex)); + + return(&(space->latch)); +} + +/*********************************************************************** +Returns the type of a file space. */ +UNIV_INTERN +ulint +fil_space_get_type( +/*===============*/ + /* out: FIL_TABLESPACE or FIL_LOG */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + ut_a(space); + + mutex_exit(&(system->mutex)); + + return(space->purpose); +} + +/************************************************************************** +Checks if all the file nodes in a space are flushed. The caller must hold +the fil_system mutex. */ +static +ibool +fil_space_is_flushed( +/*=================*/ + /* out: TRUE if all are flushed */ + fil_space_t* space) /* in: space */ +{ + fil_node_t* node; + + ut_ad(mutex_own(&(fil_system->mutex))); + + node = UT_LIST_GET_FIRST(space->chain); + + while (node) { + if (node->modification_counter > node->flush_counter) { + + return(FALSE); + } + + node = UT_LIST_GET_NEXT(chain, node); + } + + return(TRUE); +} + +/*********************************************************************** +Appends a new file to the chain of files of a space. File must be closed. */ +UNIV_INTERN +void +fil_node_create( +/*============*/ + const char* name, /* in: file name (file must be closed) */ + ulint size, /* in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /* in: space id where to append */ + ibool is_raw) /* in: TRUE if a raw device or + a raw disk partition */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + + ut_a(system); + ut_a(name); + + mutex_enter(&(system->mutex)); + + node = mem_alloc(sizeof(fil_node_t)); + + node->name = mem_strdup(name); + node->open = FALSE; + + ut_a(!is_raw || srv_start_raw_disk_in_use); + + node->is_raw_disk = is_raw; + node->size = size; + node->magic_n = FIL_NODE_MAGIC_N; + node->n_pending = 0; + node->n_pending_flushes = 0; + + node->modification_counter = 0; + node->flush_counter = 0; + + space = fil_space_get_by_id(id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Could not find tablespace %lu for\n" + "InnoDB: file ", (ulong) id); + ut_print_filename(stderr, name); + fputs(" in the tablespace memory cache.\n", stderr); + mem_free(node->name); + + mem_free(node); + + mutex_exit(&(system->mutex)); + + return; + } + + space->size += size; + + node->space = space; + + UT_LIST_ADD_LAST(chain, space->chain, node); + + mutex_exit(&(system->mutex)); +} + +/************************************************************************ +Opens a the file of a node of a tablespace. The caller must own the fil_system +mutex. */ +static +void +fil_node_open_file( +/*===============*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space) /* in: space */ +{ + ib_int64_t size_bytes; + ulint size_low; + ulint size_high; + ibool ret; + ibool success; +#ifndef UNIV_HOTBACKUP + byte* buf2; + byte* page; + ulint space_id; + ulint flags; +#endif /* !UNIV_HOTBACKUP */ + + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->n_pending == 0); + ut_a(node->open == FALSE); + + if (node->size == 0) { + /* It must be a single-table tablespace and we do not know the + size of the file yet. First we open the file in the normal + mode, no async I/O here, for simplicity. Then do some checks, + and close the file again. + NOTE that we could not use the simple file read function + os_file_read() in Windows to read from a file opened for + async I/O! */ + + node->handle = os_file_create_simple_no_error_handling( + node->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Fatal error: cannot open %s\n." + "InnoDB: Have you deleted .ibd files" + " under a running mysqld server?\n", + node->name); + ut_a(0); + } + + os_file_get_size(node->handle, &size_low, &size_high); + + size_bytes = (((ib_int64_t)size_high) << 32) + + (ib_int64_t)size_low; +#ifdef UNIV_HOTBACKUP + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + /* TODO: adjust to zip_size, like below? */ +#else + ut_a(space->purpose != FIL_LOG); + ut_a(space->id != 0); + + if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Error: the size of single-table" + " tablespace file %s\n" + "InnoDB: is only %lu %lu," + " should be at least %lu!\n", + node->name, + (ulong) size_high, + (ulong) size_low, + (ulong) (FIL_IBD_FILE_INITIAL_SIZE + * UNIV_PAGE_SIZE)); + + ut_a(0); + } + + /* Read the first page of the tablespace */ + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT + set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + success = os_file_read(node->handle, page, 0, 0, + UNIV_PAGE_SIZE); + space_id = fsp_header_get_space_id(page); + flags = fsp_header_get_flags(page); + + ut_free(buf2); + + /* Close the file now that we have read the space id from it */ + + os_file_close(node->handle); + + if (UNIV_UNLIKELY(space_id != space->id)) { + fprintf(stderr, + "InnoDB: Error: tablespace id is %lu" + " in the data dictionary\n" + "InnoDB: but in file %s it is %lu!\n", + space->id, node->name, space_id); + + ut_error; + } + + if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED + || space_id == 0)) { + fprintf(stderr, + "InnoDB: Error: tablespace id %lu" + " in file %s is not sensible\n", + (ulong) space_id, node->name); + + ut_error; + } + + if (UNIV_UNLIKELY(space->flags != flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are %lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are %lx!\n", + space->flags, node->name, flags); + + ut_error; + } + + if (size_bytes >= 1024 * 1024) { + /* Truncate the size to whole megabytes. */ + size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); + } + + if (!(flags & DICT_TF_ZSSIZE_MASK)) { + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + } else { + node->size = (ulint) + (size_bytes + / dict_table_flags_to_zip_size(flags)); + } +#endif + space->size += node->size; + } + + /* printf("Opening file %s\n", node->name); */ + + /* Open the file for reading and writing, in Windows normally in the + unbuffered async I/O mode, though global variables may make + os_file_create() to fall back to the normal file I/O mode. */ + + if (space->purpose == FIL_LOG) { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, &ret); + } else if (node->is_raw_disk) { + node->handle = os_file_create(node->name, + OS_FILE_OPEN_RAW, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } else { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } + + ut_a(ret); + + node->open = TRUE; + + system->n_open++; + + if (space->purpose == FIL_TABLESPACE && space->id != 0) { + /* Put the node to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } +} + +/************************************************************************** +Closes a file. */ +static +void +fil_node_close_file( +/*================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system) /* in: tablespace memory cache */ +{ + ibool ret; + + ut_ad(node && system); + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->open); + ut_a(node->n_pending == 0); + ut_a(node->n_pending_flushes == 0); + ut_a(node->modification_counter == node->flush_counter); + + ret = os_file_close(node->handle); + ut_a(ret); + + /* printf("Closing file %s\n", node->name); */ + + node->open = FALSE; + ut_a(system->n_open > 0); + system->n_open--; + + if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) { + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + /* The node is in the LRU list, remove it */ + UT_LIST_REMOVE(LRU, system->LRU, node); + } +} + +/************************************************************************ +Tries to close a file in the LRU list. The caller must hold the fil_sys +mutex. */ +static +ibool +fil_try_to_close_file_in_LRU( +/*=========================*/ + /* out: TRUE if success, FALSE if should retry + later; since i/o's generally complete in < + 100 ms, and as InnoDB writes at most 128 pages + from the buffer pool in a batch, and then + immediately flushes the files, there is a good + chance that the next time we find a suitable + node from the LRU list */ + ibool print_info) /* in: if TRUE, prints information why it + cannot close a file */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + + ut_ad(mutex_own(&(system->mutex))); + + node = UT_LIST_GET_LAST(system->LRU); + + if (print_info) { + fprintf(stderr, + "InnoDB: fil_sys open file LRU len %lu\n", + (ulong) UT_LIST_GET_LEN(system->LRU)); + } + + while (node != NULL) { + if (node->modification_counter == node->flush_counter + && node->n_pending_flushes == 0) { + + fil_node_close_file(node, system); + + return(TRUE); + } + + if (print_info && node->n_pending_flushes > 0) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, ", because n_pending_flushes %lu\n", + (ulong) node->n_pending_flushes); + } + + if (print_info + && node->modification_counter != node->flush_counter) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, + ", because mod_count %ld != fl_count %ld\n", + (long) node->modification_counter, + (long) node->flush_counter); + } + + node = UT_LIST_GET_PREV(LRU, node); + } + + return(FALSE); +} + +/*********************************************************************** +Reserves the fil_system mutex and tries to make sure we can open at least one +file while holding it. This should be called before calling +fil_node_prepare_for_io(), because that function may need to open a file. */ +static +void +fil_mutex_enter_and_prepare_for_io( +/*===============================*/ + ulint space_id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ibool success; + ibool print_info = FALSE; + ulint count = 0; + ulint count2 = 0; + + ut_ad(!mutex_own(&(system->mutex))); +retry: + mutex_enter(&(system->mutex)); + + if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) { + /* We keep log files and system tablespace files always open; + this is important in preventing deadlocks in this module, as + a page read completion often performs another read from the + insert buffer. The insert buffer is in tablespace 0, and we + cannot end up waiting in this function. */ + + return; + } + + if (system->n_open < system->max_n_open) { + + return; + } + + space = fil_space_get_by_id(space_id); + + if (space != NULL && space->stop_ios) { + /* We are going to do a rename file and want to stop new i/o's + for a while */ + + if (count2 > 20000) { + fputs("InnoDB: Warning: tablespace ", stderr); + ut_print_filename(stderr, space->name); + fprintf(stderr, + " has i/o ops stopped for a long time %lu\n", + (ulong) count2); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + count2++; + + goto retry; + } + + /* If the file is already open, no need to do anything; if the space + does not exist, we handle the situation in the function which called + this function */ + + if (!space || UT_LIST_GET_FIRST(space->chain)->open) { + + return; + } + + if (count > 1) { + print_info = TRUE; + } + + /* Too many files are open, try to close some */ +close_more: + success = fil_try_to_close_file_in_LRU(print_info); + + if (success && system->n_open >= system->max_n_open) { + + goto close_more; + } + + if (system->n_open < system->max_n_open) { + /* Ok */ + + return; + } + + if (count >= 2) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: too many (%lu) files stay open" + " while the maximum\n" + "InnoDB: allowed value would be %lu.\n" + "InnoDB: You may need to raise the value of" + " innodb_max_files_open in\n" + "InnoDB: my.cnf.\n", + (ulong) system->n_open, (ulong) system->max_n_open); + + return; + } + + mutex_exit(&(system->mutex)); + +#ifndef UNIV_HOTBACKUP + /* Wake the i/o-handler threads to make sure pending i/o's are + performed */ + os_aio_simulated_wake_handler_threads(); + + os_thread_sleep(20000); +#endif + /* Flush tablespaces so that we can close modified files in the LRU + list */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + count++; + + goto retry; +} + +/*********************************************************************** +Frees a file node object from a tablespace memory cache. */ +static +void +fil_node_free( +/*==========*/ + fil_node_t* node, /* in, own: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space) /* in: space where the file node is chained */ +{ + ut_ad(node && system && space); + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->magic_n == FIL_NODE_MAGIC_N); + ut_a(node->n_pending == 0); + + if (node->open) { + /* We fool the assertion in fil_node_close_file() to think + there are no unflushed modifications in the file */ + + node->modification_counter = node->flush_counter; + + if (space->is_in_unflushed_spaces + && fil_space_is_flushed(space)) { + + space->is_in_unflushed_spaces = FALSE; + + UT_LIST_REMOVE(unflushed_spaces, + system->unflushed_spaces, + space); + } + + fil_node_close_file(node, system); + } + + space->size -= node->size; + + UT_LIST_REMOVE(chain, space->chain, node); + + mem_free(node->name); + mem_free(node); +} + +#ifdef UNIV_LOG_ARCHIVE +/******************************************************************** +Drops files from the start of a file space, so that its size is cut by +the amount given. */ +UNIV_INTERN +void +fil_space_truncate_start( +/*=====================*/ + ulint id, /* in: space id */ + ulint trunc_len) /* in: truncate by this much; it is an error + if this does not equal to the combined size of + some initial files in the space */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + ut_a(space); + + while (trunc_len > 0) { + node = UT_LIST_GET_FIRST(space->chain); + + ut_a(node->size * UNIV_PAGE_SIZE <= trunc_len); + + trunc_len -= node->size * UNIV_PAGE_SIZE; + + fil_node_free(node, system, space); + } + + mutex_exit(&(system->mutex)); +} +#endif /* UNIV_LOG_ARCHIVE */ + +/*********************************************************************** +Creates a space memory object and puts it to the tablespace memory cache. If +there is an error, prints an error message to the .err log. */ +UNIV_INTERN +ibool +fil_space_create( +/*=============*/ + /* out: TRUE if success */ + const char* name, /* in: space name */ + ulint id, /* in: space id */ + ulint flags, /* in: compressed page size + and file format, or 0 */ + ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for + ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and + ROW_FORMAT=REDUNDANT (table->flags == 0). For any other + format, the tablespace flags should equal table->flags. */ + ut_a(flags != DICT_TF_COMPACT); + +try_again: + /*printf( + "InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name, + purpose);*/ + + ut_a(system); + ut_a(name); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_name(name); + + if (UNIV_LIKELY_NULL(space)) { + ulint namesake_id; + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: trying to init to the" + " tablespace memory cache\n" + "InnoDB: a tablespace %lu of name ", (ulong) id); + ut_print_filename(stderr, name); + fprintf(stderr, ",\n" + "InnoDB: but a tablespace %lu of the same name\n" + "InnoDB: already exists in the" + " tablespace memory cache!\n", + (ulong) space->id); + + if (id == 0 || purpose != FIL_TABLESPACE) { + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + fprintf(stderr, + "InnoDB: We assume that InnoDB did a crash recovery," + " and you had\n" + "InnoDB: an .ibd file for which the table" + " did not exist in the\n" + "InnoDB: InnoDB internal data dictionary in the" + " ibdata files.\n" + "InnoDB: We assume that you later removed the" + " .ibd and .frm files,\n" + "InnoDB: and are now trying to recreate the table." + " We now remove the\n" + "InnoDB: conflicting tablespace object" + " from the memory cache and try\n" + "InnoDB: the init again.\n"); + + namesake_id = space->id; + + mutex_exit(&(system->mutex)); + + fil_space_free(namesake_id); + + goto try_again; + } + + space = fil_space_get_by_id(id); + + if (UNIV_LIKELY_NULL(space)) { + fprintf(stderr, + "InnoDB: Error: trying to add tablespace %lu" + " of name ", (ulong) id); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: to the tablespace memory cache," + " but tablespace\n" + "InnoDB: %lu of name ", (ulong) space->id); + ut_print_filename(stderr, space->name); + fputs(" already exists in the tablespace\n" + "InnoDB: memory cache!\n", stderr); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + space = mem_alloc(sizeof(fil_space_t)); + + space->name = mem_strdup(name); + space->id = id; + + system->tablespace_version++; + space->tablespace_version = system->tablespace_version; + space->mark = FALSE; + + if (purpose == FIL_TABLESPACE && id > system->max_assigned_id) { + system->max_assigned_id = id; + } + + space->stop_ios = FALSE; + space->stop_ibuf_merges = FALSE; + space->is_being_deleted = FALSE; + space->purpose = purpose; + space->size = 0; + space->flags = flags; + + space->n_reserved_extents = 0; + + space->n_pending_flushes = 0; + space->n_pending_ibuf_merges = 0; + + UT_LIST_INIT(space->chain); + space->magic_n = FIL_SPACE_MAGIC_N; + + rw_lock_create(&space->latch, SYNC_FSP); + + HASH_INSERT(fil_space_t, hash, system->spaces, id, space); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(name), space); + space->is_in_unflushed_spaces = FALSE; + + UT_LIST_ADD_LAST(space_list, system->space_list, space); + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. */ +static +ulint +fil_assign_new_space_id(void) +/*=========================*/ + /* out: new tablespace id; ULINT_UNDEFINED if could + not assign an id */ +{ + fil_system_t* system = fil_system; + ulint id; + + mutex_enter(&(system->mutex)); + + system->max_assigned_id++; + + id = system->max_assigned_id; + + if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: Warning: you are running out of new" + " single-table tablespace id's.\n" + "InnoDB: Current counter is %lu and it" + " must not exceed %lu!\n" + "InnoDB: To reset the counter to zero" + " you have to dump all your tables and\n" + "InnoDB: recreate the whole InnoDB installation.\n", + (ulong) id, + (ulong) SRV_LOG_SPACE_FIRST_ID); + } + + if (id >= SRV_LOG_SPACE_FIRST_ID) { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: You have run out of single-table" + " tablespace id's!\n" + "InnoDB: Current counter is %lu.\n" + "InnoDB: To reset the counter to zero you" + " have to dump all your tables and\n" + "InnoDB: recreate the whole InnoDB installation.\n", + (ulong) id); + system->max_assigned_id--; + + id = ULINT_UNDEFINED; + } + + mutex_exit(&(system->mutex)); + + return(id); +} + +/*********************************************************************** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. */ +UNIV_INTERN +ibool +fil_space_free( +/*===========*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_space_t* namespace; + fil_node_t* fil_node; + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: trying to remove tablespace %lu" + " from the cache but\n" + "InnoDB: it is not there.\n", (ulong) id); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, hash, system->spaces, id, space); + + namespace = fil_space_get_by_name(space->name); + ut_a(namespace); + ut_a(space == namespace); + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + + if (space->is_in_unflushed_spaces) { + space->is_in_unflushed_spaces = FALSE; + + UT_LIST_REMOVE(unflushed_spaces, system->unflushed_spaces, + space); + } + + UT_LIST_REMOVE(space_list, system->space_list, space); + + ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + ut_a(0 == space->n_pending_flushes); + + fil_node = UT_LIST_GET_FIRST(space->chain); + + while (fil_node != NULL) { + fil_node_free(fil_node, system, space); + + fil_node = UT_LIST_GET_FIRST(space->chain); + } + + ut_a(0 == UT_LIST_GET_LEN(space->chain)); + + mutex_exit(&(system->mutex)); + + rw_lock_free(&(space->latch)); + + mem_free(space->name); + mem_free(space); + + return(TRUE); +} + +#ifdef UNIV_HOTBACKUP +/*********************************************************************** +Returns the tablespace object for a given id, or NULL if not found from the +tablespace memory cache. */ +static +fil_space_t* +fil_get_space_for_id_low( +/*=====================*/ + /* out: tablespace object or NULL; NOTE that you must + own &(fil_system->mutex) to call this function! */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + ut_ad(system); + + space = fil_space_get_by_id(id); + + return(space); +} +#endif + +/*********************************************************************** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. */ +UNIV_INTERN +ulint +fil_space_get_size( +/*===============*/ + /* out: space size, 0 if space not found */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + ulint size; + + ut_ad(system); + + fil_mutex_enter_and_prepare_for_io(id); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + mutex_exit(&(system->mutex)); + + return(0); + } + + if (space->size == 0 && space->purpose == FIL_TABLESPACE) { + ut_a(id != 0); + + ut_a(1 == UT_LIST_GET_LEN(space->chain)); + + node = UT_LIST_GET_FIRST(space->chain); + + /* It must be a single-table tablespace and we have not opened + the file yet; the following calls will open it and update the + size fields */ + + fil_node_prepare_for_io(node, system, space); + fil_node_complete_io(node, system, OS_FILE_READ); + } + + size = space->size; + + mutex_exit(&(system->mutex)); + + return(size); +} + +/*********************************************************************** +Returns the flags of the space. The tablespace must be cached +in the memory cache. */ +UNIV_INTERN +ulint +fil_space_get_flags( +/*================*/ + /* out: flags, ULINT_UNDEFINED if space not found */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + ulint flags; + + ut_ad(system); + + if (UNIV_UNLIKELY(!id)) { + return(0); + } + + fil_mutex_enter_and_prepare_for_io(id); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + mutex_exit(&(system->mutex)); + + return(ULINT_UNDEFINED); + } + + if (space->size == 0 && space->purpose == FIL_TABLESPACE) { + ut_a(id != 0); + + ut_a(1 == UT_LIST_GET_LEN(space->chain)); + + node = UT_LIST_GET_FIRST(space->chain); + + /* It must be a single-table tablespace and we have not opened + the file yet; the following calls will open it and update the + size fields */ + + fil_node_prepare_for_io(node, system, space); + fil_node_complete_io(node, system, OS_FILE_READ); + } + + flags = space->flags; + + mutex_exit(&(system->mutex)); + + return(flags); +} + +/*********************************************************************** +Returns the compressed page size of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. */ +UNIV_INTERN +ulint +fil_space_get_zip_size( +/*===================*/ + /* out: compressed page size, ULINT_UNDEFINED + if space not found */ + ulint id) /* in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(dict_table_flags_to_zip_size(flags)); + } + + return(flags); +} + +/*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. */ +UNIV_INTERN +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no)/* in: page number */ +{ + if (fil_space_get_size(id) > page_no) { + + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************** +Creates a the tablespace memory cache. */ +static +fil_system_t* +fil_system_create( +/*==============*/ + /* out, own: tablespace memory cache */ + ulint hash_size, /* in: hash table size */ + ulint max_n_open) /* in: maximum number of open files; must be + > 10 */ +{ + fil_system_t* system; + + ut_a(hash_size > 0); + ut_a(max_n_open > 0); + + system = mem_alloc(sizeof(fil_system_t)); + + mutex_create(&system->mutex, SYNC_ANY_LATCH); + + system->spaces = hash_create(hash_size); + system->name_hash = hash_create(hash_size); + + UT_LIST_INIT(system->LRU); + + system->n_open = 0; + system->max_n_open = max_n_open; + + system->modification_counter = 0; + system->max_assigned_id = 0; + + system->tablespace_version = 0; + + UT_LIST_INIT(system->unflushed_spaces); + UT_LIST_INIT(system->space_list); + + return(system); +} + +/******************************************************************** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_init( +/*=====*/ + ulint max_n_open) /* in: max number of open files */ +{ + ulint hash_size; + + ut_a(fil_system == NULL); + + if (srv_file_per_table) { + hash_size = 50000; + } else { + hash_size = 5000; + } + + fil_system = fil_system_create(hash_size, max_n_open); +} + +/*********************************************************************** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ +UNIV_INTERN +void +fil_open_log_and_system_tablespace_files(void) +/*==========================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + if (space->purpose != FIL_TABLESPACE || space->id == 0) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (!node->open) { + fil_node_open_file(node, system, + space); + } + if (system->max_n_open < 10 + system->n_open) { + fprintf(stderr, + "InnoDB: Warning: you must" + " raise the value of" + " innodb_max_open_files in\n" + "InnoDB: my.cnf! Remember that" + " InnoDB keeps all log files" + " and all system\n" + "InnoDB: tablespace files open" + " for the whole time mysqld is" + " running, and\n" + "InnoDB: needs to open also" + " some .ibd files if the" + " file-per-table storage\n" + "InnoDB: model is used." + " Current open files %lu," + " max allowed" + " open files %lu.\n", + (ulong) system->n_open, + (ulong) system->max_n_open); + } + node = UT_LIST_GET_NEXT(chain, node); + } + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ +UNIV_INTERN +void +fil_close_all_files(void) +/*=====================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (node->open) { + fil_node_close_file(node, system); + } + node = UT_LIST_GET_NEXT(chain, node); + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +UNIV_INTERN +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id) /* in: maximum known id */ +{ + fil_system_t* system = fil_system; + + if (max_id >= SRV_LOG_SPACE_FIRST_ID) { + fprintf(stderr, + "InnoDB: Fatal error: max tablespace id" + " is too high, %lu\n", (ulong) max_id); + ut_a(0); + } + + mutex_enter(&(system->mutex)); + + if (system->max_assigned_id < max_id) { + + system->max_assigned_id = max_id; + } + + mutex_exit(&(system->mutex)); +} + +/******************************************************************** +Writes the flushed lsn and the latest archived log number to the page header +of the first page of a data file of the system tablespace (space 0), +which is uncompressed. */ +static +ulint +fil_write_lsn_and_arch_no_to_file( +/*==============================*/ + ulint sum_of_sizes, /* in: combined size of previous files + in space, in database pages */ + ib_uint64_t lsn, /* in: lsn to write */ + ulint arch_log_no /* in: archived log number to write */ + __attribute__((unused))) +{ + byte* buf1; + byte* buf; + + buf1 = mem_alloc(2 * UNIV_PAGE_SIZE); + buf = ut_align(buf1, UNIV_PAGE_SIZE); + + fil_read(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); + + mach_write_ull(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); + + fil_write(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); + + mem_free(buf1); + + return(DB_SUCCESS); +} + +/******************************************************************** +Writes the flushed lsn and the latest archived log number to the page +header of the first page of each data file in the system tablespace. */ +UNIV_INTERN +ulint +fil_write_flushed_lsn_to_data_files( +/*================================*/ + /* out: DB_SUCCESS or error number */ + ib_uint64_t lsn, /* in: lsn to write */ + ulint arch_log_no) /* in: latest archived log + file number */ +{ + fil_space_t* space; + fil_node_t* node; + ulint sum_of_sizes; + ulint err; + + mutex_enter(&(fil_system->mutex)); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space) { + /* We only write the lsn to all existing data files which have + been open during the lifetime of the mysqld process; they are + represented by the space objects in the tablespace memory + cache. Note that all data files in the system tablespace 0 are + always open. */ + + if (space->purpose == FIL_TABLESPACE + && space->id == 0) { + sum_of_sizes = 0; + + node = UT_LIST_GET_FIRST(space->chain); + while (node) { + mutex_exit(&(fil_system->mutex)); + + err = fil_write_lsn_and_arch_no_to_file( + sum_of_sizes, lsn, arch_log_no); + if (err != DB_SUCCESS) { + + return(err); + } + + mutex_enter(&(fil_system->mutex)); + + sum_of_sizes += node->size; + node = UT_LIST_GET_NEXT(chain, node); + } + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(fil_system->mutex)); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Reads the flushed lsn and arch no fields from a data file at database +startup. */ +UNIV_INTERN +void +fil_read_flushed_lsn_and_arch_log_no( +/*=================================*/ + os_file_t data_file, /* in: open data file */ + ibool one_read_already, /* in: TRUE if min and max + parameters below already + contain sensible data */ +#ifdef UNIV_LOG_ARCHIVE + ulint* min_arch_log_no, /* in/out: */ + ulint* max_arch_log_no, /* in/out: */ +#endif /* UNIV_LOG_ARCHIVE */ + ib_uint64_t* min_flushed_lsn, /* in/out: */ + ib_uint64_t* max_flushed_lsn) /* in/out: */ +{ + byte* buf; + byte* buf2; + ib_uint64_t flushed_lsn; + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for a possible read from a raw device */ + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE); + + flushed_lsn = mach_read_ull(buf + FIL_PAGE_FILE_FLUSH_LSN); + + ut_free(buf2); + + if (!one_read_already) { + *min_flushed_lsn = flushed_lsn; + *max_flushed_lsn = flushed_lsn; +#ifdef UNIV_LOG_ARCHIVE + *min_arch_log_no = arch_log_no; + *max_arch_log_no = arch_log_no; +#endif /* UNIV_LOG_ARCHIVE */ + return; + } + + if (*min_flushed_lsn > flushed_lsn) { + *min_flushed_lsn = flushed_lsn; + } + if (*max_flushed_lsn < flushed_lsn) { + *max_flushed_lsn = flushed_lsn; + } +#ifdef UNIV_LOG_ARCHIVE + if (*min_arch_log_no > arch_log_no) { + *min_arch_log_no = arch_log_no; + } + if (*max_arch_log_no < arch_log_no) { + *max_arch_log_no = arch_log_no; + } +#endif /* UNIV_LOG_ARCHIVE */ +} + +/*================ SINGLE-TABLE TABLESPACES ==========================*/ + +/*********************************************************************** +Increments the count of pending insert buffer page merges, if space is not +being deleted. */ +UNIV_INTERN +ibool +fil_inc_pending_ibuf_merges( +/*========================*/ + /* out: TRUE if being deleted, and ibuf merges should + be skipped */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + fprintf(stderr, + "InnoDB: Error: trying to do ibuf merge to a" + " dropped tablespace %lu\n", + (ulong) id); + } + + if (space == NULL || space->stop_ibuf_merges) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + space->n_pending_ibuf_merges++; + + mutex_exit(&(system->mutex)); + + return(FALSE); +} + +/*********************************************************************** +Decrements the count of pending insert buffer page merges. */ +UNIV_INTERN +void +fil_decr_pending_ibuf_merges( +/*=========================*/ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + fprintf(stderr, + "InnoDB: Error: decrementing ibuf merge of a" + " dropped tablespace %lu\n", + (ulong) id); + } + + if (space != NULL) { + space->n_pending_ibuf_merges--; + } + + mutex_exit(&(system->mutex)); +} + +/************************************************************ +Creates the database directory for a table if it does not exist yet. */ +static +void +fil_create_directory_for_tablename( +/*===============================*/ + const char* name) /* in: name in the standard + 'databasename/tablename' format */ +{ + const char* namend; + char* path; + ulint len; + + len = strlen(fil_path_to_mysql_datadir); + namend = strchr(name, '/'); + ut_a(namend); + path = mem_alloc(len + (namend - name) + 2); + + memcpy(path, fil_path_to_mysql_datadir, len); + path[len] = '/'; + memcpy(path + len + 1, name, namend - name); + path[len + (namend - name) + 1] = 0; + + srv_normalize_path_for_win(path); + + ut_a(os_file_create_directory(path, FALSE)); + mem_free(path); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************ +Writes a log record about an .ibd file create/rename/delete. */ +static +void +fil_op_write_log( +/*=============*/ + ulint type, /* in: MLOG_FILE_CREATE, + MLOG_FILE_CREATE2, + MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id, /* in: space id */ + ulint flags, /* in: compressed page size + and file format + if type==MLOG_FILE_CREATE2, or 0 */ + const char* name, /* in: table name in the familiar + 'databasename/tablename' format, or + the file path in the case of + MLOG_FILE_DELETE */ + const char* new_name, /* in: if type is MLOG_FILE_RENAME, + the new table name in the + 'databasename/tablename' format */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + byte* log_ptr; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 2 + 1); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_for_file_op(type, space_id, 0, + log_ptr, mtr); + if (type == MLOG_FILE_CREATE2) { + mach_write_to_4(log_ptr, flags); + log_ptr += 4; + } + /* Let us store the strings as null-terminated for easier readability + and handling */ + + len = strlen(name) + 1; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, (byte*) name, len); + + if (type == MLOG_FILE_RENAME) { + len = strlen(new_name) + 1; + log_ptr = mlog_open(mtr, 2 + len); + ut_a(log_ptr); + mach_write_to_2(log_ptr, len); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, (byte*) new_name, len); + } +} +#endif + +/*********************************************************************** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the +datadir that we should use in replaying the file operations. */ +UNIV_INTERN +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + /* out: end of log record, or NULL if the + record was not completely contained between + ptr and end_ptr */ + byte* ptr, /* in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /* in: buffer end */ + ulint type, /* in: the type of this log record */ + ulint space_id) /* in: the space id of the tablespace in + question, or 0 if the log record should + only be parsed but not replayed */ +{ + ulint name_len; + ulint new_name_len; + const char* name; + const char* new_name = NULL; + ulint flags = 0; + + if (type == MLOG_FILE_CREATE2) { + if (end_ptr < ptr + 4) { + + return(NULL); + } + + flags = mach_read_from_4(ptr); + ptr += 4; + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + name_len) { + + return(NULL); + } + + name = (const char*) ptr; + + ptr += name_len; + + if (type == MLOG_FILE_RENAME) { + if (end_ptr < ptr + 2) { + + return(NULL); + } + + new_name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + new_name_len) { + + return(NULL); + } + + new_name = (const char*) ptr; + + ptr += new_name_len; + } + + /* We managed to parse a full log record body */ + /* + printf("Parsed log rec of type %lu space %lu\n" + "name %s\n", type, space_id, name); + + if (type == MLOG_FILE_RENAME) { + printf("new name %s\n", new_name); + } + */ + if (!space_id) { + + return(ptr); + } + + /* Let us try to perform the file operation, if sensible. Note that + ibbackup has at this stage already read in all space id info to the + fil0fil.c data structures. + + NOTE that our algorithm is not guaranteed to work correctly if there + were renames of tables during the backup. See ibbackup code for more + on the problem. */ + + switch (type) { + case MLOG_FILE_DELETE: + if (fil_tablespace_exists_in_mem(space_id)) { + ut_a(fil_delete_tablespace(space_id)); + } + + break; + + case MLOG_FILE_RENAME: + /* We do the rename based on space id, not old file name; + this should guarantee that after the log replay each .ibd file + has the correct name for the latest log sequence number; the + proof is left as an exercise :) */ + + if (fil_tablespace_exists_in_mem(space_id)) { + /* Create the database directory for the new name, if + it does not exist yet */ + fil_create_directory_for_tablename(new_name); + + /* Rename the table if there is not yet a tablespace + with the same name */ + + if (fil_get_space_id_for_table(new_name) + == ULINT_UNDEFINED) { + /* We do not care of the old name, that is + why we pass NULL as the first argument */ + if (!fil_rename_tablespace(NULL, space_id, + new_name)) { + ut_error; + } + } + } + + break; + + case MLOG_FILE_CREATE: + case MLOG_FILE_CREATE2: + if (fil_tablespace_exists_in_mem(space_id)) { + /* Do nothing */ + } else if (fil_get_space_id_for_table(name) + != ULINT_UNDEFINED) { + /* Do nothing */ + } else { + /* Create the database directory for name, if it does + not exist yet */ + fil_create_directory_for_tablename(name); + + if (fil_create_new_single_table_tablespace( + &space_id, name, FALSE, flags, + FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + ut_error; + } + } + + break; + + default: + ut_error; + } + + return(ptr); +} + +/*********************************************************************** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. */ +UNIV_INTERN +ibool +fil_delete_tablespace( +/*==================*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char* path; + + ut_a(id != 0); +stop_ibuf_merges: + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space != NULL) { + space->stop_ibuf_merges = TRUE; + + if (space->n_pending_ibuf_merges == 0) { + mutex_exit(&(system->mutex)); + + count = 0; + + goto try_again; + } else { + if (count > 5000) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: trying to" + " delete tablespace ", stderr); + ut_print_filename(stderr, space->name); + fprintf(stderr, ",\n" + "InnoDB: but there are %lu pending" + " ibuf merges on it.\n" + "InnoDB: Loop %lu.\n", + (ulong) space->n_pending_ibuf_merges, + (ulong) count); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + count++; + + goto stop_ibuf_merges; + } + } + + mutex_exit(&(system->mutex)); + count = 0; + +try_again: + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: cannot delete tablespace %lu\n" + "InnoDB: because it is not found in the" + " tablespace memory cache.\n", + (ulong) id); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + ut_a(space); + ut_a(space->n_pending_ibuf_merges == 0); + + space->is_being_deleted = TRUE; + + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (space->n_pending_flushes > 0 || node->n_pending > 0) { + if (count > 1000) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: trying to" + " delete tablespace ", stderr); + ut_print_filename(stderr, space->name); + fprintf(stderr, ",\n" + "InnoDB: but there are %lu flushes" + " and %lu pending i/o's on it\n" + "InnoDB: Loop %lu.\n", + (ulong) space->n_pending_flushes, + (ulong) node->n_pending, + (ulong) count); + } + mutex_exit(&(system->mutex)); + os_thread_sleep(20000); + + count++; + + goto try_again; + } + + path = mem_strdup(space->name); + + mutex_exit(&(system->mutex)); +#ifndef UNIV_HOTBACKUP + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since we have set space->is_being_deleted = TRUE, readahead + or ibuf merge can no longer read more pages of this tablespace to the + buffer pool. Thus we can clean the tablespace out of the buffer pool + completely and permanently. The flag is_being_deleted also prevents + fil_flush() from being applied to this tablespace. */ + + buf_LRU_invalidate_tablespace(id); +#endif + /* printf("Deleting tablespace %s id %lu\n", space->name, id); */ + + success = fil_space_free(id); + + if (success) { + success = os_file_delete(path); + + if (!success) { + success = os_file_delete_if_exists(path); + } + } + + if (success) { +#ifndef UNIV_HOTBACKUP + /* Write a log record about the deletion of the .ibd + file, so that ibbackup can replay it in the + --apply-log phase. We use a dummy mtr and the familiar + log write mechanism. */ + mtr_t mtr; + + /* When replaying the operation in ibbackup, do not try + to write any log record */ + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_DELETE, id, 0, path, NULL, &mtr); + mtr_commit(&mtr); +#endif + mem_free(path); + + return(TRUE); + } + + mem_free(path); + + return(FALSE); +} + +/*********************************************************************** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but +1) we do not drop the table from the data dictionary; +2) we remove all insert buffer entries for the tablespace immediately; in DROP +TABLE they are only removed gradually in the background; +3) when the user does IMPORT TABLESPACE, the tablespace will have the same id +as it originally had. */ +UNIV_INTERN +ibool +fil_discard_tablespace( +/*===================*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + ibool success; + + success = fil_delete_tablespace(id); + + if (!success) { + fprintf(stderr, + "InnoDB: Warning: cannot delete tablespace %lu" + " in DISCARD TABLESPACE.\n" + "InnoDB: But let us remove the" + " insert buffer entries for this tablespace.\n", + (ulong) id); + } + + /* Remove all insert buffer entries for the tablespace */ + + ibuf_delete_for_discarded_space(id); + + return(success); +} + +/*********************************************************************** +Renames the memory cache structures of a single-table tablespace. */ +static +ibool +fil_rename_tablespace_in_mem( +/*=========================*/ + /* out: TRUE if success */ + fil_space_t* space, /* in: tablespace memory object */ + fil_node_t* node, /* in: file node of that tablespace */ + const char* path) /* in: new name */ +{ + fil_system_t* system = fil_system; + fil_space_t* space2; + const char* old_name = space->name; + + space2 = fil_space_get_by_name(old_name); + if (space != space2) { + fputs("InnoDB: Error: cannot find ", stderr); + ut_print_filename(stderr, old_name); + fputs(" in tablespace memory cache\n", stderr); + + return(FALSE); + } + + space2 = fil_space_get_by_name(path); + if (space2 != NULL) { + fputs("InnoDB: Error: ", stderr); + ut_print_filename(stderr, path); + fputs(" is already in tablespace memory cache\n", stderr); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + mem_free(space->name); + mem_free(node->name); + + space->name = mem_strdup(path); + node->name = mem_strdup(path); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(path), space); + return(TRUE); +} + +/*********************************************************************** +Allocates a file name for a single-table tablespace. The string must be freed +by caller with mem_free(). */ +static +char* +fil_make_ibd_name( +/*==============*/ + /* out, own: file name */ + const char* name, /* in: table name or a dir path of a + TEMPORARY table */ + ibool is_temp) /* in: TRUE if it is a dir path */ +{ + ulint namelen = strlen(name); + ulint dirlen = strlen(fil_path_to_mysql_datadir); + char* filename = mem_alloc(namelen + dirlen + sizeof "/.ibd"); + + if (is_temp) { + memcpy(filename, name, namelen); + memcpy(filename + namelen, ".ibd", sizeof ".ibd"); + } else { + memcpy(filename, fil_path_to_mysql_datadir, dirlen); + filename[dirlen] = '/'; + + memcpy(filename + dirlen + 1, name, namelen); + memcpy(filename + dirlen + namelen + 1, ".ibd", sizeof ".ibd"); + } + + srv_normalize_path_for_win(filename); + + return(filename); +} + +/*********************************************************************** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. */ +UNIV_INTERN +ibool +fil_rename_tablespace( +/*==================*/ + /* out: TRUE if success */ + const char* old_name, /* in: old table name in the standard + databasename/tablename format of + InnoDB, or NULL if we do the rename + based on the space id only */ + ulint id, /* in: space id */ + const char* new_name) /* in: new table name in the standard + databasename/tablename format + of InnoDB */ +{ + fil_system_t* system = fil_system; + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char* path; + ibool old_name_was_specified = TRUE; + char* old_path; + + ut_a(id != 0); + + if (old_name == NULL) { + old_name = "(name not specified)"; + old_name_was_specified = FALSE; + } +retry: + count++; + + if (count > 1000) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: problems renaming ", stderr); + ut_print_filename(stderr, old_name); + fputs(" to ", stderr); + ut_print_filename(stderr, new_name); + fprintf(stderr, ", %lu iterations\n", (ulong) count); + } + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + fprintf(stderr, + "InnoDB: Error: cannot find space id %lu" + " in the tablespace memory cache\n" + "InnoDB: though the table ", (ulong) id); + ut_print_filename(stderr, old_name); + fputs(" in a rename operation should have that id\n", stderr); + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (count > 25000) { + space->stop_ios = FALSE; + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + /* We temporarily close the .ibd file because we do not trust that + operating systems can rename an open file. For the closing we have to + wait until there are no pending i/o's or flushes on the file. */ + + space->stop_ios = TRUE; + + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (node->n_pending > 0 || node->n_pending_flushes > 0) { + /* There are pending i/o's or flushes, sleep for a while and + retry */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + goto retry; + + } else if (node->modification_counter > node->flush_counter) { + /* Flush the space */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + fil_flush(id); + + goto retry; + + } else if (node->open) { + /* Close the file */ + + fil_node_close_file(node, system); + } + + /* Check that the old name in the space is right */ + + if (old_name_was_specified) { + old_path = fil_make_ibd_name(old_name, FALSE); + + ut_a(strcmp(space->name, old_path) == 0); + ut_a(strcmp(node->name, old_path) == 0); + } else { + old_path = mem_strdup(space->name); + } + + /* Rename the tablespace and the node in the memory cache */ + path = fil_make_ibd_name(new_name, FALSE); + success = fil_rename_tablespace_in_mem(space, node, path); + + if (success) { + success = os_file_rename(old_path, path); + + if (!success) { + /* We have to revert the changes we made + to the tablespace memory cache */ + + ut_a(fil_rename_tablespace_in_mem(space, node, + old_path)); + } + } + + mem_free(path); + mem_free(old_path); + + space->stop_ios = FALSE; + + mutex_exit(&(system->mutex)); + +#ifndef UNIV_HOTBACKUP + if (success) { + mtr_t mtr; + + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_RENAME, id, 0, old_name, new_name, + &mtr); + mtr_commit(&mtr); + } +#endif + return(success); +} + +/*********************************************************************** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp +dir of the mysqld server. */ +UNIV_INTERN +ulint +fil_create_new_single_table_tablespace( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + ulint* space_id, /* in/out: space id; if this is != 0, + then this is an input parameter, + otherwise output */ + const char* tablename, /* in: the table name in the usual + databasename/tablename format + of InnoDB, or a dir path to a temp + table */ + ibool is_temp, /* in: TRUE if a table created with + CREATE TEMPORARY TABLE */ + ulint flags, /* in: tablespace flags */ + ulint size) /* in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ +{ + os_file_t file; + ibool ret; + ulint err; + byte* buf2; + byte* page; + ibool success; + char* path; + + ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); + /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for + ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and + ROW_FORMAT=REDUNDANT (table->flags == 0). For any other + format, the tablespace flags should equal table->flags. */ + ut_a(flags != DICT_TF_COMPACT); + + path = fil_make_ibd_name(tablename, is_temp); + + file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + if (ret == FALSE) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error creating file ", stderr); + ut_print_filename(stderr, path); + fputs(".\n", stderr); + + /* The following call will print an error message */ + + err = os_file_get_last_error(TRUE); + + if (err == OS_FILE_ALREADY_EXISTS) { + fputs("InnoDB: The file already exists though" + " the corresponding table did not\n" + "InnoDB: exist in the InnoDB data dictionary." + " Have you moved InnoDB\n" + "InnoDB: .ibd files around without using the" + " SQL commands\n" + "InnoDB: DISCARD TABLESPACE and" + " IMPORT TABLESPACE, or did\n" + "InnoDB: mysqld crash in the middle of" + " CREATE TABLE? You can\n" + "InnoDB: resolve the problem by" + " removing the file ", stderr); + ut_print_filename(stderr, path); + fputs("\n" + "InnoDB: under the 'datadir' of MySQL.\n", + stderr); + + mem_free(path); + return(DB_TABLESPACE_ALREADY_EXISTS); + } + + if (err == OS_FILE_DISK_FULL) { + + mem_free(path); + return(DB_OUT_OF_FILE_SPACE); + } + + mem_free(path); + return(DB_ERROR); + } + + buf2 = ut_malloc(3 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0); + + if (!ret) { + ut_free(buf2); + os_file_close(file); + os_file_delete(path); + + mem_free(path); + return(DB_OUT_OF_FILE_SPACE); + } + + if (*space_id == 0) { + *space_id = fil_assign_new_space_id(); + } + + /* printf("Creating tablespace %s id %lu\n", path, *space_id); */ + + if (*space_id == ULINT_UNDEFINED) { + ut_free(buf2); +error_exit: + os_file_close(file); +error_exit2: + os_file_delete(path); + + mem_free(path); + return(DB_ERROR); + } + + /* We have to write the space id to the file immediately and flush the + file to disk. This is because in crash recovery we must be aware what + tablespaces exist and what are their space id's, so that we can apply + the log records to the right file. It may take quite a while until + buffer pool flush algorithms write anything to the file and flush it to + disk. If we would not write here anything, the file would be filled + with zeros from the call of os_file_set_size(), until a buffer pool + flush would write to it. */ + + memset(page, '\0', UNIV_PAGE_SIZE); + + fsp_header_init_fields(page, *space_id, flags); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, *space_id); + + if (!(flags & DICT_TF_ZSSIZE_MASK)) { + buf_flush_init_for_writing(page, NULL, 0); + ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE); + } else { + page_zip_des_t page_zip; + ulint zip_size; + + zip_size = ((PAGE_ZIP_MIN_SIZE >> 1) + << ((flags & DICT_TF_ZSSIZE_MASK) + >> DICT_TF_ZSSIZE_SHIFT)); + + page_zip_set_size(&page_zip, zip_size); + page_zip.data = page + UNIV_PAGE_SIZE; +#ifdef UNIV_DEBUG + page_zip.m_start = +#endif /* UNIV_DEBUG */ + page_zip.m_end = page_zip.m_nonempty = + page_zip.n_blobs = 0; + buf_flush_init_for_writing(page, &page_zip, 0); + ret = os_file_write(path, file, page_zip.data, 0, 0, zip_size); + } + + ut_free(buf2); + + if (!ret) { + fputs("InnoDB: Error: could not write the first page" + " to tablespace ", stderr); + ut_print_filename(stderr, path); + putc('\n', stderr); + goto error_exit; + } + + ret = os_file_flush(file); + + if (!ret) { + fputs("InnoDB: Error: file flush of tablespace ", stderr); + ut_print_filename(stderr, path); + fputs(" failed\n", stderr); + goto error_exit; + } + + os_file_close(file); + + if (*space_id == ULINT_UNDEFINED) { + goto error_exit2; + } + + success = fil_space_create(path, *space_id, flags, FIL_TABLESPACE); + + if (!success) { + goto error_exit2; + } + + fil_node_create(path, size, *space_id, FALSE); + +#ifndef UNIV_HOTBACKUP + { + mtr_t mtr; + + mtr_start(&mtr); + + fil_op_write_log(flags + ? MLOG_FILE_CREATE2 + : MLOG_FILE_CREATE, + *space_id, flags, + tablename, NULL, &mtr); + + mtr_commit(&mtr); + } +#endif + mem_free(path); + return(DB_SUCCESS); +} + +/************************************************************************ +It is possible, though very improbable, that the lsn's in the tablespace to be +imported have risen above the current system lsn, if a lengthy purge, ibuf +merge, or rollback was performed on a backup taken with ibbackup. If that is +the case, reset page lsn's in the file. We assume that mysqld was shut down +after it performed these cleanup operations on the .ibd file, so that it at +the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the +first page of the .ibd file, and we can determine whether we need to reset the +lsn's just by looking at that flush lsn. */ +UNIV_INTERN +ibool +fil_reset_too_high_lsns( +/*====================*/ + /* out: TRUE if success */ + const char* name, /* in: table name in the + databasename/tablename format */ + ib_uint64_t current_lsn) /* in: reset lsn's if the lsn stamped + to FIL_PAGE_FILE_FLUSH_LSN in the + first page is too high */ +{ + os_file_t file; + char* filepath; + byte* page; + byte* buf2; + ib_uint64_t flush_lsn; + ulint space_id; + ib_int64_t file_size; + ib_int64_t offset; + ulint zip_size; + ibool success; + + filepath = fil_make_ibd_name(name, FALSE); + + file = os_file_create_simple_no_error_handling( + filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: trying to open a table," + " but could not\n" + "InnoDB: open the tablespace file ", stderr); + ut_print_filename(stderr, filepath); + fputs("!\n", stderr); + mem_free(filepath); + + return(FALSE); + } + + /* Read the first page of the tablespace */ + + buf2 = ut_malloc(3 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + + /* We have to read the file flush lsn from the header of the file */ + + flush_lsn = mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN); + + if (current_lsn >= flush_lsn) { + /* Ok */ + success = TRUE; + + goto func_exit; + } + + space_id = fsp_header_get_space_id(page); + zip_size = fsp_header_get_zip_size(page); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Flush lsn in the tablespace file %lu" + " to be imported\n" + "InnoDB: is %llu, which exceeds current" + " system lsn %llu.\n" + "InnoDB: We reset the lsn's in the file ", + (ulong) space_id, + flush_lsn, current_lsn); + ut_print_filename(stderr, filepath); + fputs(".\n", stderr); + + ut_a(ut_is_2pow(zip_size)); + ut_a(zip_size <= UNIV_PAGE_SIZE); + + /* Loop through all the pages in the tablespace and reset the lsn and + the page checksum if necessary */ + + file_size = os_file_get_size_as_iblonglong(file); + + for (offset = 0; offset < file_size; + offset += zip_size ? zip_size : UNIV_PAGE_SIZE) { + success = os_file_read(file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), + zip_size ? zip_size : UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) { + /* We have to reset the lsn */ + + if (zip_size) { + memcpy(page + UNIV_PAGE_SIZE, page, zip_size); + buf_flush_init_for_writing( + page, page + UNIV_PAGE_SIZE, + current_lsn); + } else { + buf_flush_init_for_writing( + page, NULL, current_lsn); + } + success = os_file_write(filepath, file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), + zip_size + ? zip_size + : UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + } + } + + success = os_file_flush(file); + if (!success) { + + goto func_exit; + } + + /* We now update the flush_lsn stamp at the start of the file */ + success = os_file_read(file, page, 0, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + + mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); + + success = os_file_write(filepath, file, page, 0, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE); + if (!success) { + + goto func_exit; + } + success = os_file_flush(file); +func_exit: + os_file_close(file); + ut_free(buf2); + mem_free(filepath); + + return(success); +} + +/************************************************************************ +Tries to open a single-table tablespace and optionally checks the space id is +right in it. If does not succeed, prints an error message to the .err log. This +function is used to open a tablespace when we start up mysqld, and also in +IMPORT TABLESPACE. +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. */ +UNIV_INTERN +ibool +fil_open_single_table_tablespace( +/*=============================*/ + /* out: TRUE if success */ + ibool check_space_id, /* in: should we check that the space + id in the file is right; we assume + that this function runs much faster + if no check is made, since accessing + the file inode probably is much + faster (the OS caches them) than + accessing the first page of the file */ + ulint id, /* in: space id */ + ulint flags, /* in: tablespace flags */ + const char* name) /* in: table name in the + databasename/tablename format */ +{ + os_file_t file; + char* filepath; + ibool success; + byte* buf2; + byte* page; + ulint space_id; + ulint space_flags; + ibool ret = TRUE; + + filepath = fil_make_ibd_name(name, FALSE); + + /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for + ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and + ROW_FORMAT=REDUNDANT (table->flags == 0). For any other + format, the tablespace flags should equal table->flags. */ + ut_a(flags != DICT_TF_COMPACT); + + file = os_file_create_simple_no_error_handling( + filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: trying to open a table," + " but could not\n" + "InnoDB: open the tablespace file ", stderr); + ut_print_filename(stderr, filepath); + fputs("!\n" + "InnoDB: Have you moved InnoDB .ibd files around" + " without using the\n" + "InnoDB: commands DISCARD TABLESPACE and" + " IMPORT TABLESPACE?\n" + "InnoDB: It is also possible that this is" + " a temporary table #sql...,\n" + "InnoDB: and MySQL removed the .ibd file for this.\n" + "InnoDB: Please refer to\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: for how to resolve the issue.\n", stderr); + + mem_free(filepath); + + return(FALSE); + } + + if (!check_space_id) { + space_id = id; + + goto skip_check; + } + + /* Read the first page of the tablespace */ + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id and flags from the file. */ + + space_id = fsp_header_get_space_id(page); + space_flags = fsp_header_get_flags(page); + + ut_free(buf2); + + if (UNIV_UNLIKELY(space_id != id || space_flags != flags)) { + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: tablespace id and flags in file ", + stderr); + ut_print_filename(stderr, filepath); + fprintf(stderr, " are %lu and %lu, but in the InnoDB\n" + "InnoDB: data dictionary they are %lu and %lu.\n" + "InnoDB: Have you moved InnoDB .ibd files" + " around without using the\n" + "InnoDB: commands DISCARD TABLESPACE and" + " IMPORT TABLESPACE?\n" + "InnoDB: Please refer to\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: for how to resolve the issue.\n", + (ulong) space_id, (ulong) space_flags, + (ulong) id, (ulong) flags); + + ret = FALSE; + + goto func_exit; + } + +skip_check: + success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE); + + if (!success) { + goto func_exit; + } + + /* We do not measure the size of the file, that is why we pass the 0 + below */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + mem_free(filepath); + + return(ret); +} + +#ifdef UNIV_HOTBACKUP +/*********************************************************************** +Allocates a file name for an old version of a single-table tablespace. +The string must be freed by caller with mem_free()! */ +static +char* +fil_make_ibbackup_old_name( +/*=======================*/ + /* out, own: file name */ + const char* name) /* in: original file name */ +{ + static const char suffix[] = "_ibbackup_old_vers_"; + ulint len = strlen(name); + char* path = mem_alloc(len + (15 + sizeof suffix)); + + memcpy(path, name, len); + memcpy(path + len, suffix, (sizeof suffix) - 1); + ut_sprintf_timestamp_without_extra_chars(path + len + sizeof suffix); + return(path); +} +#endif /* UNIV_HOTBACKUP */ + +/************************************************************************ +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.c data structures. */ +static +void +fil_load_single_table_tablespace( +/*=============================*/ + const char* dbname, /* in: database name */ + const char* filename) /* in: file name (not a path), + including the .ibd extension */ +{ + os_file_t file; + char* filepath; + ibool success; + byte* buf2; + byte* page; + ulint space_id; + ulint flags; + ulint size_low; + ulint size_high; + ib_int64_t size; +#ifdef UNIV_HOTBACKUP + fil_space_t* space; +#endif + filepath = mem_alloc(strlen(dbname) + strlen(filename) + + strlen(fil_path_to_mysql_datadir) + 3); + + sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname, + filename); + srv_normalize_path_for_win(filepath); +#ifdef __WIN__ +# ifndef UNIV_HOTBACKUP + /* If lower_case_table_names is 0 or 2, then MySQL allows database + directory names with upper case letters. On Windows, all table and + database names in InnoDB are internally always in lower case. Put the + file path to lower case, so that we are consistent with InnoDB's + internal data dictionary. */ + + dict_casedn_str(filepath); +# endif /* !UNIV_HOTBACKUP */ +#endif + file = os_file_create_simple_no_error_handling( + filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, + "InnoDB: Error: could not open single-table tablespace" + " file\n" + "InnoDB: %s!\n" + "InnoDB: We do not continue the crash recovery," + " because the table may become\n" + "InnoDB: corrupt if we cannot apply the log records" + " in the InnoDB log to it.\n" + "InnoDB: To fix the problem and start mysqld:\n" + "InnoDB: 1) If there is a permission problem" + " in the file and mysqld cannot\n" + "InnoDB: open the file, you should" + " modify the permissions.\n" + "InnoDB: 2) If the table is not needed, or you can" + " restore it from a backup,\n" + "InnoDB: then you can remove the .ibd file," + " and InnoDB will do a normal\n" + "InnoDB: crash recovery and ignore that table.\n" + "InnoDB: 3) If the file system or the" + " disk is broken, and you cannot remove\n" + "InnoDB: the .ibd file, you can set" + " innodb_force_recovery > 0 in my.cnf\n" + "InnoDB: and force InnoDB to continue crash" + " recovery here.\n", filepath); + + mem_free(filepath); + + if (srv_force_recovery > 0) { + fprintf(stderr, + "InnoDB: innodb_force_recovery" + " was set to %lu. Continuing crash recovery\n" + "InnoDB: even though we cannot access" + " the .ibd file of this table.\n", + srv_force_recovery); + return; + } + + exit(1); + } + + success = os_file_get_size(file, &size_low, &size_high); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, + "InnoDB: Error: could not measure the size" + " of single-table tablespace file\n" + "InnoDB: %s!\n" + "InnoDB: We do not continue crash recovery," + " because the table will become\n" + "InnoDB: corrupt if we cannot apply the log records" + " in the InnoDB log to it.\n" + "InnoDB: To fix the problem and start mysqld:\n" + "InnoDB: 1) If there is a permission problem" + " in the file and mysqld cannot\n" + "InnoDB: access the file, you should" + " modify the permissions.\n" + "InnoDB: 2) If the table is not needed," + " or you can restore it from a backup,\n" + "InnoDB: then you can remove the .ibd file," + " and InnoDB will do a normal\n" + "InnoDB: crash recovery and ignore that table.\n" + "InnoDB: 3) If the file system or the disk is broken," + " and you cannot remove\n" + "InnoDB: the .ibd file, you can set" + " innodb_force_recovery > 0 in my.cnf\n" + "InnoDB: and force InnoDB to continue" + " crash recovery here.\n", filepath); + + os_file_close(file); + mem_free(filepath); + + if (srv_force_recovery > 0) { + fprintf(stderr, + "InnoDB: innodb_force_recovery" + " was set to %lu. Continuing crash recovery\n" + "InnoDB: even though we cannot access" + " the .ibd file of this table.\n", + srv_force_recovery); + return; + } + + exit(1); + } + + /* TODO: What to do in other cases where we cannot access an .ibd + file during a crash recovery? */ + + /* Every .ibd file is created >= 4 pages in size. Smaller files + cannot be ok. */ + + size = (((ib_int64_t)size_high) << 32) + (ib_int64_t)size_low; +#ifndef UNIV_HOTBACKUP + if (size < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Error: the size of single-table tablespace" + " file %s\n" + "InnoDB: is only %lu %lu, should be at least %lu!", + filepath, + (ulong) size_high, + (ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE)); + os_file_close(file); + mem_free(filepath); + + return; + } +#endif + /* Read the first page of the tablespace if the size big enough */ + + buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = ut_align(buf2, UNIV_PAGE_SIZE); + + if (size >= FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id from the file */ + + space_id = fsp_header_get_space_id(page); + flags = fsp_header_get_flags(page); + } else { + space_id = ULINT_UNDEFINED; + flags = 0; + } + +#ifndef UNIV_HOTBACKUP + if (space_id == ULINT_UNDEFINED || space_id == 0) { + fprintf(stderr, + "InnoDB: Error: tablespace id %lu in file %s" + " is not sensible\n", + (ulong) space_id, + filepath); + goto func_exit; + } +#else + if (space_id == ULINT_UNDEFINED || space_id == 0) { + char* new_path; + + fprintf(stderr, + "InnoDB: Renaming tablespace %s of id %lu,\n" + "InnoDB: to %s_ibbackup_old_vers_\n" + "InnoDB: because its size %lld is too small" + " (< 4 pages 16 kB each),\n" + "InnoDB: or the space id in the file header" + " is not sensible.\n" + "InnoDB: This can happen in an ibbackup run," + " and is not dangerous.\n", + filepath, space_id, filepath, size); + os_file_close(file); + + new_path = fil_make_ibbackup_old_name(filepath); + ut_a(os_file_rename(filepath, new_path)); + + ut_free(buf2); + mem_free(filepath); + mem_free(new_path); + + return; + } + + /* A backup may contain the same space several times, if the space got + renamed at a sensitive time. Since it is enough to have one version of + the space, we rename the file if a space with the same space id + already exists in the tablespace memory cache. We rather rename the + file than delete it, because if there is a bug, we do not want to + destroy valuable data. */ + + mutex_enter(&(fil_system->mutex)); + + space = fil_get_space_for_id_low(space_id); + + if (space) { + char* new_path; + + fprintf(stderr, + "InnoDB: Renaming tablespace %s of id %lu,\n" + "InnoDB: to %s_ibbackup_old_vers_\n" + "InnoDB: because space %s with the same id\n" + "InnoDB: was scanned earlier. This can happen" + " if you have renamed tables\n" + "InnoDB: during an ibbackup run.\n", + filepath, space_id, filepath, + space->name); + os_file_close(file); + + new_path = fil_make_ibbackup_old_name(filepath); + + mutex_exit(&(fil_system->mutex)); + + ut_a(os_file_rename(filepath, new_path)); + + ut_free(buf2); + mem_free(filepath); + mem_free(new_path); + + return; + } + mutex_exit(&(fil_system->mutex)); +#endif + success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE); + + if (!success) { + + goto func_exit; + } + + /* We do not use the size information we have about the file, because + the rounding formula for extents and pages is somewhat complex; we + let fil_node_open() do that task. */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + ut_free(buf2); + mem_free(filepath); +} + +/*************************************************************************** +A fault-tolerant function that tries to read the next file name in the +directory. We retry 100 times if os_file_readdir_next_file() returns -1. The +idea is to read as much good data as we can and jump over bad data. */ +static +int +fil_file_readdir_next_file( +/*=======================*/ + /* out: 0 if ok, -1 if error even after the + retries, 1 if at the end of the directory */ + ulint* err, /* out: this is set to DB_ERROR if an error + was encountered, otherwise not changed */ + const char* dirname,/* in: directory name or path */ + os_file_dir_t dir, /* in: directory stream */ + os_file_stat_t* info) /* in/out: buffer where the info is returned */ +{ + ulint i; + int ret; + + for (i = 0; i < 100; i++) { + ret = os_file_readdir_next_file(dirname, dir, info); + + if (ret != -1) { + + return(ret); + } + + fprintf(stderr, + "InnoDB: Error: os_file_readdir_next_file()" + " returned -1 in\n" + "InnoDB: directory %s\n" + "InnoDB: Crash recovery may have failed" + " for some .ibd files!\n", dirname); + + *err = DB_ERROR; + } + + return(-1); +} + +/************************************************************************ +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. */ +UNIV_INTERN +ulint +fil_load_single_table_tablespaces(void) +/*===================================*/ + /* out: DB_SUCCESS or error number */ +{ + int ret; + char* dbpath = NULL; + ulint dbpath_len = 100; + os_file_dir_t dir; + os_file_dir_t dbdir; + os_file_stat_t dbinfo; + os_file_stat_t fileinfo; + ulint err = DB_SUCCESS; + + /* The datadir of MySQL is always the default directory of mysqld */ + + dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE); + + if (dir == NULL) { + + return(DB_ERROR); + } + + dbpath = mem_alloc(dbpath_len); + + /* Scan all directories under the datadir. They are the database + directories of MySQL. */ + + ret = fil_file_readdir_next_file(&err, fil_path_to_mysql_datadir, dir, + &dbinfo); + while (ret == 0) { + ulint len; + /* printf("Looking at %s in datadir\n", dbinfo.name); */ + + if (dbinfo.type == OS_FILE_TYPE_FILE + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + + goto next_datadir_item; + } + + /* We found a symlink or a directory; try opening it to see + if a symlink is a directory */ + + len = strlen(fil_path_to_mysql_datadir) + + strlen (dbinfo.name) + 2; + if (len > dbpath_len) { + dbpath_len = len; + + if (dbpath) { + mem_free(dbpath); + } + + dbpath = mem_alloc(dbpath_len); + } + sprintf(dbpath, "%s/%s", fil_path_to_mysql_datadir, + dbinfo.name); + srv_normalize_path_for_win(dbpath); + + dbdir = os_file_opendir(dbpath, FALSE); + + if (dbdir != NULL) { + /* printf("Opened dir %s\n", dbinfo.name); */ + + /* We found a database directory; loop through it, + looking for possible .ibd files in it */ + + ret = fil_file_readdir_next_file(&err, dbpath, dbdir, + &fileinfo); + while (ret == 0) { + /* printf( + " Looking at file %s\n", fileinfo.name); */ + + if (fileinfo.type == OS_FILE_TYPE_DIR) { + + goto next_file_item; + } + + /* We found a symlink or a file */ + if (strlen(fileinfo.name) > 4 + && 0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".ibd")) { + /* The name ends in .ibd; try opening + the file */ + fil_load_single_table_tablespace( + dbinfo.name, fileinfo.name); + } +next_file_item: + ret = fil_file_readdir_next_file(&err, + dbpath, dbdir, + &fileinfo); + } + + if (0 != os_file_closedir(dbdir)) { + fputs("InnoDB: Warning: could not" + " close database directory ", stderr); + ut_print_filename(stderr, dbpath); + putc('\n', stderr); + + err = DB_ERROR; + } + } + +next_datadir_item: + ret = fil_file_readdir_next_file(&err, + fil_path_to_mysql_datadir, + dir, &dbinfo); + } + + mem_free(dbpath); + + if (0 != os_file_closedir(dir)) { + fprintf(stderr, + "InnoDB: Error: could not close MySQL datadir\n"); + + return(DB_ERROR); + } + + return(err); +} + +/************************************************************************ +If we need crash recovery, and we have called +fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(), +we can call this function to print an error message of orphaned .ibd files +for which there is not a data dictionary entry with a matching table name +and space id. */ +UNIV_INTERN +void +fil_print_orphaned_tablespaces(void) +/*================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space) { + if (space->purpose == FIL_TABLESPACE && space->id != 0 + && !space->mark) { + fputs("InnoDB: Warning: tablespace ", stderr); + ut_print_filename(stderr, space->name); + fprintf(stderr, " of id %lu has no matching table in\n" + "InnoDB: the InnoDB data dictionary.\n", + (ulong) space->id); + } + + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. */ +UNIV_INTERN +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + /* out: TRUE if does not exist or is being\ + deleted */ + ulint id, /* in: space id */ + ib_int64_t version)/* in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space == NULL || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + if (version != ((ib_int64_t)-1) + && space->tablespace_version != version) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + mutex_exit(&(system->mutex)); + + return(FALSE); +} + +/*********************************************************************** +Returns TRUE if a single-table tablespace exists in the memory cache. */ +UNIV_INTERN +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + /* out: TRUE if exists */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. */ +UNIV_INTERN +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + /* out: TRUE if a matching tablespace + exists in the memory cache */ + ulint id, /* in: space id */ + const char* name, /* in: table name in the standard + 'databasename/tablename' format or + the dir path to a temp table */ + ibool is_temp, /* in: TRUE if created with CREATE + TEMPORARY TABLE */ + ibool mark_space, /* in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist) + /* in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ +{ + fil_system_t* system = fil_system; + fil_space_t* namespace; + fil_space_t* space; + char* path; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + path = fil_make_ibd_name(name, is_temp); + + /* Look if there is a space with the same id */ + + space = fil_space_get_by_id(id); + + /* Look if there is a space with the same name; the name is the + directory path from the datadir to the file */ + + namespace = fil_space_get_by_name(path); + if (space && space == namespace) { + /* Found */ + + if (mark_space) { + space->mark = TRUE; + } + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + if (!print_error_if_does_not_exist) { + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (space == NULL) { + if (namespace == NULL) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary" + " has tablespace id %lu,\n" + "InnoDB: but tablespace with that id" + " or name does not exist. Have\n" + "InnoDB: you deleted or moved .ibd files?\n" + "InnoDB: This may also be a table created with" + " CREATE TEMPORARY TABLE\n" + "InnoDB: whose .ibd and .frm files" + " MySQL automatically removed, but the\n" + "InnoDB: table still exists in the" + " InnoDB internal data dictionary.\n", + (ulong) id); + } else { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary has" + " tablespace id %lu,\n" + "InnoDB: but a tablespace with that id" + " does not exist. There is\n" + "InnoDB: a tablespace of name %s and id %lu," + " though. Have\n" + "InnoDB: you deleted or moved .ibd files?\n", + (ulong) id, namespace->name, + (ulong) namespace->id); + } +error_exit: + fputs("InnoDB: Please refer to\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: for how to resolve the issue.\n", stderr); + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (0 != strcmp(space->name, path)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary has" + " tablespace id %lu,\n" + "InnoDB: but the tablespace with that id" + " has name %s.\n" + "InnoDB: Have you deleted or moved .ibd files?\n", + (ulong) id, space->name); + + if (namespace != NULL) { + fputs("InnoDB: There is a tablespace" + " with the right name\n" + "InnoDB: ", stderr); + ut_print_filename(stderr, namespace->name); + fprintf(stderr, ", but its id is %lu.\n", + (ulong) namespace->id); + } + + goto error_exit; + } + + mem_free(path); + mutex_exit(&(system->mutex)); + + return(FALSE); +} + +/*********************************************************************** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. */ +static +ulint +fil_get_space_id_for_table( +/*=======================*/ + /* out: space id, ULINT_UNDEFINED if not + found */ + const char* name) /* in: table name in the standard + 'databasename/tablename' format */ +{ + fil_system_t* system = fil_system; + fil_space_t* namespace; + ulint id = ULINT_UNDEFINED; + char* path; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + path = fil_make_ibd_name(name, FALSE); + + /* Look if there is a space with the same name; the name is the + directory path to the file */ + + namespace = fil_space_get_by_name(path); + + if (namespace) { + id = namespace->id; + } + + mem_free(path); + + mutex_exit(&(system->mutex)); + + return(id); +} + +/************************************************************************** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. */ +UNIV_INTERN +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + /* out: TRUE if success */ + ulint* actual_size, /* out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /* in: space id */ + ulint size_after_extend)/* in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ulint buf_size; + ulint start_page_no; + ulint file_start_page_no; + ulint offset_high; + ulint offset_low; + ulint page_size; + ibool success = TRUE; + + fil_mutex_enter_and_prepare_for_io(space_id); + + space = fil_space_get_by_id(space_id); + ut_a(space); + + if (space->size >= size_after_extend) { + /* Space already big enough */ + + *actual_size = space->size; + + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + page_size = dict_table_flags_to_zip_size(space->flags); + if (!page_size) { + page_size = UNIV_PAGE_SIZE; + } + + node = UT_LIST_GET_LAST(space->chain); + + fil_node_prepare_for_io(node, system, space); + + start_page_no = space->size; + file_start_page_no = space->size - node->size; + + /* Extend at most 64 pages at a time */ + buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; + buf2 = mem_alloc(buf_size + page_size); + buf = ut_align(buf2, page_size); + + memset(buf, 0, buf_size); + + while (start_page_no < size_after_extend) { + ulint n_pages = ut_min(buf_size / page_size, + size_after_extend - start_page_no); + + offset_high = (start_page_no - file_start_page_no) + / (4096 * ((1024 * 1024) / page_size)); + offset_low = ((start_page_no - file_start_page_no) + % (4096 * ((1024 * 1024) / page_size))) + * page_size; +#ifdef UNIV_HOTBACKUP + success = os_file_write(node->name, node->handle, buf, + offset_low, offset_high, + page_size * n_pages); +#else + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + offset_low, offset_high, + page_size * n_pages, + NULL, NULL); +#endif + if (success) { + node->size += n_pages; + space->size += n_pages; + + os_has_said_disk_full = FALSE; + } else { + /* Let us measure the size of the file to determine + how much we were able to extend it */ + + n_pages = ((ulint) + (os_file_get_size_as_iblonglong( + node->handle) + / page_size)) - node->size; + + node->size += n_pages; + space->size += n_pages; + + break; + } + + start_page_no += n_pages; + } + + mem_free(buf2); + + fil_node_complete_io(node, system, OS_FILE_WRITE); + + *actual_size = space->size; + +#ifndef UNIV_HOTBACKUP + if (space_id == 0) { + ulint pages_per_mb = (1024 * 1024) / page_size; + + /* Keep the last data file size info up to date, rounded to + full megabytes */ + + srv_data_file_sizes[srv_n_data_files - 1] + = (node->size / pages_per_mb) * pages_per_mb; + } +#endif /* !UNIV_HOTBACKUP */ + + /* + printf("Extended %s to %lu, actual size %lu pages\n", space->name, + size_after_extend, *actual_size); */ + mutex_exit(&(system->mutex)); + + fil_flush(space_id); + + return(success); +} + +#ifdef UNIV_HOTBACKUP +/************************************************************************ +Extends all tablespaces to the size stored in the space header. During the +ibbackup --apply-log phase we extended the spaces on-demand so that log records +could be applied, but that may have left spaces still too small compared to +the size stored in the space header. */ +UNIV_INTERN +void +fil_extend_tablespaces_to_stored_len(void) +/*======================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + byte* buf; + ulint actual_size; + ulint size_in_header; + ulint error; + ibool success; + + buf = mem_alloc(UNIV_PAGE_SIZE); + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space) { + ut_a(space->purpose == FIL_TABLESPACE); + + mutex_exit(&(system->mutex)); /* no need to protect with a + mutex, because this is a + single-threaded operation */ + error = fil_read(TRUE, space->id, space->zip_size, + 0, 0, UNIV_PAGE_SIZE, buf, NULL); + ut_a(error == DB_SUCCESS); + + size_in_header = fsp_get_size_low(buf); + + success = fil_extend_space_to_desired_size( + &actual_size, space->id, size_in_header); + if (!success) { + fprintf(stderr, + "InnoDB: Error: could not extend the" + " tablespace of %s\n" + "InnoDB: to the size stored in header," + " %lu pages;\n" + "InnoDB: size after extension %lu pages\n" + "InnoDB: Check that you have free disk space" + " and retry!\n", + space->name, size_in_header, actual_size); + exit(1); + } + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); + + mem_free(buf); +} +#endif + +/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/ + +/*********************************************************************** +Tries to reserve free extents in a file space. */ +UNIV_INTERN +ibool +fil_space_reserve_free_extents( +/*===========================*/ + /* out: TRUE if succeed */ + ulint id, /* in: space id */ + ulint n_free_now, /* in: number of free extents now */ + ulint n_to_reserve) /* in: how many one wants to reserve */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ibool success; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + ut_a(space); + + if (space->n_reserved_extents + n_to_reserve > n_free_now) { + success = FALSE; + } else { + space->n_reserved_extents += n_to_reserve; + success = TRUE; + } + + mutex_exit(&(system->mutex)); + + return(success); +} + +/*********************************************************************** +Releases free extents in a file space. */ +UNIV_INTERN +void +fil_space_release_free_extents( +/*===========================*/ + ulint id, /* in: space id */ + ulint n_reserved) /* in: how many one reserved */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + ut_a(space); + ut_a(space->n_reserved_extents >= n_reserved); + + space->n_reserved_extents -= n_reserved; + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ +UNIV_INTERN +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ulint n; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(id); + + ut_a(space); + + n = space->n_reserved_extents; + + mutex_exit(&(system->mutex)); + + return(n); +} + +/*============================ FILE I/O ================================*/ + +/************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ +static +void +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space) /* in: space */ +{ + ut_ad(node && system && space); + ut_ad(mutex_own(&(system->mutex))); + + if (system->n_open > system->max_n_open + 5) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: open files %lu" + " exceeds the limit %lu\n", + (ulong) system->n_open, + (ulong) system->max_n_open); + } + + if (node->open == FALSE) { + /* File is closed: open it */ + ut_a(node->n_pending == 0); + + fil_node_open_file(node, system, space); + } + + if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE + && space->id != 0) { + /* The node is in the LRU list, remove it */ + + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + UT_LIST_REMOVE(LRU, system->LRU, node); + } + + node->n_pending++; +} + +/************************************************************************ +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type) /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ +{ + ut_ad(node); + ut_ad(system); + ut_ad(mutex_own(&(system->mutex))); + + ut_a(node->n_pending > 0); + + node->n_pending--; + + if (type == OS_FILE_WRITE) { + system->modification_counter++; + node->modification_counter = system->modification_counter; + + if (!node->space->is_in_unflushed_spaces) { + + node->space->is_in_unflushed_spaces = TRUE; + UT_LIST_ADD_FIRST(unflushed_spaces, + system->unflushed_spaces, + node->space); + } + } + + if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE + && node->space->id != 0) { + /* The node must be put back to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } +} + +/************************************************************************ +Report information about an invalid page access. */ +static +void +fil_report_invalid_page_access( +/*===========================*/ + ulint block_offset, /* in: block offset */ + ulint space_id, /* in: space id */ + const char* space_name, /* in: space name */ + ulint byte_offset, /* in: byte offset */ + ulint len, /* in: I/O length */ + ulint type) /* in: I/O type */ +{ + fprintf(stderr, + "InnoDB: Error: trying to access page number %lu" + " in space %lu,\n" + "InnoDB: space name %s,\n" + "InnoDB: which is outside the tablespace bounds.\n" + "InnoDB: Byte offset %lu, len %lu, i/o type %lu.\n" + "InnoDB: If you get this error at mysqld startup," + " please check that\n" + "InnoDB: your my.cnf matches the ibdata files" + " that you have in the\n" + "InnoDB: MySQL server.\n", + (ulong) block_offset, (ulong) space_id, space_name, + (ulong) byte_offset, (ulong) len, (ulong) type); +} + +/************************************************************************ +Reads or writes data. This operation is asynchronous (aio). */ +UNIV_INTERN +ulint +fil_io( +/*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ + ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE, + ORed to OS_FILE_LOG, if a log i/o + and ORed to OS_AIO_SIMULATED_WAKE_LATER + if simulated aio and we want to post a + batch of i/os; NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + ibool sync, /* in: TRUE if synchronous aio is desired */ + ulint space_id, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /* in: offset in number of blocks */ + ulint byte_offset, /* in: remainder of offset in bytes; in + aio this must be divisible by the OS block + size */ + ulint len, /* in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ + void* buf, /* in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ + void* message) /* in: message for aio handler if non-sync + aio used, else ignored */ +{ + fil_system_t* system = fil_system; + ulint mode; + fil_space_t* space; + fil_node_t* node; + ulint offset_high; + ulint offset_low; + ibool ret; + ulint is_log; + ulint wake_later; + + is_log = type & OS_FILE_LOG; + type = type & ~OS_FILE_LOG; + + wake_later = type & OS_AIO_SIMULATED_WAKE_LATER; + type = type & ~OS_AIO_SIMULATED_WAKE_LATER; + + ut_ad(byte_offset < UNIV_PAGE_SIZE); + ut_ad(!zip_size || !byte_offset); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(buf); + ut_ad(len > 0); +#if (1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE +# error "(1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE" +#endif + ut_ad(fil_validate()); +#ifndef UNIV_LOG_DEBUG + /* ibuf bitmap pages must be read in the sync aio mode: */ + ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE) + || !ibuf_bitmap_page(zip_size, block_offset) + || sync || is_log); + ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE) + || ibuf_page(space_id, zip_size, block_offset, NULL)); +#endif + if (sync) { + mode = OS_AIO_SYNC; + } else if (is_log) { + mode = OS_AIO_LOG; + } else if (type == OS_FILE_READ + && !recv_no_ibuf_operations + && ibuf_page(space_id, zip_size, block_offset, NULL)) { + mode = OS_AIO_IBUF; + } else { + mode = OS_AIO_NORMAL; + } + + if (type == OS_FILE_READ) { + srv_data_read+= len; + } else if (type == OS_FILE_WRITE) { + srv_data_written+= len; + } + + /* Reserve the fil_system mutex and make sure that we can open at + least one file while holding it, if the file is not already open */ + + fil_mutex_enter_and_prepare_for_io(space_id); + + space = fil_space_get_by_id(space_id); + + if (!space) { + mutex_exit(&(system->mutex)); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: trying to do i/o" + " to a tablespace which does not exist.\n" + "InnoDB: i/o type %lu, space id %lu," + " page no. %lu, i/o length %lu bytes\n", + (ulong) type, (ulong) space_id, (ulong) block_offset, + (ulong) len); + + return(DB_TABLESPACE_DELETED); + } + + ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE)); + + node = UT_LIST_GET_FIRST(space->chain); + + for (;;) { + if (UNIV_UNLIKELY(node == NULL)) { + fil_report_invalid_page_access( + block_offset, space_id, space->name, + byte_offset, len, type); + + ut_error; + } + + if (space->id != 0 && node->size == 0) { + /* We do not know the size of a single-table tablespace + before we open the file */ + + break; + } + + if (node->size > block_offset) { + /* Found! */ + break; + } else { + block_offset -= node->size; + node = UT_LIST_GET_NEXT(chain, node); + } + } + + /* Open file if closed */ + fil_node_prepare_for_io(node, system, space); + + /* Check that at least the start offset is within the bounds of a + single-table tablespace */ + if (UNIV_UNLIKELY(node->size <= block_offset) + && space->id != 0 && space->purpose == FIL_TABLESPACE) { + + fil_report_invalid_page_access( + block_offset, space_id, space->name, byte_offset, + len, type); + + ut_error; + } + + /* Now we have made the changes in the data structures of system */ + mutex_exit(&(system->mutex)); + + /* Calculate the low 32 bits and the high 32 bits of the file offset */ + + if (!zip_size) { + offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT)); + offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) + & 0xFFFFFFFFUL) + byte_offset; + + ut_a(node->size - block_offset + >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1)) + / UNIV_PAGE_SIZE)); + } else { + ulint zip_size_shift; + switch (zip_size) { + case 1024: zip_size_shift = 10; break; + case 2048: zip_size_shift = 11; break; + case 4096: zip_size_shift = 12; break; + case 8192: zip_size_shift = 13; break; + case 16384: zip_size_shift = 14; break; + default: ut_error; + } + offset_high = block_offset >> (32 - zip_size_shift); + offset_low = (block_offset << zip_size_shift & 0xFFFFFFFFUL) + + byte_offset; + ut_a(node->size - block_offset + >= (len + (zip_size - 1)) / zip_size); + } + + /* Do aio */ + + ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); + +#ifdef UNIV_HOTBACKUP + /* In ibbackup do normal i/o, not aio */ + if (type == OS_FILE_READ) { + ret = os_file_read(node->handle, buf, offset_low, offset_high, + len); + } else { + ret = os_file_write(node->name, node->handle, buf, + offset_low, offset_high, len); + } +#else + /* Queue the aio request */ + ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, + offset_low, offset_high, len, node, message); +#endif + ut_a(ret); + + if (mode == OS_AIO_SYNC) { + /* The i/o operation is already completed when we return from + os_aio: */ + + mutex_enter(&(system->mutex)); + + fil_node_complete_io(node, system, type); + + mutex_exit(&(system->mutex)); + + ut_ad(fil_validate()); + } + + return(DB_SUCCESS); +} + +/************************************************************************** +Waits for an aio operation to complete. This function is used to write the +handler for completed requests. The aio array of pending requests is divided +into segments (see os0file.c for more info). The thread specifies which +segment it wants to wait for. */ +UNIV_INTERN +void +fil_aio_wait( +/*=========*/ + ulint segment) /* in: the number of the segment in the aio + array to wait for */ +{ + fil_system_t* system = fil_system; + ibool ret; + fil_node_t* fil_node; + void* message; + ulint type; + + ut_ad(fil_validate()); + + if (os_aio_use_native_aio) { + srv_set_io_thread_op_info(segment, "native aio handle"); +#ifdef WIN_ASYNC_IO + ret = os_aio_windows_handle(segment, 0, &fil_node, + &message, &type); +#else + ret = 0; /* Eliminate compiler warning */ + ut_error; +#endif + } else { + srv_set_io_thread_op_info(segment, "simulated aio handle"); + + ret = os_aio_simulated_handle(segment, &fil_node, + &message, &type); + } + + ut_a(ret); + + srv_set_io_thread_op_info(segment, "complete io for fil node"); + + mutex_enter(&(system->mutex)); + + fil_node_complete_io(fil_node, fil_system, type); + + mutex_exit(&(system->mutex)); + + ut_ad(fil_validate()); + + /* Do the i/o handling */ + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in tablespace 0, you have to be very careful not to introduce + deadlocks in the i/o system. We keep tablespace 0 data files always + open, and use a special i/o thread to serve insert buffer requests. */ + + if (fil_node->space->purpose == FIL_TABLESPACE) { + srv_set_io_thread_op_info(segment, "complete io for buf page"); + buf_page_io_complete(message); + } else { + srv_set_io_thread_op_info(segment, "complete io for log"); + log_io_complete(message); + } +} + +/************************************************************************** +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ +UNIV_INTERN +void +fil_flush( +/*======*/ + ulint space_id) /* in: file space id (this can be a group of + log files or a tablespace of the database) */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + os_file_t file; + ib_int64_t old_mod_counter; + + mutex_enter(&(system->mutex)); + + space = fil_space_get_by_id(space_id); + + if (!space || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return; + } + + space->n_pending_flushes++; /* prevent dropping of the space while + we are flushing */ + node = UT_LIST_GET_FIRST(space->chain); + + while (node) { + if (node->modification_counter > node->flush_counter) { + ut_a(node->open); + + /* We want to flush the changes at least up to + old_mod_counter */ + old_mod_counter = node->modification_counter; + + if (space->purpose == FIL_TABLESPACE) { + fil_n_pending_tablespace_flushes++; + } else { + fil_n_pending_log_flushes++; + fil_n_log_flushes++; + } +#ifdef __WIN__ + if (node->is_raw_disk) { + + goto skip_flush; + } +#endif +retry: + if (node->n_pending_flushes > 0) { + /* We want to avoid calling os_file_flush() on + the file twice at the same time, because we do + not know what bugs OS's may contain in file + i/o; sleep for a while */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + mutex_enter(&(system->mutex)); + + if (node->flush_counter >= old_mod_counter) { + + goto skip_flush; + } + + goto retry; + } + + ut_a(node->open); + file = node->handle; + node->n_pending_flushes++; + + mutex_exit(&(system->mutex)); + + /* fprintf(stderr, "Flushing to file %s\n", + node->name); */ + + os_file_flush(file); + + mutex_enter(&(system->mutex)); + + node->n_pending_flushes--; +skip_flush: + if (node->flush_counter < old_mod_counter) { + node->flush_counter = old_mod_counter; + + if (space->is_in_unflushed_spaces + && fil_space_is_flushed(space)) { + + space->is_in_unflushed_spaces = FALSE; + + UT_LIST_REMOVE( + unflushed_spaces, + system->unflushed_spaces, + space); + } + } + + if (space->purpose == FIL_TABLESPACE) { + fil_n_pending_tablespace_flushes--; + } else { + fil_n_pending_log_flushes--; + } + } + + node = UT_LIST_GET_NEXT(chain, node); + } + + space->n_pending_flushes--; + + mutex_exit(&(system->mutex)); +} + +/************************************************************************** +Flushes to disk the writes in file spaces of the given type possibly cached by +the OS. */ +UNIV_INTERN +void +fil_flush_file_spaces( +/*==================*/ + ulint purpose) /* in: FIL_TABLESPACE, FIL_LOG */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ulint* space_ids; + ulint n_space_ids; + ulint i; + + mutex_enter(&(system->mutex)); + + n_space_ids = UT_LIST_GET_LEN(system->unflushed_spaces); + if (n_space_ids == 0) { + + mutex_exit(&system->mutex); + return; + } + + /* Assemble a list of space ids to flush. Previously, we + traversed system->unflushed_spaces and called UT_LIST_GET_NEXT() + on a space that was just removed from the list by fil_flush(). + Thus, the space could be dropped and the memory overwritten. */ + space_ids = mem_alloc(n_space_ids * sizeof *space_ids); + + n_space_ids = 0; + + for (space = UT_LIST_GET_FIRST(system->unflushed_spaces); + space; + space = UT_LIST_GET_NEXT(unflushed_spaces, space)) { + + if (space->purpose == purpose && !space->is_being_deleted) { + + space_ids[n_space_ids++] = space->id; + } + } + + mutex_exit(&system->mutex); + + /* Flush the spaces. It will not hurt to call fil_flush() on + a non-existing space id. */ + for (i = 0; i < n_space_ids; i++) { + + fil_flush(space_ids[i]); + } + + mem_free(space_ids); +} + +/********************************************************************** +Checks the consistency of the tablespace cache. */ +UNIV_INTERN +ibool +fil_validate(void) +/*==============*/ + /* out: TRUE if ok */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* fil_node; + ulint n_open = 0; + ulint i; + + mutex_enter(&(system->mutex)); + + /* Look for spaces in the hash table */ + + for (i = 0; i < hash_get_n_cells(system->spaces); i++) { + + space = HASH_GET_FIRST(system->spaces, i); + + while (space != NULL) { + UT_LIST_VALIDATE(chain, fil_node_t, space->chain); + + fil_node = UT_LIST_GET_FIRST(space->chain); + + while (fil_node != NULL) { + if (fil_node->n_pending > 0) { + ut_a(fil_node->open); + } + + if (fil_node->open) { + n_open++; + } + fil_node = UT_LIST_GET_NEXT(chain, fil_node); + } + space = HASH_GET_NEXT(hash, space); + } + } + + ut_a(system->n_open == n_open); + + UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU); + + fil_node = UT_LIST_GET_FIRST(system->LRU); + + while (fil_node != NULL) { + ut_a(fil_node->n_pending == 0); + ut_a(fil_node->open); + ut_a(fil_node->space->purpose == FIL_TABLESPACE); + ut_a(fil_node->space->id != 0); + + fil_node = UT_LIST_GET_NEXT(LRU, fil_node); + } + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/************************************************************************ +Returns TRUE if file address is undefined. */ +UNIV_INTERN +ibool +fil_addr_is_null( +/*=============*/ + /* out: TRUE if undefined */ + fil_addr_t addr) /* in: address */ +{ + return(addr.page == FIL_NULL); +} + +/************************************************************************ +Accessor functions for a file page */ +UNIV_INTERN +ulint +fil_page_get_prev(const byte* page) +{ + return(mach_read_from_4(page + FIL_PAGE_PREV)); +} + +UNIV_INTERN +ulint +fil_page_get_next(const byte* page) +{ + return(mach_read_from_4(page + FIL_PAGE_NEXT)); +} + +/************************************************************************* +Sets the file page type. */ +UNIV_INTERN +void +fil_page_set_type( +/*==============*/ + byte* page, /* in: file page */ + ulint type) /* in: type */ +{ + ut_ad(page); + + mach_write_to_2(page + FIL_PAGE_TYPE, type); +} + +/************************************************************************* +Gets the file page type. */ +UNIV_INTERN +ulint +fil_page_get_type( +/*==============*/ + /* out: type; NOTE that if the type + has not been written to page, the return value + not defined */ + const byte* page) /* in: file page */ +{ + ut_ad(page); + + return(mach_read_from_2(page + FIL_PAGE_TYPE)); +} + +/************************************************************************* +Return local hash table informations. */ + +ulint +fil_system_hash_cells(void) +/*=======================*/ +{ + if (fil_system) { + return (fil_system->spaces->n_cells + + fil_system->name_hash->n_cells); + } else { + return 0; + } +} + +ulint +fil_system_hash_nodes(void) +/*=======================*/ +{ + if (fil_system) { + return (UT_LIST_GET_LEN(fil_system->space_list) + * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE)); + } else { + return 0; + } +} diff --git a/storage/xtradb/fsp/fsp0fsp.c b/storage/xtradb/fsp/fsp0fsp.c new file mode 100644 index 00000000000..25d260daeea --- /dev/null +++ b/storage/xtradb/fsp/fsp0fsp.c @@ -0,0 +1,4284 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +File space management + +Created 11/29/1995 Heikki Tuuri +***********************************************************************/ + +#include "fsp0fsp.h" + +#ifdef UNIV_NONINL +#include "fsp0fsp.ic" +#endif + +#include "buf0buf.h" +#include "fil0fil.h" +#include "sync0sync.h" +#include "mtr0log.h" +#include "fut0fut.h" +#include "ut0byte.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "ibuf0ibuf.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "dict0boot.h" +#include "dict0mem.h" +#include "log0log.h" + + +#define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header + within a file page */ + +/* The data structures in files are defined just as byte strings in C */ +typedef byte fsp_header_t; +typedef byte xdes_t; + +/* SPACE HEADER + ============ + +File space header data structure: this data structure is contained in the +first page of a space. The space for this header is reserved in every extent +descriptor page, but used only in the first. */ + +/*-------------------------------------*/ +#define FSP_SPACE_ID 0 /* space id */ +#define FSP_NOT_USED 4 /* this field contained a value up to + which we know that the modifications + in the database have been flushed to + the file space; not used now */ +#define FSP_SIZE 8 /* Current size of the space in + pages */ +#define FSP_FREE_LIMIT 12 /* Minimum page number for which the + free list has not been initialized: + the pages >= this limit are, by + definition, free; note that in a + single-table tablespace where size + < 64 pages, this number is 64, i.e., + we have initialized the space + about the first extent, but have not + physically allocted those pages to the + file */ +#define FSP_SPACE_FLAGS 16 /* table->flags & ~DICT_TF_COMPACT */ +#define FSP_FRAG_N_USED 20 /* number of used pages in the + FSP_FREE_FRAG list */ +#define FSP_FREE 24 /* list of free extents */ +#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE) + /* list of partially free extents not + belonging to any segment */ +#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents not belonging + to any segment */ +#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE) + /* 8 bytes which give the first unused + segment id */ +#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where all the segment inode + slots are reserved */ +#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where not all the segment + header slots are reserved */ +/*-------------------------------------*/ +/* File space header size */ +#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE) + +#define FSP_FREE_ADD 4 /* this many free extents are added + to the free list from above + FSP_FREE_LIMIT at a time */ + +/* FILE SEGMENT INODE + ================== + +Segment inode which is created for each segment in a tablespace. NOTE: in +purge we assume that a segment having only one currently used page can be +freed in a few steps, so that the freeing cannot fill the file buffer with +bufferfixed file pages. */ + +typedef byte fseg_inode_t; + +#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA + /* the list node for linking + segment inode pages */ + +#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE) +/*-------------------------------------*/ +#define FSEG_ID 0 /* 8 bytes of segment id: if this is + ut_dulint_zero, it means that the + header is unused */ +#define FSEG_NOT_FULL_N_USED 8 + /* number of used segment pages in + the FSEG_NOT_FULL list */ +#define FSEG_FREE 12 + /* list of free extents of this + segment */ +#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE) + /* list of partially free extents */ +#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents */ +#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE) + /* magic number used in debugging */ +#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE) + /* array of individual pages + belonging to this segment in fsp + fragment extent lists */ +#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2) + /* number of slots in the array for + the fragment pages */ +#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its + page number within space, FIL_NULL + means that the slot is not in use */ +/*-------------------------------------*/ +#define FSEG_INODE_SIZE \ + (16 + 3 * FLST_BASE_NODE_SIZE \ + + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE) + +#define FSP_SEG_INODES_PER_PAGE(zip_size) \ + (((zip_size ? zip_size : UNIV_PAGE_SIZE) \ + - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE) + /* Number of segment inodes which fit on a + single page */ + +#define FSEG_MAGIC_N_VALUE 97937874 + +#define FSEG_FILLFACTOR 8 /* If this value is x, then if + the number of unused but reserved + pages in a segment is less than + reserved pages * 1/x, and there are + at least FSEG_FRAG_LIMIT used pages, + then we allow a new empty extent to + be added to the segment in + fseg_alloc_free_page. Otherwise, we + use unused pages of the segment. */ + +#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS + /* If the segment has >= this many + used pages, it may be expanded by + allocating extents to the segment; + until that only individual fragment + pages are allocated from the space */ + +#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment + is at least this many extents, we + allow extents to be put to the free + list of the extent: at most + FSEG_FREE_LIST_MAX_LEN many */ +#define FSEG_FREE_LIST_MAX_LEN 4 + + +/* EXTENT DESCRIPTOR + ================= + +File extent descriptor data structure: contains bits to tell which pages in +the extent are free and which contain old tuple version to clean. */ + +/*-------------------------------------*/ +#define XDES_ID 0 /* The identifier of the segment + to which this extent belongs */ +#define XDES_FLST_NODE 8 /* The list node data structure + for the descriptors */ +#define XDES_STATE (FLST_NODE_SIZE + 8) + /* contains state information + of the extent */ +#define XDES_BITMAP (FLST_NODE_SIZE + 12) + /* Descriptor bitmap of the pages + in the extent */ +/*-------------------------------------*/ + +#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */ +#define XDES_FREE_BIT 0 /* Index of the bit which tells if + the page is free */ +#define XDES_CLEAN_BIT 1 /* NOTE: currently not used! + Index of the bit which tells if + there are old versions of tuples + on the page */ +/* States of a descriptor */ +#define XDES_FREE 1 /* extent is in free list of space */ +#define XDES_FREE_FRAG 2 /* extent is in free fragment list of + space */ +#define XDES_FULL_FRAG 3 /* extent is in full fragment list of + space */ +#define XDES_FSEG 4 /* extent belongs to a segment */ + +/* File extent data structure size in bytes. */ +#define XDES_SIZE \ + (XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE)) + +/* Offset of the descriptor array on a descriptor page */ +#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE) + +/************************************************************************** +Returns an extent to the free list of a space. */ +static +void +fsp_free_extent( +/*============*/ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: page offset in the extent */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Frees an extent of a segment to the space free list. */ +static +void +fseg_free_extent( +/*=============*/ + fseg_inode_t* seg_inode, /* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: page offset in the extent */ + mtr_t* mtr); /* in: mtr handle */ +/************************************************************************** +Calculates the number of pages reserved by a segment, and how +many pages are currently used. */ +static +ulint +fseg_n_reserved_pages_low( +/*======================*/ + /* out: number of reserved pages */ + fseg_inode_t* header, /* in: segment inode */ + ulint* used, /* out: number of pages used (<= reserved) */ + mtr_t* mtr); /* in: mtr handle */ +/************************************************************************ +Marks a page used. The page must reside within the extents of the given +segment. */ +static +void +fseg_mark_page_used( +/*================*/ + fseg_inode_t* seg_inode,/* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: page offset */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Returns the first extent descriptor for a segment. We think of the extent +lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL +-> FSEG_FREE. */ +static +xdes_t* +fseg_get_first_extent( +/*==================*/ + /* out: the first extent descriptor, or NULL if + none */ + fseg_inode_t* inode, /* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Puts new extents to the free list if +there are free extents above the free limit. If an extent happens +to contain an extent descriptor page, the extent is put to +the FSP_FREE_FRAG list with the page marked as used. */ +static +void +fsp_fill_free_list( +/*===============*/ + ibool init_space, /* in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. */ +static +ulint +fseg_alloc_free_page_low( +/*=====================*/ + /* out: the allocated page number, FIL_NULL + if no page could be allocated */ + ulint space, /* in: space */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + fseg_inode_t* seg_inode, /* in: segment inode */ + ulint hint, /* in: hint of which page would be desirable */ + byte direction, /* in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr); /* in: mtr handle */ + + +/************************************************************************** +Reads the file space size stored in the header page. */ +UNIV_INTERN +ulint +fsp_get_size_low( +/*=============*/ + /* out: tablespace size stored in the space header */ + page_t* page) /* in: header page (page 0 in the tablespace) */ +{ + return(mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SIZE)); +} + +/************************************************************************** +Gets a pointer to the space header and x-locks its page. */ +UNIV_INLINE +fsp_header_t* +fsp_get_space_header( +/*=================*/ + /* out: pointer to the space header, page x-locked */ + ulint id, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + fsp_header_t* header; + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + ut_ad(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE); + ut_ad(id || !zip_size); + + block = buf_page_get(id, zip_size, 0, RW_X_LATCH, mtr); + header = FSP_HEADER_OFFSET + buf_block_get_frame(block); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + ut_ad(id == mach_read_from_4(FSP_SPACE_ID + header)); + ut_ad(zip_size == dict_table_flags_to_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + header))); + return(header); +} + +/************************************************************************** +Gets a descriptor bit of a page. */ +UNIV_INLINE +ibool +xdes_get_bit( +/*=========*/ + /* out: TRUE if free */ + xdes_t* descr, /* in: descriptor */ + ulint bit, /* in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset, /* in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ + mtr_t* mtr) /* in: mtr */ +{ + ulint index; + ulint byte_index; + ulint bit_index; + + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT)); + ut_ad(offset < FSP_EXTENT_SIZE); + + index = bit + XDES_BITS_PER_PAGE * offset; + + byte_index = index / 8; + bit_index = index % 8; + + return(ut_bit_get_nth(mtr_read_ulint(descr + XDES_BITMAP + byte_index, + MLOG_1BYTE, mtr), + bit_index)); +} + +/************************************************************************** +Sets a descriptor bit of a page. */ +UNIV_INLINE +void +xdes_set_bit( +/*=========*/ + xdes_t* descr, /* in: descriptor */ + ulint bit, /* in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ulint offset, /* in: page offset within extent: + 0 ... FSP_EXTENT_SIZE - 1 */ + ibool val, /* in: bit value */ + mtr_t* mtr) /* in: mtr */ +{ + ulint index; + ulint byte_index; + ulint bit_index; + ulint descr_byte; + + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT)); + ut_ad(offset < FSP_EXTENT_SIZE); + + index = bit + XDES_BITS_PER_PAGE * offset; + + byte_index = index / 8; + bit_index = index % 8; + + descr_byte = mtr_read_ulint(descr + XDES_BITMAP + byte_index, + MLOG_1BYTE, mtr); + descr_byte = ut_bit_set_nth(descr_byte, bit_index, val); + + mlog_write_ulint(descr + XDES_BITMAP + byte_index, descr_byte, + MLOG_1BYTE, mtr); +} + +/************************************************************************** +Looks for a descriptor bit having the desired value. Starts from hint +and scans upward; at the end of the extent the search is wrapped to +the start of the extent. */ +UNIV_INLINE +ulint +xdes_find_bit( +/*==========*/ + /* out: bit index of the bit, ULINT_UNDEFINED if not + found */ + xdes_t* descr, /* in: descriptor */ + ulint bit, /* in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ibool val, /* in: desired bit value */ + ulint hint, /* in: hint of which bit position would be desirable */ + mtr_t* mtr) /* in: mtr */ +{ + ulint i; + + ut_ad(descr && mtr); + ut_ad(val <= TRUE); + ut_ad(hint < FSP_EXTENT_SIZE); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + for (i = hint; i < FSP_EXTENT_SIZE; i++) { + if (val == xdes_get_bit(descr, bit, i, mtr)) { + + return(i); + } + } + + for (i = 0; i < hint; i++) { + if (val == xdes_get_bit(descr, bit, i, mtr)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Looks for a descriptor bit having the desired value. Scans the extent in +a direction opposite to xdes_find_bit. */ +UNIV_INLINE +ulint +xdes_find_bit_downward( +/*===================*/ + /* out: bit index of the bit, ULINT_UNDEFINED if not + found */ + xdes_t* descr, /* in: descriptor */ + ulint bit, /* in: XDES_FREE_BIT or XDES_CLEAN_BIT */ + ibool val, /* in: desired bit value */ + ulint hint, /* in: hint of which bit position would be desirable */ + mtr_t* mtr) /* in: mtr */ +{ + ulint i; + + ut_ad(descr && mtr); + ut_ad(val <= TRUE); + ut_ad(hint < FSP_EXTENT_SIZE); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + for (i = hint + 1; i > 0; i--) { + if (val == xdes_get_bit(descr, bit, i - 1, mtr)) { + + return(i - 1); + } + } + + for (i = FSP_EXTENT_SIZE - 1; i > hint; i--) { + if (val == xdes_get_bit(descr, bit, i, mtr)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Returns the number of used pages in a descriptor. */ +UNIV_INLINE +ulint +xdes_get_n_used( +/*============*/ + /* out: number of pages used */ + xdes_t* descr, /* in: descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + ulint i; + ulint count = 0; + + ut_ad(descr && mtr); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + for (i = 0; i < FSP_EXTENT_SIZE; i++) { + if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) { + count++; + } + } + + return(count); +} + +/************************************************************************** +Returns true if extent contains no used pages. */ +UNIV_INLINE +ibool +xdes_is_free( +/*=========*/ + /* out: TRUE if totally free */ + xdes_t* descr, /* in: descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + if (0 == xdes_get_n_used(descr, mtr)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************** +Returns true if extent contains no free pages. */ +UNIV_INLINE +ibool +xdes_is_full( +/*=========*/ + /* out: TRUE if full */ + xdes_t* descr, /* in: descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************** +Sets the state of an xdes. */ +UNIV_INLINE +void +xdes_set_state( +/*===========*/ + xdes_t* descr, /* in: descriptor */ + ulint state, /* in: state to set */ + mtr_t* mtr) /* in: mtr handle */ +{ + ut_ad(descr && mtr); + ut_ad(state >= XDES_FREE); + ut_ad(state <= XDES_FSEG); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + + mlog_write_ulint(descr + XDES_STATE, state, MLOG_4BYTES, mtr); +} + +/************************************************************************** +Gets the state of an xdes. */ +UNIV_INLINE +ulint +xdes_get_state( +/*===========*/ + /* out: state */ + xdes_t* descr, /* in: descriptor */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint state; + + ut_ad(descr && mtr); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + + state = mtr_read_ulint(descr + XDES_STATE, MLOG_4BYTES, mtr); + ut_ad(state - 1 < XDES_FSEG); + return(state); +} + +/************************************************************************** +Inits an extent descriptor to the free and clean state. */ +UNIV_INLINE +void +xdes_init( +/*======*/ + xdes_t* descr, /* in: descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + ulint i; + + ut_ad(descr && mtr); + ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX)); + ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0); + + for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) { + mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr); + } + + xdes_set_state(descr, XDES_FREE, mtr); +} + +/************************************************************************ +Calculates the page where the descriptor of a page resides. */ +UNIV_INLINE +ulint +xdes_calc_descriptor_page( +/*======================*/ + /* out: descriptor page offset */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /* in: page offset */ +{ +#if UNIV_PAGE_SIZE <= XDES_ARR_OFFSET \ + + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE +# error +#endif +#if PAGE_ZIP_MIN_SIZE <= XDES_ARR_OFFSET \ + + (PAGE_ZIP_MIN_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE +# error +#endif + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return(ut_2pow_round(offset, UNIV_PAGE_SIZE)); + } else { + ut_ad(zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return(ut_2pow_round(offset, zip_size)); + } +} + +/************************************************************************ +Calculates the descriptor index within a descriptor page. */ +UNIV_INLINE +ulint +xdes_calc_descriptor_index( +/*=======================*/ + /* out: descriptor index */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint offset) /* in: page offset */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE) + / FSP_EXTENT_SIZE); + } else { + return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE); + } +} + +/************************************************************************ +Gets pointer to a the extent descriptor of a page. The page where the extent +descriptor resides is x-locked. If the page offset is equal to the free limit +of the space, adds new extents from above the free limit to the space free +list, if not free limit == space size. This adding is necessary to make the +descriptor defined, as they are uninitialized above the free limit. */ +UNIV_INLINE +xdes_t* +xdes_get_descriptor_with_space_hdr( +/*===============================*/ + /* out: pointer to the extent descriptor, + NULL if the page does not exist in the + space or if offset > free limit */ + fsp_header_t* sp_header,/* in: space header, x-latched */ + ulint space, /* in: space id */ + ulint offset, /* in: page offset; + if equal to the free limit, + we try to add new extents to + the space free list */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint limit; + ulint size; + ulint zip_size; + ulint descr_page_no; + page_t* descr_page; + + ut_ad(mtr); + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET); + /* Read free limit and space size */ + limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT); + size = mach_read_from_4(sp_header + FSP_SIZE); + zip_size = dict_table_flags_to_zip_size( + mach_read_from_4(sp_header + FSP_SPACE_FLAGS)); + + /* If offset is >= size or > limit, return NULL */ + + if ((offset >= size) || (offset > limit)) { + + return(NULL); + } + + /* If offset is == limit, fill free list of the space. */ + + if (offset == limit) { + fsp_fill_free_list(FALSE, space, sp_header, mtr); + } + + descr_page_no = xdes_calc_descriptor_page(zip_size, offset); + + if (descr_page_no == 0) { + /* It is on the space header page */ + + descr_page = page_align(sp_header); + } else { + buf_block_t* block; + + block = buf_page_get(space, zip_size, descr_page_no, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + descr_page = buf_block_get_frame(block); + } + + return(descr_page + XDES_ARR_OFFSET + + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)); +} + +/************************************************************************ +Gets pointer to a the extent descriptor of a page. The page where the +extent descriptor resides is x-locked. If the page offset is equal to +the free limit of the space, adds new extents from above the free limit +to the space free list, if not free limit == space size. This adding +is necessary to make the descriptor defined, as they are uninitialized +above the free limit. */ +static +xdes_t* +xdes_get_descriptor( +/*================*/ + /* out: pointer to the extent descriptor, NULL if the + page does not exist in the space or if offset > free + limit */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint offset, /* in: page offset; if equal to the free limit, + we try to add new extents to the space free list */ + mtr_t* mtr) /* in: mtr handle */ +{ + buf_block_t* block; + fsp_header_t* sp_header; + + block = buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + sp_header = FSP_HEADER_OFFSET + buf_block_get_frame(block); + return(xdes_get_descriptor_with_space_hdr(sp_header, space, offset, + mtr)); +} + +/************************************************************************ +Gets pointer to a the extent descriptor if the file address +of the descriptor list node is known. The page where the +extent descriptor resides is x-locked. */ +UNIV_INLINE +xdes_t* +xdes_lst_get_descriptor( +/*====================*/ + /* out: pointer to the extent descriptor */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t lst_node,/* in: file address of the list node + contained in the descriptor */ + mtr_t* mtr) /* in: mtr handle */ +{ + xdes_t* descr; + + ut_ad(mtr); + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), + MTR_MEMO_X_LOCK)); + descr = fut_get_ptr(space, zip_size, lst_node, RW_X_LATCH, mtr) + - XDES_FLST_NODE; + + return(descr); +} + +/************************************************************************ +Returns page offset of the first page in extent described by a descriptor. */ +UNIV_INLINE +ulint +xdes_get_offset( +/*============*/ + /* out: offset of the first page in extent */ + xdes_t* descr) /* in: extent descriptor */ +{ + ut_ad(descr); + + return(page_get_page_no(page_align(descr)) + + ((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) + * FSP_EXTENT_SIZE); +} + +/*************************************************************** +Inits a file page whose prior contents should be ignored. */ +static +void +fsp_init_file_page_low( +/*===================*/ + buf_block_t* block) /* in: pointer to a page */ +{ + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + block->check_index_page_at_flush = FALSE; + + if (UNIV_LIKELY_NULL(page_zip)) { + memset(page, 0, UNIV_PAGE_SIZE); + memset(page_zip->data, 0, page_zip_get_size(page_zip)); + mach_write_to_4(page + FIL_PAGE_OFFSET, + buf_block_get_page_no(block)); + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + buf_block_get_space(block)); + memcpy(page_zip->data + FIL_PAGE_OFFSET, + page + FIL_PAGE_OFFSET, 4); + memcpy(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4); + return; + } + +#ifdef UNIV_BASIC_LOG_DEBUG + memset(page, 0xff, UNIV_PAGE_SIZE); +#endif + mach_write_to_4(page + FIL_PAGE_OFFSET, buf_block_get_page_no(block)); + memset(page + FIL_PAGE_LSN, 0, 8); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + buf_block_get_space(block)); + memset(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, 0, 8); +} + +/*************************************************************** +Inits a file page whose prior contents should be ignored. */ +static +void +fsp_init_file_page( +/*===============*/ + buf_block_t* block, /* in: pointer to a page */ + mtr_t* mtr) /* in: mtr */ +{ + fsp_init_file_page_low(block); + + mlog_write_initial_log_record(buf_block_get_frame(block), + MLOG_INIT_FILE_PAGE, mtr); +} + +/*************************************************************** +Parses a redo log record of a file page init. */ +UNIV_INTERN +byte* +fsp_parse_init_file_page( +/*=====================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + buf_block_t* block) /* in: block or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (block) { + fsp_init_file_page_low(block); + } + + return(ptr); +} + +/************************************************************************** +Initializes the fsp system. */ +UNIV_INTERN +void +fsp_init(void) +/*==========*/ +{ + /* Does nothing at the moment */ +} + +/************************************************************************** +Writes the space id and compressed page size to a tablespace header. +This function is used past the buffer pool when we in fil0fil.c create +a new single-table tablespace. */ +UNIV_INTERN +void +fsp_header_init_fields( +/*===================*/ + page_t* page, /* in/out: first page in the space */ + ulint space_id, /* in: space id */ + ulint flags) /* in: tablespace flags (FSP_SPACE_FLAGS): + 0, or table->flags if newer than COMPACT */ +{ + /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for + ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and + ROW_FORMAT=REDUNDANT (table->flags == 0). For any other + format, the tablespace flags should equal table->flags. */ + ut_a(flags != DICT_TF_COMPACT); + + mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page, + space_id); + mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page, + flags); +} + +/************************************************************************** +Initializes the space header of a new created space and creates also the +insert buffer tree root if space == 0. */ +UNIV_INTERN +void +fsp_header_init( +/*============*/ + ulint space, /* in: space id */ + ulint size, /* in: current size in blocks */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + fsp_header_t* header; + buf_block_t* block; + page_t* page; + ulint flags; + ulint zip_size; + + ut_ad(mtr); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + + zip_size = dict_table_flags_to_zip_size(flags); + block = buf_page_create(space, 0, zip_size, mtr); + buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + /* The prior contents of the file page should be ignored */ + + fsp_init_file_page(block, mtr); + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR, + MLOG_2BYTES, mtr); + + header = FSP_HEADER_OFFSET + page; + + mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_NOT_USED, 0, MLOG_4BYTES, mtr); + + mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_SPACE_FLAGS, flags, + MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, 0, MLOG_4BYTES, mtr); + + flst_init(header + FSP_FREE, mtr); + flst_init(header + FSP_FREE_FRAG, mtr); + flst_init(header + FSP_FULL_FRAG, mtr); + flst_init(header + FSP_SEG_INODES_FULL, mtr); + flst_init(header + FSP_SEG_INODES_FREE, mtr); + + mlog_write_dulint(header + FSP_SEG_ID, ut_dulint_create(0, 1), mtr); + if (space == 0) { + fsp_fill_free_list(FALSE, space, header, mtr); + btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, + 0, 0, ut_dulint_add(DICT_IBUF_ID_MIN, space), + srv_sys->dummy_ind1, mtr); + } else { + fsp_fill_free_list(TRUE, space, header, mtr); + } +} + +/************************************************************************** +Reads the space id from the first page of a tablespace. */ +UNIV_INTERN +ulint +fsp_header_get_space_id( +/*====================*/ + /* out: space id, ULINT UNDEFINED if error */ + const page_t* page) /* in: first page of a tablespace */ +{ + ulint fsp_id; + ulint id; + + fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID); + + id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + if (id != fsp_id) { + fprintf(stderr, + "InnoDB: Error: space id in fsp header %lu," + " but in the page header %lu\n", + (ulong) fsp_id, (ulong) id); + + return(ULINT_UNDEFINED); + } + + return(id); +} + +/************************************************************************** +Reads the space flags from the first page of a tablespace. */ +UNIV_INTERN +ulint +fsp_header_get_flags( +/*=================*/ + /* out: flags */ + const page_t* page) /* in: first page of a tablespace */ +{ + ut_ad(!page_offset(page)); + + return(mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page)); +} + +/************************************************************************** +Reads the compressed page size from the first page of a tablespace. */ +UNIV_INTERN +ulint +fsp_header_get_zip_size( +/*====================*/ + /* out: compressed page size in bytes, + or 0 if uncompressed */ + const page_t* page) /* in: first page of a tablespace */ +{ + ulint flags = fsp_header_get_flags(page); + + return(dict_table_flags_to_zip_size(flags)); +} + +/************************************************************************** +Increases the space size field of a space. */ +UNIV_INTERN +void +fsp_header_inc_size( +/*================*/ + ulint space, /* in: space id */ + ulint size_inc,/* in: size increment in pages */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + fsp_header_t* header; + ulint size; + ulint flags; + + ut_ad(mtr); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + + header = fsp_get_space_header(space, + dict_table_flags_to_zip_size(flags), + mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, + mtr); +} + +/************************************************************************** +Gets the current free limit of the system tablespace. The free limit +means the place of the first page which has never been put to the the +free list for allocation. The space above that address is initialized +to zero. Sets also the global variable log_fsp_current_free_limit. */ +UNIV_INTERN +ulint +fsp_header_get_free_limit(void) +/*===========================*/ + /* out: free limit in megabytes */ +{ + fsp_header_t* header; + ulint limit; + mtr_t mtr; + + mtr_start(&mtr); + + mtr_x_lock(fil_space_get_latch(0, NULL), &mtr); + + header = fsp_get_space_header(0, 0, &mtr); + + limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, &mtr); + + limit /= ((1024 * 1024) / UNIV_PAGE_SIZE); + + log_fsp_current_free_limit_set_and_checkpoint(limit); + + mtr_commit(&mtr); + + return(limit); +} + +/************************************************************************** +Gets the size of the system tablespace from the tablespace header. If +we do not have an auto-extending data file, this should be equal to +the size of the data files. If there is an auto-extending data file, +this can be smaller. */ +UNIV_INTERN +ulint +fsp_header_get_tablespace_size(void) +/*================================*/ + /* out: size in pages */ +{ + fsp_header_t* header; + ulint size; + mtr_t mtr; + + mtr_start(&mtr); + + mtr_x_lock(fil_space_get_latch(0, NULL), &mtr); + + header = fsp_get_space_header(0, 0, &mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr); + + mtr_commit(&mtr); + + return(size); +} + +/*************************************************************************** +Tries to extend a single-table tablespace so that a page would fit in the +data file. */ +static +ibool +fsp_try_extend_data_file_with_pages( +/*================================*/ + /* out: TRUE if success */ + ulint space, /* in: space */ + ulint page_no, /* in: page number */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ +{ + ibool success; + ulint actual_size; + ulint size; + + ut_a(space != 0); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + ut_a(page_no >= size); + + success = fil_extend_space_to_desired_size(&actual_size, space, + page_no + 1); + /* actual_size now has the space size in pages; it may be less than + we wanted if we ran out of disk space */ + + mlog_write_ulint(header + FSP_SIZE, actual_size, MLOG_4BYTES, mtr); + + return(success); +} + +/*************************************************************************** +Tries to extend the last data file of a tablespace if it is auto-extending. */ +static +ibool +fsp_try_extend_data_file( +/*=====================*/ + /* out: FALSE if not auto-extending */ + ulint* actual_increase,/* out: actual increase in pages, where + we measure the tablespace size from + what the header field says; it may be + the actual file size rounded down to + megabyte */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint size; + ulint zip_size; + ulint new_size; + ulint old_size; + ulint size_increase; + ulint actual_size; + ibool success; + + *actual_increase = 0; + + if (space == 0 && !srv_auto_extend_last_data_file) { + + return(FALSE); + } + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + zip_size = dict_table_flags_to_zip_size( + mach_read_from_4(header + FSP_SPACE_FLAGS)); + + old_size = size; + + if (space == 0) { + if (!srv_last_file_size_max) { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } else { + if (srv_last_file_size_max + < srv_data_file_sizes[srv_n_data_files - 1]) { + + fprintf(stderr, + "InnoDB: Error: Last data file size" + " is %lu, max size allowed %lu\n", + (ulong) srv_data_file_sizes[ + srv_n_data_files - 1], + (ulong) srv_last_file_size_max); + } + + size_increase = srv_last_file_size_max + - srv_data_file_sizes[srv_n_data_files - 1]; + if (size_increase > SRV_AUTO_EXTEND_INCREMENT) { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } + } + } else { + /* We extend single-table tablespaces first one extent + at a time, but for bigger tablespaces more. It is not + enough to extend always by one extent, because some + extents are frag page extents. */ + ulint extent_size; /* one megabyte, in pages */ + + if (!zip_size) { + extent_size = FSP_EXTENT_SIZE; + } else { + extent_size = FSP_EXTENT_SIZE + * UNIV_PAGE_SIZE / zip_size; + } + + if (size < extent_size) { + /* Let us first extend the file to extent_size */ + success = fsp_try_extend_data_file_with_pages( + space, extent_size - 1, header, mtr); + if (!success) { + new_size = mtr_read_ulint(header + FSP_SIZE, + MLOG_4BYTES, mtr); + + *actual_increase = new_size - old_size; + + return(FALSE); + } + + size = extent_size; + } + + if (size < 32 * extent_size) { + size_increase = extent_size; + } else { + /* Below in fsp_fill_free_list() we assume + that we add at most FSP_FREE_ADD extents at + a time */ + size_increase = FSP_FREE_ADD * extent_size; + } + } + + if (size_increase == 0) { + + return(TRUE); + } + + success = fil_extend_space_to_desired_size(&actual_size, space, + size + size_increase); + /* We ignore any fragments of a full megabyte when storing the size + to the space header */ + + if (!zip_size) { + new_size = ut_calc_align_down(actual_size, + (1024 * 1024) / UNIV_PAGE_SIZE); + } else { + new_size = ut_calc_align_down(actual_size, + (1024 * 1024) / zip_size); + } + mlog_write_ulint(header + FSP_SIZE, new_size, MLOG_4BYTES, mtr); + + *actual_increase = new_size - old_size; + + return(TRUE); +} + +/************************************************************************** +Puts new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. */ +static +void +fsp_fill_free_list( +/*===============*/ + ibool init_space, /* in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint limit; + ulint size; + ulint zip_size; + xdes_t* descr; + ulint count = 0; + ulint frag_n_used; + ulint actual_increase; + ulint i; + mtr_t ibuf_mtr; + + ut_ad(header && mtr); + ut_ad(page_offset(header) == FSP_HEADER_OFFSET); + + /* Check if we can fill free list from above the free list limit */ + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr); + + zip_size = dict_table_flags_to_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + header)); + ut_a(ut_is_2pow(zip_size)); + ut_a(zip_size <= UNIV_PAGE_SIZE); + ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE); + + if (space == 0 && srv_auto_extend_last_data_file + && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { + + /* Try to increase the last data file size */ + fsp_try_extend_data_file(&actual_increase, space, header, mtr); + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + } + + if (space != 0 && !init_space + && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { + + /* Try to increase the .ibd file size */ + fsp_try_extend_data_file(&actual_increase, space, header, mtr); + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + } + + i = limit; + + while ((init_space && i < 1) + || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) { + + ibool init_xdes; + if (zip_size) { + init_xdes = ut_2pow_remainder(i, zip_size) == 0; + } else { + init_xdes = ut_2pow_remainder(i, UNIV_PAGE_SIZE) == 0; + } + + mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE, + MLOG_4BYTES, mtr); + + /* Update the free limit info in the log system and make + a checkpoint */ + if (space == 0) { + ut_a(!zip_size); + log_fsp_current_free_limit_set_and_checkpoint( + (i + FSP_EXTENT_SIZE) + / ((1024 * 1024) / UNIV_PAGE_SIZE)); + } + + if (UNIV_UNLIKELY(init_xdes)) { + + buf_block_t* block; + + /* We are going to initialize a new descriptor page + and a new ibuf bitmap page: the prior contents of the + pages should be ignored. */ + + if (i > 0) { + block = buf_page_create( + space, i, zip_size, mtr); + buf_page_get(space, zip_size, i, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, + SYNC_FSP_PAGE); + + fsp_init_file_page(block, mtr); + mlog_write_ulint(buf_block_get_frame(block) + + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_XDES, + MLOG_2BYTES, mtr); + } + + /* Initialize the ibuf bitmap page in a separate + mini-transaction because it is low in the latching + order, and we must be able to release its latch + before returning from the fsp routine */ + + mtr_start(&ibuf_mtr); + + block = buf_page_create(space, + i + FSP_IBUF_BITMAP_OFFSET, + zip_size, &ibuf_mtr); + buf_page_get(space, zip_size, + i + FSP_IBUF_BITMAP_OFFSET, + RW_X_LATCH, &ibuf_mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + fsp_init_file_page(block, &ibuf_mtr); + + ibuf_bitmap_page_init(block, &ibuf_mtr); + + mtr_commit(&ibuf_mtr); + } + + descr = xdes_get_descriptor_with_space_hdr(header, space, i, + mtr); + xdes_init(descr, mtr); + +#if UNIV_PAGE_SIZE % FSP_EXTENT_SIZE +# error "UNIV_PAGE_SIZE % FSP_EXTENT_SIZE != 0" +#endif +#if PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE +# error "PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE != 0" +#endif + + if (UNIV_UNLIKELY(init_xdes)) { + + /* The first page in the extent is a descriptor page + and the second is an ibuf bitmap page: mark them + used */ + + xdes_set_bit(descr, XDES_FREE_BIT, 0, FALSE, mtr); + xdes_set_bit(descr, XDES_FREE_BIT, + FSP_IBUF_BITMAP_OFFSET, FALSE, mtr); + xdes_set_state(descr, XDES_FREE_FRAG, mtr); + + flst_add_last(header + FSP_FREE_FRAG, + descr + XDES_FLST_NODE, mtr); + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, + MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used + 2, MLOG_4BYTES, mtr); + } else { + flst_add_last(header + FSP_FREE, + descr + XDES_FLST_NODE, mtr); + count++; + } + + i += FSP_EXTENT_SIZE; + } +} + +/************************************************************************** +Allocates a new free extent. */ +static +xdes_t* +fsp_alloc_free_extent( +/*==================*/ + /* out: extent descriptor, NULL if cannot be + allocated */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hint, /* in: hint of which extent would be desirable: any + page offset in the extent goes; the hint must not + be > FSP_FREE_LIMIT */ + mtr_t* mtr) /* in: mtr */ +{ + fsp_header_t* header; + fil_addr_t first; + xdes_t* descr; + + ut_ad(mtr); + + header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr); + + if (descr && (xdes_get_state(descr, mtr) == XDES_FREE)) { + /* Ok, we can take this extent */ + } else { + /* Take the first extent in the free list */ + first = flst_get_first(header + FSP_FREE, mtr); + + if (fil_addr_is_null(first)) { + fsp_fill_free_list(FALSE, space, header, mtr); + + first = flst_get_first(header + FSP_FREE, mtr); + } + + if (fil_addr_is_null(first)) { + + return(NULL); /* No free extents left */ + } + + descr = xdes_lst_get_descriptor(space, zip_size, first, mtr); + } + + flst_remove(header + FSP_FREE, descr + XDES_FLST_NODE, mtr); + + return(descr); +} + +/************************************************************************** +Allocates a single free page from a space. The page is marked as used. */ +static +ulint +fsp_alloc_free_page( +/*================*/ + /* out: the page offset, FIL_NULL if no page could + be allocated */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hint, /* in: hint of which page would be desirable */ + mtr_t* mtr) /* in: mtr handle */ +{ + fsp_header_t* header; + fil_addr_t first; + xdes_t* descr; + buf_block_t* block; + ulint free; + ulint frag_n_used; + ulint page_no; + ulint space_size; + ibool success; + + ut_ad(mtr); + + header = fsp_get_space_header(space, zip_size, mtr); + + /* Get the hinted descriptor */ + descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr); + + if (descr && (xdes_get_state(descr, mtr) == XDES_FREE_FRAG)) { + /* Ok, we can take this extent */ + } else { + /* Else take the first extent in free_frag list */ + first = flst_get_first(header + FSP_FREE_FRAG, mtr); + + if (fil_addr_is_null(first)) { + /* There are no partially full fragments: allocate + a free extent and add it to the FREE_FRAG list. NOTE + that the allocation may have as a side-effect that an + extent containing a descriptor page is added to the + FREE_FRAG list. But we will allocate our page from the + the free extent anyway. */ + + descr = fsp_alloc_free_extent(space, zip_size, + hint, mtr); + + if (descr == NULL) { + /* No free space left */ + + return(FIL_NULL); + } + + xdes_set_state(descr, XDES_FREE_FRAG, mtr); + flst_add_last(header + FSP_FREE_FRAG, + descr + XDES_FLST_NODE, mtr); + } else { + descr = xdes_lst_get_descriptor(space, zip_size, + first, mtr); + } + + /* Reset the hint */ + hint = 0; + } + + /* Now we have in descr an extent with at least one free page. Look + for a free page in the extent. */ + + free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE, + hint % FSP_EXTENT_SIZE, mtr); + if (free == ULINT_UNDEFINED) { + + ut_print_buf(stderr, ((byte*)descr) - 500, 1000); + putc('\n', stderr); + + ut_error; + } + + page_no = xdes_get_offset(descr) + free; + + space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + if (space_size <= page_no) { + /* It must be that we are extending a single-table tablespace + whose size is still < 64 pages */ + + ut_a(space != 0); + if (page_no >= FSP_EXTENT_SIZE) { + fprintf(stderr, + "InnoDB: Error: trying to extend a" + " single-table tablespace %lu\n" + "InnoDB: by single page(s) though the" + " space size %lu. Page no %lu.\n", + (ulong) space, (ulong) space_size, + (ulong) page_no); + return(FIL_NULL); + } + success = fsp_try_extend_data_file_with_pages(space, page_no, + header, mtr); + if (!success) { + /* No disk space left */ + return(FIL_NULL); + } + } + + xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); + + /* Update the FRAG_N_USED field */ + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + frag_n_used++; + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES, + mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FULL_FRAG, mtr); + + flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES, + mtr); + } + + /* Initialize the allocated page to the buffer pool, so that it can + be obtained immediately with buf_page_get without need for a disk + read. */ + + buf_page_create(space, page_no, zip_size, mtr); + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + /* Prior contents of the page should be ignored */ + fsp_init_file_page(block, mtr); + + return(page_no); +} + +/************************************************************************** +Frees a single page of a space. The page is marked as free and clean. */ +static +void +fsp_free_page( +/*==========*/ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: page offset */ + mtr_t* mtr) /* in: mtr handle */ +{ + fsp_header_t* header; + xdes_t* descr; + ulint state; + ulint frag_n_used; + + ut_ad(mtr); + + /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */ + + header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr); + + state = xdes_get_state(descr, mtr); + + if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) { + fprintf(stderr, + "InnoDB: Error: File space extent descriptor" + " of page %lu has state %lu\n", + (ulong) page, + (ulong) state); + fputs("InnoDB: Dump of descriptor: ", stderr); + ut_print_buf(stderr, ((byte*)descr) - 50, 200); + putc('\n', stderr); + + if (state == XDES_FREE) { + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } + + ut_error; + } + + if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) { + fprintf(stderr, + "InnoDB: Error: File space extent descriptor" + " of page %lu says it is free\n" + "InnoDB: Dump of descriptor: ", (ulong) page); + ut_print_buf(stderr, ((byte*)descr) - 50, 200); + putc('\n', stderr); + + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } + + xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + mtr); + if (state == XDES_FULL_FRAG) { + /* The fragment was full: move it to another list */ + flst_remove(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE, + mtr); + xdes_set_state(descr, XDES_FREE_FRAG, mtr); + flst_add_last(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + mlog_write_ulint(header + FSP_FRAG_N_USED, + frag_n_used + FSP_EXTENT_SIZE - 1, + MLOG_4BYTES, mtr); + } else { + ut_a(frag_n_used > 0); + mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used - 1, + MLOG_4BYTES, mtr); + } + + if (xdes_is_free(descr, mtr)) { + /* The extent has become free: move it to another list */ + flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE, + mtr); + fsp_free_extent(space, zip_size, page, mtr); + } +} + +/************************************************************************** +Returns an extent to the free list of a space. */ +static +void +fsp_free_extent( +/*============*/ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: page offset in the extent */ + mtr_t* mtr) /* in: mtr */ +{ + fsp_header_t* header; + xdes_t* descr; + + ut_ad(mtr); + + header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr); + + if (xdes_get_state(descr, mtr) == XDES_FREE) { + + ut_print_buf(stderr, (byte*)descr - 500, 1000); + putc('\n', stderr); + + ut_error; + } + + xdes_init(descr, mtr); + + flst_add_last(header + FSP_FREE, descr + XDES_FLST_NODE, mtr); +} + +/************************************************************************** +Returns the nth inode slot on an inode page. */ +UNIV_INLINE +fseg_inode_t* +fsp_seg_inode_page_get_nth_inode( +/*=============================*/ + /* out: segment inode */ + page_t* page, /* in: segment inode page */ + ulint i, /* in: inode index on page */ + ulint zip_size __attribute__((unused)), + /* in: compressed page size, or 0 */ + mtr_t* mtr __attribute__((unused))) + /* in: mini-transaction handle */ +{ + ut_ad(i < FSP_SEG_INODES_PER_PAGE(zip_size)); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + + return(page + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i); +} + +/************************************************************************** +Looks for a used segment inode on a segment inode page. */ +static +ulint +fsp_seg_inode_page_find_used( +/*=========================*/ + /* out: segment inode index, or ULINT_UNDEFINED + if not found */ + page_t* page, /* in: segment inode page */ + ulint zip_size,/* in: compressed page size, or 0 */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint i; + fseg_inode_t* inode; + + for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + + inode = fsp_seg_inode_page_get_nth_inode( + page, i, zip_size, mtr); + + if (!ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))) { + /* This is used */ + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Looks for an unused segment inode on a segment inode page. */ +static +ulint +fsp_seg_inode_page_find_free( +/*=========================*/ + /* out: segment inode index, or ULINT_UNDEFINED + if not found */ + page_t* page, /* in: segment inode page */ + ulint i, /* in: search forward starting from this index */ + ulint zip_size,/* in: compressed page size, or 0 */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + fseg_inode_t* inode; + + for (; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + + inode = fsp_seg_inode_page_get_nth_inode( + page, i, zip_size, mtr); + + if (ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))) { + /* This is unused */ + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Allocates a new file segment inode page. */ +static +ibool +fsp_alloc_seg_inode_page( +/*=====================*/ + /* out: TRUE if could be allocated */ + fsp_header_t* space_header, /* in: space header */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + fseg_inode_t* inode; + buf_block_t* block; + page_t* page; + ulint page_no; + ulint space; + ulint zip_size; + ulint i; + + ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET); + + space = page_get_space_id(page_align(space_header)); + zip_size = dict_table_flags_to_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + space_header)); + + page_no = fsp_alloc_free_page(space, zip_size, 0, mtr); + + if (page_no == FIL_NULL) { + + return(FALSE); + } + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + block->check_index_page_at_flush = FALSE; + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_INODE, + MLOG_2BYTES, mtr); + + for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) { + + inode = fsp_seg_inode_page_get_nth_inode(page, i, + zip_size, mtr); + + mlog_write_dulint(inode + FSEG_ID, ut_dulint_zero, mtr); + } + + flst_add_last(space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + return(TRUE); +} + +/************************************************************************** +Allocates a new file segment inode. */ +static +fseg_inode_t* +fsp_alloc_seg_inode( +/*================*/ + /* out: segment inode, or NULL if + not enough space */ + fsp_header_t* space_header, /* in: space header */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint page_no; + buf_block_t* block; + page_t* page; + fseg_inode_t* inode; + ibool success; + ulint zip_size; + ulint n; + + ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET); + + if (flst_get_len(space_header + FSP_SEG_INODES_FREE, mtr) == 0) { + /* Allocate a new segment inode page */ + + success = fsp_alloc_seg_inode_page(space_header, mtr); + + if (!success) { + + return(NULL); + } + } + + page_no = flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page; + + zip_size = dict_table_flags_to_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + space_header)); + block = buf_page_get(page_get_space_id(page_align(space_header)), + zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + page = buf_block_get_frame(block); + + n = fsp_seg_inode_page_find_free(page, 0, zip_size, mtr); + + ut_a(n != ULINT_UNDEFINED); + + inode = fsp_seg_inode_page_get_nth_inode(page, n, zip_size, mtr); + + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, + zip_size, mtr)) { + /* There are no other unused headers left on the page: move it + to another list */ + + flst_remove(space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + + flst_add_last(space_header + FSP_SEG_INODES_FULL, + page + FSEG_INODE_PAGE_NODE, mtr); + } + + return(inode); +} + +/************************************************************************** +Frees a file segment inode. */ +static +void +fsp_free_seg_inode( +/*===============*/ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + fseg_inode_t* inode, /* in: segment inode */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + page_t* page; + fsp_header_t* space_header; + + page = page_align(inode); + + space_header = fsp_get_space_header(space, zip_size, mtr); + + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + if (ULINT_UNDEFINED + == fsp_seg_inode_page_find_free(page, 0, zip_size, mtr)) { + + /* Move the page to another list */ + + flst_remove(space_header + FSP_SEG_INODES_FULL, + page + FSEG_INODE_PAGE_NODE, mtr); + + flst_add_last(space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + } + + mlog_write_dulint(inode + FSEG_ID, ut_dulint_zero, mtr); + mlog_write_ulint(inode + FSEG_MAGIC_N, 0, MLOG_4BYTES, mtr); + + if (ULINT_UNDEFINED + == fsp_seg_inode_page_find_used(page, zip_size, mtr)) { + + /* There are no other used headers left on the page: free it */ + + flst_remove(space_header + FSP_SEG_INODES_FREE, + page + FSEG_INODE_PAGE_NODE, mtr); + + fsp_free_page(space, zip_size, page_get_page_no(page), mtr); + } +} + +/************************************************************************** +Returns the file segment inode, page x-latched. */ +static +fseg_inode_t* +fseg_inode_get( +/*===========*/ + /* out: segment inode, page x-latched */ + fseg_header_t* header, /* in: segment header */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /* in: mtr handle */ +{ + fil_addr_t inode_addr; + fseg_inode_t* inode; + + inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO); + inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET); + ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE)); + + inode = fut_get_ptr(space, zip_size, inode_addr, RW_X_LATCH, mtr); + + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + return(inode); +} + +/************************************************************************** +Gets the page number from the nth fragment page slot. */ +UNIV_INLINE +ulint +fseg_get_nth_frag_page_no( +/*======================*/ + /* out: page number, FIL_NULL if not in use */ + fseg_inode_t* inode, /* in: segment inode */ + ulint n, /* in: slot index */ + mtr_t* mtr __attribute__((unused))) /* in: mtr handle */ +{ + ut_ad(inode && mtr); + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + return(mach_read_from_4(inode + FSEG_FRAG_ARR + + n * FSEG_FRAG_SLOT_SIZE)); +} + +/************************************************************************** +Sets the page number in the nth fragment page slot. */ +UNIV_INLINE +void +fseg_set_nth_frag_page_no( +/*======================*/ + fseg_inode_t* inode, /* in: segment inode */ + ulint n, /* in: slot index */ + ulint page_no,/* in: page number to set */ + mtr_t* mtr) /* in: mtr handle */ +{ + ut_ad(inode && mtr); + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + + mlog_write_ulint(inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE, + page_no, MLOG_4BYTES, mtr); +} + +/************************************************************************** +Finds a fragment page slot which is free. */ +static +ulint +fseg_find_free_frag_page_slot( +/*==========================*/ + /* out: slot index; ULINT_UNDEFINED if none + found */ + fseg_inode_t* inode, /* in: segment inode */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint i; + ulint page_no; + + ut_ad(inode && mtr); + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no(inode, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Finds a fragment page slot which is used and last in the array. */ +static +ulint +fseg_find_last_used_frag_page_slot( +/*===============================*/ + /* out: slot index; ULINT_UNDEFINED if none + found */ + fseg_inode_t* inode, /* in: segment inode */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint i; + ulint page_no; + + ut_ad(inode && mtr); + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no( + inode, FSEG_FRAG_ARR_N_SLOTS - i - 1, mtr); + + if (page_no != FIL_NULL) { + + return(FSEG_FRAG_ARR_N_SLOTS - i - 1); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Calculates reserved fragment page slots. */ +static +ulint +fseg_get_n_frag_pages( +/*==================*/ + /* out: number of fragment pages */ + fseg_inode_t* inode, /* in: segment inode */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint i; + ulint count = 0; + + ut_ad(inode && mtr); + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i, mtr)) { + count++; + } + } + + return(count); +} + +/************************************************************************** +Creates a new segment. */ +UNIV_INTERN +buf_block_t* +fseg_create_general( +/*================*/ + /* out: the block where the segment header is placed, + x-latched, NULL if could not create segment + because of lack of space */ + ulint space, /* in: space id */ + ulint page, /* in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /* in: byte offset of the created segment header + on the page */ + ibool has_done_reservation, /* in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ + mtr_t* mtr) /* in: mtr */ +{ + ulint flags; + ulint zip_size; + fsp_header_t* space_header; + fseg_inode_t* inode; + dulint seg_id; + buf_block_t* block = 0; /* remove warning */ + fseg_header_t* header = 0; /* remove warning */ + rw_lock_t* latch; + ibool success; + ulint n_reserved; + ulint i; + + ut_ad(mtr); + ut_ad(byte_offset + FSEG_HEADER_SIZE + <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END); + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + if (page != 0) { + block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr); + header = byte_offset + buf_block_get_frame(block); + } + + ut_ad(!mutex_own(&kernel_mutex) + || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK)); + + mtr_x_lock(latch, mtr); + + if (rw_lock_get_x_lock_count(latch) == 1) { + /* This thread did not own the latch before this call: free + excess pages from the insert buffer free list */ + + if (space == IBUF_SPACE_ID) { + ibuf_free_excess_pages(); + } + } + + if (!has_done_reservation) { + success = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); + if (!success) { + return(NULL); + } + } + + space_header = fsp_get_space_header(space, zip_size, mtr); + + inode = fsp_alloc_seg_inode(space_header, mtr); + + if (inode == NULL) { + + goto funct_exit; + } + + /* Read the next segment id from space header and increment the + value in space header */ + + seg_id = mtr_read_dulint(space_header + FSP_SEG_ID, mtr); + + mlog_write_dulint(space_header + FSP_SEG_ID, ut_dulint_add(seg_id, 1), + mtr); + + mlog_write_dulint(inode + FSEG_ID, seg_id, mtr); + mlog_write_ulint(inode + FSEG_NOT_FULL_N_USED, 0, MLOG_4BYTES, mtr); + + flst_init(inode + FSEG_FREE, mtr); + flst_init(inode + FSEG_NOT_FULL, mtr); + flst_init(inode + FSEG_FULL, mtr); + + mlog_write_ulint(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE, + MLOG_4BYTES, mtr); + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + fseg_set_nth_frag_page_no(inode, i, FIL_NULL, mtr); + } + + if (page == 0) { + page = fseg_alloc_free_page_low(space, zip_size, + inode, 0, FSP_UP, mtr); + + if (page == FIL_NULL) { + + fsp_free_seg_inode(space, zip_size, inode, mtr); + + goto funct_exit; + } + + block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr); + header = byte_offset + buf_block_get_frame(block); + mlog_write_ulint(header - byte_offset + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_SYS, MLOG_2BYTES, mtr); + } + + mlog_write_ulint(header + FSEG_HDR_OFFSET, + page_offset(inode), MLOG_2BYTES, mtr); + + mlog_write_ulint(header + FSEG_HDR_PAGE_NO, + page_get_page_no(page_align(inode)), + MLOG_4BYTES, mtr); + + mlog_write_ulint(header + FSEG_HDR_SPACE, space, MLOG_4BYTES, mtr); + +funct_exit: + if (!has_done_reservation) { + + fil_space_release_free_extents(space, n_reserved); + } + + return(block); +} + +/************************************************************************** +Creates a new segment. */ +UNIV_INTERN +buf_block_t* +fseg_create( +/*========*/ + /* out: the block where the segment header is placed, + x-latched, NULL if could not create segment + because of lack of space */ + ulint space, /* in: space id */ + ulint page, /* in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /* in: byte offset of the created segment header + on the page */ + mtr_t* mtr) /* in: mtr */ +{ + return(fseg_create_general(space, page, byte_offset, FALSE, mtr)); +} + +/************************************************************************** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. */ +static +ulint +fseg_n_reserved_pages_low( +/*======================*/ + /* out: number of reserved pages */ + fseg_inode_t* inode, /* in: segment inode */ + ulint* used, /* out: number of pages used (<= reserved) */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint ret; + + ut_ad(inode && used && mtr); + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + + *used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr) + + fseg_get_n_frag_pages(inode, mtr); + + ret = fseg_get_n_frag_pages(inode, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL, mtr) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr); + + return(ret); +} + +/************************************************************************** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. */ +UNIV_INTERN +ulint +fseg_n_reserved_pages( +/*==================*/ + /* out: number of reserved pages */ + fseg_header_t* header, /* in: segment header */ + ulint* used, /* out: number of pages used (<= reserved) */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint ret; + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + rw_lock_t* latch; + + space = page_get_space_id(page_align(header)); + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + ut_ad(!mutex_own(&kernel_mutex) + || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK)); + + mtr_x_lock(latch, mtr); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + ret = fseg_n_reserved_pages_low(inode, used, mtr); + + return(ret); +} + +/************************************************************************* +Tries to fill the free list of a segment with consecutive free extents. +This happens if the segment is big enough to allow extents in the free list, +the free list is empty, and the extents can be allocated consecutively from +the hint onward. */ +static +void +fseg_fill_free_list( +/*================*/ + fseg_inode_t* inode, /* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hint, /* in: hint which extent would be good as + the first extent */ + mtr_t* mtr) /* in: mtr */ +{ + xdes_t* descr; + ulint i; + dulint seg_id; + ulint reserved; + ulint used; + + ut_ad(inode && mtr); + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + reserved = fseg_n_reserved_pages_low(inode, &used, mtr); + + if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) { + + /* The segment is too small to allow extents in free list */ + + return; + } + + if (flst_get_len(inode + FSEG_FREE, mtr) > 0) { + /* Free list is not empty */ + + return; + } + + for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) { + descr = xdes_get_descriptor(space, zip_size, hint, mtr); + + if ((descr == NULL) + || (XDES_FREE != xdes_get_state(descr, mtr))) { + + /* We cannot allocate the desired extent: stop */ + + return; + } + + descr = fsp_alloc_free_extent(space, zip_size, hint, mtr); + + xdes_set_state(descr, XDES_FSEG, mtr); + + seg_id = mtr_read_dulint(inode + FSEG_ID, mtr); + mlog_write_dulint(descr + XDES_ID, seg_id, mtr); + + flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr); + hint += FSP_EXTENT_SIZE; + } +} + +/************************************************************************* +Allocates a free extent for the segment: looks first in the free list of the +segment, then tries to allocate from the space free list. NOTE that the extent +returned still resides in the segment free list, it is not yet taken off it! */ +static +xdes_t* +fseg_alloc_free_extent( +/*===================*/ + /* out: allocated extent, still placed in the + segment free list, NULL if could + not be allocated */ + fseg_inode_t* inode, /* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /* in: mtr */ +{ + xdes_t* descr; + dulint seg_id; + fil_addr_t first; + + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + if (flst_get_len(inode + FSEG_FREE, mtr) > 0) { + /* Segment free list is not empty, allocate from it */ + + first = flst_get_first(inode + FSEG_FREE, mtr); + + descr = xdes_lst_get_descriptor(space, zip_size, first, mtr); + } else { + /* Segment free list was empty, allocate from space */ + descr = fsp_alloc_free_extent(space, zip_size, 0, mtr); + + if (descr == NULL) { + + return(NULL); + } + + seg_id = mtr_read_dulint(inode + FSEG_ID, mtr); + + xdes_set_state(descr, XDES_FSEG, mtr); + mlog_write_dulint(descr + XDES_ID, seg_id, mtr); + flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr); + + /* Try to fill the segment free list */ + fseg_fill_free_list(inode, space, zip_size, + xdes_get_offset(descr) + FSP_EXTENT_SIZE, + mtr); + } + + return(descr); +} + +/************************************************************************** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. */ +static +ulint +fseg_alloc_free_page_low( +/*=====================*/ + /* out: the allocated page number, FIL_NULL + if no page could be allocated */ + ulint space, /* in: space */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + fseg_inode_t* seg_inode, /* in: segment inode */ + ulint hint, /* in: hint of which page would be desirable */ + byte direction, /* in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr) /* in: mtr handle */ +{ + fsp_header_t* space_header; + ulint space_size; + dulint seg_id; + ulint used; + ulint reserved; + xdes_t* descr; /* extent of the hinted page */ + ulint ret_page; /* the allocated page offset, FIL_NULL + if could not be allocated */ + xdes_t* ret_descr; /* the extent of the allocated page */ + ibool frag_page_allocated = FALSE; + ibool success; + ulint n; + + ut_ad(mtr); + ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR)); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + seg_id = mtr_read_dulint(seg_inode + FSEG_ID, mtr); + + ut_ad(!ut_dulint_is_zero(seg_id)); + + reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr); + + space_header = fsp_get_space_header(space, zip_size, mtr); + + descr = xdes_get_descriptor_with_space_hdr(space_header, space, + hint, mtr); + if (descr == NULL) { + /* Hint outside space or too high above free limit: reset + hint */ + hint = 0; + descr = xdes_get_descriptor(space, zip_size, hint, mtr); + } + + /* In the big if-else below we look for ret_page and ret_descr */ + /*-------------------------------------------------------------*/ + if ((xdes_get_state(descr, mtr) == XDES_FSEG) + && (0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, + mtr), seg_id)) + && (xdes_get_bit(descr, XDES_FREE_BIT, + hint % FSP_EXTENT_SIZE, mtr) == TRUE)) { + + /* 1. We can take the hinted page + =================================*/ + ret_descr = descr; + ret_page = hint; + /*-----------------------------------------------------------*/ + } else if ((xdes_get_state(descr, mtr) == XDES_FREE) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT)) { + + /* 2. We allocate the free extent from space and can take + ========================================================= + the hinted page + ===============*/ + ret_descr = fsp_alloc_free_extent(space, zip_size, hint, mtr); + + ut_a(ret_descr == descr); + + xdes_set_state(ret_descr, XDES_FSEG, mtr); + mlog_write_dulint(ret_descr + XDES_ID, seg_id, mtr); + flst_add_last(seg_inode + FSEG_FREE, + ret_descr + XDES_FLST_NODE, mtr); + + /* Try to fill the segment free list */ + fseg_fill_free_list(seg_inode, space, zip_size, + hint + FSP_EXTENT_SIZE, mtr); + ret_page = hint; + /*-----------------------------------------------------------*/ + } else if ((direction != FSP_NO_DIR) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT) + && (!!(ret_descr + = fseg_alloc_free_extent(seg_inode, + space, zip_size, mtr)))) { + + /* 3. We take any free extent (which was already assigned above + =============================================================== + in the if-condition to ret_descr) and take the lowest or + ======================================================== + highest page in it, depending on the direction + ==============================================*/ + ret_page = xdes_get_offset(ret_descr); + + if (direction == FSP_DOWN) { + ret_page += FSP_EXTENT_SIZE - 1; + } + /*-----------------------------------------------------------*/ + } else if ((xdes_get_state(descr, mtr) == XDES_FSEG) + && (0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, + mtr), seg_id)) + && (!xdes_is_full(descr, mtr))) { + + /* 4. We can take the page from the same extent as the + ====================================================== + hinted page (and the extent already belongs to the + ================================================== + segment) + ========*/ + ret_descr = descr; + ret_page = xdes_get_offset(ret_descr) + + xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE, + hint % FSP_EXTENT_SIZE, mtr); + /*-----------------------------------------------------------*/ + } else if (reserved - used > 0) { + /* 5. We take any unused page from the segment + ==============================================*/ + fil_addr_t first; + + if (flst_get_len(seg_inode + FSEG_NOT_FULL, mtr) > 0) { + first = flst_get_first(seg_inode + FSEG_NOT_FULL, + mtr); + } else if (flst_get_len(seg_inode + FSEG_FREE, mtr) > 0) { + first = flst_get_first(seg_inode + FSEG_FREE, mtr); + } else { + ut_error; + return(FIL_NULL); + } + + ret_descr = xdes_lst_get_descriptor(space, zip_size, + first, mtr); + ret_page = xdes_get_offset(ret_descr) + + xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE, + 0, mtr); + /*-----------------------------------------------------------*/ + } else if (used < FSEG_FRAG_LIMIT) { + /* 6. We allocate an individual page from the space + ===================================================*/ + ret_page = fsp_alloc_free_page(space, zip_size, hint, mtr); + ret_descr = NULL; + + frag_page_allocated = TRUE; + + if (ret_page != FIL_NULL) { + /* Put the page in the fragment page array of the + segment */ + n = fseg_find_free_frag_page_slot(seg_inode, mtr); + ut_a(n != FIL_NULL); + + fseg_set_nth_frag_page_no(seg_inode, n, ret_page, + mtr); + } + /*-----------------------------------------------------------*/ + } else { + /* 7. We allocate a new extent and take its first page + ======================================================*/ + ret_descr = fseg_alloc_free_extent(seg_inode, + space, zip_size, mtr); + + if (ret_descr == NULL) { + ret_page = FIL_NULL; + } else { + ret_page = xdes_get_offset(ret_descr); + } + } + + if (ret_page == FIL_NULL) { + /* Page could not be allocated */ + + return(FIL_NULL); + } + + if (space != 0) { + space_size = fil_space_get_size(space); + + if (space_size <= ret_page) { + /* It must be that we are extending a single-table + tablespace whose size is still < 64 pages */ + + if (ret_page >= FSP_EXTENT_SIZE) { + fprintf(stderr, + "InnoDB: Error (2): trying to extend" + " a single-table tablespace %lu\n" + "InnoDB: by single page(s) though" + " the space size %lu. Page no %lu.\n", + (ulong) space, (ulong) space_size, + (ulong) ret_page); + return(FIL_NULL); + } + + success = fsp_try_extend_data_file_with_pages( + space, ret_page, space_header, mtr); + if (!success) { + /* No disk space left */ + return(FIL_NULL); + } + } + } + + if (!frag_page_allocated) { + /* Initialize the allocated page to buffer pool, so that it + can be obtained immediately with buf_page_get without need + for a disk read */ + buf_block_t* block; + ulint zip_size = dict_table_flags_to_zip_size( + mach_read_from_4(FSP_SPACE_FLAGS + space_header)); + + block = buf_page_create(space, ret_page, zip_size, mtr); + buf_block_dbg_add_level(block, SYNC_FSP_PAGE); + + if (UNIV_UNLIKELY(block != buf_page_get(space, zip_size, + ret_page, RW_X_LATCH, + mtr))) { + ut_error; + } + + /* The prior contents of the page should be ignored */ + fsp_init_file_page(block, mtr); + + /* At this point we know the extent and the page offset. + The extent is still in the appropriate list (FSEG_NOT_FULL + or FSEG_FREE), and the page is not yet marked as used. */ + + ut_ad(xdes_get_descriptor(space, zip_size, ret_page, mtr) + == ret_descr); + ut_ad(xdes_get_bit(ret_descr, XDES_FREE_BIT, + ret_page % FSP_EXTENT_SIZE, mtr) == TRUE); + + fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr); + } + + buf_reset_check_index_page_at_flush(space, ret_page); + + return(ret_page); +} + +/************************************************************************** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. */ +UNIV_INTERN +ulint +fseg_alloc_free_page_general( +/*=========================*/ + /* out: allocated page offset, FIL_NULL if no + page could be allocated */ + fseg_header_t* seg_header,/* in: segment header */ + ulint hint, /* in: hint of which page would be desirable */ + byte direction,/* in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + ibool has_done_reservation, /* in: TRUE if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr) /* in: mtr handle */ +{ + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + rw_lock_t* latch; + ibool success; + ulint page_no; + ulint n_reserved; + + space = page_get_space_id(page_align(seg_header)); + + latch = fil_space_get_latch(space, &flags); + + zip_size = dict_table_flags_to_zip_size(flags); + + ut_ad(!mutex_own(&kernel_mutex) + || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK)); + + mtr_x_lock(latch, mtr); + + if (rw_lock_get_x_lock_count(latch) == 1) { + /* This thread did not own the latch before this call: free + excess pages from the insert buffer free list */ + + if (space == IBUF_SPACE_ID) { + ibuf_free_excess_pages(); + } + } + + inode = fseg_inode_get(seg_header, space, zip_size, mtr); + + if (!has_done_reservation) { + success = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); + if (!success) { + return(FIL_NULL); + } + } + + page_no = fseg_alloc_free_page_low(space, zip_size, + inode, hint, direction, mtr); + if (!has_done_reservation) { + fil_space_release_free_extents(space, n_reserved); + } + + return(page_no); +} + +/************************************************************************** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. */ +UNIV_INTERN +ulint +fseg_alloc_free_page( +/*=================*/ + /* out: allocated page offset, FIL_NULL if no + page could be allocated */ + fseg_header_t* seg_header,/* in: segment header */ + ulint hint, /* in: hint of which page would be desirable */ + byte direction,/* in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr) /* in: mtr handle */ +{ + return(fseg_alloc_free_page_general(seg_header, hint, direction, + FALSE, mtr)); +} + +/************************************************************************** +Checks that we have at least 2 frag pages free in the first extent of a +single-table tablespace, and they are also physically initialized to the data +file. That is we have already extended the data file so that those pages are +inside the data file. If not, this function extends the tablespace with +pages. */ +static +ibool +fsp_reserve_free_pages( +/*===================*/ + /* out: TRUE if there were >= 3 free + pages, or we were able to extend */ + ulint space, /* in: space id, must be != 0 */ + fsp_header_t* space_header, /* in: header of that space, + x-latched */ + ulint size, /* in: size of the tablespace in pages, + must be < FSP_EXTENT_SIZE / 2 */ + mtr_t* mtr) /* in: mtr */ +{ + xdes_t* descr; + ulint n_used; + + ut_a(space != 0); + ut_a(size < FSP_EXTENT_SIZE / 2); + + descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0, + mtr); + n_used = xdes_get_n_used(descr, mtr); + + ut_a(n_used <= size); + + if (size >= n_used + 2) { + + return(TRUE); + } + + return(fsp_try_extend_data_file_with_pages(space, n_used + 1, + space_header, mtr)); +} + +/************************************************************************** +Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_release_free_extents! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. */ +UNIV_INTERN +ibool +fsp_reserve_free_extents( +/*=====================*/ + /* out: TRUE if we were able to make the reservation */ + ulint* n_reserved,/* out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ + ulint space, /* in: space id */ + ulint n_ext, /* in: number of extents to reserve */ + ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ + mtr_t* mtr) /* in: mtr */ +{ + fsp_header_t* space_header; + rw_lock_t* latch; + ulint n_free_list_ext; + ulint free_limit; + ulint size; + ulint flags; + ulint zip_size; + ulint n_free; + ulint n_free_up; + ulint reserve; + ibool success; + ulint n_pages_added; + + ut_ad(mtr); + *n_reserved = n_ext; + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + ut_ad(!mutex_own(&kernel_mutex) + || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK)); + + mtr_x_lock(latch, mtr); + + space_header = fsp_get_space_header(space, zip_size, mtr); +try_again: + size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr); + + if (size < FSP_EXTENT_SIZE / 2) { + /* Use different rules for small single-table tablespaces */ + *n_reserved = 0; + return(fsp_reserve_free_pages(space, space_header, size, mtr)); + } + + n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr); + + free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT, + MLOG_4BYTES, mtr); + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + + n_free_up = (size - free_limit) / FSP_EXTENT_SIZE; + + if (n_free_up > 0) { + n_free_up--; + if (!zip_size) { + n_free_up -= n_free_up + / (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE); + } else { + n_free_up -= n_free_up + / (zip_size / FSP_EXTENT_SIZE); + } + } + + n_free = n_free_list_ext + n_free_up; + + if (alloc_type == FSP_NORMAL) { + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function below! */ + + reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + } else if (alloc_type == FSP_UNDO) { + /* We reserve 0.5 % of the space size to cleaning operations */ + + reserve = 1 + ((size / FSP_EXTENT_SIZE) * 1) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + } else { + ut_a(alloc_type == FSP_CLEANING); + } + + success = fil_space_reserve_free_extents(space, n_free, n_ext); + + if (success) { + return(TRUE); + } +try_to_extend: + success = fsp_try_extend_data_file(&n_pages_added, space, + space_header, mtr); + if (success && n_pages_added > 0) { + + goto try_again; + } + + return(FALSE); +} + +/************************************************************************** +This function should be used to get information on how much we still +will be able to insert new data to the database without running out the +tablespace. Only free extents are taken into account and we also subtract +the safety margin required by the above function fsp_reserve_free_extents. */ +UNIV_INTERN +ullint +fsp_get_available_space_in_free_extents( +/*====================================*/ + /* out: available space in kB */ + ulint space) /* in: space id */ +{ + fsp_header_t* space_header; + ulint n_free_list_ext; + ulint free_limit; + ulint size; + ulint flags; + ulint zip_size; + ulint n_free; + ulint n_free_up; + ulint reserve; + rw_lock_t* latch; + mtr_t mtr; + + ut_ad(!mutex_own(&kernel_mutex)); + + mtr_start(&mtr); + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + mtr_x_lock(latch, &mtr); + + space_header = fsp_get_space_header(space, zip_size, &mtr); + + size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, &mtr); + + n_free_list_ext = flst_get_len(space_header + FSP_FREE, &mtr); + + free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + if (size < FSP_EXTENT_SIZE) { + ut_a(space != 0); /* This must be a single-table + tablespace */ + + return(0); /* TODO: count free frag pages and + return a value based on that */ + } + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + + n_free_up = (size - free_limit) / FSP_EXTENT_SIZE; + + if (n_free_up > 0) { + n_free_up--; + if (!zip_size) { + n_free_up -= n_free_up + / (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE); + } else { + n_free_up -= n_free_up + / (zip_size / FSP_EXTENT_SIZE); + } + } + + n_free = n_free_list_ext + n_free_up; + + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function above! */ + + reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200; + + if (reserve > n_free) { + return(0); + } + + if (!zip_size) { + return((ullint) (n_free - reserve) + * FSP_EXTENT_SIZE + * (UNIV_PAGE_SIZE / 1024)); + } else { + return((ullint) (n_free - reserve) + * FSP_EXTENT_SIZE + * (zip_size / 1024)); + } +} + +/************************************************************************ +Marks a page used. The page must reside within the extents of the given +segment. */ +static +void +fseg_mark_page_used( +/*================*/ + fseg_inode_t* seg_inode,/* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: page offset */ + mtr_t* mtr) /* in: mtr */ +{ + xdes_t* descr; + ulint not_full_n_used; + + ut_ad(seg_inode && mtr); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + descr = xdes_get_descriptor(space, zip_size, page, mtr); + + ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr) + == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr)); + + if (xdes_is_free(descr, mtr)) { + /* We move the extent from the free list to the + NOT_FULL list */ + flst_remove(seg_inode + FSEG_FREE, descr + XDES_FLST_NODE, + mtr); + flst_add_last(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + } + + ut_ad(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr) + == TRUE); + /* We mark the page as used */ + xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr); + + not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr); + not_full_n_used++; + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, not_full_n_used, + MLOG_4BYTES, mtr); + if (xdes_is_full(descr, mtr)) { + /* We move the extent from the NOT_FULL list to the + FULL list */ + flst_remove(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + flst_add_last(seg_inode + FSEG_FULL, + descr + XDES_FLST_NODE, mtr); + + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - FSP_EXTENT_SIZE, + MLOG_4BYTES, mtr); + } +} + +/************************************************************************** +Frees a single page of a segment. */ +static +void +fseg_free_page_low( +/*===============*/ + fseg_inode_t* seg_inode, /* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: page offset */ + mtr_t* mtr) /* in: mtr handle */ +{ + xdes_t* descr; + ulint not_full_n_used; + ulint state; + dulint descr_id; + dulint seg_id; + ulint i; + + ut_ad(seg_inode && mtr); + ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) + == FSEG_MAGIC_N_VALUE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + + /* Drop search system page hash index if the page is found in + the pool and is hashed */ + + btr_search_drop_page_hash_when_freed(space, zip_size, page); + + descr = xdes_get_descriptor(space, zip_size, page, mtr); + + ut_a(descr); + if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) { + fputs("InnoDB: Dump of the tablespace extent descriptor: ", + stderr); + ut_print_buf(stderr, descr, 40); + + fprintf(stderr, "\n" + "InnoDB: Serious error! InnoDB is trying to" + " free page %lu\n" + "InnoDB: though it is already marked as free" + " in the tablespace!\n" + "InnoDB: The tablespace free space info is corrupt.\n" + "InnoDB: You may need to dump your" + " InnoDB tables and recreate the whole\n" + "InnoDB: database!\n", (ulong) page); +crash: + fputs("InnoDB: Please refer to\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "forcing-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); + ut_error; + } + + state = xdes_get_state(descr, mtr); + + if (state != XDES_FSEG) { + /* The page is in the fragment pages of the segment */ + + for (i = 0;; i++) { + if (fseg_get_nth_frag_page_no(seg_inode, i, mtr) + == page) { + + fseg_set_nth_frag_page_no(seg_inode, i, + FIL_NULL, mtr); + break; + } + } + + fsp_free_page(space, zip_size, page, mtr); + + return; + } + + /* If we get here, the page is in some extent of the segment */ + + descr_id = mtr_read_dulint(descr + XDES_ID, mtr); + seg_id = mtr_read_dulint(seg_inode + FSEG_ID, mtr); +#if 0 + fprintf(stderr, + "InnoDB: InnoDB is freeing space %lu page %lu,\n" + "InnoDB: which belongs to descr seg %lu %lu\n" + "InnoDB: segment %lu %lu.\n", + (ulong) space, (ulong) page, + (ulong) ut_dulint_get_high(descr_id), + (ulong) ut_dulint_get_low(descr_id), + (ulong) ut_dulint_get_high(seg_id), + (ulong) ut_dulint_get_low(seg_id)); +#endif /* 0 */ + if (0 != ut_dulint_cmp(descr_id, seg_id)) { + fputs("InnoDB: Dump of the tablespace extent descriptor: ", + stderr); + ut_print_buf(stderr, descr, 40); + fputs("\nInnoDB: Dump of the segment inode: ", stderr); + ut_print_buf(stderr, seg_inode, 40); + putc('\n', stderr); + + fprintf(stderr, + "InnoDB: Serious error: InnoDB is trying to" + " free space %lu page %lu,\n" + "InnoDB: which does not belong to" + " segment %lu %lu but belongs\n" + "InnoDB: to segment %lu %lu.\n", + (ulong) space, (ulong) page, + (ulong) ut_dulint_get_high(descr_id), + (ulong) ut_dulint_get_low(descr_id), + (ulong) ut_dulint_get_high(seg_id), + (ulong) ut_dulint_get_low(seg_id)); + goto crash; + } + + not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr); + if (xdes_is_full(descr, mtr)) { + /* The fragment is full: move it to another list */ + flst_remove(seg_inode + FSEG_FULL, + descr + XDES_FLST_NODE, mtr); + flst_add_last(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used + FSP_EXTENT_SIZE - 1, + MLOG_4BYTES, mtr); + } else { + ut_a(not_full_n_used > 0); + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - 1, MLOG_4BYTES, mtr); + } + + xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); + + if (xdes_is_free(descr, mtr)) { + /* The extent has become free: free it to space */ + flst_remove(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + fsp_free_extent(space, zip_size, page, mtr); + } +} + +/************************************************************************** +Frees a single page of a segment. */ +UNIV_INTERN +void +fseg_free_page( +/*===========*/ + fseg_header_t* seg_header, /* in: segment header */ + ulint space, /* in: space id */ + ulint page, /* in: page offset */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint flags; + ulint zip_size; + fseg_inode_t* seg_inode; + rw_lock_t* latch; + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + ut_ad(!mutex_own(&kernel_mutex) + || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK)); + + mtr_x_lock(latch, mtr); + + seg_inode = fseg_inode_get(seg_header, space, zip_size, mtr); + + fseg_free_page_low(seg_inode, space, zip_size, page, mtr); + +#ifdef UNIV_DEBUG_FILE_ACCESSES + buf_page_set_file_page_was_freed(space, page); +#endif +} + +/************************************************************************** +Frees an extent of a segment to the space free list. */ +static +void +fseg_free_extent( +/*=============*/ + fseg_inode_t* seg_inode, /* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page, /* in: a page in the extent */ + mtr_t* mtr) /* in: mtr handle */ +{ + ulint first_page_in_extent; + xdes_t* descr; + ulint not_full_n_used; + ulint descr_n_used; + ulint i; + + ut_ad(seg_inode && mtr); + + descr = xdes_get_descriptor(space, zip_size, page, mtr); + + ut_a(xdes_get_state(descr, mtr) == XDES_FSEG); + ut_a(0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, mtr), + mtr_read_dulint(seg_inode + FSEG_ID, mtr))); + + first_page_in_extent = page - (page % FSP_EXTENT_SIZE); + + for (i = 0; i < FSP_EXTENT_SIZE; i++) { + if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) { + + /* Drop search system page hash index if the page is + found in the pool and is hashed */ + + btr_search_drop_page_hash_when_freed( + space, zip_size, first_page_in_extent + i); + } + } + + if (xdes_is_full(descr, mtr)) { + flst_remove(seg_inode + FSEG_FULL, + descr + XDES_FLST_NODE, mtr); + } else if (xdes_is_free(descr, mtr)) { + flst_remove(seg_inode + FSEG_FREE, + descr + XDES_FLST_NODE, mtr); + } else { + flst_remove(seg_inode + FSEG_NOT_FULL, + descr + XDES_FLST_NODE, mtr); + + not_full_n_used = mtr_read_ulint( + seg_inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr); + + descr_n_used = xdes_get_n_used(descr, mtr); + ut_a(not_full_n_used >= descr_n_used); + mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - descr_n_used, + MLOG_4BYTES, mtr); + } + + fsp_free_extent(space, zip_size, page, mtr); + +#ifdef UNIV_DEBUG_FILE_ACCESSES + for (i = 0; i < FSP_EXTENT_SIZE; i++) { + + buf_page_set_file_page_was_freed(space, + first_page_in_extent + i); + } +#endif +} + +/************************************************************************** +Frees part of a segment. This function can be used to free a segment by +repeatedly calling this function in different mini-transactions. Doing +the freeing in a single mini-transaction might result in too big a +mini-transaction. */ +UNIV_INTERN +ibool +fseg_free_step( +/*===========*/ + /* out: TRUE if freeing completed */ + fseg_header_t* header, /* in, own: segment header; NOTE: if the header + resides on the first page of the frag list + of the segment, this pointer becomes obsolete + after the last freeing step */ + mtr_t* mtr) /* in: mtr */ +{ + ulint n; + ulint page; + xdes_t* descr; + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + ulint header_page; + rw_lock_t* latch; + + space = page_get_space_id(page_align(header)); + header_page = page_get_page_no(page_align(header)); + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + ut_ad(!mutex_own(&kernel_mutex) + || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK)); + + mtr_x_lock(latch, mtr); + + descr = xdes_get_descriptor(space, zip_size, header_page, mtr); + + /* Check that the header resides on a page which has not been + freed yet */ + + ut_a(descr); + ut_a(xdes_get_bit(descr, XDES_FREE_BIT, + header_page % FSP_EXTENT_SIZE, mtr) == FALSE); + inode = fseg_inode_get(header, space, zip_size, mtr); + + descr = fseg_get_first_extent(inode, space, zip_size, mtr); + + if (descr != NULL) { + /* Free the extent held by the segment */ + page = xdes_get_offset(descr); + + fseg_free_extent(inode, space, zip_size, page, mtr); + + return(FALSE); + } + + /* Free a frag page */ + n = fseg_find_last_used_frag_page_slot(inode, mtr); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, zip_size, inode, mtr); + + return(TRUE); + } + + fseg_free_page_low(inode, space, zip_size, + fseg_get_nth_frag_page_no(inode, n, mtr), mtr); + + n = fseg_find_last_used_frag_page_slot(inode, mtr); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, zip_size, inode, mtr); + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************** +Frees part of a segment. Differs from fseg_free_step because this function +leaves the header page unfreed. */ +UNIV_INTERN +ibool +fseg_free_step_not_header( +/*======================*/ + /* out: TRUE if freeing completed, except the + header page */ + fseg_header_t* header, /* in: segment header which must reside on + the first fragment page of the segment */ + mtr_t* mtr) /* in: mtr */ +{ + ulint n; + ulint page; + xdes_t* descr; + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + ulint page_no; + rw_lock_t* latch; + + space = page_get_space_id(page_align(header)); + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + ut_ad(!mutex_own(&kernel_mutex) + || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK)); + + mtr_x_lock(latch, mtr); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + descr = fseg_get_first_extent(inode, space, zip_size, mtr); + + if (descr != NULL) { + /* Free the extent held by the segment */ + page = xdes_get_offset(descr); + + fseg_free_extent(inode, space, zip_size, page, mtr); + + return(FALSE); + } + + /* Free a frag page */ + + n = fseg_find_last_used_frag_page_slot(inode, mtr); + + if (n == ULINT_UNDEFINED) { + ut_error; + } + + page_no = fseg_get_nth_frag_page_no(inode, n, mtr); + + if (page_no == page_get_page_no(page_align(header))) { + + return(TRUE); + } + + fseg_free_page_low(inode, space, zip_size, page_no, mtr); + + return(FALSE); +} + +/*********************************************************************** +Frees a segment. The freeing is performed in several mini-transactions, +so that there is no danger of bufferfixing too many buffer pages. */ +UNIV_INTERN +void +fseg_free( +/*======*/ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/* in: page number where the segment header is + placed */ + ulint offset) /* in: byte offset of the segment header on that + page */ +{ + mtr_t mtr; + ibool finished; + fseg_header_t* header; + fil_addr_t addr; + + addr.page = page_no; + addr.boffset = offset; + + for (;;) { + mtr_start(&mtr); + + header = fut_get_ptr(space, zip_size, addr, RW_X_LATCH, &mtr); + + finished = fseg_free_step(header, &mtr); + + mtr_commit(&mtr); + + if (finished) { + + return; + } + } +} + +/************************************************************************** +Returns the first extent descriptor for a segment. We think of the extent +lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL +-> FSEG_FREE. */ +static +xdes_t* +fseg_get_first_extent( +/*==================*/ + /* out: the first extent descriptor, or NULL if + none */ + fseg_inode_t* inode, /* in: segment inode */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + mtr_t* mtr) /* in: mtr */ +{ + fil_addr_t first; + xdes_t* descr; + + ut_ad(inode && mtr); + + ut_ad(space == page_get_space_id(page_align(inode))); + + first = fil_addr_null; + + if (flst_get_len(inode + FSEG_FULL, mtr) > 0) { + + first = flst_get_first(inode + FSEG_FULL, mtr); + + } else if (flst_get_len(inode + FSEG_NOT_FULL, mtr) > 0) { + + first = flst_get_first(inode + FSEG_NOT_FULL, mtr); + + } else if (flst_get_len(inode + FSEG_FREE, mtr) > 0) { + + first = flst_get_first(inode + FSEG_FREE, mtr); + } + + if (first.page == FIL_NULL) { + + return(NULL); + } + descr = xdes_lst_get_descriptor(space, zip_size, first, mtr); + + return(descr); +} + +/*********************************************************************** +Validates a segment. */ +static +ibool +fseg_validate_low( +/*==============*/ + /* out: TRUE if ok */ + fseg_inode_t* inode, /* in: segment inode */ + mtr_t* mtr2) /* in: mtr */ +{ + ulint space; + dulint seg_id; + mtr_t mtr; + xdes_t* descr; + fil_addr_t node_addr; + ulint n_used = 0; + ulint n_used2 = 0; + + ut_ad(mtr_memo_contains_page(mtr2, inode, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE); + + space = page_get_space_id(page_align(inode)); + + seg_id = mtr_read_dulint(inode + FSEG_ID, mtr2); + n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr2); + flst_validate(inode + FSEG_FREE, mtr2); + flst_validate(inode + FSEG_NOT_FULL, mtr2); + flst_validate(inode + FSEG_FULL, mtr2); + + /* Validate FSEG_FREE list */ + node_addr = flst_get_first(inode + FSEG_FREE, mtr2); + + while (!fil_addr_is_null(node_addr)) { + ulint flags; + ulint zip_size; + + mtr_start(&mtr); + mtr_x_lock(fil_space_get_latch(space, &flags), &mtr); + zip_size = dict_table_flags_to_zip_size(flags); + + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == 0); + ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG); + ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr), + seg_id)); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate FSEG_NOT_FULL list */ + + node_addr = flst_get_first(inode + FSEG_NOT_FULL, mtr2); + + while (!fil_addr_is_null(node_addr)) { + ulint flags; + ulint zip_size; + + mtr_start(&mtr); + mtr_x_lock(fil_space_get_latch(space, &flags), &mtr); + zip_size = dict_table_flags_to_zip_size(flags); + + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) > 0); + ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG); + ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr), + seg_id)); + + n_used2 += xdes_get_n_used(descr, &mtr); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate FSEG_FULL list */ + + node_addr = flst_get_first(inode + FSEG_FULL, mtr2); + + while (!fil_addr_is_null(node_addr)) { + ulint flags; + ulint zip_size; + + mtr_start(&mtr); + mtr_x_lock(fil_space_get_latch(space, &flags), &mtr); + zip_size = dict_table_flags_to_zip_size(flags); + + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG); + ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr), + seg_id)); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + ut_a(n_used == n_used2); + + return(TRUE); +} + +/*********************************************************************** +Validates a segment. */ +UNIV_INTERN +ibool +fseg_validate( +/*==========*/ + /* out: TRUE if ok */ + fseg_header_t* header, /* in: segment header */ + mtr_t* mtr) /* in: mtr */ +{ + fseg_inode_t* inode; + ibool ret; + ulint space; + ulint flags; + ulint zip_size; + + space = page_get_space_id(page_align(header)); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + zip_size = dict_table_flags_to_zip_size(flags); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + ret = fseg_validate_low(inode, mtr); + + return(ret); +} + +/*********************************************************************** +Writes info of a segment. */ +static +void +fseg_print_low( +/*===========*/ + fseg_inode_t* inode, /* in: segment inode */ + mtr_t* mtr) /* in: mtr */ +{ + ulint space; + ulint seg_id_low; + ulint seg_id_high; + ulint n_used; + ulint n_frag; + ulint n_free; + ulint n_not_full; + ulint n_full; + ulint reserved; + ulint used; + ulint page_no; + dulint d_var; + + ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX)); + space = page_get_space_id(page_align(inode)); + page_no = page_get_page_no(page_align(inode)); + + reserved = fseg_n_reserved_pages_low(inode, &used, mtr); + + d_var = mtr_read_dulint(inode + FSEG_ID, mtr); + + seg_id_low = ut_dulint_get_low(d_var); + seg_id_high = ut_dulint_get_high(d_var); + + n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, + MLOG_4BYTES, mtr); + n_frag = fseg_get_n_frag_pages(inode, mtr); + n_free = flst_get_len(inode + FSEG_FREE, mtr); + n_not_full = flst_get_len(inode + FSEG_NOT_FULL, mtr); + n_full = flst_get_len(inode + FSEG_FULL, mtr); + + fprintf(stderr, + "SEGMENT id %lu %lu space %lu; page %lu;" + " res %lu used %lu; full ext %lu\n" + "fragm pages %lu; free extents %lu;" + " not full extents %lu: pages %lu\n", + (ulong) seg_id_high, (ulong) seg_id_low, + (ulong) space, (ulong) page_no, + (ulong) reserved, (ulong) used, (ulong) n_full, + (ulong) n_frag, (ulong) n_free, (ulong) n_not_full, + (ulong) n_used); +} + +#ifdef UNIV_BTR_PRINT +/*********************************************************************** +Writes info of a segment. */ +UNIV_INTERN +void +fseg_print( +/*=======*/ + fseg_header_t* header, /* in: segment header */ + mtr_t* mtr) /* in: mtr */ +{ + fseg_inode_t* inode; + ulint space; + ulint flags; + ulint zip_size; + + space = page_get_space_id(page_align(header)); + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + zip_size = dict_table_flags_to_zip_size(flags); + + inode = fseg_inode_get(header, space, zip_size, mtr); + + fseg_print_low(inode, mtr); +} +#endif /* UNIV_BTR_PRINT */ + +/*********************************************************************** +Validates the file space system and its segments. */ +UNIV_INTERN +ibool +fsp_validate( +/*=========*/ + /* out: TRUE if ok */ + ulint space) /* in: space id */ +{ + fsp_header_t* header; + fseg_inode_t* seg_inode; + page_t* seg_inode_page; + rw_lock_t* latch; + ulint size; + ulint flags; + ulint zip_size; + ulint free_limit; + ulint frag_n_used; + mtr_t mtr; + mtr_t mtr2; + xdes_t* descr; + fil_addr_t node_addr; + fil_addr_t next_node_addr; + ulint descr_count = 0; + ulint n_used = 0; + ulint n_used2 = 0; + ulint n_full_frag_pages; + ulint n; + ulint seg_inode_len_free; + ulint seg_inode_len_full; + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + ut_a(ut_is_2pow(zip_size)); + ut_a(zip_size <= UNIV_PAGE_SIZE); + ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE); + + /* Start first a mini-transaction mtr2 to lock out all other threads + from the fsp system */ + mtr_start(&mtr2); + mtr_x_lock(latch, &mtr2); + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr); + free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT, + MLOG_4BYTES, &mtr); + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, + MLOG_4BYTES, &mtr); + + n_full_frag_pages = FSP_EXTENT_SIZE + * flst_get_len(header + FSP_FULL_FRAG, &mtr); + + if (UNIV_UNLIKELY(free_limit > size)) { + + ut_a(space != 0); + ut_a(size < FSP_EXTENT_SIZE); + } + + flst_validate(header + FSP_FREE, &mtr); + flst_validate(header + FSP_FREE_FRAG, &mtr); + flst_validate(header + FSP_FULL_FRAG, &mtr); + + mtr_commit(&mtr); + + /* Validate FSP_FREE list */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + node_addr = flst_get_first(header + FSP_FREE, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + descr_count++; + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == 0); + ut_a(xdes_get_state(descr, &mtr) == XDES_FREE); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate FSP_FREE_FRAG list */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + node_addr = flst_get_first(header + FSP_FREE_FRAG, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + descr_count++; + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) > 0); + ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FREE_FRAG); + + n_used += xdes_get_n_used(descr, &mtr); + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + + mtr_commit(&mtr); + } + + /* Validate FSP_FULL_FRAG list */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + node_addr = flst_get_first(header + FSP_FULL_FRAG, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + descr_count++; + descr = xdes_lst_get_descriptor(space, zip_size, + node_addr, &mtr); + + ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE); + ut_a(xdes_get_state(descr, &mtr) == XDES_FULL_FRAG); + + node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr); + mtr_commit(&mtr); + } + + /* Validate segments */ + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr); + + seg_inode_len_full = flst_get_len(header + FSP_SEG_INODES_FULL, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + do { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + ut_a(!ut_dulint_is_zero( + mach_read_from_8(seg_inode + FSEG_ID))); + fseg_validate_low(seg_inode, &mtr); + + descr_count += flst_get_len(seg_inode + FSEG_FREE, + &mtr); + descr_count += flst_get_len(seg_inode + FSEG_FULL, + &mtr); + descr_count += flst_get_len(seg_inode + FSEG_NOT_FULL, + &mtr); + + n_used2 += fseg_get_n_frag_pages(seg_inode, &mtr); + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr); + + seg_inode_len_free = flst_get_len(header + FSP_SEG_INODES_FREE, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + + do { + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + if (!ut_dulint_is_zero( + mach_read_from_8(seg_inode + FSEG_ID))) { + fseg_validate_low(seg_inode, &mtr); + + descr_count += flst_get_len( + seg_inode + FSEG_FREE, &mtr); + descr_count += flst_get_len( + seg_inode + FSEG_FULL, &mtr); + descr_count += flst_get_len( + seg_inode + FSEG_NOT_FULL, &mtr); + n_used2 += fseg_get_n_frag_pages( + seg_inode, &mtr); + } + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + ut_a(descr_count * FSP_EXTENT_SIZE == free_limit); + if (!zip_size) { + ut_a(n_used + n_full_frag_pages + == n_used2 + 2 * ((free_limit + (UNIV_PAGE_SIZE - 1)) + / UNIV_PAGE_SIZE) + + seg_inode_len_full + seg_inode_len_free); + } else { + ut_a(n_used + n_full_frag_pages + == n_used2 + 2 * ((free_limit + (zip_size - 1)) + / zip_size) + + seg_inode_len_full + seg_inode_len_free); + } + ut_a(frag_n_used == n_used); + + mtr_commit(&mtr2); + + return(TRUE); +} + +/*********************************************************************** +Prints info of a file space. */ +UNIV_INTERN +void +fsp_print( +/*======*/ + ulint space) /* in: space id */ +{ + fsp_header_t* header; + fseg_inode_t* seg_inode; + page_t* seg_inode_page; + rw_lock_t* latch; + ulint flags; + ulint zip_size; + ulint size; + ulint free_limit; + ulint frag_n_used; + fil_addr_t node_addr; + fil_addr_t next_node_addr; + ulint n_free; + ulint n_free_frag; + ulint n_full_frag; + ulint seg_id_low; + ulint seg_id_high; + ulint n; + ulint n_segs = 0; + dulint d_var; + mtr_t mtr; + mtr_t mtr2; + + latch = fil_space_get_latch(space, &flags); + zip_size = dict_table_flags_to_zip_size(flags); + + /* Start first a mini-transaction mtr2 to lock out all other threads + from the fsp system */ + + mtr_start(&mtr2); + + mtr_x_lock(latch, &mtr2); + + mtr_start(&mtr); + + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr); + + free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, + &mtr); + frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES, + &mtr); + n_free = flst_get_len(header + FSP_FREE, &mtr); + n_free_frag = flst_get_len(header + FSP_FREE_FRAG, &mtr); + n_full_frag = flst_get_len(header + FSP_FULL_FRAG, &mtr); + + d_var = mtr_read_dulint(header + FSP_SEG_ID, &mtr); + + seg_id_low = ut_dulint_get_low(d_var); + seg_id_high = ut_dulint_get_high(d_var); + + fprintf(stderr, + "FILE SPACE INFO: id %lu\n" + "size %lu, free limit %lu, free extents %lu\n" + "not full frag extents %lu: used pages %lu," + " full frag extents %lu\n" + "first seg id not used %lu %lu\n", + (ulong) space, + (ulong) size, (ulong) free_limit, (ulong) n_free, + (ulong) n_free_frag, (ulong) frag_n_used, (ulong) n_full_frag, + (ulong) seg_id_high, (ulong) seg_id_low); + + mtr_commit(&mtr); + + /* Print segments */ + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + + do { + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + ut_a(!ut_dulint_is_zero( + mach_read_from_8(seg_inode + FSEG_ID))); + fseg_print_low(seg_inode, &mtr); + + n_segs++; + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + header = fsp_get_space_header(space, zip_size, &mtr); + + node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr); + + mtr_commit(&mtr); + + while (!fil_addr_is_null(node_addr)) { + + n = 0; + + do { + + mtr_start(&mtr); + mtr_x_lock(latch, &mtr); + + seg_inode_page = fut_get_ptr( + space, zip_size, node_addr, RW_X_LATCH, &mtr) + - FSEG_INODE_PAGE_NODE; + + seg_inode = fsp_seg_inode_page_get_nth_inode( + seg_inode_page, n, zip_size, &mtr); + if (!ut_dulint_is_zero( + mach_read_from_8(seg_inode + FSEG_ID))) { + + fseg_print_low(seg_inode, &mtr); + n_segs++; + } + + next_node_addr = flst_get_next_addr( + seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr); + mtr_commit(&mtr); + } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size)); + + node_addr = next_node_addr; + } + + mtr_commit(&mtr2); + + fprintf(stderr, "NUMBER of file segments: %lu\n", (ulong) n_segs); +} diff --git a/storage/xtradb/fut/fut0fut.c b/storage/xtradb/fut/fut0fut.c new file mode 100644 index 00000000000..41ee0cb6715 --- /dev/null +++ b/storage/xtradb/fut/fut0fut.c @@ -0,0 +1,30 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0fut.h" + +#ifdef UNIV_NONINL +#include "fut0fut.ic" +#endif + diff --git a/storage/xtradb/fut/fut0lst.c b/storage/xtradb/fut/fut0lst.c new file mode 100644 index 00000000000..bea27ab70d1 --- /dev/null +++ b/storage/xtradb/fut/fut0lst.c @@ -0,0 +1,529 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0lst.h" + +#ifdef UNIV_NONINL +#include "fut0lst.ic" +#endif + +#include "buf0buf.h" +#include "page0page.h" + +/************************************************************************ +Adds a node to an empty list. */ +static +void +flst_add_to_empty( +/*==============*/ + flst_base_node_t* base, /* in: pointer to base node of + empty list */ + flst_node_t* node, /* in: node to add */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node_addr; + ulint len; + + ut_ad(mtr && base && node); + ut_ad(base != node); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX)); + len = flst_get_len(base, mtr); + ut_a(len == 0); + + buf_ptr_get_fsp_addr(node, &space, &node_addr); + + /* Update first and last fields of base node */ + flst_write_addr(base + FLST_FIRST, node_addr, mtr); + flst_write_addr(base + FLST_LAST, node_addr, mtr); + + /* Set prev and next fields of node to add */ + flst_write_addr(node + FLST_PREV, fil_addr_null, mtr); + flst_write_addr(node + FLST_NEXT, fil_addr_null, mtr); + + /* Update len of base node */ + mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); +} + +/************************************************************************ +Adds a node as the last node in a list. */ +UNIV_INTERN +void +flst_add_last( +/*==========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node, /* in: node to add */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node_addr; + ulint len; + fil_addr_t last_addr; + flst_node_t* last_node; + + ut_ad(mtr && base && node); + ut_ad(base != node); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX)); + len = flst_get_len(base, mtr); + last_addr = flst_get_last(base, mtr); + + buf_ptr_get_fsp_addr(node, &space, &node_addr); + + /* If the list is not empty, call flst_insert_after */ + if (len != 0) { + if (last_addr.page == node_addr.page) { + last_node = page_align(node) + last_addr.boffset; + } else { + ulint zip_size = fil_space_get_zip_size(space); + + last_node = fut_get_ptr(space, zip_size, last_addr, + RW_X_LATCH, mtr); + } + + flst_insert_after(base, last_node, node, mtr); + } else { + /* else call flst_add_to_empty */ + flst_add_to_empty(base, node, mtr); + } +} + +/************************************************************************ +Adds a node as the first node in a list. */ +UNIV_INTERN +void +flst_add_first( +/*===========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node, /* in: node to add */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node_addr; + ulint len; + fil_addr_t first_addr; + flst_node_t* first_node; + + ut_ad(mtr && base && node); + ut_ad(base != node); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX)); + len = flst_get_len(base, mtr); + first_addr = flst_get_first(base, mtr); + + buf_ptr_get_fsp_addr(node, &space, &node_addr); + + /* If the list is not empty, call flst_insert_before */ + if (len != 0) { + if (first_addr.page == node_addr.page) { + first_node = page_align(node) + first_addr.boffset; + } else { + ulint zip_size = fil_space_get_zip_size(space); + + first_node = fut_get_ptr(space, zip_size, first_addr, + RW_X_LATCH, mtr); + } + + flst_insert_before(base, node, first_node, mtr); + } else { + /* else call flst_add_to_empty */ + flst_add_to_empty(base, node, mtr); + } +} + +/************************************************************************ +Inserts a node after another in a list. */ +UNIV_INTERN +void +flst_insert_after( +/*==============*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node1, /* in: node to insert after */ + flst_node_t* node2, /* in: node to add */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint space; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + flst_node_t* node3; + fil_addr_t node3_addr; + ulint len; + + ut_ad(mtr && node1 && node2 && base); + ut_ad(base != node1); + ut_ad(base != node2); + ut_ad(node2 != node1); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node1, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + + buf_ptr_get_fsp_addr(node1, &space, &node1_addr); + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + + node3_addr = flst_get_next_addr(node1, mtr); + + /* Set prev and next fields of node2 */ + flst_write_addr(node2 + FLST_PREV, node1_addr, mtr); + flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr); + + if (!fil_addr_is_null(node3_addr)) { + /* Update prev field of node3 */ + ulint zip_size = fil_space_get_zip_size(space); + + node3 = fut_get_ptr(space, zip_size, + node3_addr, RW_X_LATCH, mtr); + flst_write_addr(node3 + FLST_PREV, node2_addr, mtr); + } else { + /* node1 was last in list: update last field in base */ + flst_write_addr(base + FLST_LAST, node2_addr, mtr); + } + + /* Set next field of node1 */ + flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); +} + +/************************************************************************ +Inserts a node before another in a list. */ +UNIV_INTERN +void +flst_insert_before( +/*===============*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: node to insert */ + flst_node_t* node3, /* in: node to insert before */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint space; + flst_node_t* node1; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + fil_addr_t node3_addr; + ulint len; + + ut_ad(mtr && node2 && node3 && base); + ut_ad(base != node2); + ut_ad(base != node3); + ut_ad(node2 != node3); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node3, MTR_MEMO_PAGE_X_FIX)); + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + buf_ptr_get_fsp_addr(node3, &space, &node3_addr); + + node1_addr = flst_get_prev_addr(node3, mtr); + + /* Set prev and next fields of node2 */ + flst_write_addr(node2 + FLST_PREV, node1_addr, mtr); + flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr); + + if (!fil_addr_is_null(node1_addr)) { + ulint zip_size = fil_space_get_zip_size(space); + /* Update next field of node1 */ + node1 = fut_get_ptr(space, zip_size, node1_addr, + RW_X_LATCH, mtr); + flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr); + } else { + /* node3 was first in list: update first field in base */ + flst_write_addr(base + FLST_FIRST, node2_addr, mtr); + } + + /* Set prev field of node3 */ + flst_write_addr(node3 + FLST_PREV, node2_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); +} + +/************************************************************************ +Removes a node. */ +UNIV_INTERN +void +flst_remove( +/*========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: node to remove */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint space; + ulint zip_size; + flst_node_t* node1; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + flst_node_t* node3; + fil_addr_t node3_addr; + ulint len; + + ut_ad(mtr && node2 && base); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + zip_size = fil_space_get_zip_size(space); + + node1_addr = flst_get_prev_addr(node2, mtr); + node3_addr = flst_get_next_addr(node2, mtr); + + if (!fil_addr_is_null(node1_addr)) { + + /* Update next field of node1 */ + + if (node1_addr.page == node2_addr.page) { + + node1 = page_align(node2) + node1_addr.boffset; + } else { + node1 = fut_get_ptr(space, zip_size, + node1_addr, RW_X_LATCH, mtr); + } + + ut_ad(node1 != node2); + + flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr); + } else { + /* node2 was first in list: update first field in base */ + flst_write_addr(base + FLST_FIRST, node3_addr, mtr); + } + + if (!fil_addr_is_null(node3_addr)) { + /* Update prev field of node3 */ + + if (node3_addr.page == node2_addr.page) { + + node3 = page_align(node2) + node3_addr.boffset; + } else { + node3 = fut_get_ptr(space, zip_size, + node3_addr, RW_X_LATCH, mtr); + } + + ut_ad(node2 != node3); + + flst_write_addr(node3 + FLST_PREV, node1_addr, mtr); + } else { + /* node2 was last in list: update last field in base */ + flst_write_addr(base + FLST_LAST, node1_addr, mtr); + } + + /* Update len of base node */ + len = flst_get_len(base, mtr); + ut_ad(len > 0); + + mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr); +} + +/************************************************************************ +Cuts off the tail of the list, including the node given. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_cut_end( +/*=========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: first node to remove */ + ulint n_nodes,/* in: number of nodes to remove, + must be >= 1 */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint space; + flst_node_t* node1; + fil_addr_t node1_addr; + fil_addr_t node2_addr; + ulint len; + + ut_ad(mtr && node2 && base); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + ut_ad(n_nodes > 0); + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + + node1_addr = flst_get_prev_addr(node2, mtr); + + if (!fil_addr_is_null(node1_addr)) { + + /* Update next field of node1 */ + + if (node1_addr.page == node2_addr.page) { + + node1 = page_align(node2) + node1_addr.boffset; + } else { + node1 = fut_get_ptr(space, + fil_space_get_zip_size(space), + node1_addr, RW_X_LATCH, mtr); + } + + flst_write_addr(node1 + FLST_NEXT, fil_addr_null, mtr); + } else { + /* node2 was first in list: update the field in base */ + flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); + } + + flst_write_addr(base + FLST_LAST, node1_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + ut_ad(len >= n_nodes); + + mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr); +} + +/************************************************************************ +Cuts off the tail of the list, not including the given node. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_truncate_end( +/*==============*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: first node not to remove */ + ulint n_nodes,/* in: number of nodes to remove */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + fil_addr_t node2_addr; + ulint len; + ulint space; + + ut_ad(mtr && node2 && base); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); + if (n_nodes == 0) { + + ut_ad(fil_addr_is_null(flst_get_next_addr(node2, mtr))); + + return; + } + + buf_ptr_get_fsp_addr(node2, &space, &node2_addr); + + /* Update next field of node2 */ + flst_write_addr(node2 + FLST_NEXT, fil_addr_null, mtr); + + flst_write_addr(base + FLST_LAST, node2_addr, mtr); + + /* Update len of base node */ + len = flst_get_len(base, mtr); + ut_ad(len >= n_nodes); + + mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr); +} + +/************************************************************************ +Validates a file-based list. */ +UNIV_INTERN +ibool +flst_validate( +/*==========*/ + /* out: TRUE if ok */ + const flst_base_node_t* base, /* in: pointer to base node of list */ + mtr_t* mtr1) /* in: mtr */ +{ + ulint space; + ulint zip_size; + const flst_node_t* node; + fil_addr_t node_addr; + fil_addr_t base_addr; + ulint len; + ulint i; + mtr_t mtr2; + + ut_ad(base); + ut_ad(mtr_memo_contains_page(mtr1, base, MTR_MEMO_PAGE_X_FIX)); + + /* We use two mini-transaction handles: the first is used to + lock the base node, and prevent other threads from modifying the + list. The second is used to traverse the list. We cannot run the + second mtr without committing it at times, because if the list + is long, then the x-locked pages could fill the buffer resulting + in a deadlock. */ + + /* Find out the space id */ + buf_ptr_get_fsp_addr(base, &space, &base_addr); + zip_size = fil_space_get_zip_size(space); + + len = flst_get_len(base, mtr1); + node_addr = flst_get_first(base, mtr1); + + for (i = 0; i < len; i++) { + mtr_start(&mtr2); + + node = fut_get_ptr(space, zip_size, + node_addr, RW_X_LATCH, &mtr2); + node_addr = flst_get_next_addr(node, &mtr2); + + mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer + becoming full */ + } + + ut_a(fil_addr_is_null(node_addr)); + + node_addr = flst_get_last(base, mtr1); + + for (i = 0; i < len; i++) { + mtr_start(&mtr2); + + node = fut_get_ptr(space, zip_size, + node_addr, RW_X_LATCH, &mtr2); + node_addr = flst_get_prev_addr(node, &mtr2); + + mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer + becoming full */ + } + + ut_a(fil_addr_is_null(node_addr)); + + return(TRUE); +} + +/************************************************************************ +Prints info of a file-based list. */ +UNIV_INTERN +void +flst_print( +/*=======*/ + const flst_base_node_t* base, /* in: pointer to base node of list */ + mtr_t* mtr) /* in: mtr */ +{ + const buf_frame_t* frame; + ulint len; + + ut_ad(base && mtr); + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + frame = page_align((byte*) base); + + len = flst_get_len(base, mtr); + + fprintf(stderr, + "FILE-BASED LIST:\n" + "Base node in space %lu page %lu byte offset %lu; len %lu\n", + (ulong) page_get_space_id(frame), + (ulong) page_get_page_no(frame), + (ulong) page_offset(base), (ulong) len); +} diff --git a/storage/xtradb/ha/ha0ha.c b/storage/xtradb/ha/ha0ha.c new file mode 100644 index 00000000000..1ecba3df663 --- /dev/null +++ b/storage/xtradb/ha/ha0ha.c @@ -0,0 +1,442 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The hash table with external chains + +Created 8/22/1994 Heikki Tuuri +*************************************************************************/ + +#include "ha0ha.h" +#ifdef UNIV_NONINL +#include "ha0ha.ic" +#endif + +#ifdef UNIV_DEBUG +# include "buf0buf.h" +#endif /* UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG +# include "btr0sea.h" +#endif /* UNIV_SYNC_DEBUG */ +#include "page0page.h" + +/***************************************************************** +Creates a hash table with >= n array cells. The actual number of cells is +chosen to be a prime number slightly bigger than n. */ +UNIV_INTERN +hash_table_t* +ha_create_func( +/*===========*/ + /* out, own: created table */ + ulint n, /* in: number of array cells */ +#ifdef UNIV_SYNC_DEBUG + ulint mutex_level, /* in: level of the mutexes in the latching + order: this is used in the debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_mutexes) /* in: number of mutexes to protect the + hash table: must be a power of 2, or 0 */ +{ + hash_table_t* table; + ulint i; + + table = hash_create(n); + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + table->adaptive = TRUE; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + /* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail, + but in practise it never should in this case, hence the asserts. */ + + if (n_mutexes == 0) { + table->heap = mem_heap_create_in_btr_search( + ut_min(4096, MEM_MAX_ALLOC_IN_BUF)); + ut_a(table->heap); + + return(table); + } + + hash_create_mutexes(table, n_mutexes, mutex_level); + + table->heaps = mem_alloc(n_mutexes * sizeof(void*)); + + for (i = 0; i < n_mutexes; i++) { + table->heaps[i] = mem_heap_create_in_btr_search(4096); + ut_a(table->heaps[i]); + } + + return(table); +} + +/***************************************************************** +Empties a hash table and frees the memory heaps. */ +UNIV_INTERN +void +ha_clear( +/*=====*/ + hash_table_t* table) /* in, own: hash table */ +{ + ulint i; + ulint n; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Free the memory heaps. */ + n = table->n_mutexes; + + for (i = 0; i < n; i++) { + mem_heap_free(table->heaps[i]); + } + + /* Clear the hash table. */ + n = hash_get_n_cells(table); + + for (i = 0; i < n; i++) { + hash_get_nth_cell(table, i)->node = NULL; + } +} + +/***************************************************************** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. */ +UNIV_INTERN +ibool +ha_insert_for_fold_func( +/*====================*/ + /* out: TRUE if succeed, FALSE if no more + memory could be allocated */ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of data; if a node with + the same fold value already exists, it is + updated to point to the same data, and no new + node is created! */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /* in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + void* data) /* in: data, must not be NULL */ +{ + hash_cell_t* cell; + ha_node_t* node; + ha_node_t* prev_node; + ulint hash; + + ut_ad(table && data); +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->frame == page_align(data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); + + hash = hash_calc_hash(fold, table); + + cell = hash_get_nth_cell(table, hash); + + prev_node = cell->node; + + while (prev_node != NULL) { + if (prev_node->fold == fold) { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + buf_block_t* prev_block = prev_node->block; + ut_a(prev_block->frame + == page_align(prev_node->data)); + ut_a(prev_block->n_pointers > 0); + prev_block->n_pointers--; + block->n_pointers++; + } + + prev_node->block = block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + prev_node->data = data; + + return(TRUE); + } + + prev_node = prev_node->next; + } + + /* We have to allocate a new chain node */ + + node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t)); + + if (node == NULL) { + /* It was a btr search type memory heap and at the moment + no more memory could be allocated: return */ + + ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH); + + return(FALSE); + } + + ha_node_set_data(node, block, data); + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + block->n_pointers++; + } +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->fold = fold; + + node->next = NULL; + + prev_node = cell->node; + + if (prev_node == NULL) { + + cell->node = node; + + return(TRUE); + } + + while (prev_node->next != NULL) { + + prev_node = prev_node->next; + } + + prev_node->next = node; + + return(TRUE); +} + +/*************************************************************** +Deletes a hash node. */ +UNIV_INTERN +void +ha_delete_hash_node( +/*================*/ + hash_table_t* table, /* in: hash table */ + ha_node_t* del_node) /* in: node to be deleted */ +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + ut_a(del_node->block->frame = page_align(del_node->data)); + ut_a(del_node->block->n_pointers > 0); + del_node->block->n_pointers--; + } +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node); +} + +/***************************************************************** +Deletes an entry from a hash table. */ +UNIV_INTERN +void +ha_delete( +/*======*/ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of data */ + void* data) /* in: data, must not be NULL and must exist + in the hash table */ +{ + ha_node_t* node; + + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); + + node = ha_search_with_data(table, fold, data); + + ut_a(node); + + ha_delete_hash_node(table, node); +} + +/************************************************************* +Looks for an element when we know the pointer to the data, and updates +the pointer to data, if found. */ +UNIV_INTERN +void +ha_search_and_update_if_found_func( +/*===============================*/ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of the searched data */ + void* data, /* in: pointer to the data */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* new_block,/* in: block containing new_data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + void* new_data)/* in: new pointer to the data */ +{ + ha_node_t* node; + + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(new_block->frame == page_align(new_data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + node = ha_search_with_data(table, fold, data); + + if (node) { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + if (table->adaptive) { + ut_a(node->block->n_pointers > 0); + node->block->n_pointers--; + new_block->n_pointers++; + } + + node->block = new_block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data = new_data; + } +} + +/********************************************************************* +Removes from the chain determined by fold all nodes whose data pointer +points to the page given. */ +UNIV_INTERN +void +ha_remove_all_nodes_to_page( +/*========================*/ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: fold value */ + const page_t* page) /* in: buffer page */ +{ + ha_node_t* node; + + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (page_align(ha_node_get_data(node)) == page) { + + /* Remove the hash node */ + + ha_delete_hash_node(table, node); + + /* Start again from the first node in the chain + because the deletion may compact the heap of + nodes and move other nodes! */ + + node = ha_chain_get_first(table, fold); + } else { + node = ha_chain_get_next(node); + } + } +#ifdef UNIV_DEBUG + /* Check that all nodes really got deleted */ + + node = ha_chain_get_first(table, fold); + + while (node) { + ut_a(page_align(ha_node_get_data(node)) != page); + + node = ha_chain_get_next(node); + } +#endif +} + +/***************************************************************** +Validates a given range of the cells in hash table. */ +UNIV_INTERN +ibool +ha_validate( +/*========*/ + /* out: TRUE if ok */ + hash_table_t* table, /* in: hash table */ + ulint start_index, /* in: start index */ + ulint end_index) /* in: end index */ +{ + hash_cell_t* cell; + ha_node_t* node; + ibool ok = TRUE; + ulint i; + + ut_a(start_index <= end_index); + ut_a(start_index < hash_get_n_cells(table)); + ut_a(end_index < hash_get_n_cells(table)); + + for (i = start_index; i <= end_index; i++) { + + cell = hash_get_nth_cell(table, i); + + node = cell->node; + + while (node) { + if (hash_calc_hash(node->fold, table) != i) { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: Error: hash table node" + " fold value %lu does not\n" + "InnoDB: match the cell number %lu.\n", + (ulong) node->fold, (ulong) i); + + ok = FALSE; + } + + node = node->next; + } + } + + return(ok); +} + +/***************************************************************** +Prints info of a hash table. */ +UNIV_INTERN +void +ha_print_info( +/*==========*/ + FILE* file, /* in: file where to print */ + hash_table_t* table) /* in: hash table */ +{ +#ifdef UNIV_DEBUG +/* Some of the code here is disabled for performance reasons in production +builds, see http://bugs.mysql.com/36941 */ +#define PRINT_USED_CELLS +#endif /* UNIV_DEBUG */ + +#ifdef PRINT_USED_CELLS + hash_cell_t* cell; + ulint cells = 0; + ulint i; +#endif /* PRINT_USED_CELLS */ + ulint n_bufs; + +#ifdef PRINT_USED_CELLS + for (i = 0; i < hash_get_n_cells(table); i++) { + + cell = hash_get_nth_cell(table, i); + + if (cell->node) { + + cells++; + } + } +#endif /* PRINT_USED_CELLS */ + + fprintf(file, "Hash table size %lu", + (ulong) hash_get_n_cells(table)); + +#ifdef PRINT_USED_CELLS + fprintf(file, ", used cells %lu", (ulong) cells); +#endif /* PRINT_USED_CELLS */ + + if (table->heaps == NULL && table->heap != NULL) { + + /* This calculation is intended for the adaptive hash + index: how many buffer frames we have reserved? */ + + n_bufs = UT_LIST_GET_LEN(table->heap->base) - 1; + + if (table->heap->free_block) { + n_bufs++; + } + + fprintf(file, ", node heap has %lu buffer(s)\n", + (ulong) n_bufs); + } +} diff --git a/storage/xtradb/ha/ha0storage.c b/storage/xtradb/ha/ha0storage.c new file mode 100644 index 00000000000..e7e09591193 --- /dev/null +++ b/storage/xtradb/ha/ha0storage.c @@ -0,0 +1,183 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "ha0storage.h" +#include "hash0hash.h" +#include "mem0mem.h" +#include "ut0rnd.h" + +#ifdef UNIV_NONINL +#include "ha0storage.ic" +#endif + +/*********************************************************************** +Retrieves a data from a storage. If it is present, a pointer to the +stored copy of data is returned, otherwise NULL is returned. */ +static +const void* +ha_storage_get( +/*===========*/ + ha_storage_t* storage, /* in: hash storage */ + const void* data, /* in: data to check for */ + ulint data_len) /* in: data length */ +{ + ha_storage_node_t* node; + ulint fold; + + /* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH + macro */ + fold = ut_fold_binary(data, data_len); + +#define IS_FOUND \ + node->data_len == data_len && memcmp(node->data, data, data_len) == 0 + + HASH_SEARCH( + next, /* node->"next" */ + storage->hash, /* the hash table */ + fold, /* key */ + ha_storage_node_t*, /* type of node->next */ + node, /* auxiliary variable */ + , /* assertion */ + IS_FOUND); /* search criteria */ + + if (node == NULL) { + + return(NULL); + } + /* else */ + + return(node->data); +} + +/*********************************************************************** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". */ +UNIV_INTERN +const void* +ha_storage_put_memlim( +/*==================*/ + ha_storage_t* storage, /* in/out: hash storage */ + const void* data, /* in: data to store */ + ulint data_len, /* in: data length */ + ulint memlim) /* in: memory limit to obey */ +{ + void* raw; + ha_storage_node_t* node; + const void* data_copy; + ulint fold; + + /* check if data chunk is already present */ + data_copy = ha_storage_get(storage, data, data_len); + if (data_copy != NULL) { + + return(data_copy); + } + + /* not present */ + + /* check if we are allowed to allocate data_len bytes */ + if (memlim > 0 + && ha_storage_get_size(storage) + data_len > memlim) { + + return(NULL); + } + + /* we put the auxiliary node struct and the data itself in one + continuous block */ + raw = mem_heap_alloc(storage->heap, + sizeof(ha_storage_node_t) + data_len); + + node = (ha_storage_node_t*) raw; + data_copy = (byte*) raw + sizeof(*node); + + memcpy((byte*) raw + sizeof(*node), data, data_len); + + node->data_len = data_len; + node->data = data_copy; + + /* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT + macro */ + fold = ut_fold_binary(data, data_len); + + HASH_INSERT( + ha_storage_node_t, /* type used in the hash chain */ + next, /* node->"next" */ + storage->hash, /* the hash table */ + fold, /* key */ + node); /* add this data to the hash */ + + /* the output should not be changed because it will spoil the + hash table */ + return(data_copy); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +void +test_ha_storage() +{ + ha_storage_t* storage; + char buf[1024]; + int i; + const void* stored[256]; + const void* p; + + storage = ha_storage_create(0, 0); + + for (i = 0; i < 256; i++) { + + memset(buf, i, sizeof(buf)); + stored[i] = ha_storage_put(storage, buf, sizeof(buf)); + } + + //ha_storage_empty(&storage); + + for (i = 255; i >= 0; i--) { + + memset(buf, i, sizeof(buf)); + p = ha_storage_put(storage, buf, sizeof(buf)); + + if (p != stored[i]) { + + fprintf(stderr, "ha_storage_put() returned %p " + "instead of %p, i=%d\n", p, stored[i], i); + return; + } + } + + fprintf(stderr, "all ok\n"); + + ha_storage_free(storage); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/ha/hash0hash.c b/storage/xtradb/ha/hash0hash.c new file mode 100644 index 00000000000..9694a288c99 --- /dev/null +++ b/storage/xtradb/ha/hash0hash.c @@ -0,0 +1,165 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#include "hash0hash.h" +#ifdef UNIV_NONINL +#include "hash0hash.ic" +#endif + +#include "mem0mem.h" + +/**************************************************************** +Reserves the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_enter( +/*=============*/ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: fold */ +{ + mutex_enter(hash_get_mutex(table, fold)); +} + +/**************************************************************** +Releases the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_exit( +/*============*/ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: fold */ +{ + mutex_exit(hash_get_mutex(table, fold)); +} + +/**************************************************************** +Reserves all the mutexes of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_mutex_enter_all( +/*=================*/ + hash_table_t* table) /* in: hash table */ +{ + ulint i; + + for (i = 0; i < table->n_mutexes; i++) { + + mutex_enter(table->mutexes + i); + } +} + +/**************************************************************** +Releases all the mutexes of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all( +/*================*/ + hash_table_t* table) /* in: hash table */ +{ + ulint i; + + for (i = 0; i < table->n_mutexes; i++) { + + mutex_exit(table->mutexes + i); + } +} + +/***************************************************************** +Creates a hash table with >= n array cells. The actual number of cells is +chosen to be a prime number slightly bigger than n. */ +UNIV_INTERN +hash_table_t* +hash_create( +/*========*/ + /* out, own: created table */ + ulint n) /* in: number of array cells */ +{ + hash_cell_t* array; + ulint prime; + hash_table_t* table; + + prime = ut_find_prime(n); + + table = mem_alloc(sizeof(hash_table_t)); + + array = ut_malloc(sizeof(hash_cell_t) * prime); + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + table->adaptive = FALSE; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + table->array = array; + table->n_cells = prime; + table->n_mutexes = 0; + table->mutexes = NULL; + table->heaps = NULL; + table->heap = NULL; + table->magic_n = HASH_TABLE_MAGIC_N; + + /* Initialize the cell array */ + hash_table_clear(table); + + return(table); +} + +/***************************************************************** +Frees a hash table. */ +UNIV_INTERN +void +hash_table_free( +/*============*/ + hash_table_t* table) /* in, own: hash table */ +{ + ut_a(table->mutexes == NULL); + + ut_free(table->array); + mem_free(table); +} + +/***************************************************************** +Creates a mutex array to protect a hash table. */ +UNIV_INTERN +void +hash_create_mutexes_func( +/*=====================*/ + hash_table_t* table, /* in: hash table */ +#ifdef UNIV_SYNC_DEBUG + ulint sync_level, /* in: latching order level of the + mutexes: used in the debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_mutexes) /* in: number of mutexes, must be a + power of 2 */ +{ + ulint i; + + ut_a(n_mutexes > 0); + ut_a(ut_is_2pow(n_mutexes)); + + table->mutexes = mem_alloc(n_mutexes * sizeof(mutex_t)); + + for (i = 0; i < n_mutexes; i++) { + mutex_create(table->mutexes + i, sync_level); + } + + table->n_mutexes = n_mutexes; +} diff --git a/storage/xtradb/ha_innodb.def b/storage/xtradb/ha_innodb.def new file mode 100644 index 00000000000..e0faa62deb1 --- /dev/null +++ b/storage/xtradb/ha_innodb.def @@ -0,0 +1,4 @@ +EXPORTS + _mysql_plugin_interface_version_ + _mysql_sizeof_struct_st_plugin_ + _mysql_plugin_declarations_ diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc new file mode 100644 index 00000000000..88ac35c1789 --- /dev/null +++ b/storage/xtradb/handler/ha_innodb.cc @@ -0,0 +1,10394 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, MySQL AB & Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/* TODO list for the InnoDB handler in 5.0: + - Remove the flag trx->active_trans and look at trx->conc_state + - fix savepoint functions to use savepoint storage area + - Find out what kind of problems the OS X case-insensitivity causes to + table and database names; should we 'normalize' the names like we do + in Windows? +*/ + +#ifdef USE_PRAGMA_IMPLEMENTATION +#pragma implementation // gcc: Class implementation +#endif + +#define MYSQL_SERVER + +#include +#ifdef MYSQL_SERVER +#include +#endif /* MYSQL_SERVER */ + +#include +#include +#include + +/* Include necessary InnoDB headers */ +extern "C" { +#include "../storage/xtradb/include/univ.i" +#include "../storage/xtradb/include/btr0sea.h" +#include "../storage/xtradb/include/os0file.h" +#include "../storage/xtradb/include/os0thread.h" +#include "../storage/xtradb/include/srv0start.h" +#include "../storage/xtradb/include/srv0srv.h" +#include "../storage/xtradb/include/trx0roll.h" +#include "../storage/xtradb/include/trx0trx.h" +#include "../storage/xtradb/include/trx0sys.h" +#include "../storage/xtradb/include/mtr0mtr.h" +#include "../storage/xtradb/include/row0ins.h" +#include "../storage/xtradb/include/row0mysql.h" +#include "../storage/xtradb/include/row0sel.h" +#include "../storage/xtradb/include/row0upd.h" +#include "../storage/xtradb/include/log0log.h" +#include "../storage/xtradb/include/lock0lock.h" +#include "../storage/xtradb/include/dict0crea.h" +#include "../storage/xtradb/include/btr0cur.h" +#include "../storage/xtradb/include/btr0btr.h" +#include "../storage/xtradb/include/fsp0fsp.h" +#include "../storage/xtradb/include/sync0sync.h" +#include "../storage/xtradb/include/fil0fil.h" +#include "../storage/xtradb/include/trx0xa.h" +#include "../storage/xtradb/include/row0merge.h" +#include "../storage/xtradb/include/thr0loc.h" +#include "../storage/xtradb/include/dict0boot.h" +#include "../storage/xtradb/include/ha_prototypes.h" +#include "../storage/xtradb/include/ut0mem.h" +#include "../storage/xtradb/include/ibuf0ibuf.h" +} + +#include "ha_innodb.h" +#include "i_s.h" +#include "handler0vars.h" + +#ifdef MYSQL_SERVER +// Defined in trx0sys.c +extern char trx_sys_mysql_master_log_name[]; +extern ib_int64_t trx_sys_mysql_master_log_pos; +extern char trx_sys_mysql_relay_log_name[]; +extern ib_int64_t trx_sys_mysql_relay_log_pos; +#endif /* MYSQL_SERVER */ + +#ifndef MYSQL_SERVER +/* This is needed because of Bug #3596. Let us hope that pthread_mutex_t +is defined the same in both builds: the MySQL server and the InnoDB plugin. */ +extern pthread_mutex_t LOCK_thread_count; + +#if MYSQL_VERSION_ID < 50124 +/* this is defined in mysql_priv.h inside #ifdef MYSQL_SERVER +but we need it here */ +bool check_global_access(THD *thd, ulong want_access); +#endif /* MYSQL_VERSION_ID < 50124 */ +#endif /* MYSQL_SERVER */ + +/** to protect innobase_open_files */ +static pthread_mutex_t innobase_share_mutex; +/** to force correct commit order in binlog */ +static pthread_mutex_t prepare_commit_mutex; +static ulong commit_threads = 0; +static pthread_mutex_t commit_threads_m; +static pthread_cond_t commit_cond; +static pthread_mutex_t commit_cond_m; +static bool innodb_inited = 0; + +#define INSIDE_HA_INNOBASE_CC + +/* In the Windows plugin, the return value of current_thd is +undefined. Map it to NULL. */ +#if defined MYSQL_DYNAMIC_PLUGIN && defined __WIN__ +# undef current_thd +# define current_thd NULL +# define EQ_CURRENT_THD(thd) TRUE +#else /* MYSQL_DYNAMIC_PLUGIN && __WIN__ */ +# define EQ_CURRENT_THD(thd) ((thd) == current_thd) +#endif /* MYSQL_DYNAMIC_PLUGIN && __WIN__ */ + +#ifdef MYSQL_DYNAMIC_PLUGIN +/* These must be weak global variables in the dynamic plugin. */ +struct handlerton* innodb_hton_ptr; +#ifdef __WIN__ +struct st_mysql_plugin* builtin_innobase_plugin_ptr; +#else +int builtin_innobase_plugin; +#endif /* __WIN__ */ +/******************************************************************** +Copy InnoDB system variables from the static InnoDB to the dynamic +plugin. */ +static +bool +innodb_plugin_init(void); +/*====================*/ + /* out: TRUE if the dynamic InnoDB plugin should start */ +#else /* MYSQL_DYNAMIC_PLUGIN */ +/* This must be a global variable in the statically linked InnoDB. */ +struct handlerton* innodb_hton_ptr = NULL; +#endif /* MYSQL_DYNAMIC_PLUGIN */ + +static const long AUTOINC_OLD_STYLE_LOCKING = 0; +static const long AUTOINC_NEW_STYLE_LOCKING = 1; +static const long AUTOINC_NO_LOCKING = 2; + +static long innobase_mirrored_log_groups, innobase_log_files_in_group, + innobase_log_buffer_size, + innobase_additional_mem_pool_size, innobase_file_io_threads, + innobase_force_recovery, innobase_open_files, + innobase_autoinc_lock_mode; + +static unsigned long innobase_read_io_threads, innobase_write_io_threads; +static long long innobase_buffer_pool_size, innobase_log_file_size; + +/* The default values for the following char* start-up parameters +are determined in innobase_init below: */ + +static char* innobase_data_home_dir = NULL; +static char* innobase_data_file_path = NULL; +static char* innobase_log_group_home_dir = NULL; +static char* innobase_file_format_name = NULL; +static char* innobase_change_buffering = NULL; + +/* Note: This variable can be set to on/off and any of the supported +file formats in the configuration file, but can only be set to any +of the supported file formats during runtime. */ +static char* innobase_file_format_check = NULL; + +/* The following has a misleading name: starting from 4.0.5, this also +affects Windows: */ +static char* innobase_unix_file_flush_method = NULL; + +/* Below we have boolean-valued start-up parameters, and their default +values */ + +static ulong innobase_fast_shutdown = 1; +#ifdef UNIV_LOG_ARCHIVE +static my_bool innobase_log_archive = FALSE; +static char* innobase_log_arch_dir = NULL; +#endif /* UNIV_LOG_ARCHIVE */ +static my_bool innobase_use_doublewrite = TRUE; +static my_bool innobase_use_checksums = TRUE; +static my_bool innobase_extra_undoslots = FALSE; +static my_bool innobase_locks_unsafe_for_binlog = FALSE; +static my_bool innobase_overwrite_relay_log_info = FALSE; +static my_bool innobase_rollback_on_timeout = FALSE; +static my_bool innobase_create_status_file = FALSE; +static my_bool innobase_stats_on_metadata = TRUE; + +static char* internal_innobase_data_file_path = NULL; + +static char* innodb_version_str = (char*) INNODB_VERSION_STR; + +/* The following counter is used to convey information to InnoDB +about server activity: in selects it is not sensible to call +srv_active_wake_master_thread after each fetch or search, we only do +it every INNOBASE_WAKE_INTERVAL'th step. */ + +#define INNOBASE_WAKE_INTERVAL 32 +static ulong innobase_active_counter = 0; + +static hash_table_t* innobase_open_tables; + +#ifdef __NETWARE__ /* some special cleanup for NetWare */ +bool nw_panic = FALSE; +#endif + +/** Allowed values of innodb_change_buffering */ +static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = { + "none", /* IBUF_USE_NONE */ + "inserts" /* IBUF_USE_INSERT */ +}; + +static INNOBASE_SHARE *get_share(const char *table_name); +static void free_share(INNOBASE_SHARE *share); +static int innobase_close_connection(handlerton *hton, THD* thd); +static int innobase_commit(handlerton *hton, THD* thd, bool all); +static int innobase_rollback(handlerton *hton, THD* thd, bool all); +static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd, + void *savepoint); +static int innobase_savepoint(handlerton *hton, THD* thd, void *savepoint); +static int innobase_release_savepoint(handlerton *hton, THD* thd, + void *savepoint); +static handler *innobase_create_handler(handlerton *hton, + TABLE_SHARE *table, + MEM_ROOT *mem_root); + +/**************************************************************** +Validate the file format name and return its corresponding id. */ +static +uint +innobase_file_format_name_lookup( +/*=============================*/ + /* out: valid file format id */ + const char* format_name); /* in: pointer to file format + name */ +/**************************************************************** +Validate the file format check config parameters, as a side effect it +sets the srv_check_file_format_at_startup variable. */ +static +bool +innobase_file_format_check_on_off( +/*==============================*/ + /* out: true if one of + "on" or "off" */ + const char* format_check); /* in: parameter value */ +/**************************************************************** +Validate the file format check config parameters, as a side effect it +sets the srv_check_file_format_at_startup variable. */ +static +bool +innobase_file_format_check_validate( +/*================================*/ + /* out: true if valid + config value */ + const char* format_check); /* in: parameter value */ +/******************************************************************** +Return alter table flags supported in an InnoDB database. */ +static +uint +innobase_alter_table_flags( +/*=======================*/ + uint flags); + +static const char innobase_hton_name[]= "InnoDB"; + +static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG, + "Enable InnoDB support for the XA two-phase commit", + /* check_func */ NULL, /* update_func */ NULL, + /* default */ TRUE); + +static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG, + "Enable InnoDB locking in LOCK TABLES", + /* check_func */ NULL, /* update_func */ NULL, + /* default */ TRUE); + +static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG, + "Use strict mode when evaluating create options.", + NULL, NULL, FALSE); + +static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, + "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.", + NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0); + + +static handler *innobase_create_handler(handlerton *hton, + TABLE_SHARE *table, + MEM_ROOT *mem_root) +{ + return new (mem_root) ha_innobase(hton, table); +} + +/*********************************************************************** +This function is used to prepare X/Open XA distributed transaction */ +static +int +innobase_xa_prepare( +/*================*/ + /* out: 0 or error number */ + handlerton* hton, + THD* thd, /* in: handle to the MySQL thread of the user + whose XA transaction should be prepared */ + bool all); /* in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +/*********************************************************************** +This function is used to recover X/Open XA distributed transactions */ +static +int +innobase_xa_recover( +/*================*/ + /* out: number of prepared transactions + stored in xid_list */ + handlerton* hton, + XID* xid_list, /* in/out: prepared transactions */ + uint len); /* in: number of slots in xid_list */ +/*********************************************************************** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state */ +static +int +innobase_commit_by_xid( +/*===================*/ + /* out: 0 or error number */ + handlerton* hton, + XID* xid); /* in: X/Open XA transaction identification */ +/*********************************************************************** +This function is used to rollback one X/Open XA distributed transaction +which is in the prepared state */ +static +int +innobase_rollback_by_xid( +/*=====================*/ + /* out: 0 or error number */ + handlerton* hton, + XID *xid); /* in: X/Open XA transaction identification */ +/*********************************************************************** +Create a consistent view for a cursor based on current transaction +which is created if the corresponding MySQL thread still lacks one. +This consistent view is then used inside of MySQL when accessing records +using a cursor. */ +static +void* +innobase_create_cursor_view( +/*========================*/ + /* out: pointer to cursor view or NULL */ + handlerton* hton, /* in: innobase hton */ + THD* thd); /* in: user thread handle */ +/*********************************************************************** +Set the given consistent cursor view to a transaction which is created +if the corresponding MySQL thread still lacks one. If the given +consistent cursor view is NULL global read view of a transaction is +restored to a transaction read view. */ +static +void +innobase_set_cursor_view( +/*=====================*/ + handlerton* hton, + THD* thd, /* in: user thread handle */ + void* curview);/* in: Consistent cursor view to be set */ +/*********************************************************************** +Close the given consistent cursor view of a transaction and restore +global read view to a transaction read view. Transaction is created if the +corresponding MySQL thread still lacks one. */ +static +void +innobase_close_cursor_view( +/*=======================*/ + handlerton* hton, + THD* thd, /* in: user thread handle */ + void* curview);/* in: Consistent read view to be closed */ +/********************************************************************* +Removes all tables in the named database inside InnoDB. */ +static +void +innobase_drop_database( +/*===================*/ + /* out: error number */ + handlerton* hton, /* in: handlerton of Innodb */ + char* path); /* in: database path; inside InnoDB the name + of the last directory in the path is used as + the database name: for example, in 'mysql/data/test' + the database name is 'test' */ +/*********************************************************************** +Closes an InnoDB database. */ +static +int +innobase_end(handlerton *hton, ha_panic_function type); + +/********************************************************************* +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. */ +static +int +innobase_start_trx_and_assign_read_view( +/*====================================*/ + /* out: 0 */ + handlerton* hton, /* in: Innodb handlerton */ + THD* thd); /* in: MySQL thread handle of the user for whom + the transaction should be committed */ +/******************************************************************** +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes +the logs, and the name of this function should be innobase_checkpoint. */ +static +bool +innobase_flush_logs( +/*================*/ + /* out: TRUE if error */ + handlerton* hton); /* in: InnoDB handlerton */ + +/**************************************************************************** +Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB +Monitor to the client. */ +static +bool +innodb_show_status( +/*===============*/ + handlerton* hton, /* in: the innodb handlerton */ + THD* thd, /* in: the MySQL query thread of the caller */ + stat_print_fn *stat_print); +static +bool innobase_show_status(handlerton *hton, THD* thd, + stat_print_fn* stat_print, + enum ha_stat_type stat_type); + +/********************************************************************* +Commits a transaction in an InnoDB database. */ +static +void +innobase_commit_low( +/*================*/ + trx_t* trx); /* in: transaction handle */ + +static SHOW_VAR innodb_status_variables[]= { + {"buffer_pool_pages_data", + (char*) &export_vars.innodb_buffer_pool_pages_data, SHOW_LONG}, + {"buffer_pool_pages_dirty", + (char*) &export_vars.innodb_buffer_pool_pages_dirty, SHOW_LONG}, + {"buffer_pool_pages_flushed", + (char*) &export_vars.innodb_buffer_pool_pages_flushed, SHOW_LONG}, + {"buffer_pool_pages_free", + (char*) &export_vars.innodb_buffer_pool_pages_free, SHOW_LONG}, +#ifdef UNIV_DEBUG + {"buffer_pool_pages_latched", + (char*) &export_vars.innodb_buffer_pool_pages_latched, SHOW_LONG}, +#endif /* UNIV_DEBUG */ + {"buffer_pool_pages_misc", + (char*) &export_vars.innodb_buffer_pool_pages_misc, SHOW_LONG}, + {"buffer_pool_pages_total", + (char*) &export_vars.innodb_buffer_pool_pages_total, SHOW_LONG}, + {"buffer_pool_read_ahead_rnd", + (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG}, + {"buffer_pool_read_ahead_seq", + (char*) &export_vars.innodb_buffer_pool_read_ahead_seq, SHOW_LONG}, + {"buffer_pool_read_requests", + (char*) &export_vars.innodb_buffer_pool_read_requests, SHOW_LONG}, + {"buffer_pool_reads", + (char*) &export_vars.innodb_buffer_pool_reads, SHOW_LONG}, + {"buffer_pool_wait_free", + (char*) &export_vars.innodb_buffer_pool_wait_free, SHOW_LONG}, + {"buffer_pool_write_requests", + (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG}, + {"data_fsyncs", + (char*) &export_vars.innodb_data_fsyncs, SHOW_LONG}, + {"data_pending_fsyncs", + (char*) &export_vars.innodb_data_pending_fsyncs, SHOW_LONG}, + {"data_pending_reads", + (char*) &export_vars.innodb_data_pending_reads, SHOW_LONG}, + {"data_pending_writes", + (char*) &export_vars.innodb_data_pending_writes, SHOW_LONG}, + {"data_read", + (char*) &export_vars.innodb_data_read, SHOW_LONG}, + {"data_reads", + (char*) &export_vars.innodb_data_reads, SHOW_LONG}, + {"data_writes", + (char*) &export_vars.innodb_data_writes, SHOW_LONG}, + {"data_written", + (char*) &export_vars.innodb_data_written, SHOW_LONG}, + {"dblwr_pages_written", + (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG}, + {"dblwr_writes", + (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG}, + {"have_atomic_builtins", + (char*) &export_vars.innodb_have_atomic_builtins, SHOW_BOOL}, + {"log_waits", + (char*) &export_vars.innodb_log_waits, SHOW_LONG}, + {"log_write_requests", + (char*) &export_vars.innodb_log_write_requests, SHOW_LONG}, + {"log_writes", + (char*) &export_vars.innodb_log_writes, SHOW_LONG}, + {"os_log_fsyncs", + (char*) &export_vars.innodb_os_log_fsyncs, SHOW_LONG}, + {"os_log_pending_fsyncs", + (char*) &export_vars.innodb_os_log_pending_fsyncs, SHOW_LONG}, + {"os_log_pending_writes", + (char*) &export_vars.innodb_os_log_pending_writes, SHOW_LONG}, + {"os_log_written", + (char*) &export_vars.innodb_os_log_written, SHOW_LONG}, + {"page_size", + (char*) &export_vars.innodb_page_size, SHOW_LONG}, + {"pages_created", + (char*) &export_vars.innodb_pages_created, SHOW_LONG}, + {"pages_read", + (char*) &export_vars.innodb_pages_read, SHOW_LONG}, + {"pages_written", + (char*) &export_vars.innodb_pages_written, SHOW_LONG}, + {"row_lock_current_waits", + (char*) &export_vars.innodb_row_lock_current_waits, SHOW_LONG}, + {"row_lock_time", + (char*) &export_vars.innodb_row_lock_time, SHOW_LONGLONG}, + {"row_lock_time_avg", + (char*) &export_vars.innodb_row_lock_time_avg, SHOW_LONG}, + {"row_lock_time_max", + (char*) &export_vars.innodb_row_lock_time_max, SHOW_LONG}, + {"row_lock_waits", + (char*) &export_vars.innodb_row_lock_waits, SHOW_LONG}, + {"rows_deleted", + (char*) &export_vars.innodb_rows_deleted, SHOW_LONG}, + {"rows_inserted", + (char*) &export_vars.innodb_rows_inserted, SHOW_LONG}, + {"rows_read", + (char*) &export_vars.innodb_rows_read, SHOW_LONG}, + {"rows_updated", + (char*) &export_vars.innodb_rows_updated, SHOW_LONG}, + {NullS, NullS, SHOW_LONG} +}; + +/* General functions */ + +/********************************************************************** +Returns true if the thread is the replication thread on the slave +server. Used in srv_conc_enter_innodb() to determine if the thread +should be allowed to enter InnoDB - the replication thread is treated +differently than other threads. Also used in +srv_conc_force_exit_innodb(). */ +extern "C" UNIV_INTERN +ibool +thd_is_replication_slave_thread( +/*============================*/ + /* out: true if thd is the replication thread */ + void* thd) /* in: thread handle (THD*) */ +{ + return((ibool) thd_slave_thread((THD*) thd)); +} + +/********************************************************************** +Save some CPU by testing the value of srv_thread_concurrency in inline +functions. */ +static inline +void +innodb_srv_conc_enter_innodb( +/*=========================*/ + trx_t* trx) /* in: transaction handle */ +{ + if (UNIV_LIKELY(!srv_thread_concurrency)) { + + return; + } + + srv_conc_enter_innodb(trx); +} + +/********************************************************************** +Save some CPU by testing the value of srv_thread_concurrency in inline +functions. */ +static inline +void +innodb_srv_conc_exit_innodb( +/*========================*/ + trx_t* trx) /* in: transaction handle */ +{ + if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) { + + return; + } + + srv_conc_exit_innodb(trx); +} + +/********************************************************************** +Releases possible search latch and InnoDB thread FIFO ticket. These should +be released at each SQL statement end, and also when mysqld passes the +control to the client. It does no harm to release these also in the middle +of an SQL statement. */ +static inline +void +innobase_release_stat_resources( +/*============================*/ + trx_t* trx) /* in: transaction object */ +{ + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + if (trx->declared_to_be_inside_innodb) { + /* Release our possible ticket in the FIFO */ + + srv_conc_force_exit_innodb(trx); + } +} + +/********************************************************************** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. */ +extern "C" UNIV_INTERN +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + /* out: true if non-transactional tables have + been edited */ + void* thd) /* in: thread handle (THD*) */ +{ + return((ibool) thd_non_transactional_update((THD*) thd)); +} + +/********************************************************************** +Returns true if the thread is executing a SELECT statement. */ +extern "C" UNIV_INTERN +ibool +thd_is_select( +/*==========*/ + /* out: true if thd is executing SELECT */ + const void* thd) /* in: thread handle (THD*) */ +{ + return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT); +} + +/********************************************************************** +Returns true if the thread supports XA, +global value of innodb_supports_xa if thd is NULL. */ +extern "C" UNIV_INTERN +ibool +thd_supports_xa( +/*============*/ + /* out: true if thd has XA support */ + void* thd) /* in: thread handle (THD*), or NULL to query + the global innodb_supports_xa */ +{ + return(THDVAR((THD*) thd, support_xa)); +} + +/********************************************************************** +Returns the lock wait timeout for the current connection. */ +extern "C" UNIV_INTERN +ulong +thd_lock_wait_timeout( +/*==================*/ + /* out: the lock wait timeout, in seconds */ + void* thd) /* in: thread handle (THD*), or NULL to query + the global innodb_lock_wait_timeout */ +{ + /* According to , passing thd == NULL + returns the global value of the session variable. */ + return(THDVAR((THD*) thd, lock_wait_timeout)); +} + +/************************************************************************ +Obtain the InnoDB transaction of a MySQL thread. */ +static inline +trx_t*& +thd_to_trx( +/*=======*/ + /* out: reference to transaction pointer */ + THD* thd) /* in: MySQL thread */ +{ + return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr)); +} + +/************************************************************************ +Call this function when mysqld passes control to the client. That is to +avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more +documentation, see handler.cc. */ +static +int +innobase_release_temporary_latches( +/*===============================*/ + /* out: 0 */ + handlerton* hton, /* in: handlerton */ + THD* thd) /* in: MySQL thread */ +{ + trx_t* trx; + + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (!innodb_inited) { + + return(0); + } + + trx = thd_to_trx(thd); + + if (trx) { + innobase_release_stat_resources(trx); + } + return(0); +} + +/************************************************************************ +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth +time calls srv_active_wake_master_thread. This function should be used +when a single database operation may introduce a small need for +server utility activity, like checkpointing. */ +static inline +void +innobase_active_small(void) +/*=======================*/ +{ + innobase_active_counter++; + + if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) { + srv_active_wake_master_thread(); + } +} + +/************************************************************************ +Converts an InnoDB error code to a MySQL error code and also tells to MySQL +about a possible transaction rollback inside InnoDB caused by a lock wait +timeout or a deadlock. */ +extern "C" UNIV_INTERN +int +convert_error_code_to_mysql( +/*========================*/ + /* out: MySQL error code */ + int error, /* in: InnoDB error code */ + ulint flags, /* in: InnoDB table flags, or 0 */ + THD* thd) /* in: user thread handle or NULL */ +{ + switch (error) { + case DB_SUCCESS: + return(0); + + case DB_ERROR: + default: + return(-1); /* unspecified error */ + + case DB_DUPLICATE_KEY: + return(HA_ERR_FOUND_DUPP_KEY); + + case DB_FOREIGN_DUPLICATE_KEY: + return(HA_ERR_FOREIGN_DUPLICATE_KEY); + + case DB_RECORD_NOT_FOUND: + return(HA_ERR_NO_ACTIVE_RECORD); + + case DB_DEADLOCK: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_DEADLOCK); + + case DB_LOCK_WAIT_TIMEOUT: + /* Starting from 5.0.13, we let MySQL just roll back the + latest SQL statement in a lock wait timeout. Previously, we + rolled back the whole transaction. */ + + if (thd) { + thd_mark_transaction_to_rollback( + thd, (bool)row_rollback_on_timeout); + } + + return(HA_ERR_LOCK_WAIT_TIMEOUT); + + case DB_NO_REFERENCED_ROW: + return(HA_ERR_NO_REFERENCED_ROW); + + case DB_ROW_IS_REFERENCED: + return(HA_ERR_ROW_IS_REFERENCED); + + case DB_CANNOT_ADD_CONSTRAINT: + return(HA_ERR_CANNOT_ADD_FOREIGN); + + case DB_CANNOT_DROP_CONSTRAINT: + + return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit + misleading, a new MySQL error + code should be introduced */ + + case DB_COL_APPEARS_TWICE_IN_INDEX: + case DB_CORRUPTION: + return(HA_ERR_CRASHED); + + case DB_OUT_OF_FILE_SPACE: + return(HA_ERR_RECORD_FILE_FULL); + + case DB_TABLE_IS_BEING_USED: + return(HA_ERR_WRONG_COMMAND); + + case DB_TABLE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_TOO_BIG_RECORD: + my_error(ER_TOO_BIG_ROWSIZE, MYF(0), + page_get_free_space_of_empty(flags + & DICT_TF_COMPACT) / 2); + return(HA_ERR_TO_BIG_ROW); + + case DB_NO_SAVEPOINT: + return(HA_ERR_NO_SAVEPOINT); + + case DB_LOCK_TABLE_FULL: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_TABLE_FULL); + + case DB_PRIMARY_KEY_IS_NULL: + return(ER_PRIMARY_CANT_HAVE_NULL); + + case DB_TOO_MANY_CONCURRENT_TRXS: + /* Once MySQL add the appropriate code to errmsg.txt then + we can get rid of this #ifdef. NOTE: The code checked by + the #ifdef is the suggested name for the error condition + and the actual error code name could very well be different. + This will require some monitoring, ie. the status + of this request on our part.*/ +#ifdef ER_TOO_MANY_CONCURRENT_TRXS + return(ER_TOO_MANY_CONCURRENT_TRXS); +#else + return(HA_ERR_RECORD_FILE_FULL); +#endif + case DB_UNSUPPORTED: + return(HA_ERR_UNSUPPORTED); + } +} + +/***************************************************************** +If you want to print a thd that is not associated with the current thread, +you must call this function before reserving the InnoDB kernel_mutex, to +protect MySQL from setting thd->query NULL. If you print a thd of the current +thread, we know that MySQL cannot modify thd->query, and it is not necessary +to call this. Call innobase_mysql_end_print_arbitrary_thd() after you release +the kernel_mutex. */ +extern "C" UNIV_INTERN +void +innobase_mysql_prepare_print_arbitrary_thd(void) +/*============================================*/ +{ + ut_ad(!mutex_own(&kernel_mutex)); + VOID(pthread_mutex_lock(&LOCK_thread_count)); +} + +/***************************************************************** +Releases the mutex reserved by innobase_mysql_prepare_print_arbitrary_thd(). +In the InnoDB latching order, the mutex sits right above the +kernel_mutex. In debug builds, we assert that the kernel_mutex is +released before this function is invoked. */ +extern "C" UNIV_INTERN +void +innobase_mysql_end_print_arbitrary_thd(void) +/*========================================*/ +{ + ut_ad(!mutex_own(&kernel_mutex)); + VOID(pthread_mutex_unlock(&LOCK_thread_count)); +} + +/***************************************************************** +Prints info of a THD object (== user session thread) to the given file. */ +extern "C" UNIV_INTERN +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /* in: output stream */ + void* thd, /* in: pointer to a MySQL THD object */ + uint max_query_len) /* in: max query length to print, or 0 to + use the default max length */ +{ + char buffer[1024]; + + fputs(thd_security_context((THD*) thd, buffer, sizeof buffer, + max_query_len), f); + putc('\n', f); +} + +/********************************************************************** +Get the variable length bounds of the given character set. */ +extern "C" UNIV_INTERN +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /* in: MySQL charset-collation code */ + ulint* mbminlen, /* out: minimum length of a char (in bytes) */ + ulint* mbmaxlen) /* out: maximum length of a char (in bytes) */ +{ + CHARSET_INFO* cs; + ut_ad(cset < 256); + ut_ad(mbminlen); + ut_ad(mbmaxlen); + + cs = all_charsets[cset]; + if (cs) { + *mbminlen = cs->mbminlen; + *mbmaxlen = cs->mbmaxlen; + } else { + ut_a(cset == 0); + *mbminlen = *mbmaxlen = 0; + } +} + +/********************************************************************** +Converts an identifier to a table name. */ +extern "C" UNIV_INTERN +void +innobase_convert_from_table_id( +/*===========================*/ + struct charset_info_st* cs, /* in: the 'from' character set */ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len) /* in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors); +} + +/********************************************************************** +Converts an identifier to UTF-8. */ +extern "C" UNIV_INTERN +void +innobase_convert_from_id( +/*=====================*/ + struct charset_info_st* cs, /* in: the 'from' character set */ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len) /* in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, system_charset_info, to, (uint) len, &errors); +} + +/********************************************************************** +Compares NUL-terminated UTF-8 strings case insensitively. */ +extern "C" UNIV_INTERN +int +innobase_strcasecmp( +/*================*/ + /* out: 0 if a=b, <0 if a1 if a>b */ + const char* a, /* in: first string to compare */ + const char* b) /* in: second string to compare */ +{ + return(my_strcasecmp(system_charset_info, a, b)); +} + +/********************************************************************** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +extern "C" UNIV_INTERN +void +innobase_casedn_str( +/*================*/ + char* a) /* in/out: string to put in lower case */ +{ + my_casedn_str(system_charset_info, a); +} + +/************************************************************************** +Determines the connection character set. */ +extern "C" UNIV_INTERN +struct charset_info_st* +innobase_get_charset( +/*=================*/ + /* out: connection character set */ + void* mysql_thd) /* in: MySQL thread handle */ +{ + return(thd_charset((THD*) mysql_thd)); +} + +#if defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) +/*********************************************************************** +Map an OS error to an errno value. The OS error number is stored in +_doserrno and the mapped value is stored in errno) */ +extern "C" +void __cdecl +_dosmaperr( + unsigned long); /* in: OS error value */ + +/************************************************************************* +Creates a temporary file. */ +extern "C" UNIV_INTERN +int +innobase_mysql_tmpfile(void) +/*========================*/ + /* out: temporary file descriptor, or < 0 on error */ +{ + int fd; /* handle of opened file */ + HANDLE osfh; /* OS handle of opened file */ + char* tmpdir; /* point to the directory + where to create file */ + TCHAR path_buf[MAX_PATH - 14]; /* buffer for tmp file path. + The length cannot be longer + than MAX_PATH - 14, or + GetTempFileName will fail. */ + char filename[MAX_PATH]; /* name of the tmpfile */ + DWORD fileaccess = GENERIC_READ /* OS file access */ + | GENERIC_WRITE + | DELETE; + DWORD fileshare = FILE_SHARE_READ /* OS file sharing mode */ + | FILE_SHARE_WRITE + | FILE_SHARE_DELETE; + DWORD filecreate = CREATE_ALWAYS; /* OS method of open/create */ + DWORD fileattrib = /* OS file attribute flags */ + FILE_ATTRIBUTE_NORMAL + | FILE_FLAG_DELETE_ON_CLOSE + | FILE_ATTRIBUTE_TEMPORARY + | FILE_FLAG_SEQUENTIAL_SCAN; + + DBUG_ENTER("innobase_mysql_tmpfile"); + + tmpdir = my_tmpdir(&mysql_tmpdir_list); + + /* The tmpdir parameter can not be NULL for GetTempFileName. */ + if (!tmpdir) { + uint ret; + + /* Use GetTempPath to determine path for temporary files. */ + ret = GetTempPath(sizeof(path_buf), path_buf); + if (ret > sizeof(path_buf) || (ret == 0)) { + + _dosmaperr(GetLastError()); /* map error */ + DBUG_RETURN(-1); + } + + tmpdir = path_buf; + } + + /* Use GetTempFileName to generate a unique filename. */ + if (!GetTempFileName(tmpdir, "ib", 0, filename)) { + + _dosmaperr(GetLastError()); /* map error */ + DBUG_RETURN(-1); + } + + DBUG_PRINT("info", ("filename: %s", filename)); + + /* Open/Create the file. */ + osfh = CreateFile(filename, fileaccess, fileshare, NULL, + filecreate, fileattrib, NULL); + if (osfh == INVALID_HANDLE_VALUE) { + + /* open/create file failed! */ + _dosmaperr(GetLastError()); /* map error */ + DBUG_RETURN(-1); + } + + do { + /* Associates a CRT file descriptor with the OS file handle. */ + fd = _open_osfhandle((intptr_t) osfh, 0); + } while (fd == -1 && errno == EINTR); + + if (fd == -1) { + /* Open failed, close the file handle. */ + + _dosmaperr(GetLastError()); /* map error */ + CloseHandle(osfh); /* no need to check if + CloseHandle fails */ + } + + DBUG_RETURN(fd); +} +#else +/************************************************************************* +Creates a temporary file. */ +extern "C" UNIV_INTERN +int +innobase_mysql_tmpfile(void) +/*========================*/ + /* out: temporary file descriptor, or < 0 on error */ +{ + int fd2 = -1; + File fd = mysql_tmpfile("ib"); + if (fd >= 0) { + /* Copy the file descriptor, so that the additional resources + allocated by create_temp_file() can be freed by invoking + my_close(). + + Because the file descriptor returned by this function + will be passed to fdopen(), it will be closed by invoking + fclose(), which in turn will invoke close() instead of + my_close(). */ + fd2 = dup(fd); + if (fd2 < 0) { + DBUG_PRINT("error",("Got error %d on dup",fd2)); + my_errno=errno; + my_error(EE_OUT_OF_FILERESOURCES, + MYF(ME_BELL+ME_WAITTANG), + "ib*", my_errno); + } + my_close(fd, MYF(MY_WME)); + } + return(fd2); +} +#endif /* defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) */ + +/************************************************************************* +Wrapper around MySQL's copy_and_convert function, see it for +documentation. */ +extern "C" UNIV_INTERN +ulint +innobase_convert_string( +/*====================*/ + void* to, + ulint to_length, + CHARSET_INFO* to_cs, + const void* from, + ulint from_length, + CHARSET_INFO* from_cs, + uint* errors) +{ + return(copy_and_convert((char*)to, (uint32) to_length, to_cs, + (const char*)from, (uint32) from_length, from_cs, + errors)); +} + +/*********************************************************************** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). */ +extern "C" UNIV_INTERN +ulint +innobase_raw_format( +/*================*/ + /* out: number of bytes + that were written */ + const char* data, /* in: raw data */ + ulint data_len, /* in: raw data length + in bytes */ + ulint charset_coll, /* in: charset collation */ + char* buf, /* out: output buffer */ + ulint buf_size) /* in: output buffer size + in bytes */ +{ + /* XXX we use a hard limit instead of allocating + but_size bytes from the heap */ + CHARSET_INFO* data_cs; + char buf_tmp[8192]; + ulint buf_tmp_used; + uint num_errors; + + data_cs = all_charsets[charset_coll]; + + buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp), + system_charset_info, + data, data_len, data_cs, + &num_errors); + + return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size)); +} + +/************************************************************************* +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to +n * 3 where autoinc_lock_mode != TRADITIONAL because we want +to reserve 3 values for the multi-value INSERT above. */ +static +ulonglong +innobase_next_autoinc( +/*==================*/ + /* out: the next value */ + ulonglong current, /* in: Current value */ + ulonglong increment, /* in: increment current by */ + ulonglong offset, /* in: AUTOINC offset */ + ulonglong max_value) /* in: max value for type */ +{ + ulonglong next_value; + + /* Should never be 0. */ + ut_a(increment > 0); + + /* According to MySQL documentation, if the offset is greater than + the increment then the offset is ignored. */ + if (offset > increment) { + offset = 0; + } + + if (max_value <= current) { + next_value = max_value; + } else if (offset <= 1) { + /* Offset 0 and 1 are the same, because there must be at + least one node in the system. */ + if (max_value - current <= increment) { + next_value = max_value; + } else { + next_value = current + increment; + } + } else if (max_value > current) { + if (current > offset) { + next_value = ((current - offset) / increment) + 1; + } else { + next_value = ((offset - current) / increment) + 1; + } + + ut_a(increment > 0); + ut_a(next_value > 0); + + /* Check for multiplication overflow. */ + if (increment > (max_value / next_value)) { + + next_value = max_value; + } else { + next_value *= increment; + + ut_a(max_value >= next_value); + + /* Check for overflow. */ + if (max_value - next_value <= offset) { + next_value = max_value; + } else { + next_value += offset; + } + } + } else { + next_value = max_value; + } + + ut_a(next_value <= max_value); + + return(next_value); +} + +/************************************************************************* +Initializes some fields in an InnoDB transaction object. */ +static +void +innobase_trx_init( +/*==============*/ + THD* thd, /* in: user thread handle */ + trx_t* trx) /* in/out: InnoDB transaction handle */ +{ + DBUG_ENTER("innobase_trx_init"); + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + DBUG_ASSERT(thd == trx->mysql_thd); + + trx->check_foreigns = !thd_test_options( + thd, OPTION_NO_FOREIGN_KEY_CHECKS); + + trx->check_unique_secondary = !thd_test_options( + thd, OPTION_RELAXED_UNIQUE_CHECKS); + + DBUG_VOID_RETURN; +} + +/************************************************************************* +Allocates an InnoDB transaction for a MySQL handler object. */ +extern "C" UNIV_INTERN +trx_t* +innobase_trx_allocate( +/*==================*/ + /* out: InnoDB transaction handle */ + THD* thd) /* in: user thread handle */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_trx_allocate"); + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + + trx = trx_allocate_for_mysql(); + + trx->mysql_thd = thd; + trx->mysql_query_str = thd_query(thd); + + innobase_trx_init(thd, trx); + + DBUG_RETURN(trx); +} + +/************************************************************************* +Gets the InnoDB transaction handle for a MySQL handler object, creates +an InnoDB transaction struct if the corresponding MySQL thread struct still +lacks one. */ +static +trx_t* +check_trx_exists( +/*=============*/ + /* out: InnoDB transaction handle */ + THD* thd) /* in: user thread handle */ +{ + trx_t*& trx = thd_to_trx(thd); + + ut_ad(EQ_CURRENT_THD(thd)); + + if (trx == NULL) { + trx = innobase_trx_allocate(thd); + } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) { + mem_analyze_corruption(trx); + ut_error; + } + + innobase_trx_init(thd, trx); + + return(trx); +} + + +/************************************************************************* +Construct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::ha_innobase(handlerton *hton, TABLE_SHARE *table_arg) + :handler(hton, table_arg), + int_table_flags(HA_REC_NOT_IN_SEQ | + HA_NULL_IN_KEY | + HA_CAN_INDEX_BLOBS | + HA_CAN_SQL_HANDLER | + HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | + HA_PRIMARY_KEY_IN_READ_INDEX | + HA_BINLOG_ROW_CAPABLE | + HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ | + HA_TABLE_SCAN_ON_INDEX), + start_of_scan(0), + num_write_row(0) +{} + +/************************************************************************* +Destruct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::~ha_innobase() +{ +} + +/************************************************************************* +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN inline +void +ha_innobase::update_thd( +/*====================*/ + THD* thd) /* in: thd to use the handle */ +{ + trx_t* trx; + + trx = check_trx_exists(thd); + + if (prebuilt->trx != trx) { + + row_update_prebuilt_trx(prebuilt, trx); + } + + user_thd = thd; +} + +/************************************************************************* +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN +void +ha_innobase::update_thd() +/*=====================*/ +{ + THD* thd = ha_thd(); + ut_ad(EQ_CURRENT_THD(thd)); + update_thd(thd); +} + +/************************************************************************* +Registers that InnoDB takes part in an SQL statement, so that MySQL knows to +roll back the statement if the statement results in an error. This MUST be +called for every SQL statement that may be rolled back by MySQL. Calling this +several times to register the same statement is allowed, too. */ +static inline +void +innobase_register_stmt( +/*===================*/ + handlerton* hton, /* in: Innobase hton */ + THD* thd) /* in: MySQL thd (connection) object */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + /* Register the statement */ + trans_register_ha(thd, FALSE, hton); +} + +/************************************************************************* +Registers an InnoDB transaction in MySQL, so that the MySQL XA code knows +to call the InnoDB prepare and commit, or rollback for the transaction. This +MUST be called for every transaction for which the user may call commit or +rollback. Calling this several times to register the same transaction is +allowed, too. +This function also registers the current SQL statement. */ +static inline +void +innobase_register_trx_and_stmt( +/*===========================*/ + handlerton *hton, /* in: Innobase handlerton */ + THD* thd) /* in: MySQL thd (connection) object */ +{ + /* NOTE that actually innobase_register_stmt() registers also + the transaction in the AUTOCOMMIT=1 mode. */ + + innobase_register_stmt(hton, thd); + + if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + /* No autocommit mode, register for a transaction */ + trans_register_ha(thd, TRUE, hton); + } +} + +/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB + ------------------------------------------------------------ + +1) The use of the query cache for TBL is disabled when there is an +uncommitted change to TBL. + +2) When a change to TBL commits, InnoDB stores the current value of +its global trx id counter, let us denote it by INV_TRX_ID, to the table object +in the InnoDB data dictionary, and does only allow such transactions whose +id <= INV_TRX_ID to use the query cache. + +3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit +modification because an ON DELETE CASCADE, we invalidate the MySQL query cache +of TBL immediately. + +How this is implemented inside InnoDB: + +1) Since every modification always sets an IX type table lock on the InnoDB +table, it is easy to check if there can be uncommitted modifications for a +table: just check if there are locks in the lock list of the table. + +2) When a transaction inside InnoDB commits, it reads the global trx id +counter and stores the value INV_TRX_ID to the tables on which it had a lock. + +3) If there is an implicit table change from ON DELETE CASCADE or SET NULL, +InnoDB calls an invalidate method for the MySQL query cache for that table. + +How this is implemented inside sql_cache.cc: + +1) The query cache for an InnoDB table TBL is invalidated immediately at an +INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay +invalidation to the transaction commit. + +2) To store or retrieve a value from the query cache of an InnoDB table TBL, +any query must first ask InnoDB's permission. We must pass the thd as a +parameter because InnoDB will look at the trx id, if any, associated with +that thd. + +3) Use of the query cache for InnoDB tables is now allowed also when +AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer +put restrictions on the use of the query cache. +*/ + +/********************************************************************** +The MySQL query cache uses this to check from InnoDB if the query cache at +the moment is allowed to operate on an InnoDB table. The SQL query must +be a non-locking SELECT. + +The query cache is allowed to operate on certain query only if this function +returns TRUE for all tables in the query. + +If thd is not in the autocommit state, this function also starts a new +transaction for thd if there is no active trx yet, and assigns a consistent +read view to it if there is no read view yet. + +Why a deadlock of threads is not possible: the query cache calls this function +at the start of a SELECT processing. Then the calling thread cannot be +holding any InnoDB semaphores. The calling thread is holding the +query cache mutex, and this function will reserver the InnoDB kernel mutex. +Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above +the InnoDB kernel mutex. */ +static +my_bool +innobase_query_caching_of_table_permitted( +/*======================================*/ + /* out: TRUE if permitted, FALSE if not; + note that the value FALSE does not mean + we should invalidate the query cache: + invalidation is called explicitly */ + THD* thd, /* in: thd of the user who is trying to + store a result to the query cache or + retrieve it */ + char* full_name, /* in: concatenation of database name, + the null character '\0', and the table + name */ + uint full_name_len, /* in: length of the full name, i.e. + len(dbname) + len(tablename) + 1 */ + ulonglong *unused) /* unused for this engine */ +{ + ibool is_autocommit; + trx_t* trx; + char norm_name[1000]; + + ut_a(full_name_len < 999); + + trx = check_trx_exists(thd); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every + plain SELECT if AUTOCOMMIT is not on. */ + + return((my_bool)FALSE); + } + + if (trx->has_search_latch) { + sql_print_error("The calling thread is holding the adaptive " + "search, latch though calling " + "innobase_query_caching_of_table_permitted."); + + mutex_enter(&kernel_mutex); + trx_print(stderr, trx, 1024); + mutex_exit(&kernel_mutex); + } + + innobase_release_stat_resources(trx); + + if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + is_autocommit = TRUE; + } else { + is_autocommit = FALSE; + + } + + if (is_autocommit && trx->n_mysql_tables_in_use == 0) { + /* We are going to retrieve the query result from the query + cache. This cannot be a store operation to the query cache + because then MySQL would have locks on tables already. + + TODO: if the user has used LOCK TABLES to lock the table, + then we open a transaction in the call of row_.. below. + That trx can stay open until UNLOCK TABLES. The same problem + exists even if we do not use the query cache. MySQL should be + modified so that it ALWAYS calls some cleanup function when + the processing of a query ends! + + We can imagine we instantaneously serialize this consistent + read trx to the current trx id counter. If trx2 would have + changed the tables of a query result stored in the cache, and + trx2 would have already committed, making the result obsolete, + then trx2 would have already invalidated the cache. Thus we + can trust the result in the cache is ok for this query. */ + + return((my_bool)TRUE); + } + + /* Normalize the table name to InnoDB format */ + + memcpy(norm_name, full_name, full_name_len); + + norm_name[strlen(norm_name)] = '/'; /* InnoDB uses '/' as the + separator between db and table */ + norm_name[full_name_len] = '\0'; +#ifdef __WIN__ + innobase_casedn_str(norm_name); +#endif + /* The call of row_search_.. will start a new transaction if it is + not yet started */ + + if (trx->active_trans == 0) { + + innobase_register_trx_and_stmt(innodb_hton_ptr, thd); + trx->active_trans = 1; + } + + if (row_search_check_if_query_cache_permitted(trx, norm_name)) { + + /* printf("Query cache for %s permitted\n", norm_name); */ + + return((my_bool)TRUE); + } + + /* printf("Query cache for %s NOT permitted\n", norm_name); */ + + return((my_bool)FALSE); +} + +/********************************************************************* +Invalidates the MySQL query cache for the table. +NOTE that the exact prototype of this function has to be in +/xtradb/row/row0ins.c! */ +extern "C" UNIV_INTERN +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /* in: transaction which modifies the table */ + char* full_name, /* in: concatenation of database name, null + char '\0', table name, null char'\0'; + NOTE that in Windows this is always + in LOWER CASE! */ + ulint full_name_len) /* in: full name length where also the null + chars count */ +{ + /* Note that the sync0sync.h rank of the query cache mutex is just + above the InnoDB kernel mutex. The caller of this function must not + have latches of a lower rank. */ + + /* Argument TRUE below means we are using transactions */ +#ifdef HAVE_QUERY_CACHE + mysql_query_cache_invalidate4((THD*) trx->mysql_thd, + (const char*) full_name, + (uint32) full_name_len, + TRUE); +#endif +} + +/********************************************************************* +Convert an SQL identifier to the MySQL system_charset_info (UTF-8) +and quote it if needed. */ +static +char* +innobase_convert_identifier( +/*========================*/ + /* out: pointer to the end of buf */ + char* buf, /* out: buffer for converted identifier */ + ulint buflen, /* in: length of buf, in bytes */ + const char* id, /* in: identifier to convert */ + ulint idlen, /* in: length of id, in bytes */ + void* thd, /* in: MySQL connection thread, or NULL */ + ibool file_id)/* in: TRUE=id is a table or database name; + FALSE=id is an UTF-8 string */ +{ + char nz[NAME_LEN + 1]; + char nz2[NAME_LEN + 1 + sizeof srv_mysql50_table_name_prefix]; + + const char* s = id; + int q; + + if (file_id) { + /* Decode the table name. The filename_to_tablename() + function expects a NUL-terminated string. The input and + output strings buffers must not be shared. */ + + if (UNIV_UNLIKELY(idlen > (sizeof nz) - 1)) { + idlen = (sizeof nz) - 1; + } + + memcpy(nz, id, idlen); + nz[idlen] = 0; + + s = nz2; + idlen = filename_to_tablename(nz, nz2, sizeof nz2); + } + + /* See if the identifier needs to be quoted. */ + if (UNIV_UNLIKELY(!thd)) { + q = '"'; + } else { + q = get_quote_char_for_identifier((THD*) thd, s, (int) idlen); + } + + if (q == EOF) { + if (UNIV_UNLIKELY(idlen > buflen)) { + idlen = buflen; + } + memcpy(buf, s, idlen); + return(buf + idlen); + } + + /* Quote the identifier. */ + if (buflen < 2) { + return(buf); + } + + *buf++ = q; + buflen--; + + for (; idlen; idlen--) { + int c = *s++; + if (UNIV_UNLIKELY(c == q)) { + if (UNIV_UNLIKELY(buflen < 3)) { + break; + } + + *buf++ = c; + *buf++ = c; + buflen -= 2; + } else { + if (UNIV_UNLIKELY(buflen < 2)) { + break; + } + + *buf++ = c; + buflen--; + } + } + + *buf++ = q; + return(buf); +} + +/********************************************************************* +Convert a table or index name to the MySQL system_charset_info (UTF-8) +and quote it if needed. */ +extern "C" UNIV_INTERN +char* +innobase_convert_name( +/*==================*/ + /* out: pointer to the end of buf */ + char* buf, /* out: buffer for converted identifier */ + ulint buflen, /* in: length of buf, in bytes */ + const char* id, /* in: identifier to convert */ + ulint idlen, /* in: length of id, in bytes */ + void* thd, /* in: MySQL connection thread, or NULL */ + ibool table_id)/* in: TRUE=id is a table or database name; + FALSE=id is an index name */ +{ + char* s = buf; + const char* bufend = buf + buflen; + + if (table_id) { + const char* slash = (const char*) memchr(id, '/', idlen); + if (!slash) { + + goto no_db_name; + } + + /* Print the database name and table name separately. */ + s = innobase_convert_identifier(s, bufend - s, id, slash - id, + thd, TRUE); + if (UNIV_LIKELY(s < bufend)) { + *s++ = '.'; + s = innobase_convert_identifier(s, bufend - s, + slash + 1, idlen + - (slash - id) - 1, + thd, TRUE); + } + } else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) { + /* Temporary index name (smart ALTER TABLE) */ + const char temp_index_suffix[]= "--temporary--"; + + s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1, + thd, FALSE); + if (s - buf + (sizeof temp_index_suffix - 1) < buflen) { + memcpy(s, temp_index_suffix, + sizeof temp_index_suffix - 1); + s += sizeof temp_index_suffix - 1; + } + } else { +no_db_name: + s = innobase_convert_identifier(buf, buflen, id, idlen, + thd, table_id); + } + + return(s); + +} + +/************************************************************************** +Determines if the currently running transaction has been interrupted. */ +extern "C" UNIV_INTERN +ibool +trx_is_interrupted( +/*===============*/ + /* out: TRUE if interrupted */ + trx_t* trx) /* in: transaction */ +{ + return(trx && trx->mysql_thd && thd_killed((THD*) trx->mysql_thd)); +} + +/****************************************************************** +Resets some fields of a prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +static +void +reset_template( +/*===========*/ + row_prebuilt_t* prebuilt) /* in/out: prebuilt struct */ +{ + prebuilt->keep_other_fields_on_keyread = 0; + prebuilt->read_just_key = 0; +} + +/********************************************************************* +Call this when you have opened a new table handle in HANDLER, before you +call index_read_idx() etc. Actually, we can let the cursor stay open even +over a transaction commit! Then you should call this before every operation, +fetch next etc. This function inits the necessary things even after a +transaction commit. */ +UNIV_INTERN +void +ha_innobase::init_table_handle_for_HANDLER(void) +/*============================================*/ +{ + /* If current thd does not yet have a trx struct, create one. + If the current handle does not yet have a prebuilt struct, create + one. Update the trx pointers in the prebuilt struct. Normally + this operation is done in external_lock. */ + + update_thd(ha_thd()); + + /* Initialize the prebuilt struct much like it would be inited in + external_lock */ + + innobase_release_stat_resources(prebuilt->trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started(prebuilt->trx); + + /* Assign a read view if the transaction does not have it yet */ + + trx_assign_read_view(prebuilt->trx); + + /* Set the MySQL flag to mark that there is an active transaction */ + + if (prebuilt->trx->active_trans == 0) { + + innobase_register_trx_and_stmt(ht, user_thd); + + prebuilt->trx->active_trans = 1; + } + + /* We did the necessary inits in this function, no need to repeat them + in row_search_for_mysql */ + + prebuilt->sql_stat_start = FALSE; + + /* We let HANDLER always to do the reads as consistent reads, even + if the trx isolation level would have been specified as SERIALIZABLE */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + + /* Always fetch all columns in the index record */ + + prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS; + + /* We want always to fetch all columns in the whole row? Or do + we???? */ + + prebuilt->used_in_HANDLER = TRUE; + reset_template(prebuilt); +} + +/************************************************************************* +Opens an InnoDB database. */ +static +int +innobase_init( +/*==========*/ + /* out: 0 on success, error code on failure */ + void *p) /* in: InnoDB handlerton */ +{ + static char current_dir[3]; /* Set if using current lib */ + int err; + bool ret; + char *default_path; + uint format_id; + + DBUG_ENTER("innobase_init"); + handlerton *innobase_hton= (handlerton *)p; + +#ifdef MYSQL_DYNAMIC_PLUGIN + if (!innodb_plugin_init()) { + sql_print_error("InnoDB plugin init failed."); + DBUG_RETURN(-1); + } + + if (innodb_hton_ptr) { + /* Patch the statically linked handlerton and variables */ + innobase_hton = innodb_hton_ptr; + } +#endif /* MYSQL_DYNAMIC_PLUGIN */ + + innodb_hton_ptr = innobase_hton; + + innobase_hton->state = SHOW_OPTION_YES; + innobase_hton->db_type= DB_TYPE_INNODB; + innobase_hton->savepoint_offset=sizeof(trx_named_savept_t); + innobase_hton->close_connection=innobase_close_connection; + innobase_hton->savepoint_set=innobase_savepoint; + innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint; + innobase_hton->savepoint_release=innobase_release_savepoint; + innobase_hton->commit=innobase_commit; + innobase_hton->rollback=innobase_rollback; + innobase_hton->prepare=innobase_xa_prepare; + innobase_hton->recover=innobase_xa_recover; + innobase_hton->commit_by_xid=innobase_commit_by_xid; + innobase_hton->rollback_by_xid=innobase_rollback_by_xid; + innobase_hton->create_cursor_read_view=innobase_create_cursor_view; + innobase_hton->set_cursor_read_view=innobase_set_cursor_view; + innobase_hton->close_cursor_read_view=innobase_close_cursor_view; + innobase_hton->create=innobase_create_handler; + innobase_hton->drop_database=innobase_drop_database; + innobase_hton->panic=innobase_end; + innobase_hton->start_consistent_snapshot=innobase_start_trx_and_assign_read_view; + innobase_hton->flush_logs=innobase_flush_logs; + innobase_hton->show_status=innobase_show_status; + innobase_hton->flags=HTON_NO_FLAGS; + innobase_hton->release_temporary_latches=innobase_release_temporary_latches; + innobase_hton->alter_table_flags = innobase_alter_table_flags; + + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); + +#ifdef UNIV_DEBUG + static const char test_filename[] = "-@"; + char test_tablename[sizeof test_filename + + sizeof srv_mysql50_table_name_prefix]; + if ((sizeof test_tablename) - 1 + != filename_to_tablename(test_filename, test_tablename, + sizeof test_tablename) + || strncmp(test_tablename, + srv_mysql50_table_name_prefix, + sizeof srv_mysql50_table_name_prefix) + || strcmp(test_tablename + + sizeof srv_mysql50_table_name_prefix, + test_filename)) { + sql_print_error("tablename encoding has been changed"); + goto error; + } +#endif /* UNIV_DEBUG */ + +#ifndef MYSQL_SERVER + innodb_overwrite_relay_log_info = FALSE; +#endif + +#ifdef HAVE_REPLICATION +#ifdef MYSQL_SERVER + /* read master log position from relay-log.info if exists */ + char fname[FN_REFLEN+128]; + int pos; + int info_fd; + IO_CACHE info_file; + + fname[0] = '\0'; + + if(innobase_overwrite_relay_log_info) { + + fprintf(stderr, + "InnoDB: Warning: innodb_overwrite_relay_log_info is enabled." + " Updates in other storage engines may have problem with consistency.\n"); + + bzero((char*) &info_file, sizeof(info_file)); + fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32); + + int error=0; + + if (!access(fname,F_OK)) { + /* exist */ + if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) { + error=1; + } else if (init_io_cache(&info_file, info_fd, IO_SIZE*2, + READ_CACHE, 0L, 0, MYF(MY_WME))) { + error=1; + } + + if (error) { +relay_info_error: + if (info_fd >= 0) + my_close(info_fd, MYF(0)); + fname[0] = '\0'; + goto skip_relay; + } + } else { + fname[0] = '\0'; + goto skip_relay; + } + + if (init_strvar_from_file(fname, sizeof(fname), &info_file, "") || /* dummy (it is relay-log) */ + init_intvar_from_file(&pos, &info_file, BIN_LOG_HEADER_SIZE)) { + end_io_cache(&info_file); + error=1; + goto relay_info_error; + } + + fprintf(stderr, + "InnoDB: relay-log.info is detected.\n" + "InnoDB: relay log: position %u, file name %s\n", + pos, fname); + + strncpy(trx_sys_mysql_relay_log_name, fname, TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN); + trx_sys_mysql_relay_log_pos = (ib_int64_t) pos; + + if (init_strvar_from_file(fname, sizeof(fname), &info_file, "") || + init_intvar_from_file(&pos, &info_file, 0)) { + end_io_cache(&info_file); + error=1; + goto relay_info_error; + } + + fprintf(stderr, + "InnoDB: master log: position %u, file name %s\n", + pos, fname); + + strncpy(trx_sys_mysql_master_log_name, fname, TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN); + trx_sys_mysql_master_log_pos = (ib_int64_t) pos; + + end_io_cache(&info_file); + if (info_fd >= 0) + my_close(info_fd, MYF(0)); + } +skip_relay: +#endif /* MYSQL_SERVER */ +#endif /* HAVE_REPLICATION */ + + /* Check that values don't overflow on 32-bit systems. */ + if (sizeof(ulint) == 4) { + if (innobase_buffer_pool_size > UINT_MAX32) { + sql_print_error( + "innobase_buffer_pool_size can't be over 4GB" + " on 32-bit systems"); + + goto error; + } + + if (innobase_log_file_size > UINT_MAX32) { + sql_print_error( + "innobase_log_file_size can't be over 4GB" + " on 32-bit systems"); + + goto error; + } + } + + os_innodb_umask = (ulint)my_umask; + + /* First calculate the default path for innodb_data_home_dir etc., + in case the user has not given any value. + + Note that when using the embedded server, the datadirectory is not + necessarily the current directory of this program. */ + + if (mysqld_embedded) { + default_path = mysql_real_data_home; + fil_path_to_mysql_datadir = mysql_real_data_home; + } else { + /* It's better to use current lib, to keep paths short */ + current_dir[0] = FN_CURLIB; + current_dir[1] = FN_LIBCHAR; + current_dir[2] = 0; + default_path = current_dir; + } + + ut_a(default_path); + + if (specialflag & SPECIAL_NO_PRIOR) { + srv_set_thread_priorities = FALSE; + } else { + srv_set_thread_priorities = TRUE; + srv_query_thread_priority = QUERY_PRIOR; + } + + /* Set InnoDB initialization parameters according to the values + read from MySQL .cnf file */ + + /*--------------- Data files -------------------------*/ + + /* The default dir for data files is the datadir of MySQL */ + + srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : + default_path); + + /* Set default InnoDB data file size to 10 MB and let it be + auto-extending. Thus users can use InnoDB in >= 4.0 without having + to specify any startup options. */ + + if (!innobase_data_file_path) { + innobase_data_file_path = (char*) "ibdata1:10M:autoextend"; + } + + /* Since InnoDB edits the argument in the next call, we make another + copy of it: */ + + internal_innobase_data_file_path = my_strdup(innobase_data_file_path, + MYF(MY_FAE)); + + ret = (bool) srv_parse_data_file_paths_and_sizes( + internal_innobase_data_file_path); + if (ret == FALSE) { + sql_print_error( + "InnoDB: syntax error in innodb_data_file_path"); +mem_free_and_error: + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path, + MYF(MY_ALLOW_ZERO_PTR)); + goto error; + } + +#ifdef HAVE_REPLICATION +#ifdef MYSQL_SERVER + if(innobase_overwrite_relay_log_info) { + /* If InnoDB progressed from relay-log.info, overwrite it */ + if (fname[0] == '\0') { + fprintf(stderr, + "InnoDB: something wrong with relay-info.log. InnoDB will not overwrite it.\n"); + } else if (0 != strcmp(fname, trx_sys_mysql_master_log_name) + || pos != trx_sys_mysql_master_log_pos) { + /* Overwrite relay-log.info */ + bzero((char*) &info_file, sizeof(info_file)); + fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32); + + int error = 0; + + if (!access(fname,F_OK)) { + /* exist */ + if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) { + error = 1; + } else if (init_io_cache(&info_file, info_fd, IO_SIZE*2, + WRITE_CACHE, 0L, 0, MYF(MY_WME))) { + error = 1; + } + + if (error) { + if (info_fd >= 0) + my_close(info_fd, MYF(0)); + goto skip_overwrite; + } + } else { + error = 1; + goto skip_overwrite; + } + + char buff[FN_REFLEN*2+22*2+4], *pos; + + my_b_seek(&info_file, 0L); + pos=strmov(buff, trx_sys_mysql_relay_log_name); + *pos++='\n'; + pos=longlong2str(trx_sys_mysql_relay_log_pos, pos, 10); + *pos++='\n'; + pos=strmov(pos, trx_sys_mysql_master_log_name); + *pos++='\n'; + pos=longlong2str(trx_sys_mysql_master_log_pos, pos, 10); + *pos='\n'; + + if (my_b_write(&info_file, (uchar*) buff, (size_t) (pos-buff)+1)) + error = 1; + if (flush_io_cache(&info_file)) + error = 1; + + end_io_cache(&info_file); + if (info_fd >= 0) + my_close(info_fd, MYF(0)); +skip_overwrite: + if (error) { + fprintf(stderr, + "InnoDB: ERROR: error occured during overwriting relay-log.info.\n"); + } else { + fprintf(stderr, + "InnoDB: relay-log.info was overwritten.\n"); + } + } else { + fprintf(stderr, + "InnoDB: InnoDB and relay-log.info are synchronized. InnoDB will not overwrite it.\n"); + } + } +#endif /* MYSQL_SERVER */ +#endif /* HAVE_REPLICATION */ + + + srv_extra_undoslots = (ibool) innobase_extra_undoslots; + + /* -------------- Log files ---------------------------*/ + + /* The default dir for log files is the datadir of MySQL */ + + if (!innobase_log_group_home_dir) { + innobase_log_group_home_dir = default_path; + } + +#ifdef UNIV_LOG_ARCHIVE + /* Since innodb_log_arch_dir has no relevance under MySQL, + starting from 4.0.6 we always set it the same as + innodb_log_group_home_dir: */ + + innobase_log_arch_dir = innobase_log_group_home_dir; + + srv_arch_dir = innobase_log_arch_dir; +#endif /* UNIG_LOG_ARCHIVE */ + + ret = (bool) + srv_parse_log_group_home_dirs(innobase_log_group_home_dir); + + if (ret == FALSE || innobase_mirrored_log_groups != 1) { + sql_print_error("syntax error in innodb_log_group_home_dir, or a " + "wrong number of mirrored log groups"); + + goto mem_free_and_error; + } + + /* Validate the file format by animal name */ + if (innobase_file_format_name != NULL) { + + format_id = innobase_file_format_name_lookup( + innobase_file_format_name); + + if (format_id > DICT_TF_FORMAT_MAX) { + + sql_print_error("InnoDB: wrong innodb_file_format."); + + goto mem_free_and_error; + } + } else { + /* Set it to the default file format id. Though this + should never happen. */ + format_id = 0; + } + + srv_file_format = format_id; + + /* Given the type of innobase_file_format_name we have little + choice but to cast away the constness from the returned name. + innobase_file_format_name is used in the MySQL set variable + interface and so can't be const. */ + + innobase_file_format_name = + (char*) trx_sys_file_format_id_to_name(format_id); + + /* Process innobase_file_format_check variable */ + ut_a(innobase_file_format_check != NULL); + + /* As a side effect it will set srv_check_file_format_at_startup + on valid input. First we check for "on"/"off". */ + if (!innobase_file_format_check_on_off(innobase_file_format_check)) { + + /* Did the user specify a format name that we support ? + As a side effect it will update the variable + srv_check_file_format_at_startup */ + if (!innobase_file_format_check_validate( + innobase_file_format_check)) { + + sql_print_error("InnoDB: invalid " + "innodb_file_format_check value: " + "should be either 'on' or 'off' or " + "any value up to %s or its " + "equivalent numeric id", + trx_sys_file_format_id_to_name( + DICT_TF_FORMAT_MAX)); + + goto mem_free_and_error; + } + } + + ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values)); + innobase_change_buffering = (char*) + innobase_change_buffering_values[ibuf_use]; + + /* --------------------------------------------------*/ + + srv_file_flush_method_str = innobase_unix_file_flush_method; + + srv_n_log_groups = (ulint) innobase_mirrored_log_groups; + srv_n_log_files = (ulint) innobase_log_files_in_group; + srv_log_file_size = (ulint) innobase_log_file_size; + +#ifdef UNIV_LOG_ARCHIVE + srv_log_archive_on = (ulint) innobase_log_archive; +#endif /* UNIV_LOG_ARCHIVE */ + srv_log_buffer_size = (ulint) innobase_log_buffer_size; + + srv_buf_pool_size = (ulint) innobase_buffer_pool_size; + + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + + srv_n_file_io_threads = (ulint) innobase_file_io_threads; + srv_n_read_io_threads = (ulint) innobase_read_io_threads; + srv_n_write_io_threads = (ulint) innobase_write_io_threads; + + srv_read_ahead &= 3; + + srv_force_recovery = (ulint) innobase_force_recovery; + + srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; + srv_use_checksums = (ibool) innobase_use_checksums; + +#ifdef HAVE_LARGE_PAGES + if ((os_use_large_pages = (ibool) my_use_large_pages)) + os_large_page_size = (ulint) opt_large_page_size; +#endif + + row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout; + + srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog; + + srv_max_n_open_files = (ulint) innobase_open_files; + srv_innodb_status = (ibool) innobase_create_status_file; + + srv_print_verbose_log = mysqld_embedded ? 0 : 1; + + /* Store the default charset-collation number of this MySQL + installation */ + + data_mysql_default_charset_coll = (ulint)default_charset_info->number; + + ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL == + my_charset_latin1.number); + ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number); + + /* Store the latin1_swedish_ci character ordering table to InnoDB. For + non-latin1_swedish_ci charsets we use the MySQL comparison functions, + and consequently we do not need to know the ordering internally in + InnoDB. */ + + ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci")); + srv_latin1_ordering = my_charset_latin1.sort_order; + + /* Since we in this module access directly the fields of a trx + struct, and due to different headers and flags it might happen that + mutex_t has a different size in this module and in InnoDB + modules, we check at run time that the size is the same in + these compilation modules. */ + + err = innobase_start_or_create_for_mysql(); + + if (err != DB_SUCCESS) { + goto mem_free_and_error; + } + + innobase_open_tables = hash_create(200); + pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST); + pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST); + pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST); + pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST); + pthread_cond_init(&commit_cond, NULL); + innodb_inited= 1; +#ifdef MYSQL_DYNAMIC_PLUGIN + if (innobase_hton != p) { + innobase_hton = reinterpret_cast(p); + *innobase_hton = *innodb_hton_ptr; + } +#endif /* MYSQL_DYNAMIC_PLUGIN */ + + /* Get the current high water mark format. */ + innobase_file_format_check = (char*) trx_sys_file_format_max_get(); + + DBUG_RETURN(FALSE); +error: + DBUG_RETURN(TRUE); +} + +/*********************************************************************** +Closes an InnoDB database. */ +static +int +innobase_end(handlerton *hton, ha_panic_function type) +/*==============*/ + /* out: TRUE if error */ +{ + int err= 0; + + DBUG_ENTER("innobase_end"); + DBUG_ASSERT(hton == innodb_hton_ptr); + +#ifdef __NETWARE__ /* some special cleanup for NetWare */ + if (nw_panic) { + set_panic_flag_for_netware(); + } +#endif + if (innodb_inited) { + + srv_fast_shutdown = (ulint) innobase_fast_shutdown; + innodb_inited = 0; + hash_table_free(innobase_open_tables); + innobase_open_tables = NULL; + if (innobase_shutdown_for_mysql() != DB_SUCCESS) { + err = 1; + } + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path, + MYF(MY_ALLOW_ZERO_PTR)); + pthread_mutex_destroy(&innobase_share_mutex); + pthread_mutex_destroy(&prepare_commit_mutex); + pthread_mutex_destroy(&commit_threads_m); + pthread_mutex_destroy(&commit_cond_m); + pthread_cond_destroy(&commit_cond); + } + + DBUG_RETURN(err); +} + +/******************************************************************** +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes +the logs, and the name of this function should be innobase_checkpoint. */ +static +bool +innobase_flush_logs(handlerton *hton) +/*=====================*/ + /* out: TRUE if error */ +{ + bool result = 0; + + DBUG_ENTER("innobase_flush_logs"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + log_buffer_flush_to_disk(); + + DBUG_RETURN(result); +} + +/******************************************************************** +Return alter table flags supported in an InnoDB database. */ +static +uint +innobase_alter_table_flags( +/*=======================*/ + uint flags) +{ + return(HA_ONLINE_ADD_INDEX_NO_WRITES + | HA_ONLINE_DROP_INDEX_NO_WRITES + | HA_ONLINE_ADD_UNIQUE_INDEX_NO_WRITES + | HA_ONLINE_DROP_UNIQUE_INDEX_NO_WRITES + | HA_ONLINE_ADD_PK_INDEX_NO_WRITES); +} + +/********************************************************************* +Commits a transaction in an InnoDB database. */ +static +void +innobase_commit_low( +/*================*/ + trx_t* trx) /* in: transaction handle */ +{ + if (trx->conc_state == TRX_NOT_STARTED) { + + return; + } + +#ifdef HAVE_REPLICATION +#ifdef MYSQL_SERVER + THD *thd=current_thd; + + if (thd && thd->slave_thread) { + /* Update the replication position info inside InnoDB. + In embedded server, does nothing. */ + const char *log_file_name, *group_relay_log_name; + ulonglong log_pos, relay_log_pos; + bool res = rpl_get_position_info(&log_file_name, &log_pos, + &group_relay_log_name, + &relay_log_pos); + if (res) { + trx->mysql_master_log_file_name = log_file_name; + trx->mysql_master_log_pos = (ib_int64_t)log_pos; + trx->mysql_relay_log_file_name = group_relay_log_name; + trx->mysql_relay_log_pos = (ib_int64_t)relay_log_pos; + } + } +#endif /* MYSQL_SERVER */ +#endif /* HAVE_REPLICATION */ + + trx_commit_for_mysql(trx); +} + +/********************************************************************* +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. */ +static +int +innobase_start_trx_and_assign_read_view( +/*====================================*/ + /* out: 0 */ + handlerton *hton, /* in: Innodb handlerton */ + THD* thd) /* in: MySQL thread handle of the user for whom + the transaction should be committed */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_start_trx_and_assign_read_view"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* Create a new trx struct for thd, if it does not yet have one */ + + trx = check_trx_exists(thd); + + /* This is just to play safe: release a possible FIFO ticket and + search latch. Since we will reserve the kernel mutex, we have to + release the search system latch first to obey the latching order. */ + + innobase_release_stat_resources(trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started(trx); + + /* Assign a read view if the transaction does not have it yet */ + + trx_assign_read_view(trx); + + /* Set the MySQL flag to mark that there is an active transaction */ + + if (trx->active_trans == 0) { + innobase_register_trx_and_stmt(hton, thd); + trx->active_trans = 1; + } + + DBUG_RETURN(0); +} + +/********************************************************************* +Commits a transaction in an InnoDB database or marks an SQL statement +ended. */ +static +int +innobase_commit( +/*============*/ + /* out: 0 */ + handlerton *hton, /* in: Innodb handlerton */ + THD* thd, /* in: MySQL thread handle of the user for whom + the transaction should be committed */ + bool all) /* in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_commit"); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("ending transaction")); + + trx = check_trx_exists(thd); + + /* Since we will reserve the kernel mutex, we have to release + the search system latch first to obey the latching order. */ + + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + /* The flag trx->active_trans is set to 1 in + + 1. ::external_lock(), + 2. ::start_stmt(), + 3. innobase_query_caching_of_table_permitted(), + 4. innobase_savepoint(), + 5. ::init_table_handle_for_HANDLER(), + 6. innobase_start_trx_and_assign_read_view(), + 7. ::transactional_table_lock() + + and it is only set to 0 in a commit or a rollback. If it is 0 we know + there cannot be resources to be freed and we could return immediately. + For the time being, we play safe and do the cleanup though there should + be nothing to clean up. */ + + if (trx->active_trans == 0 + && trx->conc_state != TRX_NOT_STARTED) { + + sql_print_error("trx->active_trans == 0, but" + " trx->conc_state != TRX_NOT_STARTED"); + } + if (all + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + /* We were instructed to commit the whole transaction, or + this is an SQL statement end and autocommit is on */ + + /* We need current binlog position for ibbackup to work. + Note, the position is current because of + prepare_commit_mutex */ +retry: + if (srv_commit_concurrency > 0) { + pthread_mutex_lock(&commit_cond_m); + commit_threads++; + + if (commit_threads > srv_commit_concurrency) { + commit_threads--; + pthread_cond_wait(&commit_cond, + &commit_cond_m); + pthread_mutex_unlock(&commit_cond_m); + goto retry; + } + else { + pthread_mutex_unlock(&commit_cond_m); + } + } + + trx->mysql_log_file_name = mysql_bin_log_file_name(); + trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos(); + + innobase_commit_low(trx); + + if (srv_commit_concurrency > 0) { + pthread_mutex_lock(&commit_cond_m); + commit_threads--; + pthread_cond_signal(&commit_cond); + pthread_mutex_unlock(&commit_cond_m); + } + + if (trx->active_trans == 2) { + + pthread_mutex_unlock(&prepare_commit_mutex); + } + + trx->active_trans = 0; + + } else { + /* We just mark the SQL statement ended and do not do a + transaction commit */ + + /* If we had reserved the auto-inc lock for some + table in this SQL statement we release it now */ + + row_unlock_table_autoinc_for_mysql(trx); + + /* Store the current undo_no of the transaction so that we + know where to roll back if we have to roll back the next + SQL statement */ + + trx_mark_sql_stat_end(trx); + } + + trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */ + + if (trx->declared_to_be_inside_innodb) { + /* Release our possible ticket in the FIFO */ + + srv_conc_force_exit_innodb(trx); + } + + /* Tell the InnoDB server that there might be work for utility + threads: */ + srv_active_wake_master_thread(); + + DBUG_RETURN(0); +} + +/********************************************************************* +Rolls back a transaction or the latest SQL statement. */ +static +int +innobase_rollback( +/*==============*/ + /* out: 0 or error number */ + handlerton *hton, /* in: Innodb handlerton */ + THD* thd, /* in: handle to the MySQL thread of the user + whose transaction should be rolled back */ + bool all) /* in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + int error = 0; + trx_t* trx; + + DBUG_ENTER("innobase_rollback"); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("aborting transaction")); + + trx = check_trx_exists(thd); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the kernel mutex, we have to release the search system latch + first to obey the latching order. */ + + innobase_release_stat_resources(trx); + + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ + + row_unlock_table_autoinc_for_mysql(trx); + + if (all + || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + error = trx_rollback_for_mysql(trx); + trx->active_trans = 0; + } else { + error = trx_rollback_last_sql_stat_for_mysql(trx); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/********************************************************************* +Rolls back a transaction */ +static +int +innobase_rollback_trx( +/*==================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: transaction */ +{ + int error = 0; + + DBUG_ENTER("innobase_rollback_trx"); + DBUG_PRINT("trans", ("aborting transaction")); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the kernel mutex, we have to release the search system latch + first to obey the latching order. */ + + innobase_release_stat_resources(trx); + + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ + + row_unlock_table_autoinc_for_mysql(trx); + + error = trx_rollback_for_mysql(trx); + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/********************************************************************* +Rolls back a transaction to a savepoint. */ +static +int +innobase_rollback_to_savepoint( +/*===========================*/ + /* out: 0 if success, HA_ERR_NO_SAVEPOINT if + no savepoint with the given name */ + handlerton *hton, /* in: Innodb handlerton */ + THD* thd, /* in: handle to the MySQL thread of the user + whose transaction should be rolled back */ + void* savepoint) /* in: savepoint data */ +{ + ib_int64_t mysql_binlog_cache_pos; + int error = 0; + trx_t* trx; + char name[64]; + + DBUG_ENTER("innobase_rollback_to_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the kernel mutex, we have to release the search system latch + first to obey the latching order. */ + + innobase_release_stat_resources(trx); + + /* TODO: use provided savepoint data area to store savepoint data */ + + longlong2str((ulint)savepoint, name, 36); + + error = (int) trx_rollback_to_savepoint_for_mysql(trx, name, + &mysql_binlog_cache_pos); + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/********************************************************************* +Release transaction savepoint name. */ +static +int +innobase_release_savepoint( +/*=======================*/ + /* out: 0 if success, HA_ERR_NO_SAVEPOINT if + no savepoint with the given name */ + handlerton* hton, /* in: handlerton for Innodb */ + THD* thd, /* in: handle to the MySQL thread of the user + whose transaction should be rolled back */ + void* savepoint) /* in: savepoint data */ +{ + int error = 0; + trx_t* trx; + char name[64]; + + DBUG_ENTER("innobase_release_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + /* TODO: use provided savepoint data area to store savepoint data */ + + longlong2str((ulint)savepoint, name, 36); + + error = (int) trx_release_savepoint_for_mysql(trx, name); + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/********************************************************************* +Sets a transaction savepoint. */ +static +int +innobase_savepoint( +/*===============*/ + /* out: always 0, that is, always succeeds */ + handlerton* hton, /* in: handle to the Innodb handlerton */ + THD* thd, /* in: handle to the MySQL thread */ + void* savepoint) /* in: savepoint data */ +{ + int error = 0; + trx_t* trx; + + DBUG_ENTER("innobase_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* + In the autocommit mode there is no sense to set a savepoint + (unless we are in sub-statement), so SQL layer ensures that + this method is never called in such situation. + */ +#ifdef MYSQL_SERVER /* plugins cannot access thd->in_sub_stmt */ + DBUG_ASSERT(thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) || + thd->in_sub_stmt); +#endif /* MYSQL_SERVER */ + + trx = check_trx_exists(thd); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the kernel mutex, we have to release the search system latch + first to obey the latching order. */ + + innobase_release_stat_resources(trx); + + /* cannot happen outside of transaction */ + DBUG_ASSERT(trx->active_trans); + + /* TODO: use provided savepoint data area to store savepoint data */ + char name[64]; + longlong2str((ulint)savepoint,name,36); + + error = (int) trx_savepoint_for_mysql(trx, name, (ib_int64_t)0); + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/********************************************************************* +Frees a possible InnoDB trx object associated with the current THD. */ +static +int +innobase_close_connection( +/*======================*/ + /* out: 0 or error number */ + handlerton* hton, /* in: innobase handlerton */ + THD* thd) /* in: handle to the MySQL thread of the user + whose resources should be free'd */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_close_connection"); + DBUG_ASSERT(hton == innodb_hton_ptr); + trx = thd_to_trx(thd); + + ut_a(trx); + + if (trx->active_trans == 0 + && trx->conc_state != TRX_NOT_STARTED) { + + sql_print_error("trx->active_trans == 0, but" + " trx->conc_state != TRX_NOT_STARTED"); + } + + + if (trx->conc_state != TRX_NOT_STARTED && + global_system_variables.log_warnings) { + sql_print_warning( + "MySQL is closing a connection that has an active " + "InnoDB transaction. %lu row modifications will " + "roll back.", + (ulong) trx->undo_no.low); + } + + innobase_rollback_trx(trx); + + thr_local_free(trx->mysql_thread_id); + trx_free_for_mysql(trx); + + DBUG_RETURN(0); +} + + +/***************************************************************************** +** InnoDB database tables +*****************************************************************************/ + +/******************************************************************** +Get the record format from the data dictionary. */ +UNIV_INTERN +enum row_type +ha_innobase::get_row_type() const +/*=============================*/ + /* out: one of + ROW_TYPE_REDUNDANT, + ROW_TYPE_COMPACT, + ROW_TYPE_COMPRESSED, + ROW_TYPE_DYNAMIC */ +{ + if (prebuilt && prebuilt->table) { + const ulint flags = prebuilt->table->flags; + + if (UNIV_UNLIKELY(!flags)) { + return(ROW_TYPE_REDUNDANT); + } + + ut_ad(flags & DICT_TF_COMPACT); + + switch (flags & DICT_TF_FORMAT_MASK) { + case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT: + return(ROW_TYPE_COMPACT); + case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT: + if (flags & DICT_TF_ZSSIZE_MASK) { + return(ROW_TYPE_COMPRESSED); + } else { + return(ROW_TYPE_DYNAMIC); + } +#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX +# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX" +#endif + } + } + ut_ad(0); + return(ROW_TYPE_NOT_USED); +} + + + +/******************************************************************** +Get the table flags to use for the statement. */ +UNIV_INTERN +handler::Table_flags +ha_innobase::table_flags() const +{ + /* Need to use tx_isolation here since table flags is (also) + called before prebuilt is inited. */ + ulong const tx_isolation = thd_tx_isolation(ha_thd()); + if (tx_isolation <= ISO_READ_COMMITTED) + return int_table_flags; + return int_table_flags | HA_BINLOG_STMT_CAPABLE; +} + +/******************************************************************** +Gives the file extension of an InnoDB single-table tablespace. */ +static const char* ha_innobase_exts[] = { + ".ibd", + NullS +}; + +UNIV_INTERN +const char* +ha_innobase::table_type() const +/*===========================*/ + /* out: table type */ +{ + return(innobase_hton_name); +} + +UNIV_INTERN +const char* +ha_innobase::index_type(uint) +/*=========================*/ + /* out: index type */ +{ + return("BTREE"); +} + +UNIV_INTERN +const char** +ha_innobase::bas_ext() const +/*========================*/ + /* out: file extension string */ +{ + return(ha_innobase_exts); +} + +UNIV_INTERN +ulong +ha_innobase::index_flags(uint, uint, bool) const +{ + return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER + | HA_READ_RANGE | HA_KEYREAD_ONLY); +} + +UNIV_INTERN +uint +ha_innobase::max_supported_keys() const +{ + return(MAX_KEY); +} + +UNIV_INTERN +uint +ha_innobase::max_supported_key_length() const +{ + /* An InnoDB page must store >= 2 keys; a secondary key record + must also contain the primary key value: max key length is + therefore set to slightly less than 1 / 4 of page size which + is 16 kB; but currently MySQL does not work with keys whose + size is > MAX_KEY_LENGTH */ + return(3500); +} + +UNIV_INTERN +const key_map* +ha_innobase::keys_to_use_for_scanning() +{ + return(&key_map_full); +} + +UNIV_INTERN +uint8 +ha_innobase::table_cache_type() +{ + return(HA_CACHE_TBL_ASKTRANSACT); +} + +UNIV_INTERN +bool +ha_innobase::primary_key_is_clustered() +{ + return(true); +} + +/********************************************************************* +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case. */ +static +void +normalize_table_name( +/*=================*/ + char* norm_name, /* out: normalized name as a + null-terminated string */ + const char* name) /* in: table name string */ +{ + char* name_ptr; + char* db_ptr; + char* ptr; + + /* Scan name from the end */ + + ptr = strend(name)-1; + + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + } + + name_ptr = ptr + 1; + + DBUG_ASSERT(ptr > name); + + ptr--; + + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + } + + db_ptr = ptr + 1; + + memcpy(norm_name, db_ptr, strlen(name) + 1 - (db_ptr - name)); + + norm_name[name_ptr - db_ptr - 1] = '/'; + +#ifdef __WIN__ + innobase_casedn_str(norm_name); +#endif +} + +/************************************************************************ +Set the autoinc column max value. This should only be called once from +ha_innobase::open(). Therefore there's no need for a covering lock. */ +UNIV_INTERN +ulint +ha_innobase::innobase_initialize_autoinc() +/*======================================*/ +{ + dict_index_t* index; + ulonglong auto_inc; + const char* col_name; + ulint error = DB_SUCCESS; + dict_table_t* innodb_table = prebuilt->table; + + col_name = table->found_next_number_field->field_name; + index = innobase_get_index(table->s->next_number_index); + + /* Execute SELECT MAX(col_name) FROM TABLE; */ + error = row_search_max_autoinc(index, col_name, &auto_inc); + + if (error == DB_SUCCESS) { + + /* At the this stage we dont' know the increment + or the offset, so use default inrement of 1. */ + ++auto_inc; + + dict_table_autoinc_initialize(innodb_table, auto_inc); + + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: (%lu) Couldn't read " + "the MAX(%s) autoinc value from the " + "index (%s).\n", error, col_name, index->name); + } + + return(error); +} + +/********************************************************************* +Creates and opens a handle to a table which already exists in an InnoDB +database. */ +UNIV_INTERN +int +ha_innobase::open( +/*==============*/ + /* out: 1 if error, 0 if success */ + const char* name, /* in: table name */ + int mode, /* in: not used */ + uint test_if_locked) /* in: not used */ +{ + dict_table_t* ib_table; + char norm_name[1000]; + THD* thd; + ulint retries = 0; + char* is_part = NULL; + + DBUG_ENTER("ha_innobase::open"); + + UT_NOT_USED(mode); + UT_NOT_USED(test_if_locked); + + thd = ha_thd(); + + /* Under some cases MySQL seems to call this function while + holding btr_search_latch. This breaks the latching order as + we acquire dict_sys->mutex below and leads to a deadlock. */ + if (thd != NULL) { + innobase_release_temporary_latches(ht, thd); + } + + normalize_table_name(norm_name, name); + + user_thd = NULL; + + if (!(share=get_share(name))) { + + DBUG_RETURN(1); + } + + /* Create buffers for packing the fields of a record. Why + table->reclength did not work here? Obviously, because char + fields when packed actually became 1 byte longer, when we also + stored the string length as the first byte. */ + + upd_and_key_val_buff_len = + table->s->reclength + table->s->max_key_length + + MAX_REF_PARTS * 3; + if (!(uchar*) my_multi_malloc(MYF(MY_WME), + &upd_buff, upd_and_key_val_buff_len, + &key_val_buff, upd_and_key_val_buff_len, + NullS)) { + free_share(share); + + DBUG_RETURN(1); + } + + /* We look for pattern #P# to see if the table is partitioned + MySQL table. The retry logic for partitioned tables is a + workaround for http://bugs.mysql.com/bug.php?id=33349. Look + at support issue https://support.mysql.com/view.php?id=21080 + for more details. */ + is_part = strstr(norm_name, "#P#"); +retry: + /* Get pointer to a table object in InnoDB dictionary cache */ + ib_table = dict_table_get(norm_name, TRUE); + + if (NULL == ib_table) { + if (is_part && retries < 10) { + ++retries; + os_thread_sleep(100000); + goto retry; + } + + if (is_part) { + sql_print_error("Failed to open table %s after " + "%lu attemtps.\n", norm_name, + retries); + } + + sql_print_error("Cannot find or open table %s from\n" + "the internal data dictionary of InnoDB " + "though the .frm file for the\n" + "table exists. Maybe you have deleted and " + "recreated InnoDB data\n" + "files but have forgotten to delete the " + "corresponding .frm files\n" + "of InnoDB tables, or you have moved .frm " + "files to another database?\n" + "or, the table contains indexes that this " + "version of the engine\n" + "doesn't support.\n" + "See http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html\n" + "how you can resolve the problem.\n", + norm_name); + free_share(share); + my_free(upd_buff, MYF(0)); + my_errno = ENOENT; + + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + + if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) { + sql_print_error("MySQL is trying to open a table handle but " + "the .ibd file for\ntable %s does not exist.\n" + "Have you deleted the .ibd file from the " + "database directory under\nthe MySQL datadir, " + "or have you used DISCARD TABLESPACE?\n" + "See http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html\n" + "how you can resolve the problem.\n", + norm_name); + free_share(share); + my_free(upd_buff, MYF(0)); + my_errno = ENOENT; + + dict_table_decrement_handle_count(ib_table, FALSE); + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + + prebuilt = row_create_prebuilt(ib_table); + + prebuilt->mysql_row_len = table->s->reclength; + prebuilt->default_rec = table->s->default_values; + ut_ad(prebuilt->default_rec); + + /* Looks like MySQL-3.23 sometimes has primary key number != 0 */ + + primary_key = table->s->primary_key; + key_used_on_scan = primary_key; + + /* Allocate a buffer for a 'row reference'. A row reference is + a string of bytes of length ref_length which uniquely specifies + a row in our table. Note that MySQL may also compare two row + references for equality by doing a simple memcmp on the strings + of length ref_length! */ + + if (!row_table_got_default_clust_index(ib_table)) { + if (primary_key >= MAX_KEY) { + sql_print_error("Table %s has a primary key in InnoDB data " + "dictionary, but not in MySQL!", name); + } + + prebuilt->clust_index_was_generated = FALSE; + + /* MySQL allocates the buffer for ref. key_info->key_length + includes space for all key columns + one byte for each column + that may be NULL. ref_length must be as exact as possible to + save space, because all row reference buffers are allocated + based on ref_length. */ + + ref_length = table->key_info[primary_key].key_length; + } else { + if (primary_key != MAX_KEY) { + sql_print_error("Table %s has no primary key in InnoDB data " + "dictionary, but has one in MySQL! If you " + "created the table with a MySQL version < " + "3.23.54 and did not define a primary key, " + "but defined a unique key with all non-NULL " + "columns, then MySQL internally treats that " + "key as the primary key. You can fix this " + "error by dump + DROP + CREATE + reimport " + "of the table.", name); + } + + prebuilt->clust_index_was_generated = TRUE; + + ref_length = DATA_ROW_ID_LEN; + + /* If we automatically created the clustered index, then + MySQL does not know about it, and MySQL must NOT be aware + of the index used on scan, to make it avoid checking if we + update the column of the index. That is why we assert below + that key_used_on_scan is the undefined value MAX_KEY. + The column is the row id in the automatical generation case, + and it will never be updated anyway. */ + + if (key_used_on_scan != MAX_KEY) { + sql_print_warning( + "Table %s key_used_on_scan is %lu even " + "though there is no primary key inside " + "InnoDB.", name, (ulong) key_used_on_scan); + } + } + + /* Index block size in InnoDB: used by MySQL in query optimization */ + stats.block_size = 16 * 1024; + + /* Init table lock structure */ + thr_lock_data_init(&share->lock,&lock,(void*) 0); + + if (prebuilt->table) { + /* We update the highest file format in the system table + space, if this table has higher file format setting. */ + + trx_sys_file_format_max_upgrade( + (const char**) &innobase_file_format_check, + dict_table_get_format(prebuilt->table)); + } + + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + + /* Only if the table has an AUTOINC column. */ + if (prebuilt->table != NULL && table->found_next_number_field != NULL) { + ulint error; + + dict_table_autoinc_lock(prebuilt->table); + + /* Since a table can already be "open" in InnoDB's internal + data dictionary, we only init the autoinc counter once, the + first time the table is loaded. We can safely reuse the + autoinc value from a previous MySQL open. */ + if (dict_table_autoinc_read(prebuilt->table) == 0) { + + error = innobase_initialize_autoinc(); + /* Should always succeed! */ + ut_a(error == DB_SUCCESS); + } + + dict_table_autoinc_unlock(prebuilt->table); + } + + DBUG_RETURN(0); +} + +UNIV_INTERN +uint +ha_innobase::max_supported_key_part_length() const +{ + return(DICT_MAX_INDEX_COL_LEN - 1); +} + +/********************************************************************** +Closes a handle to an InnoDB table. */ +UNIV_INTERN +int +ha_innobase::close(void) +/*====================*/ + /* out: 0 */ +{ + THD* thd; + + DBUG_ENTER("ha_innobase::close"); + + thd = ha_thd(); + if (thd != NULL) { + innobase_release_temporary_latches(ht, thd); + } + + row_prebuilt_free(prebuilt, FALSE); + + my_free(upd_buff, MYF(0)); + free_share(share); + + /* Tell InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + DBUG_RETURN(0); +} + +/* The following accessor functions should really be inside MySQL code! */ + +/****************************************************************** +Gets field offset for a field in a table. */ +static inline +uint +get_field_offset( +/*=============*/ + /* out: offset */ + TABLE* table, /* in: MySQL table object */ + Field* field) /* in: MySQL field object */ +{ + return((uint) (field->ptr - table->record[0])); +} + +/****************************************************************** +Checks if a field in a record is SQL NULL. Uses the record format +information in table to track the null bit in record. */ +static inline +uint +field_in_record_is_null( +/*====================*/ + /* out: 1 if NULL, 0 otherwise */ + TABLE* table, /* in: MySQL table object */ + Field* field, /* in: MySQL field object */ + char* record) /* in: a row in MySQL format */ +{ + int null_offset; + + if (!field->null_ptr) { + + return(0); + } + + null_offset = (uint) ((char*) field->null_ptr + - (char*) table->record[0]); + + if (record[null_offset] & field->null_bit) { + + return(1); + } + + return(0); +} + +/****************************************************************** +Sets a field in a record to SQL NULL. Uses the record format +information in table to track the null bit in record. */ +static inline +void +set_field_in_record_to_null( +/*========================*/ + TABLE* table, /* in: MySQL table object */ + Field* field, /* in: MySQL field object */ + char* record) /* in: a row in MySQL format */ +{ + int null_offset; + + null_offset = (uint) ((char*) field->null_ptr + - (char*) table->record[0]); + + record[null_offset] = record[null_offset] | field->null_bit; +} + +/***************************************************************** +InnoDB uses this function to compare two data fields for which the data type +is such that we must use MySQL code to compare them. NOTE that the prototype +of this function is in rem0cmp.c in InnoDB source code! If you change this +function, remember to update the prototype there! */ +extern "C" UNIV_INTERN +int +innobase_mysql_cmp( +/*===============*/ + /* out: 1, 0, -1, if a is greater, + equal, less than b, respectively */ + int mysql_type, /* in: MySQL type */ + uint charset_number, /* in: number of the charset */ + const unsigned char* a, /* in: data field */ + unsigned int a_length, /* in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /* in: data field */ + unsigned int b_length) /* in: data field length, + not UNIV_SQL_NULL */ +{ + CHARSET_INFO* charset; + enum_field_types mysql_tp; + int ret; + + DBUG_ASSERT(a_length != UNIV_SQL_NULL); + DBUG_ASSERT(b_length != UNIV_SQL_NULL); + + mysql_tp = (enum_field_types) mysql_type; + + switch (mysql_tp) { + + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + /* Use the charset number to pick the right charset struct for + the comparison. Since the MySQL function get_charset may be + slow before Bar removes the mutex operation there, we first + look at 2 common charsets directly. */ + + if (charset_number == default_charset_info->number) { + charset = default_charset_info; + } else if (charset_number == my_charset_latin1.number) { + charset = &my_charset_latin1; + } else { + charset = get_charset(charset_number, MYF(MY_WME)); + + if (charset == NULL) { + sql_print_error("InnoDB needs charset %lu for doing " + "a comparison, but MySQL cannot " + "find that charset.", + (ulong) charset_number); + ut_a(0); + } + } + + /* Starting from 4.1.3, we use strnncollsp() in comparisons of + non-latin1_swedish_ci strings. NOTE that the collation order + changes then: 'b\0\0...' is ordered BEFORE 'b ...'. Users + having indexes on such data need to rebuild their tables! */ + + ret = charset->coll->strnncollsp(charset, + a, a_length, + b, b_length, 0); + if (ret < 0) { + return(-1); + } else if (ret > 0) { + return(1); + } else { + return(0); + } + default: + ut_error; + } + + return(0); +} + +/****************************************************************** +Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. */ +extern "C" UNIV_INTERN +ulint +get_innobase_type_from_mysql_type( +/*==============================*/ + /* out: DATA_BINARY, + DATA_VARCHAR, ... */ + ulint* unsigned_flag, /* out: DATA_UNSIGNED if an + 'unsigned type'; + at least ENUM and SET, + and unsigned integer + types are 'unsigned types' */ + const void* f) /* in: MySQL Field */ +{ + const class Field* field = reinterpret_cast(f); + + /* The following asserts try to check that the MySQL type code fits in + 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to + the type */ + + DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256); + DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256); + + if (field->flags & UNSIGNED_FLAG) { + + *unsigned_flag = DATA_UNSIGNED; + } else { + *unsigned_flag = 0; + } + + if (field->real_type() == MYSQL_TYPE_ENUM + || field->real_type() == MYSQL_TYPE_SET) { + + /* MySQL has field->type() a string type for these, but the + data is actually internally stored as an unsigned integer + code! */ + + *unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned + flag set to zero, even though + internally this is an unsigned + integer type */ + return(DATA_INT); + } + + switch (field->type()) { + /* NOTE that we only allow string types in DATA_MYSQL and + DATA_VARMYSQL */ + case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */ + case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */ + if (field->binary()) { + return(DATA_BINARY); + } else if (strcmp( + field->charset()->name, + "latin1_swedish_ci") == 0) { + return(DATA_VARCHAR); + } else { + return(DATA_VARMYSQL); + } + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: if (field->binary()) { + + return(DATA_FIXBINARY); + } else if (strcmp( + field->charset()->name, + "latin1_swedish_ci") == 0) { + return(DATA_CHAR); + } else { + return(DATA_MYSQL); + } + case MYSQL_TYPE_NEWDECIMAL: + return(DATA_FIXBINARY); + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_LONGLONG: + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_YEAR: + case MYSQL_TYPE_NEWDATE: + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_TIMESTAMP: + return(DATA_INT); + case MYSQL_TYPE_FLOAT: + return(DATA_FLOAT); + case MYSQL_TYPE_DOUBLE: + return(DATA_DOUBLE); + case MYSQL_TYPE_DECIMAL: + return(DATA_DECIMAL); + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + return(DATA_BLOB); + default: + ut_error; + } + + return(0); +} + +/*********************************************************************** +Writes an unsigned integer value < 64k to 2 bytes, in the little-endian +storage format. */ +static inline +void +innobase_write_to_2_little_endian( +/*==============================*/ + byte* buf, /* in: where to store */ + ulint val) /* in: value to write, must be < 64k */ +{ + ut_a(val < 256 * 256); + + buf[0] = (byte)(val & 0xFF); + buf[1] = (byte)(val / 256); +} + +/*********************************************************************** +Reads an unsigned integer value < 64k from 2 bytes, in the little-endian +storage format. */ +static inline +uint +innobase_read_from_2_little_endian( +/*===============================*/ + /* out: value */ + const uchar* buf) /* in: from where to read */ +{ + return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))); +} + +/*********************************************************************** +Stores a key value for a row to a buffer. */ +UNIV_INTERN +uint +ha_innobase::store_key_val_for_row( +/*===============================*/ + /* out: key value length as stored in buff */ + uint keynr, /* in: key number */ + char* buff, /* in/out: buffer for the key value (in MySQL + format) */ + uint buff_len,/* in: buffer length */ + const uchar* record)/* in: row in MySQL format */ +{ + KEY* key_info = table->key_info + keynr; + KEY_PART_INFO* key_part = key_info->key_part; + KEY_PART_INFO* end = key_part + key_info->key_parts; + char* buff_start = buff; + enum_field_types mysql_type; + Field* field; + ibool is_null; + + DBUG_ENTER("store_key_val_for_row"); + + /* The format for storing a key field in MySQL is the following: + + 1. If the column can be NULL, then in the first byte we put 1 if the + field value is NULL, 0 otherwise. + + 2. If the column is of a BLOB type (it must be a column prefix field + in this case), then we put the length of the data in the field to the + next 2 bytes, in the little-endian format. If the field is SQL NULL, + then these 2 bytes are set to 0. Note that the length of data in the + field is <= column prefix length. + + 3. In a column prefix field, prefix_len next bytes are reserved for + data. In a normal field the max field length next bytes are reserved + for data. For a VARCHAR(n) the max field length is n. If the stored + value is the SQL NULL then these data bytes are set to 0. + + 4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that + in the MySQL row format, the length is stored in 1 or 2 bytes, + depending on the maximum allowed length. But in the MySQL key value + format, the length always takes 2 bytes. + + We have to zero-fill the buffer so that MySQL is able to use a + simple memcmp to compare two key values to determine if they are + equal. MySQL does this to compare contents of two 'ref' values. */ + + bzero(buff, buff_len); + + for (; key_part != end; key_part++) { + is_null = FALSE; + + if (key_part->null_bit) { + if (record[key_part->null_offset] + & key_part->null_bit) { + *buff = 1; + is_null = TRUE; + } else { + *buff = 0; + } + buff++; + } + + field = key_part->field; + mysql_type = field->type(); + + if (mysql_type == MYSQL_TYPE_VARCHAR) { + /* >= 5.0.3 true VARCHAR */ + ulint lenlen; + ulint len; + const byte* data; + ulint key_len; + ulint true_len; + CHARSET_INFO* cs; + int error=0; + + key_len = key_part->length; + + if (is_null) { + buff += key_len + 2; + + continue; + } + cs = field->charset(); + + lenlen = (ulint) + (((Field_varstring*)field)->length_bytes); + + data = row_mysql_read_true_varchar(&len, + (byte*) (record + + (ulint)get_field_offset(table, field)), + lenlen); + + true_len = len; + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) cs->cset->well_formed_len(cs, + (const char *) data, + (const char *) data + len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + + /* In a column prefix index, we may need to truncate + the stored value: */ + + if (true_len > key_len) { + true_len = key_len; + } + + /* The length in a key value is always stored in 2 + bytes */ + + row_mysql_store_true_var_len((byte*)buff, true_len, 2); + buff += 2; + + memcpy(buff, data, true_len); + + /* Note that we always reserve the maximum possible + length of the true VARCHAR in the key value, though + only len first bytes after the 2 length bytes contain + actual data. The rest of the space was reset to zero + in the bzero() call above. */ + + buff += key_len; + + } else if (mysql_type == MYSQL_TYPE_TINY_BLOB + || mysql_type == MYSQL_TYPE_MEDIUM_BLOB + || mysql_type == MYSQL_TYPE_BLOB + || mysql_type == MYSQL_TYPE_LONG_BLOB) { + + CHARSET_INFO* cs; + ulint key_len; + ulint true_len; + int error=0; + ulint blob_len; + const byte* blob_data; + + ut_a(key_part->key_part_flag & HA_PART_KEY_SEG); + + key_len = key_part->length; + + if (is_null) { + buff += key_len + 2; + + continue; + } + + cs = field->charset(); + + blob_data = row_mysql_read_blob_ref(&blob_len, + (byte*) (record + + (ulint)get_field_offset(table, field)), + (ulint) field->pack_length()); + + true_len = blob_len; + + ut_a(get_field_offset(table, field) + == key_part->offset); + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (blob_len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) cs->cset->well_formed_len(cs, + (const char *) blob_data, + (const char *) blob_data + + blob_len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + + /* All indexes on BLOB and TEXT are column prefix + indexes, and we may need to truncate the data to be + stored in the key value: */ + + if (true_len > key_len) { + true_len = key_len; + } + + /* MySQL reserves 2 bytes for the length and the + storage of the number is little-endian */ + + innobase_write_to_2_little_endian( + (byte*)buff, true_len); + buff += 2; + + memcpy(buff, blob_data, true_len); + + /* Note that we always reserve the maximum possible + length of the BLOB prefix in the key value. */ + + buff += key_len; + } else { + /* Here we handle all other data types except the + true VARCHAR, BLOB and TEXT. Note that the column + value we store may be also in a column prefix + index. */ + + CHARSET_INFO* cs; + ulint true_len; + ulint key_len; + const uchar* src_start; + int error=0; + enum_field_types real_type; + + key_len = key_part->length; + + if (is_null) { + buff += key_len; + + continue; + } + + src_start = record + key_part->offset; + real_type = field->real_type(); + true_len = key_len; + + /* Character set for the field is defined only + to fields whose type is string and real field + type is not enum or set. For these fields check + if character set is multi byte. */ + + if (real_type != MYSQL_TYPE_ENUM + && real_type != MYSQL_TYPE_SET + && ( mysql_type == MYSQL_TYPE_VAR_STRING + || mysql_type == MYSQL_TYPE_STRING)) { + + cs = field->charset(); + + /* For multi byte character sets we need to + calculate the true length of the key */ + + if (key_len > 0 && cs->mbmaxlen > 1) { + + true_len = (ulint) + cs->cset->well_formed_len(cs, + (const char *)src_start, + (const char *)src_start + + key_len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + } + + memcpy(buff, src_start, true_len); + buff += true_len; + + /* Pad the unused space with spaces. Note that no + padding is ever needed for UCS-2 because in MySQL, + all UCS2 characters are 2 bytes, as MySQL does not + support surrogate pairs, which are needed to represent + characters in the range U+10000 to U+10FFFF. */ + + if (true_len < key_len) { + ulint pad_len = key_len - true_len; + memset(buff, ' ', pad_len); + buff += pad_len; + } + } + } + + ut_a(buff <= buff_start + buff_len); + + DBUG_RETURN((uint)(buff - buff_start)); +} + +/****************************************************************** +Builds a 'template' to the prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +static +void +build_template( +/*===========*/ + row_prebuilt_t* prebuilt, /* in/out: prebuilt struct */ + THD* thd, /* in: current user thread, used + only if templ_type is + ROW_MYSQL_REC_FIELDS */ + TABLE* table, /* in: MySQL table */ + uint templ_type) /* in: ROW_MYSQL_WHOLE_ROW or + ROW_MYSQL_REC_FIELDS */ +{ + dict_index_t* index; + dict_index_t* clust_index; + mysql_row_templ_t* templ; + Field* field; + ulint n_fields; + ulint n_requested_fields = 0; + ibool fetch_all_in_key = FALSE; + ibool fetch_primary_key_cols = FALSE; + ulint i; + /* byte offset of the end of last requested column */ + ulint mysql_prefix_len = 0; + + if (prebuilt->select_lock_type == LOCK_X) { + /* We always retrieve the whole clustered index record if we + use exclusive row level locks, for example, if the read is + done in an UPDATE statement. */ + + templ_type = ROW_MYSQL_WHOLE_ROW; + } + + if (templ_type == ROW_MYSQL_REC_FIELDS) { + if (prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_ALL_COLS) { + + /* We know we must at least fetch all columns in the + key, or all columns in the table */ + + if (prebuilt->read_just_key) { + /* MySQL has instructed us that it is enough + to fetch the columns in the key; looks like + MySQL can set this flag also when there is + only a prefix of the column in the key: in + that case we retrieve the whole column from + the clustered index */ + + fetch_all_in_key = TRUE; + } else { + templ_type = ROW_MYSQL_WHOLE_ROW; + } + } else if (prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_PRIMARY_KEY) { + /* We must at least fetch all primary key cols. Note + that if the clustered index was internally generated + by InnoDB on the row id (no primary key was + defined), then row_search_for_mysql() will always + retrieve the row id to a special buffer in the + prebuilt struct. */ + + fetch_primary_key_cols = TRUE; + } + } + + clust_index = dict_table_get_first_index(prebuilt->table); + + if (templ_type == ROW_MYSQL_REC_FIELDS) { + index = prebuilt->index; + } else { + index = clust_index; + } + + if (index == clust_index) { + prebuilt->need_to_access_clustered = TRUE; + } else { + prebuilt->need_to_access_clustered = FALSE; + /* Below we check column by column if we need to access + the clustered index */ + } + + n_fields = (ulint)table->s->fields; /* number of columns */ + + if (!prebuilt->mysql_template) { + prebuilt->mysql_template = (mysql_row_templ_t*) + mem_alloc(n_fields * sizeof(mysql_row_templ_t)); + } + + prebuilt->template_type = templ_type; + prebuilt->null_bitmap_len = table->s->null_bytes; + + prebuilt->templ_contains_blob = FALSE; + + /* Note that in InnoDB, i is the column number. MySQL calls columns + 'fields'. */ + for (i = 0; i < n_fields; i++) { + templ = prebuilt->mysql_template + n_requested_fields; + field = table->field[i]; + + if (UNIV_LIKELY(templ_type == ROW_MYSQL_REC_FIELDS)) { + /* Decide which columns we should fetch + and which we can skip. */ + register const ibool index_contains_field = + dict_index_contains_col_or_prefix(index, i); + + if (!index_contains_field && prebuilt->read_just_key) { + /* If this is a 'key read', we do not need + columns that are not in the key */ + + goto skip_field; + } + + if (index_contains_field && fetch_all_in_key) { + /* This field is needed in the query */ + + goto include_field; + } + + if (bitmap_is_set(table->read_set, i) || + bitmap_is_set(table->write_set, i)) { + /* This field is needed in the query */ + + goto include_field; + } + + if (fetch_primary_key_cols + && dict_table_col_in_clustered_key( + index->table, i)) { + /* This field is needed in the query */ + + goto include_field; + } + + /* This field is not needed in the query, skip it */ + + goto skip_field; + } +include_field: + n_requested_fields++; + + templ->col_no = i; + + if (index == clust_index) { + templ->rec_field_no = dict_col_get_clust_pos( + &index->table->cols[i], index); + } else { + templ->rec_field_no = dict_index_get_nth_col_pos( + index, i); + } + + if (templ->rec_field_no == ULINT_UNDEFINED) { + prebuilt->need_to_access_clustered = TRUE; + } + + if (field->null_ptr) { + templ->mysql_null_byte_offset = + (ulint) ((char*) field->null_ptr + - (char*) table->record[0]); + + templ->mysql_null_bit_mask = (ulint) field->null_bit; + } else { + templ->mysql_null_bit_mask = 0; + } + + templ->mysql_col_offset = (ulint) + get_field_offset(table, field); + + templ->mysql_col_len = (ulint) field->pack_length(); + if (mysql_prefix_len < templ->mysql_col_offset + + templ->mysql_col_len) { + mysql_prefix_len = templ->mysql_col_offset + + templ->mysql_col_len; + } + templ->type = index->table->cols[i].mtype; + templ->mysql_type = (ulint)field->type(); + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + templ->mysql_length_bytes = (ulint) + (((Field_varstring*)field)->length_bytes); + } + + templ->charset = dtype_get_charset_coll( + index->table->cols[i].prtype); + templ->mbminlen = index->table->cols[i].mbminlen; + templ->mbmaxlen = index->table->cols[i].mbmaxlen; + templ->is_unsigned = index->table->cols[i].prtype + & DATA_UNSIGNED; + if (templ->type == DATA_BLOB) { + prebuilt->templ_contains_blob = TRUE; + } +skip_field: + ; + } + + prebuilt->n_template = n_requested_fields; + prebuilt->mysql_prefix_len = mysql_prefix_len; + + if (index != clust_index && prebuilt->need_to_access_clustered) { + /* Change rec_field_no's to correspond to the clustered index + record */ + for (i = 0; i < n_requested_fields; i++) { + templ = prebuilt->mysql_template + i; + + templ->rec_field_no = dict_col_get_clust_pos( + &index->table->cols[templ->col_no], + clust_index); + } + } +} + +/************************************************************************ +Get the upper limit of the MySQL integral and floating-point type. */ +UNIV_INTERN +ulonglong +ha_innobase::innobase_get_int_col_max_value( +/*========================================*/ + const Field* field) +{ + ulonglong max_value = 0; + + switch(field->key_type()) { + /* TINY */ + case HA_KEYTYPE_BINARY: + max_value = 0xFFULL; + break; + case HA_KEYTYPE_INT8: + max_value = 0x7FULL; + break; + /* SHORT */ + case HA_KEYTYPE_USHORT_INT: + max_value = 0xFFFFULL; + break; + case HA_KEYTYPE_SHORT_INT: + max_value = 0x7FFFULL; + break; + /* MEDIUM */ + case HA_KEYTYPE_UINT24: + max_value = 0xFFFFFFULL; + break; + case HA_KEYTYPE_INT24: + max_value = 0x7FFFFFULL; + break; + /* LONG */ + case HA_KEYTYPE_ULONG_INT: + max_value = 0xFFFFFFFFULL; + break; + case HA_KEYTYPE_LONG_INT: + max_value = 0x7FFFFFFFULL; + break; + /* BIG */ + case HA_KEYTYPE_ULONGLONG: + max_value = 0xFFFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_LONGLONG: + max_value = 0x7FFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_FLOAT: + /* We use the maximum as per IEEE754-2008 standard, 2^24 */ + max_value = 0x1000000ULL; + break; + case HA_KEYTYPE_DOUBLE: + /* We use the maximum as per IEEE754-2008 standard, 2^53 */ + max_value = 0x20000000000000ULL; + break; + default: + ut_error; + } + + return(max_value); +} + +/************************************************************************ +This special handling is really to overcome the limitations of MySQL's +binlogging. We need to eliminate the non-determinism that will arise in +INSERT ... SELECT type of statements, since MySQL binlog only stores the +min value of the autoinc interval. Once that is fixed we can get rid of +the special lock handling.*/ +UNIV_INTERN +ulint +ha_innobase::innobase_lock_autoinc(void) +/*====================================*/ + /* out: DB_SUCCESS if all OK else + error code */ +{ + ulint error = DB_SUCCESS; + + switch (innobase_autoinc_lock_mode) { + case AUTOINC_NO_LOCKING: + /* Acquire only the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); + break; + + case AUTOINC_NEW_STYLE_LOCKING: + /* For simple (single/multi) row INSERTs, we fallback to the + old style only if another transaction has already acquired + the AUTOINC lock on behalf of a LOAD FILE or INSERT ... SELECT + etc. type of statement. */ + if (thd_sql_command(user_thd) == SQLCOM_INSERT + || thd_sql_command(user_thd) == SQLCOM_REPLACE) { + dict_table_t* table = prebuilt->table; + + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(table); + + /* We need to check that another transaction isn't + already holding the AUTOINC lock on the table. */ + if (table->n_waiting_or_granted_auto_inc_locks) { + /* Release the mutex to avoid deadlocks. */ + dict_table_autoinc_unlock(table); + } else { + break; + } + } + /* Fall through to old style locking. */ + + case AUTOINC_OLD_STYLE_LOCKING: + error = row_lock_table_autoinc_for_mysql(prebuilt); + + if (error == DB_SUCCESS) { + + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); + } + break; + + default: + ut_error; + } + + return(ulong(error)); +} + +/************************************************************************ +Reset the autoinc value in the table.*/ +UNIV_INTERN +ulint +ha_innobase::innobase_reset_autoinc( +/*================================*/ + /* out: DB_SUCCESS if all went well + else error code */ + ulonglong autoinc) /* in: value to store */ +{ + ulint error; + + error = innobase_lock_autoinc(); + + if (error == DB_SUCCESS) { + + dict_table_autoinc_initialize(prebuilt->table, autoinc); + + dict_table_autoinc_unlock(prebuilt->table); + } + + return(ulong(error)); +} + +/************************************************************************ +Store the autoinc value in the table. The autoinc value is only set if +it's greater than the existing autoinc value in the table.*/ +UNIV_INTERN +ulint +ha_innobase::innobase_set_max_autoinc( +/*==================================*/ + /* out: DB_SUCCES if all went well + else error code */ + ulonglong auto_inc) /* in: value to store */ +{ + ulint error; + + error = innobase_lock_autoinc(); + + if (error == DB_SUCCESS) { + + dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc); + + dict_table_autoinc_unlock(prebuilt->table); + } + + return(ulong(error)); +} + +/************************************************************************ +Stores a row in an InnoDB database, to the table specified in this +handle. */ +UNIV_INTERN +int +ha_innobase::write_row( +/*===================*/ + /* out: error code */ + uchar* record) /* in: a row in MySQL format */ +{ + ulint error = 0; + int error_result= 0; + ibool auto_inc_used= FALSE; + ulint sql_command; + trx_t* trx = thd_to_trx(user_thd); + + DBUG_ENTER("ha_innobase::write_row"); + + if (prebuilt->trx != trx) { + sql_print_error("The transaction object for the table handle is at " + "%p, but for the current thread it is at %p", + (const void*) prebuilt->trx, (const void*) trx); + + fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr); + ut_print_buf(stderr, ((const byte*)prebuilt) - 100, 200); + fputs("\n" + "InnoDB: Dump of 200 bytes around ha_data: ", + stderr); + ut_print_buf(stderr, ((const byte*) trx) - 100, 200); + putc('\n', stderr); + ut_error; + } + + ha_statistic_increment(&SSV::ha_write_count); + + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) + table->timestamp_field->set_time(); + + sql_command = thd_sql_command(user_thd); + + if ((sql_command == SQLCOM_ALTER_TABLE + || sql_command == SQLCOM_OPTIMIZE + || sql_command == SQLCOM_CREATE_INDEX + || sql_command == SQLCOM_DROP_INDEX) + && num_write_row >= 10000) { + /* ALTER TABLE is COMMITted at every 10000 copied rows. + The IX table lock for the original table has to be re-issued. + As this method will be called on a temporary table where the + contents of the original table is being copied to, it is + a bit tricky to determine the source table. The cursor + position in the source table need not be adjusted after the + intermediate COMMIT, since writes by other transactions are + being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */ + + dict_table_t* src_table; + enum lock_mode mode; + + num_write_row = 0; + + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ + + /* Altering an InnoDB table */ + /* Get the source table. */ + src_table = lock_get_src_table( + prebuilt->trx, prebuilt->table, &mode); + if (!src_table) { +no_commit: + /* Unknown situation: do not commit */ + /* + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ALTER TABLE is holding lock" + " on %lu tables!\n", + prebuilt->trx->mysql_n_tables_locked); + */ + ; + } else if (src_table == prebuilt->table) { + /* Source table is not in InnoDB format: + no need to re-acquire locks on it. */ + + /* Altering to InnoDB format */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + prebuilt->trx->active_trans = 1; + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } else { + /* Ensure that there are no other table locks than + LOCK_IX and LOCK_AUTO_INC on the destination table. */ + + if (!lock_is_table_exclusive(prebuilt->table, + prebuilt->trx)) { + goto no_commit; + } + + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + prebuilt->trx->active_trans = 1; + /* Re-acquire the table lock on the source table. */ + row_lock_table_for_mysql(prebuilt, src_table, mode); + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } + } + + num_write_row++; + + /* This is the case where the table has an auto-increment column */ + if (table->next_number_field && record == table->record[0]) { + + /* Reset the error code before calling + innobase_get_auto_increment(). */ + prebuilt->autoinc_error = DB_SUCCESS; + + if ((error = update_auto_increment())) { + + /* We don't want to mask autoinc overflow errors. */ + if (prebuilt->autoinc_error != DB_SUCCESS) { + error = (int) prebuilt->autoinc_error; + + goto report_error; + } + + /* MySQL errors are passed straight back. */ + error_result = (int) error; + goto func_exit; + } + + auto_inc_used = TRUE; + } + + if (prebuilt->mysql_template == NULL + || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) { + + /* Build the template used in converting quickly between + the two database formats */ + + build_template(prebuilt, NULL, table, ROW_MYSQL_WHOLE_ROW); + } + + innodb_srv_conc_enter_innodb(prebuilt->trx); + + error = row_insert_for_mysql((byte*) record, prebuilt); + + /* Handle duplicate key errors */ + if (auto_inc_used) { + ulint err; + ulonglong auto_inc; + ulonglong col_max_value; + + /* Note the number of rows processed for this statement, used + by get_auto_increment() to determine the number of AUTO-INC + values to reserve. This is only useful for a mult-value INSERT + and is a statement level counter.*/ + if (trx->n_autoinc_rows > 0) { + --trx->n_autoinc_rows; + } + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); + + /* Get the value that MySQL attempted to store in the table.*/ + auto_inc = table->next_number_field->val_int(); + + switch (error) { + case DB_DUPLICATE_KEY: + + /* A REPLACE command and LOAD DATA INFILE REPLACE + handle a duplicate key error themselves, but we + must update the autoinc counter if we are performing + those statements. */ + + switch (sql_command) { + case SQLCOM_LOAD: + if ((trx->duplicates + & (TRX_DUP_IGNORE | TRX_DUP_REPLACE))) { + + goto set_max_autoinc; + } + break; + + case SQLCOM_REPLACE: + case SQLCOM_INSERT_SELECT: + case SQLCOM_REPLACE_SELECT: + goto set_max_autoinc; + + default: + break; + } + + break; + + case DB_SUCCESS: + /* If the actual value inserted is greater than + the upper limit of the interval, then we try and + update the table upper limit. Note: last_value + will be 0 if get_auto_increment() was not called.*/ + + if (auto_inc <= col_max_value + && auto_inc >= prebuilt->autoinc_last_value) { +set_max_autoinc: + ut_a(prebuilt->autoinc_increment > 0); + + ulonglong need; + ulonglong offset; + + offset = prebuilt->autoinc_offset; + need = prebuilt->autoinc_increment; + + auto_inc = innobase_next_autoinc( + auto_inc, need, offset, col_max_value); + + err = innobase_set_max_autoinc(auto_inc); + + if (err != DB_SUCCESS) { + error = err; + } + } + break; + } + } + + innodb_srv_conc_exit_innodb(prebuilt->trx); + +report_error: + error_result = convert_error_code_to_mysql((int) error, + prebuilt->table->flags, + user_thd); + +func_exit: + innobase_active_small(); + + DBUG_RETURN(error_result); +} + +/************************************************************************** +Checks which fields have changed in a row and stores information +of them to an update vector. */ +static +int +calc_row_difference( +/*================*/ + /* out: error number or 0 */ + upd_t* uvect, /* in/out: update vector */ + uchar* old_row, /* in: old row in MySQL format */ + uchar* new_row, /* in: new row in MySQL format */ + struct st_table* table, /* in: table in MySQL data + dictionary */ + uchar* upd_buff, /* in: buffer to use */ + ulint buff_len, /* in: buffer length */ + row_prebuilt_t* prebuilt, /* in: InnoDB prebuilt struct */ + THD* thd) /* in: user thread */ +{ + uchar* original_upd_buff = upd_buff; + Field* field; + enum_field_types field_mysql_type; + uint n_fields; + ulint o_len; + ulint n_len; + ulint col_pack_len; + const byte* new_mysql_row_col; + const byte* o_ptr; + const byte* n_ptr; + byte* buf; + upd_field_t* ufield; + ulint col_type; + ulint n_changed = 0; + dfield_t dfield; + dict_index_t* clust_index; + uint i; + + n_fields = table->s->fields; + clust_index = dict_table_get_first_index(prebuilt->table); + + /* We use upd_buff to convert changed fields */ + buf = (byte*) upd_buff; + + for (i = 0; i < n_fields; i++) { + field = table->field[i]; + + o_ptr = (const byte*) old_row + get_field_offset(table, field); + n_ptr = (const byte*) new_row + get_field_offset(table, field); + + /* Use new_mysql_row_col and col_pack_len save the values */ + + new_mysql_row_col = n_ptr; + col_pack_len = field->pack_length(); + + o_len = col_pack_len; + n_len = col_pack_len; + + /* We use o_ptr and n_ptr to dig up the actual data for + comparison. */ + + field_mysql_type = field->type(); + + col_type = prebuilt->table->cols[i].mtype; + + switch (col_type) { + + case DATA_BLOB: + o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len); + n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len); + + break; + + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field_mysql_type == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ + + o_ptr = row_mysql_read_true_varchar( + &o_len, o_ptr, + (ulint) + (((Field_varstring*)field)->length_bytes)); + + n_ptr = row_mysql_read_true_varchar( + &n_len, n_ptr, + (ulint) + (((Field_varstring*)field)->length_bytes)); + } + + break; + default: + ; + } + + if (field->null_ptr) { + if (field_in_record_is_null(table, field, + (char*) old_row)) { + o_len = UNIV_SQL_NULL; + } + + if (field_in_record_is_null(table, field, + (char*) new_row)) { + n_len = UNIV_SQL_NULL; + } + } + + if (o_len != n_len || (o_len != UNIV_SQL_NULL && + 0 != memcmp(o_ptr, n_ptr, o_len))) { + /* The field has changed */ + + ufield = uvect->fields + n_changed; + + /* Let us use a dummy dfield to make the conversion + from the MySQL column format to the InnoDB format */ + + dict_col_copy_type(prebuilt->table->cols + i, + dfield_get_type(&dfield)); + + if (n_len != UNIV_SQL_NULL) { + buf = row_mysql_store_col_in_innobase_format( + &dfield, + (byte*)buf, + TRUE, + new_mysql_row_col, + col_pack_len, + dict_table_is_comp(prebuilt->table)); + dfield_copy_data(&ufield->new_val, &dfield); + } else { + dfield_set_null(&ufield->new_val); + } + + ufield->exp = NULL; + ufield->orig_len = 0; + ufield->field_no = dict_col_get_clust_pos( + &prebuilt->table->cols[i], clust_index); + n_changed++; + } + } + + uvect->n_fields = n_changed; + uvect->info_bits = 0; + + ut_a(buf <= (byte*)original_upd_buff + buff_len); + + return(0); +} + +/************************************************************************** +Updates a row given as a parameter to a new value. Note that we are given +whole rows, not just the fields which are updated: this incurs some +overhead for CPU when we check which fields are actually updated. +TODO: currently InnoDB does not prevent the 'Halloween problem': +in a searched update a single row can get updated several times +if its index columns are updated! */ +UNIV_INTERN +int +ha_innobase::update_row( +/*====================*/ + /* out: error number or 0 */ + const uchar* old_row, /* in: old row in MySQL format */ + uchar* new_row) /* in: new row in MySQL format */ +{ + upd_t* uvect; + int error = 0; + trx_t* trx = thd_to_trx(user_thd); + + DBUG_ENTER("ha_innobase::update_row"); + + ut_a(prebuilt->trx == trx); + + ha_statistic_increment(&SSV::ha_update_count); + + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) + table->timestamp_field->set_time(); + + if (prebuilt->upd_node) { + uvect = prebuilt->upd_node->update; + } else { + uvect = row_get_prebuilt_update_vector(prebuilt); + } + + /* Build an update vector from the modified fields in the rows + (uses upd_buff of the handle) */ + + calc_row_difference(uvect, (uchar*) old_row, new_row, table, + upd_buff, (ulint)upd_and_key_val_buff_len, + prebuilt, user_thd); + + /* This is not a delete */ + prebuilt->upd_node->is_delete = FALSE; + + ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + + innodb_srv_conc_enter_innodb(trx); + + error = row_update_for_mysql((byte*) old_row, prebuilt); + + /* We need to do some special AUTOINC handling for the following case: + + INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ... + + We need to use the AUTOINC counter that was actually used by + MySQL in the UPDATE statement, which can be different from the + value used in the INSERT statement.*/ + + if (error == DB_SUCCESS + && table->next_number_field + && new_row == table->record[0] + && thd_sql_command(user_thd) == SQLCOM_INSERT + && (trx->duplicates & (TRX_DUP_IGNORE | TRX_DUP_REPLACE)) + == TRX_DUP_IGNORE) { + + ulonglong auto_inc; + ulonglong col_max_value; + + auto_inc = table->next_number_field->val_int(); + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); + + if (auto_inc <= col_max_value && auto_inc != 0) { + + ulonglong need; + ulonglong offset; + + offset = prebuilt->autoinc_offset; + need = prebuilt->autoinc_increment; + + auto_inc = innobase_next_autoinc( + auto_inc, need, offset, col_max_value); + + error = innobase_set_max_autoinc(auto_inc); + } + } + + innodb_srv_conc_exit_innodb(trx); + + error = convert_error_code_to_mysql(error, + prebuilt->table->flags, user_thd); + + if (error == 0 /* success */ + && uvect->n_fields == 0 /* no columns were updated */) { + + /* This is the same as success, but instructs + MySQL that the row is not really updated and it + should not increase the count of updated rows. + This is fix for http://bugs.mysql.com/29157 */ + error = HA_ERR_RECORD_IS_THE_SAME; + } + + /* Tell InnoDB server that there might be work for + utility threads: */ + + innobase_active_small(); + + DBUG_RETURN(error); +} + +/************************************************************************** +Deletes a row given as the parameter. */ +UNIV_INTERN +int +ha_innobase::delete_row( +/*====================*/ + /* out: error number or 0 */ + const uchar* record) /* in: a row in MySQL format */ +{ + int error = 0; + trx_t* trx = thd_to_trx(user_thd); + + DBUG_ENTER("ha_innobase::delete_row"); + + ut_a(prebuilt->trx == trx); + + ha_statistic_increment(&SSV::ha_delete_count); + + if (!prebuilt->upd_node) { + row_get_prebuilt_update_vector(prebuilt); + } + + /* This is a delete */ + + prebuilt->upd_node->is_delete = TRUE; + + innodb_srv_conc_enter_innodb(trx); + + error = row_update_for_mysql((byte*) record, prebuilt); + + innodb_srv_conc_exit_innodb(trx); + + error = convert_error_code_to_mysql( + error, prebuilt->table->flags, user_thd); + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + innobase_active_small(); + + DBUG_RETURN(error); +} + +/************************************************************************** +Removes a new lock set on a row, if it was not read optimistically. This can +be called after a row has been read in the processing of an UPDATE or a DELETE +query, if the option innodb_locks_unsafe_for_binlog is set. */ +UNIV_INTERN +void +ha_innobase::unlock_row(void) +/*=========================*/ +{ + DBUG_ENTER("ha_innobase::unlock_row"); + + /* Consistent read does not take any locks, thus there is + nothing to unlock. */ + + if (prebuilt->select_lock_type == LOCK_NONE) { + DBUG_VOID_RETURN; + } + + switch (prebuilt->row_read_type) { + case ROW_READ_WITH_LOCKS: + if (!srv_locks_unsafe_for_binlog + && prebuilt->trx->isolation_level + != TRX_ISO_READ_COMMITTED) { + break; + } + /* fall through */ + case ROW_READ_TRY_SEMI_CONSISTENT: + row_unlock_for_mysql(prebuilt, FALSE); + break; + case ROW_READ_DID_SEMI_CONSISTENT: + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + break; + } + + DBUG_VOID_RETURN; +} + +/* See handler.h and row0mysql.h for docs on this function. */ +UNIV_INTERN +bool +ha_innobase::was_semi_consistent_read(void) +/*=======================================*/ +{ + return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT); +} + +/* See handler.h and row0mysql.h for docs on this function. */ +UNIV_INTERN +void +ha_innobase::try_semi_consistent_read(bool yes) +/*===========================================*/ +{ + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + /* Row read type is set to semi consistent read if this was + requested by the MySQL and either innodb_locks_unsafe_for_binlog + option is used or this session is using READ COMMITTED isolation + level. */ + + if (yes + && (srv_locks_unsafe_for_binlog + || prebuilt->trx->isolation_level == TRX_ISO_READ_COMMITTED)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_WITH_LOCKS; + } +} + +#ifdef ROW_MERGE_IS_INDEX_USABLE +/********************************************************************** +Check if an index can be used by the optimizer. */ +UNIV_INTERN +bool +ha_innobase::is_index_available( +/*============================*/ + /* out: true if available else false*/ + uint keynr) /* in: index number to check */ +{ + DBUG_ENTER("ha_innobase::is_index_available"); + + if (table && keynr != MAX_KEY && table->s->keys > 0) { + const dict_index_t* index; + const KEY* key = table->key_info + keynr; + + ut_ad(user_thd == ha_thd()); + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + index = dict_table_get_index_on_name( + prebuilt->table, key->name); + + if (!row_merge_is_index_usable(prebuilt->trx, index)) { + + DBUG_RETURN(false); + } + } + + DBUG_RETURN(true); +} +#endif /* ROW_MERGE_IS_INDEX_USABLE */ + +/********************************************************************** +Initializes a handle to use an index. */ +UNIV_INTERN +int +ha_innobase::index_init( +/*====================*/ + /* out: 0 or error number */ + uint keynr, /* in: key (index) number */ + bool sorted) /* in: 1 if result MUST be sorted according to index */ +{ + DBUG_ENTER("index_init"); + + DBUG_RETURN(change_active_index(keynr)); +} + +/********************************************************************** +Currently does nothing. */ +UNIV_INTERN +int +ha_innobase::index_end(void) +/*========================*/ +{ + int error = 0; + DBUG_ENTER("index_end"); + active_index=MAX_KEY; + DBUG_RETURN(error); +} + +/************************************************************************* +Converts a search mode flag understood by MySQL to a flag understood +by InnoDB. */ +static inline +ulint +convert_search_mode_to_innobase( +/*============================*/ + enum ha_rkey_function find_flag) +{ + switch (find_flag) { + case HA_READ_KEY_EXACT: + /* this does not require the index to be UNIQUE */ + return(PAGE_CUR_GE); + case HA_READ_KEY_OR_NEXT: + return(PAGE_CUR_GE); + case HA_READ_KEY_OR_PREV: + return(PAGE_CUR_LE); + case HA_READ_AFTER_KEY: + return(PAGE_CUR_G); + case HA_READ_BEFORE_KEY: + return(PAGE_CUR_L); + case HA_READ_PREFIX: + return(PAGE_CUR_GE); + case HA_READ_PREFIX_LAST: + return(PAGE_CUR_LE); + case HA_READ_PREFIX_LAST_OR_PREV: + return(PAGE_CUR_LE); + /* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always + pass a complete-field prefix of a key value as the search + tuple. I.e., it is not allowed that the last field would + just contain n first bytes of the full field value. + MySQL uses a 'padding' trick to convert LIKE 'abc%' + type queries so that it can use as a search tuple + a complete-field-prefix of a key value. Thus, the InnoDB + search mode PAGE_CUR_LE_OR_EXTENDS is never used. + TODO: when/if MySQL starts to use also partial-field + prefixes, we have to deal with stripping of spaces + and comparison of non-latin1 char type fields in + innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to + work correctly. */ + case HA_READ_MBR_CONTAIN: + case HA_READ_MBR_INTERSECT: + case HA_READ_MBR_WITHIN: + case HA_READ_MBR_DISJOINT: + case HA_READ_MBR_EQUAL: + my_error(ER_TABLE_CANT_HANDLE_SPKEYS, MYF(0)); + return(PAGE_CUR_UNSUPP); + /* do not use "default:" in order to produce a gcc warning: + enumeration value '...' not handled in switch + (if -Wswitch or -Wall is used) */ + } + + my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality"); + + return(PAGE_CUR_UNSUPP); +} + +/* + BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED + --------------------------------------------------- +The following does not cover all the details, but explains how we determine +the start of a new SQL statement, and what is associated with it. + +For each table in the database the MySQL interpreter may have several +table handle instances in use, also in a single SQL query. For each table +handle instance there is an InnoDB 'prebuilt' struct which contains most +of the InnoDB data associated with this table handle instance. + + A) if the user has not explicitly set any MySQL table level locks: + + 1) MySQL calls ::external_lock to set an 'intention' table level lock on +the table of the handle instance. There we set +prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set +true if we are taking this table handle instance to use in a new SQL +statement issued by the user. We also increment trx->n_mysql_tables_in_use. + + 2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search +instructions to prebuilt->template of the table handle instance in +::index_read. The template is used to save CPU time in large joins. + + 3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we +allocate a new consistent read view for the trx if it does not yet have one, +or in the case of a locking read, set an InnoDB 'intention' table level +lock on the table. + + 4) We do the SELECT. MySQL may repeatedly call ::index_read for the +same table handle instance, if it is a join. + + 5) When the SELECT ends, MySQL removes its intention table level locks +in ::external_lock. When trx->n_mysql_tables_in_use drops to zero, + (a) we execute a COMMIT there if the autocommit is on, + (b) we also release possible 'SQL statement level resources' InnoDB may +have for this SQL statement. The MySQL interpreter does NOT execute +autocommit for pure read transactions, though it should. That is why the +table handler in that case has to execute the COMMIT in ::external_lock. + + B) If the user has explicitly set MySQL table level locks, then MySQL +does NOT call ::external_lock at the start of the statement. To determine +when we are at the start of a new SQL statement we at the start of +::index_read also compare the query id to the latest query id where the +table handle instance was used. If it has changed, we know we are at the +start of a new SQL statement. Since the query id can theoretically +overwrap, we use this test only as a secondary way of determining the +start of a new SQL statement. */ + + +/************************************************************************** +Positions an index cursor to the index specified in the handle. Fetches the +row if any. */ +UNIV_INTERN +int +ha_innobase::index_read( +/*====================*/ + /* out: 0, HA_ERR_KEY_NOT_FOUND, + or error number */ + uchar* buf, /* in/out: buffer for the returned + row */ + const uchar* key_ptr, /* in: key value; if this is NULL + we position the cursor at the + start or end of index; this can + also contain an InnoDB row id, in + which case key_len is the InnoDB + row id length; the key value can + also be a prefix of a full key value, + and the last column can be a prefix + of a full column */ + uint key_len,/* in: key value length */ + enum ha_rkey_function find_flag)/* in: search flags from my_base.h */ +{ + ulint mode; + dict_index_t* index; + ulint match_mode = 0; + int error; + ulint ret; + + DBUG_ENTER("index_read"); + + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + ha_statistic_increment(&SSV::ha_read_key_count); + + index = prebuilt->index; + + /* Note that if the index for which the search template is built is not + necessarily prebuilt->index, but can also be the clustered index */ + + if (prebuilt->sql_stat_start) { + build_template(prebuilt, user_thd, table, ROW_MYSQL_REC_FIELDS); + } + + if (key_ptr) { + /* Convert the search key value to InnoDB format into + prebuilt->search_tuple */ + + row_sel_convert_mysql_key_to_innobase( + prebuilt->search_tuple, + (byte*) key_val_buff, + (ulint)upd_and_key_val_buff_len, + index, + (byte*) key_ptr, + (ulint) key_len, + prebuilt->trx); + } else { + /* We position the cursor to the last or the first entry + in the index */ + + dtuple_set_n_fields(prebuilt->search_tuple, 0); + } + + mode = convert_search_mode_to_innobase(find_flag); + + match_mode = 0; + + if (find_flag == HA_READ_KEY_EXACT) { + + match_mode = ROW_SEL_EXACT; + + } else if (find_flag == HA_READ_PREFIX + || find_flag == HA_READ_PREFIX_LAST) { + + match_mode = ROW_SEL_EXACT_PREFIX; + } + + last_match_mode = (uint) match_mode; + + if (mode != PAGE_CUR_UNSUPP) { + + innodb_srv_conc_enter_innodb(prebuilt->trx); + + ret = row_search_for_mysql((byte*) buf, mode, prebuilt, + match_mode, 0); + + innodb_srv_conc_exit_innodb(prebuilt->trx); + } else { + + ret = DB_UNSUPPORTED; + } + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + break; + case DB_RECORD_NOT_FOUND: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + case DB_END_OF_INDEX: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + default: + error = convert_error_code_to_mysql((int) ret, + prebuilt->table->flags, + user_thd); + table->status = STATUS_NOT_FOUND; + break; + } + + DBUG_RETURN(error); +} + +/*********************************************************************** +The following functions works like index_read, but it find the last +row with the current key value or prefix. */ +UNIV_INTERN +int +ha_innobase::index_read_last( +/*=========================*/ + /* out: 0, HA_ERR_KEY_NOT_FOUND, or an + error code */ + uchar* buf, /* out: fetched row */ + const uchar* key_ptr,/* in: key value, or a prefix of a full + key value */ + uint key_len)/* in: length of the key val or prefix + in bytes */ +{ + return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST)); +} + +/************************************************************************ +Get the index for a handle. Does not change active index.*/ +UNIV_INTERN +dict_index_t* +ha_innobase::innobase_get_index( +/*============================*/ + /* out: NULL or index instance. */ + uint keynr) /* in: use this index; MAX_KEY means always + clustered index, even if it was internally + generated by InnoDB */ +{ + KEY* key = 0; + dict_index_t* index = 0; + + DBUG_ENTER("innobase_get_index"); + ha_statistic_increment(&SSV::ha_read_key_count); + + ut_ad(user_thd == ha_thd()); + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + if (keynr != MAX_KEY && table->s->keys > 0) { + key = table->key_info + keynr; + + index = dict_table_get_index_on_name(prebuilt->table, + key->name); + } else { + index = dict_table_get_first_index(prebuilt->table); + } + + if (!index) { + sql_print_error( + "Innodb could not find key n:o %u with name %s " + "from dict cache for table %s", + keynr, key ? key->name : "NULL", + prebuilt->table->name); + } + + DBUG_RETURN(index); +} + +/************************************************************************ +Changes the active index of a handle. */ +UNIV_INTERN +int +ha_innobase::change_active_index( +/*=============================*/ + /* out: 0 or error code */ + uint keynr) /* in: use this index; MAX_KEY means always clustered + index, even if it was internally generated by + InnoDB */ +{ + DBUG_ENTER("change_active_index"); + + ut_ad(user_thd == ha_thd()); + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + active_index = keynr; + + prebuilt->index = innobase_get_index(keynr); + + if (UNIV_UNLIKELY(!prebuilt->index)) { + sql_print_warning("InnoDB: change_active_index(%u) failed", + keynr); + DBUG_RETURN(1); + } + + ut_a(prebuilt->search_tuple != 0); + + dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields); + + dict_index_copy_types(prebuilt->search_tuple, prebuilt->index, + prebuilt->index->n_fields); + + /* MySQL changes the active index for a handle also during some + queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX() + and then calculates the sum. Previously we played safe and used + the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary + copying. Starting from MySQL-4.1 we use a more efficient flag here. */ + + build_template(prebuilt, user_thd, table, ROW_MYSQL_REC_FIELDS); + + DBUG_RETURN(0); +} + +/************************************************************************** +Positions an index cursor to the index specified in keynr. Fetches the +row if any. */ +/* ??? This is only used to read whole keys ??? */ +UNIV_INTERN +int +ha_innobase::index_read_idx( +/*========================*/ + /* out: error number or 0 */ + uchar* buf, /* in/out: buffer for the returned + row */ + uint keynr, /* in: use this index */ + const uchar* key, /* in: key value; if this is NULL + we position the cursor at the + start or end of index */ + uint key_len, /* in: key value length */ + enum ha_rkey_function find_flag)/* in: search flags from my_base.h */ +{ + if (change_active_index(keynr)) { + + return(1); + } + + return(index_read(buf, key, key_len, find_flag)); +} + +/*************************************************************************** +Reads the next or previous row from a cursor, which must have previously been +positioned using index_read. */ +UNIV_INTERN +int +ha_innobase::general_fetch( +/*=======================*/ + /* out: 0, HA_ERR_END_OF_FILE, or error + number */ + uchar* buf, /* in/out: buffer for next row in MySQL + format */ + uint direction, /* in: ROW_SEL_NEXT or ROW_SEL_PREV */ + uint match_mode) /* in: 0, ROW_SEL_EXACT, or + ROW_SEL_EXACT_PREFIX */ +{ + ulint ret; + int error = 0; + + DBUG_ENTER("general_fetch"); + + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + + innodb_srv_conc_enter_innodb(prebuilt->trx); + + ret = row_search_for_mysql( + (byte*)buf, 0, prebuilt, match_mode, direction); + + innodb_srv_conc_exit_innodb(prebuilt->trx); + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + break; + case DB_RECORD_NOT_FOUND: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_END_OF_INDEX: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + default: + error = convert_error_code_to_mysql( + (int) ret, prebuilt->table->flags, user_thd); + table->status = STATUS_NOT_FOUND; + break; + } + + DBUG_RETURN(error); +} + +/*************************************************************************** +Reads the next row from a cursor, which must have previously been +positioned using index_read. */ +UNIV_INTERN +int +ha_innobase::index_next( +/*====================*/ + /* out: 0, HA_ERR_END_OF_FILE, or error + number */ + uchar* buf) /* in/out: buffer for next row in MySQL + format */ +{ + ha_statistic_increment(&SSV::ha_read_next_count); + + return(general_fetch(buf, ROW_SEL_NEXT, 0)); +} + +/*********************************************************************** +Reads the next row matching to the key value given as the parameter. */ +UNIV_INTERN +int +ha_innobase::index_next_same( +/*=========================*/ + /* out: 0, HA_ERR_END_OF_FILE, or error + number */ + uchar* buf, /* in/out: buffer for the row */ + const uchar* key, /* in: key value */ + uint keylen) /* in: key value length */ +{ + ha_statistic_increment(&SSV::ha_read_next_count); + + return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode)); +} + +/*************************************************************************** +Reads the previous row from a cursor, which must have previously been +positioned using index_read. */ +UNIV_INTERN +int +ha_innobase::index_prev( +/*====================*/ + /* out: 0, HA_ERR_END_OF_FILE, or error number */ + uchar* buf) /* in/out: buffer for previous row in MySQL format */ +{ + ha_statistic_increment(&SSV::ha_read_prev_count); + + return(general_fetch(buf, ROW_SEL_PREV, 0)); +} + +/************************************************************************ +Positions a cursor on the first record in an index and reads the +corresponding row to buf. */ +UNIV_INTERN +int +ha_innobase::index_first( +/*=====================*/ + /* out: 0, HA_ERR_END_OF_FILE, or error code */ + uchar* buf) /* in/out: buffer for the row */ +{ + int error; + + DBUG_ENTER("index_first"); + ha_statistic_increment(&SSV::ha_read_first_count); + + error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY); + + /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */ + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + DBUG_RETURN(error); +} + +/************************************************************************ +Positions a cursor on the last record in an index and reads the +corresponding row to buf. */ +UNIV_INTERN +int +ha_innobase::index_last( +/*====================*/ + /* out: 0, HA_ERR_END_OF_FILE, or error code */ + uchar* buf) /* in/out: buffer for the row */ +{ + int error; + + DBUG_ENTER("index_last"); + ha_statistic_increment(&SSV::ha_read_last_count); + + error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY); + + /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */ + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + DBUG_RETURN(error); +} + +/******************************************************************** +Initialize a table scan. */ +UNIV_INTERN +int +ha_innobase::rnd_init( +/*==================*/ + /* out: 0 or error number */ + bool scan) /* in: TRUE if table/index scan FALSE otherwise */ +{ + int err; + + /* Store the active index value so that we can restore the original + value after a scan */ + + if (prebuilt->clust_index_was_generated) { + err = change_active_index(MAX_KEY); + } else { + err = change_active_index(primary_key); + } + + /* Don't use semi-consistent read in random row reads (by position). + This means we must disable semi_consistent_read if scan is false */ + + if (!scan) { + try_semi_consistent_read(0); + } + + start_of_scan = 1; + + return(err); +} + +/********************************************************************* +Ends a table scan. */ +UNIV_INTERN +int +ha_innobase::rnd_end(void) +/*======================*/ + /* out: 0 or error number */ +{ + return(index_end()); +} + +/********************************************************************* +Reads the next row in a table scan (also used to read the FIRST row +in a table scan). */ +UNIV_INTERN +int +ha_innobase::rnd_next( +/*==================*/ + /* out: 0, HA_ERR_END_OF_FILE, or error number */ + uchar* buf) /* in/out: returns the row in this buffer, + in MySQL format */ +{ + int error; + + DBUG_ENTER("rnd_next"); + ha_statistic_increment(&SSV::ha_read_rnd_next_count); + + if (start_of_scan) { + error = index_first(buf); + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + start_of_scan = 0; + } else { + error = general_fetch(buf, ROW_SEL_NEXT, 0); + } + + DBUG_RETURN(error); +} + +/************************************************************************** +Fetches a row from the table based on a row reference. */ +UNIV_INTERN +int +ha_innobase::rnd_pos( +/*=================*/ + /* out: 0, HA_ERR_KEY_NOT_FOUND, or error code */ + uchar* buf, /* in/out: buffer for the row */ + uchar* pos) /* in: primary key value of the row in the + MySQL format, or the row id if the clustered + index was internally generated by InnoDB; the + length of data in pos has to be ref_length */ +{ + int error; + uint keynr = active_index; + DBUG_ENTER("rnd_pos"); + DBUG_DUMP("key", pos, ref_length); + + ha_statistic_increment(&SSV::ha_read_rnd_count); + + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + if (prebuilt->clust_index_was_generated) { + /* No primary key was defined for the table and we + generated the clustered index from the row id: the + row reference is the row id, not any key value + that MySQL knows of */ + + error = change_active_index(MAX_KEY); + } else { + error = change_active_index(primary_key); + } + + if (error) { + DBUG_PRINT("error", ("Got error: %d", error)); + DBUG_RETURN(error); + } + + /* Note that we assume the length of the row reference is fixed + for the table, and it is == ref_length */ + + error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT); + + if (error) { + DBUG_PRINT("error", ("Got error: %d", error)); + } + + change_active_index(keynr); + + DBUG_RETURN(error); +} + +/************************************************************************* +Stores a reference to the current row to 'ref' field of the handle. Note +that in the case where we have generated the clustered index for the +table, the function parameter is illogical: we MUST ASSUME that 'record' +is the current 'position' of the handle, because if row ref is actually +the row id internally generated in InnoDB, then 'record' does not contain +it. We just guess that the row id must be for the record where the handle +was positioned the last time. */ +UNIV_INTERN +void +ha_innobase::position( +/*==================*/ + const uchar* record) /* in: row in MySQL format */ +{ + uint len; + + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + if (prebuilt->clust_index_was_generated) { + /* No primary key was defined for the table and we + generated the clustered index from row id: the + row reference will be the row id, not any key value + that MySQL knows of */ + + len = DATA_ROW_ID_LEN; + + memcpy(ref, prebuilt->row_id, len); + } else { + len = store_key_val_for_row(primary_key, (char*)ref, + ref_length, record); + } + + /* We assume that the 'ref' value len is always fixed for the same + table. */ + + if (len != ref_length) { + sql_print_error("Stored ref len is %lu, but table ref len is %lu", + (ulong) len, (ulong) ref_length); + } +} + +/* limit innodb monitor access to users with PROCESS privilege. +See http://bugs.mysql.com/32710 for expl. why we choose PROCESS. */ +#define IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name, thd) \ + (row_is_magic_monitor_table(table_name) \ + && check_global_access(thd, PROCESS_ACL)) + +/********************************************************************* +Creates a table definition to an InnoDB database. */ +static +int +create_table_def( +/*=============*/ + trx_t* trx, /* in: InnoDB transaction handle */ + TABLE* form, /* in: information on table + columns and indexes */ + const char* table_name, /* in: table name */ + const char* path_of_temp_table,/* in: if this is a table explicitly + created by the user with the + TEMPORARY keyword, then this + parameter is the dir path where the + table should be placed if we create + an .ibd file for it (no .ibd extension + in the path, though); otherwise this + is NULL */ + ulint flags) /* in: table flags */ +{ + Field* field; + dict_table_t* table; + ulint n_cols; + int error; + ulint col_type; + ulint col_len; + ulint nulls_allowed; + ulint unsigned_type; + ulint binary_type; + ulint long_true_varchar; + ulint charset_no; + ulint i; + + DBUG_ENTER("create_table_def"); + DBUG_PRINT("enter", ("table_name: %s", table_name)); + + ut_a(trx->mysql_thd != NULL); + if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name, + (THD*) trx->mysql_thd)) { + DBUG_RETURN(HA_ERR_GENERIC); + } + + n_cols = form->s->fields; + + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ + + table = dict_mem_table_create(table_name, 0, n_cols, flags); + + if (path_of_temp_table) { + table->dir_path_of_temp_table = + mem_heap_strdup(table->heap, path_of_temp_table); + } + + for (i = 0; i < n_cols; i++) { + field = form->field[i]; + + col_type = get_innobase_type_from_mysql_type(&unsigned_type, + field); + if (field->null_ptr) { + nulls_allowed = 0; + } else { + nulls_allowed = DATA_NOT_NULL; + } + + if (field->binary()) { + binary_type = DATA_BINARY_TYPE; + } else { + binary_type = 0; + } + + charset_no = 0; + + if (dtype_is_string_type(col_type)) { + + charset_no = (ulint)field->charset()->number; + + if (UNIV_UNLIKELY(charset_no >= 256)) { + /* in data0type.h we assume that the + number fits in one byte in prtype */ + push_warning_printf( + (THD*) trx->mysql_thd, + MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_CANT_CREATE_TABLE, + "In InnoDB, charset-collation codes" + " must be below 256." + " Unsupported code %lu.", + (ulong) charset_no); + DBUG_RETURN(ER_CANT_CREATE_TABLE); + } + } + + ut_a(field->type() < 256); /* we assume in dtype_form_prtype() + that this fits in one byte */ + col_len = field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes length field + for a true VARCHAR. Let us subtract that, so that the InnoDB + column length in the InnoDB data dictionary is the real + maximum byte length of the actual data. */ + + long_true_varchar = 0; + + if (field->type() == MYSQL_TYPE_VARCHAR) { + col_len -= ((Field_varstring*)field)->length_bytes; + + if (((Field_varstring*)field)->length_bytes == 2) { + long_true_varchar = DATA_LONG_TRUE_VARCHAR; + } + } + + dict_mem_table_add_col(table, table->heap, + (char*) field->field_name, + col_type, + dtype_form_prtype( + (ulint)field->type() + | nulls_allowed | unsigned_type + | binary_type | long_true_varchar, + charset_no), + col_len); + } + + error = row_create_table_for_mysql(table, trx); + + error = convert_error_code_to_mysql(error, flags, NULL); + + DBUG_RETURN(error); +} + +/********************************************************************* +Creates an index in an InnoDB database. */ +static +int +create_index( +/*=========*/ + trx_t* trx, /* in: InnoDB transaction handle */ + TABLE* form, /* in: information on table + columns and indexes */ + ulint flags, /* in: InnoDB table flags */ + const char* table_name, /* in: table name */ + uint key_num) /* in: index number */ +{ + Field* field; + dict_index_t* index; + int error; + ulint n_fields; + KEY* key; + KEY_PART_INFO* key_part; + ulint ind_type; + ulint col_type; + ulint prefix_len; + ulint is_unsigned; + ulint i; + ulint j; + ulint* field_lengths; + + DBUG_ENTER("create_index"); + + key = form->key_info + key_num; + + n_fields = key->key_parts; + + ind_type = 0; + + if (key_num == form->s->primary_key) { + ind_type = ind_type | DICT_CLUSTERED; + } + + if (key->flags & HA_NOSAME ) { + ind_type = ind_type | DICT_UNIQUE; + } + + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ + + index = dict_mem_index_create(table_name, key->name, 0, + ind_type, n_fields); + + field_lengths = (ulint*) my_malloc(sizeof(ulint) * n_fields, + MYF(MY_FAE)); + + for (i = 0; i < n_fields; i++) { + key_part = key->key_part + i; + + /* (The flag HA_PART_KEY_SEG denotes in MySQL a column prefix + field in an index: we only store a specified number of first + bytes of the column to the index field.) The flag does not + seem to be properly set by MySQL. Let us fall back on testing + the length of the key part versus the column. */ + + field = NULL; + for (j = 0; j < form->s->fields; j++) { + + field = form->field[j]; + + if (0 == innobase_strcasecmp( + field->field_name, + key_part->field->field_name)) { + /* Found the corresponding column */ + + break; + } + } + + ut_a(j < form->s->fields); + + col_type = get_innobase_type_from_mysql_type( + &is_unsigned, key_part->field); + + if (DATA_BLOB == col_type + || (key_part->length < field->pack_length() + && field->type() != MYSQL_TYPE_VARCHAR) + || (field->type() == MYSQL_TYPE_VARCHAR + && key_part->length < field->pack_length() + - ((Field_varstring*)field)->length_bytes)) { + + prefix_len = key_part->length; + + if (col_type == DATA_INT + || col_type == DATA_FLOAT + || col_type == DATA_DOUBLE + || col_type == DATA_DECIMAL) { + sql_print_error( + "MySQL is trying to create a column " + "prefix index field, on an " + "inappropriate data type. Table " + "name %s, column name %s.", + table_name, + key_part->field->field_name); + + prefix_len = 0; + } + } else { + prefix_len = 0; + } + + field_lengths[i] = key_part->length; + + dict_mem_index_add_field(index, + (char*) key_part->field->field_name, prefix_len); + } + + /* Even though we've defined max_supported_key_part_length, we + still do our own checking using field_lengths to be absolutely + sure we don't create too long indexes. */ + error = row_create_index_for_mysql(index, trx, field_lengths); + + error = convert_error_code_to_mysql(error, flags, NULL); + + my_free(field_lengths, MYF(0)); + + DBUG_RETURN(error); +} + +/********************************************************************* +Creates an index to an InnoDB table when the user has defined no +primary index. */ +static +int +create_clustered_index_when_no_primary( +/*===================================*/ + trx_t* trx, /* in: InnoDB transaction handle */ + ulint flags, /* in: InnoDB table flags */ + const char* table_name) /* in: table name */ +{ + dict_index_t* index; + int error; + + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ + + index = dict_mem_index_create(table_name, "GEN_CLUST_INDEX", + 0, DICT_CLUSTERED, 0); + + error = row_create_index_for_mysql(index, trx, NULL); + + error = convert_error_code_to_mysql(error, flags, NULL); + + return(error); +} + +/********************************************************************* +Validates the create options. We may build on this function +in future. For now, it checks two specifiers: +KEY_BLOCK_SIZE and ROW_FORMAT +If innodb_strict_mode is not set then this function is a no-op */ +static +ibool +create_options_are_valid( +/*=====================*/ + /* out: TRUE if valid. */ + THD* thd, /* in: connection thread. */ + TABLE* form, /* in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info) /* in: create info. */ +{ + ibool kbs_specified = FALSE; + ibool ret = TRUE; + + + ut_ad(thd != NULL); + + /* If innodb_strict_mode is not set don't do any validation. */ + if (!(THDVAR(thd, strict_mode))) { + return(TRUE); + } + + ut_ad(form != NULL); + ut_ad(create_info != NULL); + + /* First check if KEY_BLOCK_SIZE was specified. */ + if (create_info->key_block_size + || (create_info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE)) { + + kbs_specified = TRUE; + switch (create_info->key_block_size) { + case 1: + case 2: + case 4: + case 8: + case 16: + /* Valid value. */ + break; + default: + push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: invalid" + " KEY_BLOCK_SIZE = %lu." + " Valid values are" + " [1, 2, 4, 8, 16]", + create_info->key_block_size); + ret = FALSE; + } + } + + /* If KEY_BLOCK_SIZE was specified, check for its + dependencies. */ + if (kbs_specified && !srv_file_per_table) { + push_warning(thd, MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE" + " requires innodb_file_per_table."); + ret = FALSE; + } + + if (kbs_specified && srv_file_format < DICT_TF_FORMAT_ZIP) { + push_warning(thd, MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE" + " requires innodb_file_format >" + " Antelope."); + ret = FALSE; + } + + /* Now check for ROW_FORMAT specifier. */ + if (create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) { + switch (form->s->row_type) { + const char* row_format_name; + case ROW_TYPE_COMPRESSED: + case ROW_TYPE_DYNAMIC: + row_format_name + = form->s->row_type == ROW_TYPE_COMPRESSED + ? "COMPRESSED" + : "DYNAMIC"; + + /* These two ROW_FORMATs require + srv_file_per_table and srv_file_format */ + if (!srv_file_per_table) { + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s" + " requires innodb_file_per_table.", + row_format_name); + ret = FALSE; + + } + + if (srv_file_format < DICT_TF_FORMAT_ZIP) { + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s" + " requires innodb_file_format >" + " Antelope.", + row_format_name); + ret = FALSE; + } + + /* Cannot specify KEY_BLOCK_SIZE with + ROW_FORMAT = DYNAMIC. + However, we do allow COMPRESSED to be + specified with KEY_BLOCK_SIZE. */ + if (kbs_specified + && form->s->row_type == ROW_TYPE_DYNAMIC) { + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: cannot specify" + " ROW_FORMAT = DYNAMIC with" + " KEY_BLOCK_SIZE."); + ret = FALSE; + } + + break; + + case ROW_TYPE_REDUNDANT: + case ROW_TYPE_COMPACT: + case ROW_TYPE_DEFAULT: + /* Default is COMPACT. */ + row_format_name + = form->s->row_type == ROW_TYPE_REDUNDANT + ? "REDUNDANT" + : "COMPACT"; + + /* Cannot specify KEY_BLOCK_SIZE with these + format specifiers. */ + if (kbs_specified) { + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: cannot specify" + " ROW_FORMAT = %s with" + " KEY_BLOCK_SIZE.", + row_format_name); + ret = FALSE; + } + + break; + + default: + push_warning(thd, + MYSQL_ERROR::WARN_LEVEL_ERROR, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: invalid ROW_FORMAT specifier."); + ret = FALSE; + + } + } + + return(ret); +} + +/********************************************************************* +Update create_info. Used in SHOW CREATE TABLE et al. */ +UNIV_INTERN +void +ha_innobase::update_create_info( +/*============================*/ + HA_CREATE_INFO* create_info) /* in/out: create info */ +{ + if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) { + ha_innobase::info(HA_STATUS_AUTO); + create_info->auto_increment_value = stats.auto_increment_value; + } +} + +/********************************************************************* +Creates a new table to an InnoDB database. */ +UNIV_INTERN +int +ha_innobase::create( +/*================*/ + /* out: error number */ + const char* name, /* in: table name */ + TABLE* form, /* in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info) /* in: more information of the + created table, contains also the + create statement string */ +{ + int error; + dict_table_t* innobase_table; + trx_t* parent_trx; + trx_t* trx; + int primary_key_no; + uint i; + char name2[FN_REFLEN]; + char norm_name[FN_REFLEN]; + THD* thd = ha_thd(); + ib_int64_t auto_inc_value; + ulint flags; + /* Cache the value of innodb_file_format, in case it is + modified by another thread while the table is being created. */ + const ulint file_format = srv_file_format; + + DBUG_ENTER("ha_innobase::create"); + + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(create_info != NULL); + +#ifdef __WIN__ + /* Names passed in from server are in two formats: + 1. /: for normal table creation + 2. full path: for temp table creation, or sym link + + When srv_file_per_table is on, check for full path pattern, i.e. + X:\dir\..., X is a driver letter, or + \\dir1\dir2\..., UNC path + returns error if it is in full path format, but not creating a temp. + table. Currently InnoDB does not support symbolic link on Windows. */ + + if (srv_file_per_table + && (!create_info->options & HA_LEX_CREATE_TMP_TABLE)) { + + if ((name[1] == ':') + || (name[0] == '\\' && name[1] == '\\')) { + sql_print_error("Cannot create table %s\n", name); + DBUG_RETURN(HA_ERR_GENERIC); + } + } +#endif + + if (form->s->fields > 1000) { + /* The limit probably should be REC_MAX_N_FIELDS - 3 = 1020, + but we play safe here */ + + DBUG_RETURN(HA_ERR_TO_BIG_ROW); + } + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + + trx = innobase_trx_allocate(thd); + + if (lower_case_table_names) { + srv_lower_case_table_names = TRUE; + } else { + srv_lower_case_table_names = FALSE; + } + + strcpy(name2, name); + + normalize_table_name(norm_name, name2); + + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during a table create operation. + Drop table etc. do this latching in row0mysql.c. */ + + row_mysql_lock_data_dictionary(trx); + + /* Create the table definition in InnoDB */ + + flags = 0; + + /* Validate create options if innodb_strict_mode is set. */ + if (!create_options_are_valid(thd, form, create_info)) { + error = ER_ILLEGAL_HA_CREATE_OPTION; + goto cleanup; + } + + if (create_info->key_block_size + || (create_info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE)) { + /* Determine the page_zip.ssize corresponding to the + requested page size (key_block_size) in kilobytes. */ + + ulint ssize, ksize; + ulint key_block_size = create_info->key_block_size; + + for (ssize = ksize = 1; ssize <= DICT_TF_ZSSIZE_MAX; + ssize++, ksize <<= 1) { + if (key_block_size == ksize) { + flags = ssize << DICT_TF_ZSSIZE_SHIFT + | DICT_TF_COMPACT + | DICT_TF_FORMAT_ZIP + << DICT_TF_FORMAT_SHIFT; + break; + } + } + + if (!srv_file_per_table) { + push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE" + " requires innodb_file_per_table."); + flags = 0; + } + + if (file_format < DICT_TF_FORMAT_ZIP) { + push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE" + " requires innodb_file_format >" + " Antelope."); + flags = 0; + } + + if (!flags) { + push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ignoring" + " KEY_BLOCK_SIZE=%lu.", + create_info->key_block_size); + } + } + + if (create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) { + if (flags) { + /* KEY_BLOCK_SIZE was specified. */ + if (form->s->row_type != ROW_TYPE_COMPRESSED) { + /* ROW_FORMAT other than COMPRESSED + ignores KEY_BLOCK_SIZE. It does not + make sense to reject conflicting + KEY_BLOCK_SIZE and ROW_FORMAT, because + such combinations can be obtained + with ALTER TABLE anyway. */ + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ignoring KEY_BLOCK_SIZE=%lu" + " unless ROW_FORMAT=COMPRESSED.", + create_info->key_block_size); + flags = 0; + } + } else { + /* No KEY_BLOCK_SIZE */ + if (form->s->row_type == ROW_TYPE_COMPRESSED) { + /* ROW_FORMAT=COMPRESSED without + KEY_BLOCK_SIZE implies half the + maximum KEY_BLOCK_SIZE. */ + flags = (DICT_TF_ZSSIZE_MAX - 1) + << DICT_TF_ZSSIZE_SHIFT + | DICT_TF_COMPACT + | DICT_TF_FORMAT_ZIP + << DICT_TF_FORMAT_SHIFT; +#if DICT_TF_ZSSIZE_MAX < 1 +# error "DICT_TF_ZSSIZE_MAX < 1" +#endif + } + } + + switch (form->s->row_type) { + const char* row_format_name; + case ROW_TYPE_REDUNDANT: + break; + case ROW_TYPE_COMPRESSED: + case ROW_TYPE_DYNAMIC: + row_format_name + = form->s->row_type == ROW_TYPE_COMPRESSED + ? "COMPRESSED" + : "DYNAMIC"; + + if (!srv_file_per_table) { + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s" + " requires innodb_file_per_table.", + row_format_name); + } else if (file_format < DICT_TF_FORMAT_ZIP) { + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s" + " requires innodb_file_format >" + " Antelope.", + row_format_name); + } else { + flags |= DICT_TF_COMPACT + | (DICT_TF_FORMAT_ZIP + << DICT_TF_FORMAT_SHIFT); + break; + } + + /* fall through */ + case ROW_TYPE_NOT_USED: + case ROW_TYPE_FIXED: + default: + push_warning(thd, + MYSQL_ERROR::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: assuming ROW_FORMAT=COMPACT."); + case ROW_TYPE_DEFAULT: + case ROW_TYPE_COMPACT: + flags = DICT_TF_COMPACT; + break; + } + } else if (!flags) { + /* No KEY_BLOCK_SIZE or ROW_FORMAT specified: + use ROW_FORMAT=COMPACT by default. */ + flags = DICT_TF_COMPACT; + } + + error = create_table_def(trx, form, norm_name, + create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL, + flags); + + if (error) { + goto cleanup; + } + + /* Look for a primary key */ + + primary_key_no= (form->s->primary_key != MAX_KEY ? + (int) form->s->primary_key : + -1); + + /* Our function row_get_mysql_key_number_for_index assumes + the primary key is always number 0, if it exists */ + + DBUG_ASSERT(primary_key_no == -1 || primary_key_no == 0); + + /* Create the keys */ + + if (form->s->keys == 0 || primary_key_no == -1) { + /* Create an index which is used as the clustered index; + order the rows by their row id which is internally generated + by InnoDB */ + + error = create_clustered_index_when_no_primary( + trx, flags, norm_name); + if (error) { + goto cleanup; + } + } + + if (primary_key_no != -1) { + /* In InnoDB the clustered index must always be created + first */ + if ((error = create_index(trx, form, flags, norm_name, + (uint) primary_key_no))) { + goto cleanup; + } + } + + for (i = 0; i < form->s->keys; i++) { + + if (i != (uint) primary_key_no) { + + if ((error = create_index(trx, form, flags, norm_name, + i))) { + goto cleanup; + } + } + } + + if (*trx->mysql_query_str) { + error = row_table_add_foreign_constraints(trx, + *trx->mysql_query_str, norm_name, + create_info->options & HA_LEX_CREATE_TMP_TABLE); + + error = convert_error_code_to_mysql(error, flags, NULL); + + if (error) { + goto cleanup; + } + } + + innobase_commit_low(trx); + + row_mysql_unlock_data_dictionary(trx); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + innobase_table = dict_table_get(norm_name, FALSE); + + DBUG_ASSERT(innobase_table != 0); + + if (innobase_table) { + /* We update the highest file format in the system table + space, if this table has higher file format setting. */ + + trx_sys_file_format_max_upgrade( + (const char**) &innobase_file_format_check, + dict_table_get_format(innobase_table)); + } + + /* Note: We can't call update_thd() as prebuilt will not be + setup at this stage and so we use thd. */ + + /* We need to copy the AUTOINC value from the old table if + this is an ALTER TABLE. */ + + if (((create_info->used_fields & HA_CREATE_USED_AUTO) + || thd_sql_command(thd) == SQLCOM_ALTER_TABLE) + && create_info->auto_increment_value != 0) { + + /* Query was ALTER TABLE...AUTO_INCREMENT = x; or + CREATE TABLE ...AUTO_INCREMENT = x; Find out a table + definition from the dictionary and get the current value + of the auto increment field. Set a new value to the + auto increment field if the value is greater than the + maximum value in the column. */ + + auto_inc_value = create_info->auto_increment_value; + + dict_table_autoinc_lock(innobase_table); + dict_table_autoinc_initialize(innobase_table, auto_inc_value); + dict_table_autoinc_unlock(innobase_table); + } + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + trx_free_for_mysql(trx); + + DBUG_RETURN(0); + +cleanup: + innobase_commit_low(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + DBUG_RETURN(error); +} + +/********************************************************************* +Discards or imports an InnoDB tablespace. */ +UNIV_INTERN +int +ha_innobase::discard_or_import_tablespace( +/*======================================*/ + /* out: 0 == success, -1 == error */ + my_bool discard) /* in: TRUE if discard, else import */ +{ + dict_table_t* dict_table; + trx_t* trx; + int err; + + DBUG_ENTER("ha_innobase::discard_or_import_tablespace"); + + ut_a(prebuilt->trx); + ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + dict_table = prebuilt->table; + trx = prebuilt->trx; + + if (discard) { + err = row_discard_tablespace_for_mysql(dict_table->name, trx); + } else { + err = row_import_tablespace_for_mysql(dict_table->name, trx); + } + + err = convert_error_code_to_mysql(err, dict_table->flags, NULL); + + DBUG_RETURN(err); +} + +/********************************************************************* +Deletes all rows of an InnoDB table. */ +UNIV_INTERN +int +ha_innobase::delete_all_rows(void) +/*==============================*/ + /* out: error number */ +{ + int error; + + DBUG_ENTER("ha_innobase::delete_all_rows"); + + /* Get the transaction associated with the current thd, or create one + if not yet created, and update prebuilt->trx */ + + update_thd(ha_thd()); + + if (thd_sql_command(user_thd) != SQLCOM_TRUNCATE) { + fallback: + /* We only handle TRUNCATE TABLE t as a special case. + DELETE FROM t will have to use ha_innobase::delete_row(), + because DELETE is transactional while TRUNCATE is not. */ + DBUG_RETURN(my_errno=HA_ERR_WRONG_COMMAND); + } + + /* Truncate the table in InnoDB */ + + error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx); + if (error == DB_ERROR) { + /* Cannot truncate; resort to ha_innobase::delete_row() */ + goto fallback; + } + + error = convert_error_code_to_mysql(error, prebuilt->table->flags, + NULL); + + DBUG_RETURN(error); +} + +/********************************************************************* +Drops a table from an InnoDB database. Before calling this function, +MySQL calls innobase_commit to commit the transaction of the current user. +Then the current user cannot have locks set on the table. Drop table +operation inside InnoDB will remove all locks any user has on the table +inside InnoDB. */ +UNIV_INTERN +int +ha_innobase::delete_table( +/*======================*/ + /* out: error number */ + const char* name) /* in: table name */ +{ + ulint name_len; + int error; + trx_t* parent_trx; + trx_t* trx; + THD *thd = ha_thd(); + char norm_name[1000]; + + DBUG_ENTER("ha_innobase::delete_table"); + + /* Strangely, MySQL passes the table name without the '.frm' + extension, in contrast to ::create */ + normalize_table_name(norm_name, name); + + if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) { + DBUG_RETURN(HA_ERR_GENERIC); + } + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + + trx = innobase_trx_allocate(thd); + + if (lower_case_table_names) { + srv_lower_case_table_names = TRUE; + } else { + srv_lower_case_table_names = FALSE; + } + + name_len = strlen(name); + + ut_a(name_len < 1000); + + /* Drop the table in InnoDB */ + + error = row_drop_table_for_mysql(norm_name, trx, + thd_sql_command(thd) + == SQLCOM_DROP_DB); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + innobase_commit_low(trx); + + trx_free_for_mysql(trx); + + error = convert_error_code_to_mysql(error, 0, NULL); + + DBUG_RETURN(error); +} + +/********************************************************************* +Removes all tables in the named database inside InnoDB. */ +static +void +innobase_drop_database( +/*===================*/ + /* out: error number */ + handlerton *hton, /* in: handlerton of Innodb */ + char* path) /* in: database path; inside InnoDB the name + of the last directory in the path is used as + the database name: for example, in 'mysql/data/test' + the database name is 'test' */ +{ + ulint len = 0; + trx_t* trx; + char* ptr; + int error; + char* namebuf; + THD* thd = current_thd; + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* In the Windows plugin, thd = current_thd is always NULL */ + if (thd) { + trx_t* parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT + query, release possible adaptive hash latch to avoid + deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + } + + ptr = strend(path) - 2; + + while (ptr >= path && *ptr != '\\' && *ptr != '/') { + ptr--; + len++; + } + + ptr++; + namebuf = (char*) my_malloc((uint) len + 2, MYF(0)); + + memcpy(namebuf, ptr, len); + namebuf[len] = '/'; + namebuf[len + 1] = '\0'; +#ifdef __WIN__ + innobase_casedn_str(namebuf); +#endif +#if defined __WIN__ && !defined MYSQL_SERVER + /* In the Windows plugin, thd = current_thd is always NULL */ + trx = trx_allocate_for_mysql(); + trx->mysql_thd = NULL; + trx->mysql_query_str = NULL; +#else + trx = innobase_trx_allocate(thd); +#endif + error = row_drop_database_for_mysql(namebuf, trx); + my_free(namebuf, MYF(0)); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + innobase_commit_low(trx); + trx_free_for_mysql(trx); +} +/************************************************************************* +Renames an InnoDB table. */ +static +int +innobase_rename_table( +/*==================*/ + /* out: 0 or error code */ + trx_t* trx, /* in: transaction */ + const char* from, /* in: old name of the table */ + const char* to, /* in: new name of the table */ + ibool lock_and_commit) + /* in: TRUE=lock data dictionary and commit */ +{ + int error; + char* norm_to; + char* norm_from; + + if (lower_case_table_names) { + srv_lower_case_table_names = TRUE; + } else { + srv_lower_case_table_names = FALSE; + } + + // Magic number 64 arbitrary + norm_to = (char*) my_malloc(strlen(to) + 64, MYF(0)); + norm_from = (char*) my_malloc(strlen(from) + 64, MYF(0)); + + normalize_table_name(norm_to, to); + normalize_table_name(norm_from, from); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + if (lock_and_commit) { + row_mysql_lock_data_dictionary(trx); + } + + error = row_rename_table_for_mysql( + norm_from, norm_to, trx, lock_and_commit); + + if (error != DB_SUCCESS) { + FILE* ef = dict_foreign_err_file; + + fputs("InnoDB: Renaming table ", ef); + ut_print_name(ef, trx, TRUE, norm_from); + fputs(" to ", ef); + ut_print_name(ef, trx, TRUE, norm_to); + fputs(" failed!\n", ef); + } + + if (lock_and_commit) { + row_mysql_unlock_data_dictionary(trx); + + /* Flush the log to reduce probability that the .frm + files and the InnoDB data dictionary get out-of-sync + if the user runs with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + } + + my_free(norm_to, MYF(0)); + my_free(norm_from, MYF(0)); + + return error; +} +/************************************************************************* +Renames an InnoDB table. */ +UNIV_INTERN +int +ha_innobase::rename_table( +/*======================*/ + /* out: 0 or error code */ + const char* from, /* in: old name of the table */ + const char* to) /* in: new name of the table */ +{ + trx_t* trx; + int error; + trx_t* parent_trx; + THD* thd = ha_thd(); + + DBUG_ENTER("ha_innobase::rename_table"); + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + parent_trx = check_trx_exists(thd); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(parent_trx); + + trx = innobase_trx_allocate(thd); + + error = innobase_rename_table(trx, from, to, TRUE); + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + innobase_commit_low(trx); + trx_free_for_mysql(trx); + + error = convert_error_code_to_mysql(error, 0, NULL); + + DBUG_RETURN(error); +} + +/************************************************************************* +Estimates the number of index records in a range. */ +UNIV_INTERN +ha_rows +ha_innobase::records_in_range( +/*==========================*/ + /* out: estimated number of + rows */ + uint keynr, /* in: index number */ + key_range *min_key, /* in: start key value of the + range, may also be 0 */ + key_range *max_key) /* in: range end key val, may + also be 0 */ +{ + KEY* key; + dict_index_t* index; + uchar* key_val_buff2 = (uchar*) my_malloc( + table->s->reclength + + table->s->max_key_length + 100, + MYF(MY_FAE)); + ulint buff2_len = table->s->reclength + + table->s->max_key_length + 100; + dtuple_t* range_start; + dtuple_t* range_end; + ib_int64_t n_rows; + ulint mode1; + ulint mode2; + mem_heap_t* heap; + + DBUG_ENTER("records_in_range"); + + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + prebuilt->trx->op_info = (char*)"estimating records in index range"; + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + active_index = keynr; + + key = table->key_info + active_index; + + index = dict_table_get_index_on_name(prebuilt->table, key->name); + + /* MySQL knows about this index and so we must be able to find it.*/ + ut_a(index); + + heap = mem_heap_create(2 * (key->key_parts * sizeof(dfield_t) + + sizeof(dtuple_t))); + + range_start = dtuple_create(heap, key->key_parts); + dict_index_copy_types(range_start, index, key->key_parts); + + range_end = dtuple_create(heap, key->key_parts); + dict_index_copy_types(range_end, index, key->key_parts); + + row_sel_convert_mysql_key_to_innobase( + range_start, (byte*) key_val_buff, + (ulint)upd_and_key_val_buff_len, + index, + (byte*) (min_key ? min_key->key : + (const uchar*) 0), + (ulint) (min_key ? min_key->length : 0), + prebuilt->trx); + + row_sel_convert_mysql_key_to_innobase( + range_end, (byte*) key_val_buff2, + buff2_len, index, + (byte*) (max_key ? max_key->key : + (const uchar*) 0), + (ulint) (max_key ? max_key->length : 0), + prebuilt->trx); + + mode1 = convert_search_mode_to_innobase(min_key ? min_key->flag : + HA_READ_KEY_EXACT); + mode2 = convert_search_mode_to_innobase(max_key ? max_key->flag : + HA_READ_KEY_EXACT); + + if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) { + + n_rows = btr_estimate_n_rows_in_range(index, range_start, + mode1, range_end, + mode2); + } else { + + n_rows = 0; + } + + mem_heap_free(heap); + + my_free(key_val_buff2, MYF(0)); + + prebuilt->trx->op_info = (char*)""; + + /* The MySQL optimizer seems to believe an estimate of 0 rows is + always accurate and may return the result 'Empty set' based on that. + The accuracy is not guaranteed, and even if it were, for a locking + read we should anyway perform the search to set the next-key lock. + Add 1 to the value to make sure MySQL does not make the assumption! */ + + if (n_rows == 0) { + n_rows = 1; + } + + DBUG_RETURN((ha_rows) n_rows); +} + +/************************************************************************* +Gives an UPPER BOUND to the number of rows in a table. This is used in +filesort.cc. */ +UNIV_INTERN +ha_rows +ha_innobase::estimate_rows_upper_bound(void) +/*======================================*/ + /* out: upper bound of rows */ +{ + dict_index_t* index; + ulonglong estimate; + ulonglong local_data_file_length; + + DBUG_ENTER("estimate_rows_upper_bound"); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + prebuilt->trx->op_info = (char*) + "calculating upper bound for table rows"; + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + index = dict_table_get_first_index(prebuilt->table); + + ut_a(index->stat_n_leaf_pages > 0); + + local_data_file_length = + ((ulonglong) index->stat_n_leaf_pages) * UNIV_PAGE_SIZE; + + + /* Calculate a minimum length for a clustered index record and from + that an upper bound for the number of rows. Since we only calculate + new statistics in row0mysql.c when a table has grown by a threshold + factor, we must add a safety factor 2 in front of the formula below. */ + + estimate = 2 * local_data_file_length / + dict_index_calc_min_rec_len(index); + + prebuilt->trx->op_info = (char*)""; + + DBUG_RETURN((ha_rows) estimate); +} + +/************************************************************************* +How many seeks it will take to read through the table. This is to be +comparable to the number returned by records_in_range so that we can +decide if we should scan the table or use keys. */ +UNIV_INTERN +double +ha_innobase::scan_time() +/*====================*/ + /* out: estimated time measured in disk seeks */ +{ + /* Since MySQL seems to favor table scans too much over index + searches, we pretend that a sequential read takes the same time + as a random disk read, that is, we do not divide the following + by 10, which would be physically realistic. */ + + return((double) (prebuilt->table->stat_clustered_index_size)); +} + +/********************************************************************** +Calculate the time it takes to read a set of ranges through an index +This enables us to optimise reads for clustered indexes. */ +UNIV_INTERN +double +ha_innobase::read_time( +/*===================*/ + /* out: estimated time measured in disk seeks */ + uint index, /* in: key number */ + uint ranges, /* in: how many ranges */ + ha_rows rows) /* in: estimated number of rows in the ranges */ +{ + ha_rows total_rows; + double time_for_scan; + + if (index != table->s->primary_key) { + /* Not clustered */ + return(handler::read_time(index, ranges, rows)); + } + + if (rows <= 2) { + + return((double) rows); + } + + /* Assume that the read time is proportional to the scan time for all + rows + at most one seek per range. */ + + time_for_scan = scan_time(); + + if ((total_rows = estimate_rows_upper_bound()) < rows) { + + return(time_for_scan); + } + + return(ranges + (double) rows / (double) total_rows * time_for_scan); +} + +/************************************************************************* +Returns statistics information of the table to the MySQL interpreter, +in various fields of the handle object. */ +UNIV_INTERN +int +ha_innobase::info( +/*==============*/ + uint flag) /* in: what information MySQL requests */ +{ + dict_table_t* ib_table; + dict_index_t* index; + ha_rows rec_per_key; + ib_int64_t n_rows; + ulong j; + ulong i; + char path[FN_REFLEN]; + os_file_stat_t stat_info; + + DBUG_ENTER("info"); + + /* If we are forcing recovery at a high level, we will suppress + statistics calculation on tables, because that may crash the + server if an index is badly corrupted. */ + + if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + + /* We return success (0) instead of HA_ERR_CRASHED, + because we want MySQL to process this query and not + stop, like it would do if it received the error code + HA_ERR_CRASHED. */ + + DBUG_RETURN(0); + } + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + prebuilt->trx->op_info = (char*)"returning various info to MySQL"; + + trx_search_latch_release_if_reserved(prebuilt->trx); + + ib_table = prebuilt->table; + + if (flag & HA_STATUS_TIME) { + if (innobase_stats_on_metadata) { + /* In sql_show we call with this flag: update + then statistics so that they are up-to-date */ + + prebuilt->trx->op_info = "updating table statistics"; + + dict_update_statistics(ib_table); + + prebuilt->trx->op_info = "returning various info to MySQL"; + } + + my_snprintf(path, sizeof(path), "%s/%s%s", + mysql_data_home, ib_table->name, reg_ext); + + unpack_filename(path,path); + + /* Note that we do not know the access time of the table, + nor the CHECK TABLE time, nor the UPDATE or INSERT time. */ + + if (os_file_get_status(path,&stat_info)) { + stats.create_time = stat_info.ctime; + } + } + + if (flag & HA_STATUS_VARIABLE) { + n_rows = ib_table->stat_n_rows; + + /* Because we do not protect stat_n_rows by any mutex in a + delete, it is theoretically possible that the value can be + smaller than zero! TODO: fix this race. + + The MySQL optimizer seems to assume in a left join that n_rows + is an accurate estimate if it is zero. Of course, it is not, + since we do not have any locks on the rows yet at this phase. + Since SHOW TABLE STATUS seems to call this function with the + HA_STATUS_TIME flag set, while the left join optimizer does not + set that flag, we add one to a zero value if the flag is not + set. That way SHOW TABLE STATUS will show the best estimate, + while the optimizer never sees the table empty. */ + + if (n_rows < 0) { + n_rows = 0; + } + + if (n_rows == 0 && !(flag & HA_STATUS_TIME)) { + n_rows++; + } + + /* Fix bug#40386: Not flushing query cache after truncate. + n_rows can not be 0 unless the table is empty, set to 1 + instead. The original problem of bug#29507 is actually + fixed in the server code. */ + if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) { + + n_rows = 1; + + /* We need to reset the prebuilt value too, otherwise + checks for values greater than the last value written + to the table will fail and the autoinc counter will + not be updated. This will force write_row() into + attempting an update of the table's AUTOINC counter. */ + + prebuilt->autoinc_last_value = 0; + } + + stats.records = (ha_rows)n_rows; + stats.deleted = 0; + stats.data_file_length = ((ulonglong) + ib_table->stat_clustered_index_size) + * UNIV_PAGE_SIZE; + stats.index_file_length = ((ulonglong) + ib_table->stat_sum_of_other_index_sizes) + * UNIV_PAGE_SIZE; + + /* Since fsp_get_available_space_in_free_extents() is + acquiring latches inside InnoDB, we do not call it if we + are asked by MySQL to avoid locking. Another reason to + avoid the call is that it uses quite a lot of CPU. + See Bug#38185. + We do not update delete_length if no locking is requested + so the "old" value can remain. delete_length is initialized + to 0 in the ha_statistics' constructor. */ + if (!(flag & HA_STATUS_NO_LOCK)) { + + /* lock the data dictionary to avoid races with + ibd_file_missing and tablespace_discarded */ + row_mysql_lock_data_dictionary(prebuilt->trx); + + /* ib_table->space must be an existent tablespace */ + if (!ib_table->ibd_file_missing + && !ib_table->tablespace_discarded) { + + stats.delete_length = + fsp_get_available_space_in_free_extents( + ib_table->space) * 1024; + } else { + + THD* thd; + + thd = ha_thd(); + + push_warning_printf( + thd, + MYSQL_ERROR::WARN_LEVEL_WARN, + ER_CANT_GET_STAT, + "InnoDB: Trying to get the free " + "space for table %s but its " + "tablespace has been discarded or " + "the .ibd file is missing. Setting " + "the free space to zero.", + ib_table->name); + + stats.delete_length = 0; + } + + row_mysql_unlock_data_dictionary(prebuilt->trx); + } + + stats.check_time = 0; + + if (stats.records == 0) { + stats.mean_rec_length = 0; + } else { + stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records); + } + } + + if (flag & HA_STATUS_CONST) { + index = dict_table_get_first_index(ib_table); + + if (prebuilt->clust_index_was_generated) { + index = dict_table_get_next_index(index); + } + + for (i = 0; i < table->s->keys; i++) { + if (index == NULL) { + sql_print_error("Table %s contains fewer " + "indexes inside InnoDB than " + "are defined in the MySQL " + ".frm file. Have you mixed up " + ".frm files from different " + "installations? See " +"http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html\n", + + ib_table->name); + break; + } + + for (j = 0; j < table->key_info[i].key_parts; j++) { + + if (j + 1 > index->n_uniq) { + sql_print_error( +"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking " +"statistics for %lu columns. Have you mixed up .frm files from different " +"installations? " +"See http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html\n", + index->name, + ib_table->name, + (unsigned long) + index->n_uniq, j + 1); + break; + } + + if (index->stat_n_diff_key_vals[j + 1] == 0) { + + rec_per_key = stats.records; + } else { + rec_per_key = (ha_rows)(stats.records / + index->stat_n_diff_key_vals[j + 1]); + } + + /* Since MySQL seems to favor table scans + too much over index searches, we pretend + index selectivity is 2 times better than + our estimate: */ + + rec_per_key = rec_per_key / 2; + + if (rec_per_key == 0) { + rec_per_key = 1; + } + + table->key_info[i].rec_per_key[j]= + rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 : + (ulong) rec_per_key; + } + + index = dict_table_get_next_index(index); + } + } + + if (flag & HA_STATUS_ERRKEY) { + const dict_index_t* err_index; + + ut_a(prebuilt->trx); + ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); + + err_index = trx_get_error_info(prebuilt->trx); + + if (err_index) { + errkey = (unsigned int) + row_get_mysql_key_number_for_index(err_index); + } else { + errkey = (unsigned int) prebuilt->trx->error_key_num; + } + } + + if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) { + stats.auto_increment_value = innobase_peek_autoinc(); + } + + prebuilt->trx->op_info = (char*)""; + + DBUG_RETURN(0); +} + +/************************************************************************** +Updates index cardinalities of the table, based on 8 random dives into +each index tree. This does NOT calculate exact statistics on the table. */ +UNIV_INTERN +int +ha_innobase::analyze( +/*=================*/ + /* out: returns always 0 (success) */ + THD* thd, /* in: connection thread handle */ + HA_CHECK_OPT* check_opt) /* in: currently ignored */ +{ + /* Simply call ::info() with all the flags */ + info(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE); + + return(0); +} + +/************************************************************************** +This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds +the table in MySQL. */ +UNIV_INTERN +int +ha_innobase::optimize( +/*==================*/ + THD* thd, /* in: connection thread handle */ + HA_CHECK_OPT* check_opt) /* in: currently ignored */ +{ + return(HA_ADMIN_TRY_ALTER); +} + +/*********************************************************************** +Tries to check that an InnoDB table is not corrupted. If corruption is +noticed, prints to stderr information about it. In case of corruption +may also assert a failure and crash the server. */ +UNIV_INTERN +int +ha_innobase::check( +/*===============*/ + /* out: HA_ADMIN_CORRUPT or + HA_ADMIN_OK */ + THD* thd, /* in: user thread handle */ + HA_CHECK_OPT* check_opt) /* in: check options, currently + ignored */ +{ + ulint ret; + + DBUG_ASSERT(thd == ha_thd()); + ut_a(prebuilt->trx); + ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); + ut_a(prebuilt->trx == thd_to_trx(thd)); + + if (prebuilt->mysql_template == NULL) { + /* Build the template; we will use a dummy template + in index scans done in checking */ + + build_template(prebuilt, NULL, table, ROW_MYSQL_WHOLE_ROW); + } + + ret = row_check_table_for_mysql(prebuilt); + + if (ret == DB_SUCCESS) { + return(HA_ADMIN_OK); + } + + return(HA_ADMIN_CORRUPT); +} + +/***************************************************************** +Adds information about free space in the InnoDB tablespace to a table comment +which is printed out when a user calls SHOW TABLE STATUS. Adds also info on +foreign keys. */ +UNIV_INTERN +char* +ha_innobase::update_table_comment( +/*==============================*/ + /* out: table comment + InnoDB free space + + info on foreign keys */ + const char* comment)/* in: table comment defined by user */ +{ + uint length = (uint) strlen(comment); + char* str; + long flen; + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + if (length > 64000 - 3) { + return((char*)comment); /* string too long */ + } + + update_thd(ha_thd()); + + prebuilt->trx->op_info = (char*)"returning table comment"; + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + str = NULL; + + /* output the data to a temporary file */ + + mutex_enter(&srv_dict_tmpfile_mutex); + rewind(srv_dict_tmpfile); + + fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB", + fsp_get_available_space_in_free_extents( + prebuilt->table->space)); + + dict_print_info_on_foreign_keys(FALSE, srv_dict_tmpfile, + prebuilt->trx, prebuilt->table); + flen = ftell(srv_dict_tmpfile); + if (flen < 0) { + flen = 0; + } else if (length + flen + 3 > 64000) { + flen = 64000 - 3 - length; + } + + /* allocate buffer for the full string, and + read the contents of the temporary file */ + + str = (char*) my_malloc(length + flen + 3, MYF(0)); + + if (str) { + char* pos = str + length; + if (length) { + memcpy(str, comment, length); + *pos++ = ';'; + *pos++ = ' '; + } + rewind(srv_dict_tmpfile); + flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile); + pos[flen] = 0; + } + + mutex_exit(&srv_dict_tmpfile_mutex); + + prebuilt->trx->op_info = (char*)""; + + return(str ? str : (char*) comment); +} + +/*********************************************************************** +Gets the foreign key create info for a table stored in InnoDB. */ +UNIV_INTERN +char* +ha_innobase::get_foreign_key_create_info(void) +/*==========================================*/ + /* out, own: character string in the form which + can be inserted to the CREATE TABLE statement, + MUST be freed with ::free_foreign_key_create_info */ +{ + char* str = 0; + long flen; + + ut_a(prebuilt != NULL); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + prebuilt->trx->op_info = (char*)"getting info on foreign keys"; + + /* In case MySQL calls this in the middle of a SELECT query, + release possible adaptive hash latch to avoid + deadlocks of threads */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + mutex_enter(&srv_dict_tmpfile_mutex); + rewind(srv_dict_tmpfile); + + /* output the data to a temporary file */ + dict_print_info_on_foreign_keys(TRUE, srv_dict_tmpfile, + prebuilt->trx, prebuilt->table); + prebuilt->trx->op_info = (char*)""; + + flen = ftell(srv_dict_tmpfile); + if (flen < 0) { + flen = 0; + } else if (flen > 64000 - 1) { + flen = 64000 - 1; + } + + /* allocate buffer for the string, and + read the contents of the temporary file */ + + str = (char*) my_malloc(flen + 1, MYF(0)); + + if (str) { + rewind(srv_dict_tmpfile); + flen = (uint) fread(str, 1, flen, srv_dict_tmpfile); + str[flen] = 0; + } + + mutex_exit(&srv_dict_tmpfile_mutex); + + return(str); +} + + +UNIV_INTERN +int +ha_innobase::get_foreign_key_list(THD *thd, List *f_key_list) +{ + dict_foreign_t* foreign; + + DBUG_ENTER("get_foreign_key_list"); + ut_a(prebuilt != NULL); + update_thd(ha_thd()); + prebuilt->trx->op_info = (char*)"getting list of foreign keys"; + trx_search_latch_release_if_reserved(prebuilt->trx); + mutex_enter(&(dict_sys->mutex)); + foreign = UT_LIST_GET_FIRST(prebuilt->table->foreign_list); + + while (foreign != NULL) { + uint i; + FOREIGN_KEY_INFO f_key_info; + LEX_STRING *name= 0; + uint ulen; + char uname[NAME_LEN+1]; /* Unencoded name */ + char db_name[NAME_LEN+1]; + const char *tmp_buff; + + tmp_buff= foreign->id; + i= 0; + while (tmp_buff[i] != '/') + i++; + tmp_buff+= i + 1; + f_key_info.forein_id = thd_make_lex_string(thd, 0, + tmp_buff, (uint) strlen(tmp_buff), 1); + tmp_buff= foreign->referenced_table_name; + + /* Database name */ + i= 0; + while (tmp_buff[i] != '/') + { + db_name[i]= tmp_buff[i]; + i++; + } + db_name[i]= 0; + ulen= filename_to_tablename(db_name, uname, sizeof(uname)); + f_key_info.referenced_db = thd_make_lex_string(thd, 0, + uname, ulen, 1); + + /* Table name */ + tmp_buff+= i + 1; + ulen= filename_to_tablename(tmp_buff, uname, sizeof(uname)); + f_key_info.referenced_table = thd_make_lex_string(thd, 0, + uname, ulen, 1); + + for (i= 0;;) { + tmp_buff= foreign->foreign_col_names[i]; + name = thd_make_lex_string(thd, name, + tmp_buff, (uint) strlen(tmp_buff), 1); + f_key_info.foreign_fields.push_back(name); + tmp_buff= foreign->referenced_col_names[i]; + name = thd_make_lex_string(thd, name, + tmp_buff, (uint) strlen(tmp_buff), 1); + f_key_info.referenced_fields.push_back(name); + if (++i >= foreign->n_fields) + break; + } + + ulong length; + if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) + { + length=7; + tmp_buff= "CASCADE"; + } + else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) + { + length=8; + tmp_buff= "SET NULL"; + } + else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) + { + length=9; + tmp_buff= "NO ACTION"; + } + else + { + length=8; + tmp_buff= "RESTRICT"; + } + f_key_info.delete_method = thd_make_lex_string( + thd, f_key_info.delete_method, tmp_buff, length, 1); + + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) + { + length=7; + tmp_buff= "CASCADE"; + } + else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) + { + length=8; + tmp_buff= "SET NULL"; + } + else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) + { + length=9; + tmp_buff= "NO ACTION"; + } + else + { + length=8; + tmp_buff= "RESTRICT"; + } + f_key_info.update_method = thd_make_lex_string( + thd, f_key_info.update_method, tmp_buff, length, 1); + if (foreign->referenced_index && + foreign->referenced_index->name) + { + f_key_info.referenced_key_name = thd_make_lex_string( + thd, f_key_info.referenced_key_name, + foreign->referenced_index->name, + strlen(foreign->referenced_index->name), 1); + } + else + f_key_info.referenced_key_name= 0; + + FOREIGN_KEY_INFO *pf_key_info = (FOREIGN_KEY_INFO *) + thd_memdup(thd, &f_key_info, sizeof(FOREIGN_KEY_INFO)); + f_key_list->push_back(pf_key_info); + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + mutex_exit(&(dict_sys->mutex)); + prebuilt->trx->op_info = (char*)""; + + DBUG_RETURN(0); +} + +/********************************************************************* +Checks if ALTER TABLE may change the storage engine of the table. +Changing storage engines is not allowed for tables for which there +are foreign key constraints (parent or child tables). */ +UNIV_INTERN +bool +ha_innobase::can_switch_engines(void) +/*=================================*/ +{ + bool can_switch; + + DBUG_ENTER("ha_innobase::can_switch_engines"); + + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); + + prebuilt->trx->op_info = + "determining if there are foreign key constraints"; + row_mysql_lock_data_dictionary(prebuilt->trx); + + can_switch = !UT_LIST_GET_FIRST(prebuilt->table->referenced_list) + && !UT_LIST_GET_FIRST(prebuilt->table->foreign_list); + + row_mysql_unlock_data_dictionary(prebuilt->trx); + prebuilt->trx->op_info = ""; + + DBUG_RETURN(can_switch); +} + +/*********************************************************************** +Checks if a table is referenced by a foreign key. The MySQL manual states that +a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a +delete is then allowed internally to resolve a duplicate key conflict in +REPLACE, not an update. */ +UNIV_INTERN +uint +ha_innobase::referenced_by_foreign_key(void) +/*========================================*/ + /* out: > 0 if referenced by a FOREIGN KEY */ +{ + if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) { + + return(1); + } + + return(0); +} + +/*********************************************************************** +Frees the foreign key create info for a table stored in InnoDB, if it is +non-NULL. */ +UNIV_INTERN +void +ha_innobase::free_foreign_key_create_info( +/*======================================*/ + char* str) /* in, own: create info string to free */ +{ + if (str) { + my_free(str, MYF(0)); + } +} + +/*********************************************************************** +Tells something additional to the handler about how to do things. */ +UNIV_INTERN +int +ha_innobase::extra( +/*===============*/ + /* out: 0 or error number */ + enum ha_extra_function operation) + /* in: HA_EXTRA_FLUSH or some other flag */ +{ + /* Warning: since it is not sure that MySQL calls external_lock + before calling this function, the trx field in prebuilt can be + obsolete! */ + + switch (operation) { + case HA_EXTRA_FLUSH: + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + break; + case HA_EXTRA_RESET_STATE: + reset_template(prebuilt); + break; + case HA_EXTRA_NO_KEYREAD: + prebuilt->read_just_key = 0; + break; + case HA_EXTRA_KEYREAD: + prebuilt->read_just_key = 1; + break; + case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + prebuilt->keep_other_fields_on_keyread = 1; + break; + + /* IMPORTANT: prebuilt->trx can be obsolete in + this method, because it is not sure that MySQL + calls external_lock before this method with the + parameters below. We must not invoke update_thd() + either, because the calling threads may change. + CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */ + case HA_EXTRA_IGNORE_DUP_KEY: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE; + break; + case HA_EXTRA_WRITE_CAN_REPLACE: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE; + break; + case HA_EXTRA_WRITE_CANNOT_REPLACE: + thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE; + break; + case HA_EXTRA_NO_IGNORE_DUP_KEY: + thd_to_trx(ha_thd())->duplicates &= + ~(TRX_DUP_IGNORE | TRX_DUP_REPLACE); + break; + default:/* Do nothing */ + ; + } + + return(0); +} + +UNIV_INTERN +int +ha_innobase::reset() +{ + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + + reset_template(prebuilt); + + /* TODO: This should really be reset in reset_template() but for now + it's safer to do it explicitly here. */ + + /* This is a statement level counter. */ + prebuilt->autoinc_last_value = 0; + + return(0); +} + +/********************************************************************** +MySQL calls this function at the start of each SQL statement inside LOCK +TABLES. Inside LOCK TABLES the ::external_lock method does not work to +mark SQL statement borders. Note also a special case: if a temporary table +is created inside LOCK TABLES, MySQL has not called external_lock() at all +on that table. +MySQL-5.0 also calls this before each statement in an execution of a stored +procedure. To make the execution more deterministic for binlogging, MySQL-5.0 +locks all tables involved in a stored procedure with full explicit table +locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the +procedure. */ +UNIV_INTERN +int +ha_innobase::start_stmt( +/*====================*/ + /* out: 0 or error code */ + THD* thd, /* in: handle to the user thread */ + thr_lock_type lock_type) +{ + trx_t* trx; + + update_thd(thd); + + trx = prebuilt->trx; + + /* Here we release the search latch and the InnoDB thread FIFO ticket + if they were reserved. They should have been released already at the + end of the previous statement, but because inside LOCK TABLES the + lock count method does not work to mark the end of a SELECT statement, + that may not be the case. We MUST release the search latch before an + INSERT, for example. */ + + innobase_release_stat_resources(trx); + + /* Reset the AUTOINC statement level counter for multi-row INSERTs. */ + trx->n_autoinc_rows = 0; + + prebuilt->sql_stat_start = TRUE; + prebuilt->hint_need_to_fetch_extra_cols = 0; + reset_template(prebuilt); + + if (!prebuilt->mysql_has_locked) { + /* This handle is for a temporary table created inside + this same LOCK TABLES; since MySQL does NOT call external_lock + in this case, we must use x-row locks inside InnoDB to be + prepared for an update of a row */ + + prebuilt->select_lock_type = LOCK_X; + } else { + if (trx->isolation_level != TRX_ISO_SERIALIZABLE + && thd_sql_command(thd) == SQLCOM_SELECT + && lock_type == TL_READ) { + + /* For other than temporary tables, we obtain + no lock for consistent read (plain SELECT). */ + + prebuilt->select_lock_type = LOCK_NONE; + } else { + /* Not a consistent read: restore the + select_lock_type value. The value of + stored_select_lock_type was decided in: + 1) ::store_lock(), + 2) ::external_lock(), + 3) ::init_table_handle_for_HANDLER(), and + 4) ::transactional_table_lock(). */ + + prebuilt->select_lock_type = + prebuilt->stored_select_lock_type; + } + } + + trx->detailed_error[0] = '\0'; + + /* Set the MySQL flag to mark that there is an active transaction */ + if (trx->active_trans == 0) { + + innobase_register_trx_and_stmt(ht, thd); + trx->active_trans = 1; + } else { + innobase_register_stmt(ht, thd); + } + + return(0); +} + +/********************************************************************** +Maps a MySQL trx isolation level code to the InnoDB isolation level code */ +static inline +ulint +innobase_map_isolation_level( +/*=========================*/ + /* out: InnoDB isolation level */ + enum_tx_isolation iso) /* in: MySQL isolation level code */ +{ + switch(iso) { + case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ); + case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED); + case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE); + case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED); + default: ut_a(0); return(0); + } +} + +/********************************************************************** +As MySQL will execute an external lock for every new table it uses when it +starts to process an SQL statement (an exception is when MySQL calls +start_stmt for the handle) we can use this function to store the pointer to +the THD in the handle. We will also use this function to communicate +to InnoDB that a new SQL statement has started and that we must store a +savepoint to our transaction handle, so that we are able to roll back +the SQL statement in case of an error. */ +UNIV_INTERN +int +ha_innobase::external_lock( +/*=======================*/ + /* out: 0 */ + THD* thd, /* in: handle to the user thread */ + int lock_type) /* in: lock type */ +{ + trx_t* trx; + + DBUG_ENTER("ha_innobase::external_lock"); + DBUG_PRINT("enter",("lock_type: %d", lock_type)); + + update_thd(thd); + + /* Statement based binlogging does not work in isolation level + READ UNCOMMITTED and READ COMMITTED since the necessary + locks cannot be taken. In this case, we print an + informative error message and return with an error. */ + if (lock_type == F_WRLCK) + { + ulong const binlog_format= thd_binlog_format(thd); + ulong const tx_isolation = thd_tx_isolation(ha_thd()); + if (tx_isolation <= ISO_READ_COMMITTED && + binlog_format == BINLOG_FORMAT_STMT) + { + char buf[256]; + my_snprintf(buf, sizeof(buf), + "Transaction level '%s' in" + " InnoDB is not safe for binlog mode '%s'", + tx_isolation_names[tx_isolation], + binlog_format_names[binlog_format]); + my_error(ER_BINLOG_LOGGING_IMPOSSIBLE, MYF(0), buf); + DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE); + } + } + + + trx = prebuilt->trx; + + prebuilt->sql_stat_start = TRUE; + prebuilt->hint_need_to_fetch_extra_cols = 0; + + reset_template(prebuilt); + + if (lock_type == F_WRLCK) { + + /* If this is a SELECT, then it is in UPDATE TABLE ... + or SELECT ... FOR UPDATE */ + prebuilt->select_lock_type = LOCK_X; + prebuilt->stored_select_lock_type = LOCK_X; + } + + if (lock_type != F_UNLCK) { + /* MySQL is setting a new table lock */ + + trx->detailed_error[0] = '\0'; + + /* Set the MySQL flag to mark that there is an active + transaction */ + if (trx->active_trans == 0) { + + innobase_register_trx_and_stmt(ht, thd); + trx->active_trans = 1; + } else if (trx->n_mysql_tables_in_use == 0) { + innobase_register_stmt(ht, thd); + } + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE + && prebuilt->select_lock_type == LOCK_NONE + && thd_test_options(thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + /* To get serializable execution, we let InnoDB + conceptually add 'LOCK IN SHARE MODE' to all SELECTs + which otherwise would have been consistent reads. An + exception is consistent reads in the AUTOCOMMIT=1 mode: + we know that they are read-only transactions, and they + can be serialized also if performed as consistent + reads. */ + + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } + + /* Starting from 4.1.9, no InnoDB table lock is taken in LOCK + TABLES if AUTOCOMMIT=1. It does not make much sense to acquire + an InnoDB table lock if it is released immediately at the end + of LOCK TABLES, and InnoDB's table locks in that case cause + VERY easily deadlocks. + + We do not set InnoDB table locks if user has not explicitly + requested a table lock. Note that thd_in_lock_tables(thd) + can hold in some cases, e.g., at the start of a stored + procedure call (SQLCOM_CALL). */ + + if (prebuilt->select_lock_type != LOCK_NONE) { + + if (thd_sql_command(thd) == SQLCOM_LOCK_TABLES + && THDVAR(thd, table_locks) + && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT) + && thd_in_lock_tables(thd)) { + + ulint error = row_lock_table_for_mysql( + prebuilt, NULL, 0); + + if (error != DB_SUCCESS) { + error = convert_error_code_to_mysql( + (int) error, 0, thd); + DBUG_RETURN((int) error); + } + } + + trx->mysql_n_tables_locked++; + } + + trx->n_mysql_tables_in_use++; + prebuilt->mysql_has_locked = TRUE; + + DBUG_RETURN(0); + } + + /* MySQL is releasing a table lock */ + + trx->n_mysql_tables_in_use--; + prebuilt->mysql_has_locked = FALSE; + + /* Release a possible FIFO ticket and search latch. Since we + may reserve the kernel mutex, we have to release the search + system latch first to obey the latching order. */ + + innobase_release_stat_resources(trx); + + /* If the MySQL lock count drops to zero we know that the current SQL + statement has ended */ + + if (trx->n_mysql_tables_in_use == 0) { + + trx->mysql_n_tables_locked = 0; + prebuilt->used_in_HANDLER = FALSE; + + if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + if (trx->active_trans != 0) { + innobase_commit(ht, thd, TRUE); + } + } else { + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && trx->global_read_view) { + + /* At low transaction isolation levels we let + each consistent read set its own snapshot */ + + read_view_close_for_mysql(trx); + } + } + } + + DBUG_RETURN(0); +} + +/********************************************************************** +With this function MySQL request a transactional lock to a table when +user issued query LOCK TABLES..WHERE ENGINE = InnoDB. */ +UNIV_INTERN +int +ha_innobase::transactional_table_lock( +/*==================================*/ + /* out: error code */ + THD* thd, /* in: handle to the user thread */ + int lock_type) /* in: lock type */ +{ + trx_t* trx; + + DBUG_ENTER("ha_innobase::transactional_table_lock"); + DBUG_PRINT("enter",("lock_type: %d", lock_type)); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(thd); + + if (prebuilt->table->ibd_file_missing && !thd_tablespace_op(thd)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir?" + "InnoDB: See" + " http://dev.mysql.com/doc/refman/5.1/en/innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + DBUG_RETURN(HA_ERR_CRASHED); + } + + trx = prebuilt->trx; + + prebuilt->sql_stat_start = TRUE; + prebuilt->hint_need_to_fetch_extra_cols = 0; + + reset_template(prebuilt); + + if (lock_type == F_WRLCK) { + prebuilt->select_lock_type = LOCK_X; + prebuilt->stored_select_lock_type = LOCK_X; + } else if (lock_type == F_RDLCK) { + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB error:\n" +"MySQL is trying to set transactional table lock with corrupted lock type\n" +"to table %s, lock type %d does not exist.\n", + prebuilt->table->name, lock_type); + DBUG_RETURN(HA_ERR_CRASHED); + } + + /* MySQL is setting a new transactional table lock */ + + /* Set the MySQL flag to mark that there is an active transaction */ + if (trx->active_trans == 0) { + + innobase_register_trx_and_stmt(ht, thd); + trx->active_trans = 1; + } + + if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) { + ulint error = DB_SUCCESS; + + error = row_lock_table_for_mysql(prebuilt, NULL, 0); + + if (error != DB_SUCCESS) { + error = convert_error_code_to_mysql( + (int) error, prebuilt->table->flags, thd); + DBUG_RETURN((int) error); + } + + if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + /* Store the current undo_no of the transaction + so that we know where to roll back if we have + to roll back the next SQL statement */ + + trx_mark_sql_stat_end(trx); + } + } + + DBUG_RETURN(0); +} + +/**************************************************************************** +Here we export InnoDB status variables to MySQL. */ +static +int +innodb_export_status() +/*==================*/ +{ + if (innodb_inited) { + srv_export_innodb_status(); + } + + return(0); +} + +/**************************************************************************** +Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB +Monitor to the client. */ +static +bool +innodb_show_status( +/*===============*/ + handlerton* hton, /* in: the innodb handlerton */ + THD* thd, /* in: the MySQL query thread of the caller */ + stat_print_fn *stat_print) +{ + trx_t* trx; + static const char truncated_msg[] = "... truncated...\n"; + const long MAX_STATUS_SIZE = 64000; + ulint trx_list_start = ULINT_UNDEFINED; + ulint trx_list_end = ULINT_UNDEFINED; + + DBUG_ENTER("innodb_show_status"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + innobase_release_stat_resources(trx); + + /* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE + bytes of text. */ + + long flen, usable_len; + char* str; + + mutex_enter(&srv_monitor_file_mutex); + rewind(srv_monitor_file); + srv_printf_innodb_monitor(srv_monitor_file, + &trx_list_start, &trx_list_end); + flen = ftell(srv_monitor_file); + os_file_set_eof(srv_monitor_file); + + if (flen < 0) { + flen = 0; + } + + if (flen > MAX_STATUS_SIZE) { + usable_len = MAX_STATUS_SIZE; + } else { + usable_len = flen; + } + + /* allocate buffer for the string, and + read the contents of the temporary file */ + + if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) { + mutex_exit(&srv_monitor_file_mutex); + DBUG_RETURN(TRUE); + } + + rewind(srv_monitor_file); + if (flen < MAX_STATUS_SIZE) { + /* Display the entire output. */ + flen = (long) fread(str, 1, flen, srv_monitor_file); + } else if (trx_list_end < (ulint) flen + && trx_list_start < trx_list_end + && trx_list_start + (flen - trx_list_end) + < MAX_STATUS_SIZE - sizeof truncated_msg - 1) { + /* Omit the beginning of the list of active transactions. */ + long len = (long) fread(str, 1, trx_list_start, srv_monitor_file); + memcpy(str + len, truncated_msg, sizeof truncated_msg - 1); + len += sizeof truncated_msg - 1; + usable_len = (MAX_STATUS_SIZE - 1) - len; + fseek(srv_monitor_file, flen - usable_len, SEEK_SET); + len += (long) fread(str + len, 1, usable_len, srv_monitor_file); + flen = len; + } else { + /* Omit the end of the output. */ + flen = (long) fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file); + } + + mutex_exit(&srv_monitor_file_mutex); + + bool result = FALSE; + + if (stat_print(thd, innobase_hton_name, strlen(innobase_hton_name), + STRING_WITH_LEN(""), str, flen)) { + result= TRUE; + } + my_free(str, MYF(0)); + + DBUG_RETURN(FALSE); +} + +/**************************************************************************** +Implements the SHOW MUTEX STATUS command. . */ +static +bool +innodb_mutex_show_status( +/*=====================*/ + handlerton* hton, /* in: the innodb handlerton */ + THD* thd, /* in: the MySQL query thread of the + caller */ + stat_print_fn* stat_print) +{ + char buf1[IO_SIZE], buf2[IO_SIZE]; + mutex_t* mutex; + rw_lock_t* lock; +#ifdef UNIV_DEBUG + ulint rw_lock_count= 0; + ulint rw_lock_count_spin_loop= 0; + ulint rw_lock_count_spin_rounds= 0; + ulint rw_lock_count_os_wait= 0; + ulint rw_lock_count_os_yield= 0; + ulonglong rw_lock_wait_time= 0; +#endif /* UNIV_DEBUG */ + uint hton_name_len= strlen(innobase_hton_name), buf1len, buf2len; + DBUG_ENTER("innodb_mutex_show_status"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + mutex_enter(&mutex_list_mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex != NULL) { +#ifdef UNIV_DEBUG + if (mutex->mutex_type != 1) { + if (mutex->count_using > 0) { + buf1len= my_snprintf(buf1, sizeof(buf1), + "%s:%s", + mutex->cmutex_name, mutex->cfile_name); + buf2len= my_snprintf(buf2, sizeof(buf2), + "count=%lu, spin_waits=%lu," + " spin_rounds=%lu, " + "os_waits=%lu, os_yields=%lu," + " os_wait_times=%lu", + mutex->count_using, + mutex->count_spin_loop, + mutex->count_spin_rounds, + mutex->count_os_wait, + mutex->count_os_yield, + (ulong) (mutex->lspent_time/1000)); + + if (stat_print(thd, innobase_hton_name, + hton_name_len, buf1, buf1len, + buf2, buf2len)) { + mutex_exit(&mutex_list_mutex); + DBUG_RETURN(1); + } + } + } + else { + rw_lock_count += mutex->count_using; + rw_lock_count_spin_loop += mutex->count_spin_loop; + rw_lock_count_spin_rounds += mutex->count_spin_rounds; + rw_lock_count_os_wait += mutex->count_os_wait; + rw_lock_count_os_yield += mutex->count_os_yield; + rw_lock_wait_time += mutex->lspent_time; + } +#else /* UNIV_DEBUG */ + buf1len= my_snprintf(buf1, sizeof(buf1), "%s:%lu", + mutex->cfile_name, (ulong) mutex->cline); + buf2len= my_snprintf(buf2, sizeof(buf2), "os_waits=%lu", + mutex->count_os_wait); + + if (stat_print(thd, innobase_hton_name, + hton_name_len, buf1, buf1len, + buf2, buf2len)) { + mutex_exit(&mutex_list_mutex); + DBUG_RETURN(1); + } +#endif /* UNIV_DEBUG */ + + mutex = UT_LIST_GET_NEXT(list, mutex); + } + + mutex_exit(&mutex_list_mutex); + + mutex_enter(&rw_lock_list_mutex); + + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { + if (lock->count_os_wait) { + buf1len= my_snprintf(buf1, sizeof(buf1), "%s:%lu", + lock->cfile_name, (ulong) lock->cline); + buf2len= my_snprintf(buf2, sizeof(buf2), + "os_waits=%lu", lock->count_os_wait); + + if (stat_print(thd, innobase_hton_name, + hton_name_len, buf1, buf1len, + buf2, buf2len)) { + mutex_exit(&rw_lock_list_mutex); + DBUG_RETURN(1); + } + } + lock = UT_LIST_GET_NEXT(list, lock); + } + + mutex_exit(&rw_lock_list_mutex); + +#ifdef UNIV_DEBUG + buf2len= my_snprintf(buf2, sizeof(buf2), + "count=%lu, spin_waits=%lu, spin_rounds=%lu, " + "os_waits=%lu, os_yields=%lu, os_wait_times=%lu", + rw_lock_count, rw_lock_count_spin_loop, + rw_lock_count_spin_rounds, + rw_lock_count_os_wait, rw_lock_count_os_yield, + (ulong) (rw_lock_wait_time/1000)); + + if (stat_print(thd, innobase_hton_name, hton_name_len, + STRING_WITH_LEN("rw_lock_mutexes"), buf2, buf2len)) { + DBUG_RETURN(1); + } +#endif /* UNIV_DEBUG */ + + DBUG_RETURN(FALSE); +} + +static +bool innobase_show_status(handlerton *hton, THD* thd, + stat_print_fn* stat_print, + enum ha_stat_type stat_type) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + switch (stat_type) { + case HA_ENGINE_STATUS: + return innodb_show_status(hton, thd, stat_print); + case HA_ENGINE_MUTEX: + return innodb_mutex_show_status(hton, thd, stat_print); + default: + return(FALSE); + } +} + +/**************************************************************************** + Handling the shared INNOBASE_SHARE structure that is needed to provide table + locking. +****************************************************************************/ + +static INNOBASE_SHARE* get_share(const char* table_name) +{ + INNOBASE_SHARE *share; + pthread_mutex_lock(&innobase_share_mutex); + + ulint fold = ut_fold_string(table_name); + + HASH_SEARCH(table_name_hash, innobase_open_tables, fold, + INNOBASE_SHARE*, share, + ut_ad(share->use_count > 0), + !strcmp(share->table_name, table_name)); + + if (!share) { + + uint length = (uint) strlen(table_name); + + /* TODO: invoke HASH_MIGRATE if innobase_open_tables + grows too big */ + + share = (INNOBASE_SHARE *) my_malloc(sizeof(*share)+length+1, + MYF(MY_FAE | MY_ZEROFILL)); + + share->table_name = (char*) memcpy(share + 1, + table_name, length + 1); + + HASH_INSERT(INNOBASE_SHARE, table_name_hash, + innobase_open_tables, fold, share); + + thr_lock_init(&share->lock); + pthread_mutex_init(&share->mutex,MY_MUTEX_INIT_FAST); + } + + share->use_count++; + pthread_mutex_unlock(&innobase_share_mutex); + + return(share); +} + +static void free_share(INNOBASE_SHARE* share) +{ + pthread_mutex_lock(&innobase_share_mutex); + +#ifdef UNIV_DEBUG + INNOBASE_SHARE* share2; + ulint fold = ut_fold_string(share->table_name); + + HASH_SEARCH(table_name_hash, innobase_open_tables, fold, + INNOBASE_SHARE*, share2, + ut_ad(share->use_count > 0), + !strcmp(share->table_name, share2->table_name)); + + ut_a(share2 == share); +#endif /* UNIV_DEBUG */ + + if (!--share->use_count) { + ulint fold = ut_fold_string(share->table_name); + + HASH_DELETE(INNOBASE_SHARE, table_name_hash, + innobase_open_tables, fold, share); + thr_lock_delete(&share->lock); + pthread_mutex_destroy(&share->mutex); + my_free(share, MYF(0)); + + /* TODO: invoke HASH_MIGRATE if innobase_open_tables + shrinks too much */ + } + + pthread_mutex_unlock(&innobase_share_mutex); +} + +/********************************************************************* +Converts a MySQL table lock stored in the 'lock' field of the handle to +a proper type before storing pointer to the lock into an array of pointers. +MySQL also calls this if it wants to reset some table locks to a not-locked +state during the processing of an SQL query. An example is that during a +SELECT the read lock is released early on the 'const' tables where we only +fetch one row. MySQL does not call this when it releases all locks at the +end of an SQL statement. */ +UNIV_INTERN +THR_LOCK_DATA** +ha_innobase::store_lock( +/*====================*/ + /* out: pointer to the next + element in the 'to' array */ + THD* thd, /* in: user thread handle */ + THR_LOCK_DATA** to, /* in: pointer to an array + of pointers to lock structs; + pointer to the 'lock' field + of current handle is stored + next to this array */ + enum thr_lock_type lock_type) /* in: lock type to store in + 'lock'; this may also be + TL_IGNORE */ +{ + trx_t* trx; + + /* Note that trx in this function is NOT necessarily prebuilt->trx + because we call update_thd() later, in ::external_lock()! Failure to + understand this caused a serious memory corruption bug in 5.1.11. */ + + trx = check_trx_exists(thd); + + /* NOTE: MySQL can call this function with lock 'type' TL_IGNORE! + Be careful to ignore TL_IGNORE if we are going to do something with + only 'real' locks! */ + + /* If no MySQL table is in use, we need to set the isolation level + of the transaction. */ + + if (lock_type != TL_IGNORE + && trx->n_mysql_tables_in_use == 0) { + trx->isolation_level = innobase_map_isolation_level( + (enum_tx_isolation) thd_tx_isolation(thd)); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && trx->global_read_view) { + + /* At low transaction isolation levels we let + each consistent read set its own snapshot */ + + read_view_close_for_mysql(trx); + } + } + + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + const bool in_lock_tables = thd_in_lock_tables(thd); + const uint sql_command = thd_sql_command(thd); + + if (sql_command == SQLCOM_DROP_TABLE) { + + /* MySQL calls this function in DROP TABLE though this table + handle may belong to another thd that is running a query. Let + us in that case skip any changes to the prebuilt struct. */ + + } else if ((lock_type == TL_READ && in_lock_tables) + || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) + || lock_type == TL_READ_WITH_SHARED_LOCKS + || lock_type == TL_READ_NO_INSERT + || (lock_type != TL_IGNORE + && sql_command != SQLCOM_SELECT)) { + + /* The OR cases above are in this order: + 1) MySQL is doing LOCK TABLES ... READ LOCAL, or we + are processing a stored procedure or function, or + 2) (we do not know when TL_READ_HIGH_PRIORITY is used), or + 3) this is a SELECT ... IN SHARE MODE, or + 4) we are doing a complex SQL statement like + INSERT INTO ... SELECT ... and the logical logging (MySQL + binlog) requires the use of a locking read, or + MySQL is doing LOCK TABLES ... READ. + 5) we let InnoDB do locking reads for all SQL statements that + are not simple SELECTs; note that select_lock_type in this + case may get strengthened in ::external_lock() to LOCK_X. + Note that we MUST use a locking read in all data modifying + SQL statements, because otherwise the execution would not be + serializable, and also the results from the update could be + unexpected if an obsolete consistent read view would be + used. */ + + ulint isolation_level; + + isolation_level = trx->isolation_level; + + if ((srv_locks_unsafe_for_binlog + || isolation_level == TRX_ISO_READ_COMMITTED) + && isolation_level != TRX_ISO_SERIALIZABLE + && (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) + && (sql_command == SQLCOM_INSERT_SELECT + || sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_CREATE_TABLE)) { + + /* If we either have innobase_locks_unsafe_for_binlog + option set or this session is using READ COMMITTED + isolation level and isolation level of the transaction + is not set to serializable and MySQL is doing + INSERT INTO...SELECT or UPDATE ... = (SELECT ...) or + CREATE ... SELECT... without FOR UPDATE or + IN SHARE MODE in select, then we use consistent + read for select. */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + } else if (sql_command == SQLCOM_CHECKSUM) { + /* Use consistent read for checksum table */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + } else { + prebuilt->select_lock_type = LOCK_S; + prebuilt->stored_select_lock_type = LOCK_S; + } + + } else if (lock_type != TL_IGNORE) { + + /* We set possible LOCK_X value in external_lock, not yet + here even if this would be SELECT ... FOR UPDATE */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + } + + if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) { + + /* Starting from 5.0.7, we weaken also the table locks + set at the start of a MySQL stored procedure call, just like + we weaken the locks set at the start of an SQL statement. + MySQL does set in_lock_tables TRUE there, but in reality + we do not need table locks to make the execution of a + single transaction stored procedure call deterministic + (if it does not use a consistent read). */ + + if (lock_type == TL_READ + && sql_command == SQLCOM_LOCK_TABLES) { + /* We come here if MySQL is processing LOCK TABLES + ... READ LOCAL. MyISAM under that table lock type + reads the table as it was at the time the lock was + granted (new inserts are allowed, but not seen by the + reader). To get a similar effect on an InnoDB table, + we must use LOCK TABLES ... READ. We convert the lock + type here, so that for InnoDB, READ LOCAL is + equivalent to READ. This will change the InnoDB + behavior in mysqldump, so that dumps of InnoDB tables + are consistent with dumps of MyISAM tables. */ + + lock_type = TL_READ_NO_INSERT; + } + + /* If we are not doing a LOCK TABLE, DISCARD/IMPORT + TABLESPACE or TRUNCATE TABLE then allow multiple + writers. Note that ALTER TABLE uses a TL_WRITE_ALLOW_READ + < TL_WRITE_CONCURRENT_INSERT. + + We especially allow multiple writers if MySQL is at the + start of a stored procedure call (SQLCOM_CALL) or a + stored function call (MySQL does have in_lock_tables + TRUE there). */ + + if ((lock_type >= TL_WRITE_CONCURRENT_INSERT + && lock_type <= TL_WRITE) + && !(in_lock_tables + && sql_command == SQLCOM_LOCK_TABLES) + && !thd_tablespace_op(thd) + && sql_command != SQLCOM_TRUNCATE + && sql_command != SQLCOM_OPTIMIZE + && sql_command != SQLCOM_CREATE_TABLE) { + + lock_type = TL_WRITE_ALLOW_WRITE; + } + + /* In queries of type INSERT INTO t1 SELECT ... FROM t2 ... + MySQL would use the lock TL_READ_NO_INSERT on t2, and that + would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts + to t2. Convert the lock to a normal read lock to allow + concurrent inserts to t2. + + We especially allow concurrent inserts if MySQL is at the + start of a stored procedure call (SQLCOM_CALL) + (MySQL does have thd_in_lock_tables() TRUE there). */ + + if (lock_type == TL_READ_NO_INSERT + && sql_command != SQLCOM_LOCK_TABLES) { + + lock_type = TL_READ; + } + + lock.type = lock_type; + } + + *to++= &lock; + + return(to); +} + +/******************************************************************************* +Read the next autoinc value. Acquire the relevant locks before reading +the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked +on return and all relevant locks acquired. */ +UNIV_INTERN +ulint +ha_innobase::innobase_get_autoinc( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + ulonglong* value) /* out: autoinc value */ +{ + *value = 0; + + prebuilt->autoinc_error = innobase_lock_autoinc(); + + if (prebuilt->autoinc_error == DB_SUCCESS) { + + /* Determine the first value of the interval */ + *value = dict_table_autoinc_read(prebuilt->table); + + /* It should have been initialized during open. */ + ut_a(*value != 0); + } + + return(prebuilt->autoinc_error); +} + +/*********************************************************************** +This function reads the global auto-inc counter. It doesn't use the +AUTOINC lock even if the lock mode is set to TRADITIONAL. */ +UNIV_INTERN +ulonglong +ha_innobase::innobase_peek_autoinc(void) +/*====================================*/ + /* out: the autoinc value */ +{ + ulonglong auto_inc; + dict_table_t* innodb_table; + + ut_a(prebuilt != NULL); + ut_a(prebuilt->table != NULL); + + innodb_table = prebuilt->table; + + dict_table_autoinc_lock(innodb_table); + + auto_inc = dict_table_autoinc_read(innodb_table); + + ut_a(auto_inc > 0); + + dict_table_autoinc_unlock(innodb_table); + + return(auto_inc); +} + +/******************************************************************************* +This function initializes the auto-inc counter if it has not been +initialized yet. This function does not change the value of the auto-inc +counter if it already has been initialized. Returns the value of the +auto-inc counter in *first_value, and ULONGLONG_MAX in *nb_reserved_values (as +we have a table-level lock). offset, increment, nb_desired_values are ignored. +*first_value is set to -1 if error (deadlock or lock wait timeout) */ +UNIV_INTERN +void +ha_innobase::get_auto_increment( +/*============================*/ + ulonglong offset, /* in: */ + ulonglong increment, /* in: table autoinc increment */ + ulonglong nb_desired_values, /* in: number of values reqd */ + ulonglong *first_value, /* out: the autoinc value */ + ulonglong *nb_reserved_values) /* out: count of reserved values */ +{ + trx_t* trx; + ulint error; + ulonglong autoinc = 0; + + /* Prepare prebuilt->trx in the table handle */ + update_thd(ha_thd()); + + error = innobase_get_autoinc(&autoinc); + + if (error != DB_SUCCESS) { + *first_value = (~(ulonglong) 0); + return; + } + + /* This is a hack, since nb_desired_values seems to be accurate only + for the first call to get_auto_increment() for multi-row INSERT and + meaningless for other statements e.g, LOAD etc. Subsequent calls to + this method for the same statement results in different values which + don't make sense. Therefore we store the value the first time we are + called and count down from that as rows are written (see write_row()). + */ + + trx = prebuilt->trx; + + /* Note: We can't rely on *first_value since some MySQL engines, + in particular the partition engine, don't initialize it to 0 when + invoking this method. So we are not sure if it's guaranteed to + be 0 or not. */ + + /* Called for the first time ? */ + if (trx->n_autoinc_rows == 0) { + + trx->n_autoinc_rows = (ulint) nb_desired_values; + + /* It's possible for nb_desired_values to be 0: + e.g., INSERT INTO T1(C) SELECT C FROM T2; */ + if (nb_desired_values == 0) { + + trx->n_autoinc_rows = 1; + } + + set_if_bigger(*first_value, autoinc); + /* Not in the middle of a mult-row INSERT. */ + } else if (prebuilt->autoinc_last_value == 0) { + set_if_bigger(*first_value, autoinc); + } + + *nb_reserved_values = trx->n_autoinc_rows; + + /* With old style AUTOINC locking we only update the table's + AUTOINC counter after attempting to insert the row. */ + if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) { + ulonglong need; + ulonglong next_value; + ulonglong col_max_value; + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); + + need = *nb_reserved_values * increment; + + /* Compute the last value in the interval */ + next_value = innobase_next_autoinc( + *first_value, need, offset, col_max_value); + + prebuilt->autoinc_last_value = next_value; + + if (prebuilt->autoinc_last_value < *first_value) { + *first_value = (~(ulonglong) 0); + } else { + /* Update the table autoinc variable */ + dict_table_autoinc_update_if_greater( + prebuilt->table, prebuilt->autoinc_last_value); + } + } else { + /* This will force write_row() into attempting an update + of the table's AUTOINC counter. */ + prebuilt->autoinc_last_value = 0; + } + + /* The increment to be used to increase the AUTOINC value, we use + this in write_row() and update_row() to increase the autoinc counter + for columns that are filled by the user. We need the offset and + the increment. */ + prebuilt->autoinc_offset = offset; + prebuilt->autoinc_increment = increment; + + dict_table_autoinc_unlock(prebuilt->table); +} + +/* See comment in handler.h */ +UNIV_INTERN +int +ha_innobase::reset_auto_increment( +/*==============================*/ + ulonglong value) /* in: new value for table autoinc */ +{ + DBUG_ENTER("ha_innobase::reset_auto_increment"); + + int error; + + update_thd(ha_thd()); + + error = row_lock_table_autoinc_for_mysql(prebuilt); + + if (error != DB_SUCCESS) { + error = convert_error_code_to_mysql(error, + prebuilt->table->flags, + user_thd); + + DBUG_RETURN(error); + } + + /* The next value can never be 0. */ + if (value == 0) { + value = 1; + } + + innobase_reset_autoinc(value); + + DBUG_RETURN(0); +} + +/* See comment in handler.cc */ +UNIV_INTERN +bool +ha_innobase::get_error_message(int error, String *buf) +{ + trx_t* trx = check_trx_exists(ha_thd()); + + buf->copy(trx->detailed_error, strlen(trx->detailed_error), + system_charset_info); + + return(FALSE); +} + +/*********************************************************************** +Compares two 'refs'. A 'ref' is the (internal) primary key value of the row. +If there is no explicitly declared non-null unique key or a primary key, then +InnoDB internally uses the row id as the primary key. */ +UNIV_INTERN +int +ha_innobase::cmp_ref( +/*=================*/ + /* out: < 0 if ref1 < ref2, 0 if equal, else + > 0 */ + const uchar* ref1, /* in: an (internal) primary key value in the + MySQL key value format */ + const uchar* ref2) /* in: an (internal) primary key value in the + MySQL key value format */ +{ + enum_field_types mysql_type; + Field* field; + KEY_PART_INFO* key_part; + KEY_PART_INFO* key_part_end; + uint len1; + uint len2; + int result; + + if (prebuilt->clust_index_was_generated) { + /* The 'ref' is an InnoDB row id */ + + return(memcmp(ref1, ref2, DATA_ROW_ID_LEN)); + } + + /* Do a type-aware comparison of primary key fields. PK fields + are always NOT NULL, so no checks for NULL are performed. */ + + key_part = table->key_info[table->s->primary_key].key_part; + + key_part_end = key_part + + table->key_info[table->s->primary_key].key_parts; + + for (; key_part != key_part_end; ++key_part) { + field = key_part->field; + mysql_type = field->type(); + + if (mysql_type == MYSQL_TYPE_TINY_BLOB + || mysql_type == MYSQL_TYPE_MEDIUM_BLOB + || mysql_type == MYSQL_TYPE_BLOB + || mysql_type == MYSQL_TYPE_LONG_BLOB) { + + /* In the MySQL key value format, a column prefix of + a BLOB is preceded by a 2-byte length field */ + + len1 = innobase_read_from_2_little_endian(ref1); + len2 = innobase_read_from_2_little_endian(ref2); + + ref1 += 2; + ref2 += 2; + result = ((Field_blob*)field)->cmp( ref1, len1, + ref2, len2); + } else { + result = field->key_cmp(ref1, ref2); + } + + if (result) { + + return(result); + } + + ref1 += key_part->store_length; + ref2 += key_part->store_length; + } + + return(0); +} + +/*********************************************************************** +Ask InnoDB if a query to a table can be cached. */ +UNIV_INTERN +my_bool +ha_innobase::register_query_cache_table( +/*====================================*/ + /* out: TRUE if query caching + of the table is permitted */ + THD* thd, /* in: user thread handle */ + char* table_key, /* in: concatenation of database name, + the null character '\0', + and the table name */ + uint key_length, /* in: length of the full name, i.e. + len(dbname) + len(tablename) + 1 */ + qc_engine_callback* + call_back, /* out: pointer to function for + checking if query caching + is permitted */ + ulonglong *engine_data) /* in/out: data to call_back */ +{ + *call_back = innobase_query_caching_of_table_permitted; + *engine_data = 0; + return(innobase_query_caching_of_table_permitted(thd, table_key, + key_length, + engine_data)); +} + +UNIV_INTERN +char* +ha_innobase::get_mysql_bin_log_name() +{ + return(trx_sys_mysql_bin_log_name); +} + +UNIV_INTERN +ulonglong +ha_innobase::get_mysql_bin_log_pos() +{ + /* trx... is ib_int64_t, which is a typedef for a 64-bit integer + (__int64 or longlong) so it's ok to cast it to ulonglong. */ + + return(trx_sys_mysql_bin_log_pos); +} + +/********************************************************************** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. + +NOTE: the prototype of this function is copied to data0type.c! If you change +this function, you MUST change also data0type.c! */ +extern "C" UNIV_INTERN +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + /* out: number of bytes occupied by the first + n characters */ + ulint charset_id, /* in: character set id */ + ulint prefix_len, /* in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /* in: length of the string in bytes */ + const char* str) /* in: character string */ +{ + ulint char_length; /* character length in bytes */ + ulint n_chars; /* number of characters in prefix */ + CHARSET_INFO* charset; /* charset used in the field */ + + charset = get_charset((uint) charset_id, MYF(MY_WME)); + + ut_ad(charset); + ut_ad(charset->mbmaxlen); + + /* Calculate how many characters at most the prefix index contains */ + + n_chars = prefix_len / charset->mbmaxlen; + + /* If the charset is multi-byte, then we must find the length of the + first at most n chars in the string. If the string contains less + characters than n, then we return the length to the end of the last + character. */ + + if (charset->mbmaxlen > 1) { + /* my_charpos() returns the byte length of the first n_chars + characters, or a value bigger than the length of str, if + there were not enough full characters in str. + + Why does the code below work: + Suppose that we are looking for n UTF-8 characters. + + 1) If the string is long enough, then the prefix contains at + least n complete UTF-8 characters + maybe some extra + characters + an incomplete UTF-8 character. No problem in + this case. The function returns the pointer to the + end of the nth character. + + 2) If the string is not long enough, then the string contains + the complete value of a column, that is, only complete UTF-8 + characters, and we can store in the column prefix index the + whole string. */ + + char_length = my_charpos(charset, str, + str + data_len, (int) n_chars); + if (char_length > data_len) { + char_length = data_len; + } + } else { + if (data_len < prefix_len) { + char_length = data_len; + } else { + char_length = prefix_len; + } + } + + return(char_length); +} + +/*********************************************************************** +This function is used to prepare X/Open XA distributed transaction */ +static +int +innobase_xa_prepare( +/*================*/ + /* out: 0 or error number */ + handlerton *hton, + THD* thd, /* in: handle to the MySQL thread of the user + whose XA transaction should be prepared */ + bool all) /* in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + int error = 0; + trx_t* trx = check_trx_exists(thd); + + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (thd_sql_command(thd) != SQLCOM_XA_PREPARE && + (all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) + { + if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) { + /* choose group commit rather than binlog order */ + return(0); + } + + /* For ibbackup to work the order of transactions in binlog + and InnoDB must be the same. Consider the situation + + thread1> prepare; write to binlog; ... + + thread2> prepare; write to binlog; commit + thread1> ... commit + + To ensure this will not happen we're taking the mutex on + prepare, and releasing it on commit. + + Note: only do it for normal commits, done via ha_commit_trans. + If 2pc protocol is executed by external transaction + coordinator, it will be just a regular MySQL client + executing XA PREPARE and XA COMMIT commands. + In this case we cannot know how many minutes or hours + will be between XA PREPARE and XA COMMIT, and we don't want + to block for undefined period of time. + */ + pthread_mutex_lock(&prepare_commit_mutex); + trx->active_trans = 2; + } + + /* we use support_xa value as it was seen at transaction start + time, not the current session variable value. Any possible changes + to the session variable take effect only in the next transaction */ + if (!trx->support_xa) { + + return(0); + } + + thd_get_xid(thd, (MYSQL_XID*) &trx->xid); + + /* Release a possible FIFO ticket and search latch. Since we will + reserve the kernel mutex, we have to release the search system latch + first to obey the latching order. */ + + innobase_release_stat_resources(trx); + + if (trx->active_trans == 0 && trx->conc_state != TRX_NOT_STARTED) { + + sql_print_error("trx->active_trans == 0, but trx->conc_state != " + "TRX_NOT_STARTED"); + } + + if (all + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + /* We were instructed to prepare the whole transaction, or + this is an SQL statement end and autocommit is on */ + + ut_ad(trx->active_trans); + + error = (int) trx_prepare_for_mysql(trx); + } else { + /* We just mark the SQL statement ended and do not do a + transaction prepare */ + + /* If we had reserved the auto-inc lock for some + table in this SQL statement we release it now */ + + row_unlock_table_autoinc_for_mysql(trx); + + /* Store the current undo_no of the transaction so that we + know where to roll back if we have to roll back the next + SQL statement */ + + trx_mark_sql_stat_end(trx); + } + + /* Tell the InnoDB server that there might be work for utility + threads: */ + + srv_active_wake_master_thread(); + + return(error); +} + +/*********************************************************************** +This function is used to recover X/Open XA distributed transactions */ +static +int +innobase_xa_recover( +/*================*/ + /* out: number of prepared transactions + stored in xid_list */ + handlerton *hton, + XID* xid_list, /* in/out: prepared transactions */ + uint len) /* in: number of slots in xid_list */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (len == 0 || xid_list == NULL) { + + return(0); + } + + return(trx_recover_for_mysql(xid_list, len)); +} + +/*********************************************************************** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state */ +static +int +innobase_commit_by_xid( +/*===================*/ + /* out: 0 or error number */ + handlerton *hton, + XID* xid) /* in: X/Open XA transaction identification */ +{ + trx_t* trx; + + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = trx_get_trx_by_xid(xid); + + if (trx) { + innobase_commit_low(trx); + + return(XA_OK); + } else { + return(XAER_NOTA); + } +} + +/*********************************************************************** +This function is used to rollback one X/Open XA distributed transaction +which is in the prepared state */ +static +int +innobase_rollback_by_xid( +/*=====================*/ + /* out: 0 or error number */ + handlerton *hton, + XID *xid) /* in: X/Open XA transaction identification */ +{ + trx_t* trx; + + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = trx_get_trx_by_xid(xid); + + if (trx) { + return(innobase_rollback_trx(trx)); + } else { + return(XAER_NOTA); + } +} + +/*********************************************************************** +Create a consistent view for a cursor based on current transaction +which is created if the corresponding MySQL thread still lacks one. +This consistent view is then used inside of MySQL when accessing records +using a cursor. */ +static +void* +innobase_create_cursor_view( +/*========================*/ + /* out: pointer to cursor view or NULL */ + handlerton *hton, /* in: innobase hton */ + THD* thd) /* in: user thread handle */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + return(read_cursor_view_create_for_mysql(check_trx_exists(thd))); +} + +/*********************************************************************** +Close the given consistent cursor view of a transaction and restore +global read view to a transaction read view. Transaction is created if the +corresponding MySQL thread still lacks one. */ +static +void +innobase_close_cursor_view( +/*=======================*/ + handlerton *hton, + THD* thd, /* in: user thread handle */ + void* curview)/* in: Consistent read view to be closed */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + read_cursor_view_close_for_mysql(check_trx_exists(thd), + (cursor_view_t*) curview); +} + +/*********************************************************************** +Set the given consistent cursor view to a transaction which is created +if the corresponding MySQL thread still lacks one. If the given +consistent cursor view is NULL global read view of a transaction is +restored to a transaction read view. */ +static +void +innobase_set_cursor_view( +/*=====================*/ + handlerton *hton, + THD* thd, /* in: user thread handle */ + void* curview)/* in: Consistent cursor view to be set */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + read_cursor_set_for_mysql(check_trx_exists(thd), + (cursor_view_t*) curview); +} + + +UNIV_INTERN +bool +ha_innobase::check_if_incompatible_data( + HA_CREATE_INFO* info, + uint table_changes) +{ + if (table_changes != IS_EQUAL_YES) { + + return(COMPATIBLE_DATA_NO); + } + + /* Check that auto_increment value was not changed */ + if ((info->used_fields & HA_CREATE_USED_AUTO) && + info->auto_increment_value != 0) { + + return(COMPATIBLE_DATA_NO); + } + + /* Check that row format didn't change */ + if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) && + get_row_type() != info->row_type) { + + return(COMPATIBLE_DATA_NO); + } + + /* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */ + if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) { + return(COMPATIBLE_DATA_NO); + } + + return(COMPATIBLE_DATA_YES); +} + +/**************************************************************** +Validate the file format name and return its corresponding id. */ +static +uint +innobase_file_format_name_lookup( +/*=============================*/ + /* out: valid file format id*/ + const char* format_name) /* in: pointer to file format name */ +{ + char* endp; + uint format_id; + + ut_a(format_name != NULL); + + /* The format name can contain the format id itself instead of + the name and we check for that. */ + format_id = (uint) strtoul(format_name, &endp, 10); + + /* Check for valid parse. */ + if (*endp == '\0' && *format_name != '\0') { + + if (format_id <= DICT_TF_FORMAT_MAX) { + + return(format_id); + } + } else { + + for (format_id = 0; format_id <= DICT_TF_FORMAT_MAX; + format_id++) { + const char* name; + + name = trx_sys_file_format_id_to_name(format_id); + + if (!innobase_strcasecmp(format_name, name)) { + + return(format_id); + } + } + } + + return(DICT_TF_FORMAT_MAX + 1); +} + +/**************************************************************** +Validate the file format check value, is it one of "on" or "off", +as a side effect it sets the srv_check_file_format_at_startup variable. */ +static +bool +innobase_file_format_check_on_off( +/*==============================*/ + /* out: true if config value one + of "on" or "off" */ + const char* format_check) /* in: parameter value */ +{ + bool ret = true; + + if (!innobase_strcasecmp(format_check, "off")) { + + /* Set the value to disable checking. */ + srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX + 1; + + } else if (!innobase_strcasecmp(format_check, "on")) { + + /* Set the value to the lowest supported format. */ + srv_check_file_format_at_startup = DICT_TF_FORMAT_51; + } else { + ret = FALSE; + } + + return(ret); +} + +/**************************************************************** +Validate the file format check config parameters, as a side effect it +sets the srv_check_file_format_at_startup variable. */ +static +bool +innobase_file_format_check_validate( +/*================================*/ + /* out: true if valid config value */ + const char* format_check) /* in: parameter value */ +{ + uint format_id; + bool ret = true; + + format_id = innobase_file_format_name_lookup(format_check); + + if (format_id < DICT_TF_FORMAT_MAX + 1) { + srv_check_file_format_at_startup = format_id; + } else { + ret = false; + } + + return(ret); +} + +/***************************************************************** +Check if it is a valid file format. This function is registered as +a callback with MySQL. */ +static +int +innodb_file_format_name_validate( +/*=============================*/ + /* out: 0 for valid file + format */ + THD* thd, /* in: thread handle */ + struct st_mysql_sys_var* var, /* in: pointer to system + variable */ + void* save, /* out: immediate result + for update function */ + struct st_mysql_value* value) /* in: incoming string */ +{ + const char* file_format_input; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + ut_a(save != NULL); + ut_a(value != NULL); + + file_format_input = value->val_str(value, buff, &len); + + if (file_format_input != NULL) { + uint format_id; + + format_id = innobase_file_format_name_lookup( + file_format_input); + + if (format_id <= DICT_TF_FORMAT_MAX) { + + *(uint*) save = format_id; + return(0); + } + } + + return(1); +} + +/******************************************************************** +Update the system variable innodb_file_format using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_file_format_name_update( +/*===========================*/ + THD* thd, /* in: thread handle */ + struct st_mysql_sys_var* var, /* in: pointer to + system variable */ + void* var_ptr, /* out: where the + formal string goes */ + const void* save) /* in: immediate result + from check function */ +{ + ut_a(var_ptr != NULL); + ut_a(save != NULL); + ut_a((*(const uint*) save) <= DICT_TF_FORMAT_MAX); + + srv_file_format = *(const uint*) save; + + *(const char**) var_ptr + = trx_sys_file_format_id_to_name(srv_file_format); +} + +/***************************************************************** +Check if valid argument to innodb_file_format_check. This +function is registered as a callback with MySQL. */ +static +int +innodb_file_format_check_validate( +/*==============================*/ + /* out: 0 for valid file + format */ + THD* thd, /* in: thread handle */ + struct st_mysql_sys_var* var, /* in: pointer to system + variable */ + void* save, /* out: immediate result + for update function */ + struct st_mysql_value* value) /* in: incoming string */ +{ + const char* file_format_input; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + ut_a(save != NULL); + ut_a(value != NULL); + + file_format_input = value->val_str(value, buff, &len); + + if (file_format_input != NULL) { + + /* Check if user set on/off, we want to print a suitable + message if they did so. */ + + if (innobase_file_format_check_on_off(file_format_input)) { + sql_print_warning( + "InnoDB: invalid innodb_file_format_check " + "value; on/off can only be set at startup or " + "in the configuration file"); + } else if (innobase_file_format_check_validate( + file_format_input)) { + + uint format_id; + + format_id = innobase_file_format_name_lookup( + file_format_input); + + ut_a(format_id <= DICT_TF_FORMAT_MAX); + + *(uint*) save = format_id; + + return(0); + + } else { + sql_print_warning( + "InnoDB: invalid innodb_file_format_check " + "value; can be any format up to %s " + "or its equivalent numeric id", + trx_sys_file_format_id_to_name( + DICT_TF_FORMAT_MAX)); + } + } + + return(1); +} + +/******************************************************************** +Update the system variable innodb_file_format_check using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_file_format_check_update( +/*============================*/ + THD* thd, /* in: thread handle */ + struct st_mysql_sys_var* var, /* in: pointer to + system variable */ + void* var_ptr, /* out: where the + formal string goes */ + const void* save) /* in: immediate result + from check function */ +{ + uint format_id; + + ut_a(save != NULL); + ut_a(var_ptr != NULL); + + format_id = *(const uint*) save; + + /* Update the max format id in the system tablespace. */ + if (trx_sys_file_format_max_set(format_id, (const char**) var_ptr)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " [Info] InnoDB: the file format in the system " + "tablespace is now set to %s.\n", *(char**) var_ptr); + } +} + +/******************************************************************** +Update the system variable innodb_adaptive_hash_index using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_adaptive_hash_index_update( +/*==============================*/ + THD* thd, /* in: thread handle */ + struct st_mysql_sys_var* var, /* in: pointer to + system variable */ + void* var_ptr, /* out: where the + formal string goes */ + const void* save) /* in: immediate result + from check function */ +{ + if (*(my_bool*) save) { + btr_search_enable(); + } else { + btr_search_disable(); + } +} + +/***************************************************************** +Check if it is a valid value of innodb_change_buffering. This function is +registered as a callback with MySQL. */ +static +int +innodb_change_buffering_validate( +/*=============================*/ + /* out: 0 for valid + innodb_change_buffering */ + THD* thd, /* in: thread handle */ + struct st_mysql_sys_var* var, /* in: pointer to system + variable */ + void* save, /* out: immediate result + for update function */ + struct st_mysql_value* value) /* in: incoming string */ +{ + const char* change_buffering_input; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + ut_a(save != NULL); + ut_a(value != NULL); + + change_buffering_input = value->val_str(value, buff, &len); + + if (change_buffering_input != NULL) { + ulint use; + + for (use = 0; use < UT_ARR_SIZE(innobase_change_buffering_values); + use++) { + if (!innobase_strcasecmp( + change_buffering_input, + innobase_change_buffering_values[use])) { + *(ibuf_use_t*) save = (ibuf_use_t) use; + return(0); + } + } + } + + return(1); +} + +/******************************************************************** +Update the system variable innodb_change_buffering using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_change_buffering_update( +/*===========================*/ + THD* thd, /* in: thread handle */ + struct st_mysql_sys_var* var, /* in: pointer to + system variable */ + void* var_ptr, /* out: where the + formal string goes */ + const void* save) /* in: immediate result + from check function */ +{ + ut_a(var_ptr != NULL); + ut_a(save != NULL); + ut_a((*(ibuf_use_t*) save) < IBUF_USE_COUNT); + + ibuf_use = *(const ibuf_use_t*) save; + + *(const char**) var_ptr = innobase_change_buffering_values[ibuf_use]; +} + +static int show_innodb_vars(THD *thd, SHOW_VAR *var, char *buff) +{ + innodb_export_status(); + var->type= SHOW_ARRAY; + var->value= (char *) &innodb_status_variables; + return 0; +} + +static SHOW_VAR innodb_status_variables_export[]= { + {"Innodb", (char*) &show_innodb_vars, SHOW_FUNC}, + {NullS, NullS, SHOW_LONG} +}; + +static struct st_mysql_storage_engine innobase_storage_engine= +{ MYSQL_HANDLERTON_INTERFACE_VERSION }; + +/* plugin options */ +static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable InnoDB checksums validation (enabled by default). " + "Disable with --skip-innodb-checksums.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir, + PLUGIN_VAR_READONLY, + "The common part for InnoDB table spaces.", + NULL, NULL, NULL); + +static MYSQL_SYSVAR_BOOL(extra_undoslots, innobase_extra_undoslots, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable to use about 4000 undo slots instead of default 1024. " + "#### Attention: Once you enable this parameter, " + "don't use the datafile for normal mysqld or ibbackup! ####", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(overwrite_relay_log_info, innobase_overwrite_relay_log_info, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "During InnoDB crash recovery on slave overwrite relay-log.info " + "to align master log file position if information in InnoDB and relay-log.info is different.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable InnoDB doublewrite buffer (enabled by default). " + "Disable with --skip-innodb-doublewrite.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown, + PLUGIN_VAR_OPCMDARG, + "Speeds up the shutdown process of the InnoDB storage engine. Possible " + "values are 0, 1 (faster)" + /* + NetWare can't close unclosed files, can't automatically kill remaining + threads, etc, so on this OS we disable the crash-like InnoDB shutdown. + */ + IF_NETWARE("", " or 2 (fastest - crash-like)") + ".", + NULL, NULL, 1, 0, IF_NETWARE(1,2), 0); + +static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table, + PLUGIN_VAR_NOCMDARG, + "Stores each InnoDB table to an .ibd file in the database dir.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name, + PLUGIN_VAR_RQCMDARG, + "File format to use for new tables in .ibd files.", + innodb_file_format_name_validate, + innodb_file_format_name_update, "Antelope"); + +static MYSQL_SYSVAR_STR(file_format_check, innobase_file_format_check, + PLUGIN_VAR_OPCMDARG, + "The highest file format in the tablespace.", + innodb_file_format_check_validate, + innodb_file_format_check_update, + "on"); + +static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, + PLUGIN_VAR_OPCMDARG, + "Set to 0 (write and flush once per second)," + " 1 (write and flush at each commit)" + " or 2 (write at commit, flush once per second).", + NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_STR(flush_method, innobase_unix_file_flush_method, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "With which method to flush data.", NULL, NULL, NULL); + +static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Force InnoDB to not use next-key locking, to use only row-level locking.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(show_verbose_locks, srv_show_verbose_locks, + PLUGIN_VAR_OPCMDARG, + "Whether to show records locked in SHOW INNODB STATUS.", + NULL, NULL, 0, 0, 1, 0); + +static MYSQL_SYSVAR_ULONG(show_locks_held, srv_show_locks_held, + PLUGIN_VAR_RQCMDARG, + "Number of locks held to print for each InnoDB transaction in SHOW INNODB STATUS.", + NULL, NULL, 10, 0, 1000, 0); + +#ifdef UNIV_LOG_ARCHIVE +static MYSQL_SYSVAR_STR(log_arch_dir, innobase_log_arch_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Where full logs should be archived.", NULL, NULL, NULL); + +static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Set to 1 if you want to have logs archived.", NULL, NULL, FALSE); +#endif /* UNIV_LOG_ARCHIVE */ + +static MYSQL_SYSVAR_STR(log_group_home_dir, innobase_log_group_home_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to InnoDB log files.", NULL, NULL, NULL); + +static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct, + PLUGIN_VAR_RQCMDARG, + "Percentage of dirty pages allowed in bufferpool.", + NULL, NULL, 90, 0, 100, 0); + +static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag, + PLUGIN_VAR_RQCMDARG, + "Desired maximum length of the purge queue (0 = no limit)", + NULL, NULL, 0, 0, ~0L, 0); + +static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR, + "Enable SHOW INNODB STATUS output in the innodb_status. file", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata, + PLUGIN_VAR_OPCMDARG, + "Enable statistics gathering for metadata commands such as SHOW TABLE STATUS (on by default)", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_sample_pages, + PLUGIN_VAR_RQCMDARG, + "The number of index pages to sample when calculating statistics (default 8)", + NULL, NULL, 8, 1, ~0ULL, 0); + +static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled, + PLUGIN_VAR_OPCMDARG, + "Enable InnoDB adaptive hash index (enabled by default). " + "Disable with --skip-innodb-adaptive-hash-index.", + NULL, innodb_adaptive_hash_index_update, TRUE); + +static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, + PLUGIN_VAR_RQCMDARG, + "Replication thread delay (ms) on the slave server if " + "innodb_thread_concurrency is reached (0 by default)", + NULL, NULL, 0, 0, ~0UL, 0); + +static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Size of a memory pool InnoDB uses to store data dictionary information and other internal data structures.", + NULL, NULL, 1*1024*1024L, 512*1024L, LONG_MAX, 1024); + +static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment, + PLUGIN_VAR_RQCMDARG, + "Data file autoextend increment in megabytes", + NULL, NULL, 8L, 1L, 1000L, 0); + +static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", + NULL, NULL, 8*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L); + +static MYSQL_SYSVAR_ULONG(commit_concurrency, srv_commit_concurrency, + PLUGIN_VAR_RQCMDARG, + "Helps in performance tuning in heavily concurrent environments.", + NULL, NULL, 0, 0, 1000, 0); + +static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter, + PLUGIN_VAR_RQCMDARG, + "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket", + NULL, NULL, 500L, 1L, ~0L, 0); + +static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of file I/O threads in InnoDB.", + NULL, NULL, 4, 4, 64, 0); + +static MYSQL_SYSVAR_LONG(force_recovery, innobase_force_recovery, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Helps to save your data in case the disk image of the database becomes corrupt.", + NULL, NULL, 0, 0, 6, 0); + +static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The size of the buffer which InnoDB uses to write log to the log files on disk.", + NULL, NULL, 1024*1024L, 256*1024L, LONG_MAX, 1024); + +static MYSQL_SYSVAR_LONGLONG(log_file_size, innobase_log_file_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Size of each log file in a log group.", + NULL, NULL, 5*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L); + +static MYSQL_SYSVAR_LONG(log_files_in_group, innobase_log_files_in_group, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of log files in the log group. InnoDB writes to the files in a circular fashion. Value 3 is recommended here.", + NULL, NULL, 2, 2, 100, 0); + +static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of identical copies of log groups we keep for the database. Currently this should be set to 1.", + NULL, NULL, 1, 1, 10, 0); + +static MYSQL_SYSVAR_LONG(open_files, innobase_open_files, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "How many files at the maximum InnoDB keeps open at the same time.", + NULL, NULL, 300L, 10L, LONG_MAX, 0); + +static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds, + PLUGIN_VAR_RQCMDARG, + "Count of spin-loop rounds in InnoDB mutexes", + NULL, NULL, 20L, 0L, ~0L, 0); + +static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency, + PLUGIN_VAR_RQCMDARG, + "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.", + NULL, NULL, 0, 0, 1000, 0); + +static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay, + PLUGIN_VAR_RQCMDARG, + "Time of innodb thread sleeping before joining InnoDB queue (usec). Value 0 disable a sleep", + NULL, NULL, 10000L, 0L, ~0L, 0); + +static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to individual files and their sizes.", + NULL, NULL, NULL); + +static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The AUTOINC lock modes supported by InnoDB: " + "0 => Old style AUTOINC locking (for backward" + " compatibility) " + "1 => New style AUTOINC locking " + "2 => No AUTOINC locking (unsafe for SBR)", + NULL, NULL, + AUTOINC_NEW_STYLE_LOCKING, /* Default setting */ + AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */ + AUTOINC_NO_LOCKING, 0); /* Maximum value */ + +static MYSQL_SYSVAR_STR(version, innodb_version_str, + PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY, + "Percona-InnoDB-plugin version", NULL, NULL, INNODB_VERSION_STR); + +static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use OS memory allocator instead of InnoDB's internal memory allocator", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering, + PLUGIN_VAR_RQCMDARG, + "Buffer changes to reduce random access: " + "OFF, ON, inserting, deleting, changing, or purging.", + innodb_change_buffering_validate, + innodb_change_buffering_update, NULL); + +static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, + PLUGIN_VAR_RQCMDARG, + "Number of IO operations per second the server can do. Tunes background IO rate.", + NULL, NULL, 100, 100, 999999999, 0); + +static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The maximum size of the insert buffer. (in bytes)", + NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0); + +static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable active_contract of insert buffer. 0:disable 1:enable", + NULL, NULL, 0, 0, 1, 0); + +static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate, + PLUGIN_VAR_RQCMDARG, + "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)", + NULL, NULL, 100, 100, 999999999, 0); + +static MYSQL_SYSVAR_ULONG(flush_neighbor_pages, srv_flush_neighbor_pages, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable flushing also neighbor pages. 0:disable 1:enable", + NULL, NULL, 1, 0, 1, 0); + +static +void +innodb_read_ahead_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + *(long *)var_ptr= (*(long *)save) & 3; +} +const char *read_ahead_names[]= +{ + "none", /* 0 */ + "random", + "linear", + "both", /* 3 */ + /* For compatibility of the older patch */ + "0", /* 4 ("none" + 4) */ + "1", + "2", + "3", /* 7 ("both" + 4) */ + NullS +}; +TYPELIB read_ahead_typelib= +{ + array_elements(read_ahead_names) - 1, "read_ahead_typelib", + read_ahead_names, NULL +}; +static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead, + PLUGIN_VAR_RQCMDARG, + "Control read ahead activity. (none, random, linear, [both])", + NULL, innodb_read_ahead_update, 3, &read_ahead_typelib); + +static MYSQL_SYSVAR_ULONG(adaptive_checkpoint, srv_adaptive_checkpoint, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable flushing along modified age. 0:disable 1:enable", + NULL, NULL, 0, 0, 1, 0); + +static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.", + NULL, NULL, 0, 0, 1, 0); + +static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of background read I/O threads in InnoDB.", + NULL, NULL, 1, 1, 64, 0); + +static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of background write I/O threads in InnoDB.", + NULL, NULL, 1, 1, 64, 0); + +static MYSQL_SYSVAR_ULONG(extra_rsegments, srv_extra_rsegments, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of extra user rollback segments when create new database.", + NULL, NULL, 0, 0, 127, 0); + +static struct st_mysql_sys_var* innobase_system_variables[]= { + MYSQL_SYSVAR(additional_mem_pool_size), + MYSQL_SYSVAR(autoextend_increment), + MYSQL_SYSVAR(buffer_pool_size), + MYSQL_SYSVAR(checksums), + MYSQL_SYSVAR(commit_concurrency), + MYSQL_SYSVAR(concurrency_tickets), + MYSQL_SYSVAR(data_file_path), + MYSQL_SYSVAR(data_home_dir), + MYSQL_SYSVAR(doublewrite), + MYSQL_SYSVAR(extra_undoslots), + MYSQL_SYSVAR(fast_shutdown), + MYSQL_SYSVAR(file_io_threads), + MYSQL_SYSVAR(file_per_table), + MYSQL_SYSVAR(file_format), + MYSQL_SYSVAR(file_format_check), + MYSQL_SYSVAR(flush_log_at_trx_commit), + MYSQL_SYSVAR(flush_method), + MYSQL_SYSVAR(force_recovery), + MYSQL_SYSVAR(locks_unsafe_for_binlog), + MYSQL_SYSVAR(lock_wait_timeout), +#ifdef UNIV_LOG_ARCHIVE + MYSQL_SYSVAR(log_arch_dir), + MYSQL_SYSVAR(log_archive), +#endif /* UNIV_LOG_ARCHIVE */ + MYSQL_SYSVAR(log_buffer_size), + MYSQL_SYSVAR(log_file_size), + MYSQL_SYSVAR(log_files_in_group), + MYSQL_SYSVAR(log_group_home_dir), + MYSQL_SYSVAR(max_dirty_pages_pct), + MYSQL_SYSVAR(max_purge_lag), + MYSQL_SYSVAR(mirrored_log_groups), + MYSQL_SYSVAR(open_files), + MYSQL_SYSVAR(overwrite_relay_log_info), + MYSQL_SYSVAR(rollback_on_timeout), + MYSQL_SYSVAR(stats_on_metadata), + MYSQL_SYSVAR(stats_sample_pages), + MYSQL_SYSVAR(adaptive_hash_index), + MYSQL_SYSVAR(replication_delay), + MYSQL_SYSVAR(status_file), + MYSQL_SYSVAR(strict_mode), + MYSQL_SYSVAR(support_xa), + MYSQL_SYSVAR(sync_spin_loops), + MYSQL_SYSVAR(table_locks), + MYSQL_SYSVAR(thread_concurrency), + MYSQL_SYSVAR(thread_sleep_delay), + MYSQL_SYSVAR(autoinc_lock_mode), + MYSQL_SYSVAR(show_verbose_locks), + MYSQL_SYSVAR(show_locks_held), + MYSQL_SYSVAR(version), + MYSQL_SYSVAR(io_capacity), + MYSQL_SYSVAR(ibuf_max_size), + MYSQL_SYSVAR(ibuf_active_contract), + MYSQL_SYSVAR(ibuf_accel_rate), + MYSQL_SYSVAR(flush_neighbor_pages), + MYSQL_SYSVAR(read_ahead), + MYSQL_SYSVAR(adaptive_checkpoint), + MYSQL_SYSVAR(enable_unsafe_group_commit), + MYSQL_SYSVAR(read_io_threads), + MYSQL_SYSVAR(write_io_threads), + MYSQL_SYSVAR(extra_rsegments), + MYSQL_SYSVAR(use_sys_malloc), + MYSQL_SYSVAR(change_buffering), + NULL +}; + +#ifdef MYSQL_DYNAMIC_PLUGIN +struct st_mysql_sys_var +{ + MYSQL_PLUGIN_VAR_HEADER; + void* value; +}; + +struct param_mapping +{ + const char* server; /* Parameter name in the server. */ + const char* plugin; /* Paramater name in the plugin. */ +}; + +/******************************************************************** +Match the parameters from the static and dynamic versions. */ +static +bool +innobase_match_parameter( +/*=====================*/ + /* out: true if names match */ + const char* from_server, /* in: variable name from server */ + const char* from_plugin) /* in: variable name from plugin */ +{ + static const param_mapping param_map[] = { + {"use_adaptive_hash_indexes", "adaptive_hash_index"} + }; + + if (strcmp(from_server, from_plugin) == 0) { + return(true); + } + + const param_mapping* param = param_map; + int n_elems = sizeof(param_map) / sizeof(param_map[0]); + + for (int i = 0; i < n_elems; ++i, ++param) { + + if (strcmp(param->server, from_server) == 0 + && strcmp(param->plugin, from_plugin) == 0) { + + return(true); + } + } + + return(false); +} + +/******************************************************************** +Copy InnoDB system variables from the static InnoDB to the dynamic +plugin. */ +static +bool +innodb_plugin_init(void) +/*====================*/ + /* out: TRUE if the dynamic InnoDB plugin should start */ +{ +#if !MYSQL_STORAGE_ENGINE_PLUGIN +#error "MYSQL_STORAGE_ENGINE_PLUGIN must be nonzero." +#endif + + /* Copy the system variables. */ + + struct st_mysql_plugin* builtin; + struct st_mysql_sys_var** sta; /* static parameters */ + struct st_mysql_sys_var** dyn; /* dynamic parameters */ + +#ifdef __WIN__ + if (!builtin_innobase_plugin_ptr) { + + return(true); + } + + builtin = builtin_innobase_plugin_ptr; +#else + switch (builtin_innobase_plugin) { + case 0: + return(true); + case MYSQL_STORAGE_ENGINE_PLUGIN: + break; + default: + return(false); + } + + builtin = (struct st_mysql_plugin*) &builtin_innobase_plugin; +#endif + + for (sta = builtin->system_vars; *sta != NULL; sta++) { + + for (dyn = innobase_system_variables; *dyn != NULL; dyn++) { + + /* do not copy session variables */ + if (((*sta)->flags | (*dyn)->flags) + & PLUGIN_VAR_THDLOCAL) { + continue; + } + + if (innobase_match_parameter((*sta)->name, + (*dyn)->name)) { + + /* found the corresponding parameter */ + + /* check if the flags are the same, + ignoring differences in the READONLY or + NOSYSVAR flags; + e.g. we are not copying string variable to + an integer one, but we do not care if it is + readonly in the static and not in the + dynamic */ + if (((*sta)->flags ^ (*dyn)->flags) + & ~(PLUGIN_VAR_READONLY + | PLUGIN_VAR_NOSYSVAR)) { + + fprintf(stderr, + "InnoDB: %s in static InnoDB " + "(flags=0x%x) differs from " + "%s in dynamic InnoDB " + "(flags=0x%x)\n", + (*sta)->name, (*sta)->flags, + (*dyn)->name, (*dyn)->flags); + + /* we could break; here leaving this + parameter uncopied */ + return(false); + } + + /* assign the value of the static parameter + to the dynamic one, according to their type */ + +#define COPY_VAR(label, type) \ + case label: \ + *(type*)(*dyn)->value = *(type*)(*sta)->value; \ + break; + + switch ((*sta)->flags + & ~(PLUGIN_VAR_MASK + | PLUGIN_VAR_UNSIGNED)) { + + COPY_VAR(PLUGIN_VAR_BOOL, char); + COPY_VAR(PLUGIN_VAR_INT, int); + COPY_VAR(PLUGIN_VAR_LONG, long); + COPY_VAR(PLUGIN_VAR_LONGLONG, long long); + COPY_VAR(PLUGIN_VAR_STR, char*); + + default: + fprintf(stderr, + "InnoDB: unknown flags " + "0x%x for %s\n", + (*sta)->flags, (*sta)->name); + } + + /* Make the static InnoDB variable point to + the dynamic one */ + (*sta)->value = (*dyn)->value; + + break; + } + } + } + + return(true); +} +#endif /* MYSQL_DYNAMIC_PLUGIN */ + +mysql_declare_plugin(innobase) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &innobase_storage_engine, + innobase_hton_name, + "Innobase Oy", + "Supports transactions, row-level locking, and foreign keys", + PLUGIN_LICENSE_GPL, + innobase_init, /* Plugin Init */ + NULL, /* Plugin Deinit */ + INNODB_VERSION_SHORT, + innodb_status_variables_export,/* status variables */ + innobase_system_variables, /* system variables */ + NULL /* reserved */ +}, +i_s_innodb_rseg, +i_s_innodb_buffer_pool_pages, +i_s_innodb_buffer_pool_pages_index, +i_s_innodb_buffer_pool_pages_blob, +i_s_innodb_trx, +i_s_innodb_locks, +i_s_innodb_lock_waits, +i_s_innodb_cmp, +i_s_innodb_cmp_reset, +i_s_innodb_cmpmem, +i_s_innodb_cmpmem_reset, +i_s_innodb_patches +mysql_declare_plugin_end; + +#ifdef UNIV_COMPILE_TEST_FUNCS + +typedef struct innobase_convert_name_test_struct { + char* buf; + ulint buflen; + const char* id; + ulint idlen; + void* thd; + ibool file_id; + + const char* expected; +} innobase_convert_name_test_t; + +void +test_innobase_convert_name() +{ + char buf[1024]; + ulint i; + + innobase_convert_name_test_t test_input[] = { + {buf, sizeof(buf), "abcd", 4, NULL, TRUE, "\"abcd\""}, + {buf, 7, "abcd", 4, NULL, TRUE, "\"abcd\""}, + {buf, 6, "abcd", 4, NULL, TRUE, "\"abcd\""}, + {buf, 5, "abcd", 4, NULL, TRUE, "\"abc\""}, + {buf, 4, "abcd", 4, NULL, TRUE, "\"ab\""}, + + {buf, sizeof(buf), "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 9, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 8, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 7, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""}, + {buf, 6, "ab@0060cd", 9, NULL, TRUE, "\"ab`c\""}, + {buf, 5, "ab@0060cd", 9, NULL, TRUE, "\"ab`\""}, + {buf, 4, "ab@0060cd", 9, NULL, TRUE, "\"ab\""}, + + {buf, sizeof(buf), "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"cd\""}, + {buf, 17, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"cd\""}, + {buf, 16, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"c\""}, + {buf, 15, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\"\"\""}, + {buf, 14, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\""}, + {buf, 13, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#ab\""}, + {buf, 12, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#a\""}, + {buf, 11, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50#\""}, + {buf, 10, "ab\"cd", 5, NULL, TRUE, + "\"#mysql50\""}, + + {buf, sizeof(buf), "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""}, + {buf, 9, "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""}, + {buf, 8, "ab/cd", 5, NULL, TRUE, "\"ab\".\"c\""}, + {buf, 7, "ab/cd", 5, NULL, TRUE, "\"ab\".\"\""}, + {buf, 6, "ab/cd", 5, NULL, TRUE, "\"ab\"."}, + {buf, 5, "ab/cd", 5, NULL, TRUE, "\"ab\"."}, + {buf, 4, "ab/cd", 5, NULL, TRUE, "\"ab\""}, + {buf, 3, "ab/cd", 5, NULL, TRUE, "\"a\""}, + {buf, 2, "ab/cd", 5, NULL, TRUE, "\"\""}, + /* XXX probably "" is a better result in this case + {buf, 1, "ab/cd", 5, NULL, TRUE, "."}, + */ + {buf, 0, "ab/cd", 5, NULL, TRUE, ""}, + }; + + for (i = 0; i < sizeof(test_input) / sizeof(test_input[0]); i++) { + + char* end; + ibool ok = TRUE; + size_t res_len; + + fprintf(stderr, "TESTING %lu, %s, %lu, %s\n", + test_input[i].buflen, + test_input[i].id, + test_input[i].idlen, + test_input[i].expected); + + end = innobase_convert_name( + test_input[i].buf, + test_input[i].buflen, + test_input[i].id, + test_input[i].idlen, + test_input[i].thd, + test_input[i].file_id); + + res_len = (size_t) (end - test_input[i].buf); + + if (res_len != strlen(test_input[i].expected)) { + + fprintf(stderr, "unexpected len of the result: %u, " + "expected: %u\n", (unsigned) res_len, + (unsigned) strlen(test_input[i].expected)); + ok = FALSE; + } + + if (memcmp(test_input[i].buf, + test_input[i].expected, + strlen(test_input[i].expected)) != 0 + || !ok) { + + fprintf(stderr, "unexpected result: %.*s, " + "expected: %s\n", (int) res_len, + test_input[i].buf, + test_input[i].expected); + ok = FALSE; + } + + if (ok) { + fprintf(stderr, "OK: res: %.*s\n\n", (int) res_len, + buf); + } else { + fprintf(stderr, "FAILED\n\n"); + return; + } + } +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h new file mode 100644 index 00000000000..c08dd3ed173 --- /dev/null +++ b/storage/xtradb/handler/ha_innodb.h @@ -0,0 +1,283 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, MySQL AB & Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/* + This file is based on ha_berkeley.h of MySQL distribution + + This file defines the Innodb handler: the interface between MySQL and + Innodb +*/ + +#ifdef USE_PRAGMA_INTERFACE +#pragma interface /* gcc class implementation */ +#endif + +typedef struct st_innobase_share { + THR_LOCK lock; + pthread_mutex_t mutex; + const char* table_name; + uint use_count; + void* table_name_hash; +} INNOBASE_SHARE; + + +struct dict_index_struct; +struct row_prebuilt_struct; + +typedef struct dict_index_struct dict_index_t; +typedef struct row_prebuilt_struct row_prebuilt_t; + +/* The class defining a handle to an Innodb table */ +class ha_innobase: public handler +{ + row_prebuilt_t* prebuilt; /* prebuilt struct in InnoDB, used + to save CPU time with prebuilt data + structures*/ + THD* user_thd; /* the thread handle of the user + currently using the handle; this is + set in external_lock function */ + THR_LOCK_DATA lock; + INNOBASE_SHARE *share; + + uchar* upd_buff; /* buffer used in updates */ + uchar* key_val_buff; /* buffer used in converting + search key values from MySQL format + to Innodb format */ + ulong upd_and_key_val_buff_len; + /* the length of each of the previous + two buffers */ + Table_flags int_table_flags; + uint primary_key; + ulong start_of_scan; /* this is set to 1 when we are + starting a table scan but have not + yet fetched any row, else 0 */ + uint last_match_mode;/* match mode of the latest search: + ROW_SEL_EXACT, ROW_SEL_EXACT_PREFIX, + or undefined */ + uint num_write_row; /* number of write_row() calls */ + + uint store_key_val_for_row(uint keynr, char* buff, uint buff_len, + const uchar* record); + inline void update_thd(THD* thd); + void update_thd(); + int change_active_index(uint keynr); + int general_fetch(uchar* buf, uint direction, uint match_mode); + ulint innobase_lock_autoinc(); + ulonglong innobase_peek_autoinc(); + ulint innobase_set_max_autoinc(ulonglong auto_inc); + ulint innobase_reset_autoinc(ulonglong auto_inc); + ulint innobase_get_autoinc(ulonglong* value); + ulint innobase_update_autoinc(ulonglong auto_inc); + ulint innobase_initialize_autoinc(); + dict_index_t* innobase_get_index(uint keynr); + ulonglong innobase_get_int_col_max_value(const Field* field); + + /* Init values for the class: */ + public: + ha_innobase(handlerton *hton, TABLE_SHARE *table_arg); + ~ha_innobase(); + /* + Get the row type from the storage engine. If this method returns + ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used. + */ + enum row_type get_row_type() const; + + const char* table_type() const; + const char* index_type(uint key_number); + const char** bas_ext() const; + Table_flags table_flags() const; + ulong index_flags(uint idx, uint part, bool all_parts) const; + uint max_supported_keys() const; + uint max_supported_key_length() const; + uint max_supported_key_part_length() const; + const key_map* keys_to_use_for_scanning(); + + int open(const char *name, int mode, uint test_if_locked); + int close(void); + double scan_time(); + double read_time(uint index, uint ranges, ha_rows rows); + + int write_row(uchar * buf); + int update_row(const uchar * old_data, uchar * new_data); + int delete_row(const uchar * buf); + bool was_semi_consistent_read(); + void try_semi_consistent_read(bool yes); + void unlock_row(); + +#ifdef ROW_MERGE_IS_INDEX_USABLE + /** Check if an index can be used by this transaction. + * @param keynr key number to check + * @return true if available, false if the index + * does not contain old records that exist + * in the read view of this transaction */ + bool is_index_available(uint keynr); +#endif /* ROW_MERGE_IS_INDEX_USABLE */ + int index_init(uint index, bool sorted); + int index_end(); + int index_read(uchar * buf, const uchar * key, + uint key_len, enum ha_rkey_function find_flag); + int index_read_idx(uchar * buf, uint index, const uchar * key, + uint key_len, enum ha_rkey_function find_flag); + int index_read_last(uchar * buf, const uchar * key, uint key_len); + int index_next(uchar * buf); + int index_next_same(uchar * buf, const uchar *key, uint keylen); + int index_prev(uchar * buf); + int index_first(uchar * buf); + int index_last(uchar * buf); + + int rnd_init(bool scan); + int rnd_end(); + int rnd_next(uchar *buf); + int rnd_pos(uchar * buf, uchar *pos); + + void position(const uchar *record); + int info(uint); + int analyze(THD* thd,HA_CHECK_OPT* check_opt); + int optimize(THD* thd,HA_CHECK_OPT* check_opt); + int discard_or_import_tablespace(my_bool discard); + int extra(enum ha_extra_function operation); + int reset(); + int external_lock(THD *thd, int lock_type); + int transactional_table_lock(THD *thd, int lock_type); + int start_stmt(THD *thd, thr_lock_type lock_type); + void position(uchar *record); + ha_rows records_in_range(uint inx, key_range *min_key, key_range + *max_key); + ha_rows estimate_rows_upper_bound(); + + void update_create_info(HA_CREATE_INFO* create_info); + int create(const char *name, register TABLE *form, + HA_CREATE_INFO *create_info); + int delete_all_rows(); + int delete_table(const char *name); + int rename_table(const char* from, const char* to); + int check(THD* thd, HA_CHECK_OPT* check_opt); + char* update_table_comment(const char* comment); + char* get_foreign_key_create_info(); + int get_foreign_key_list(THD *thd, List *f_key_list); + bool can_switch_engines(); + uint referenced_by_foreign_key(); + void free_foreign_key_create_info(char* str); + THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, + enum thr_lock_type lock_type); + void init_table_handle_for_HANDLER(); + virtual void get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values); + int reset_auto_increment(ulonglong value); + + virtual bool get_error_message(int error, String *buf); + + uint8 table_cache_type(); + /* + ask handler about permission to cache table during query registration + */ + my_bool register_query_cache_table(THD *thd, char *table_key, + uint key_length, + qc_engine_callback *call_back, + ulonglong *engine_data); + static char *get_mysql_bin_log_name(); + static ulonglong get_mysql_bin_log_pos(); + bool primary_key_is_clustered(); + int cmp_ref(const uchar *ref1, const uchar *ref2); + /** Fast index creation (smart ALTER TABLE) @see handler0alter.cc @{ */ + int add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys); + int prepare_drop_index(TABLE *table_arg, uint *key_num, + uint num_of_keys); + int final_drop_index(TABLE *table_arg); + /** @} */ + bool check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes); +}; + +/* Some accessor functions which the InnoDB plugin needs, but which +can not be added to mysql/plugin.h as part of the public interface; +the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */ + +#ifndef INNODB_COMPATIBILITY_HOOKS +#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS +#endif + +extern "C" { +struct charset_info_st *thd_charset(MYSQL_THD thd); +char **thd_query(MYSQL_THD thd); + +/** Get the file name of the MySQL binlog. + * @return the name of the binlog file + */ +const char* mysql_bin_log_file_name(void); + +/** Get the current position of the MySQL binlog. + * @return byte offset from the beginning of the binlog + */ +ulonglong mysql_bin_log_file_pos(void); + +/** + Check if a user thread is a replication slave thread + @param thd user thread + @retval 0 the user thread is not a replication slave thread + @retval 1 the user thread is a replication slave thread +*/ +int thd_slave_thread(const MYSQL_THD thd); + +/** + Check if a user thread is running a non-transactional update + @param thd user thread + @retval 0 the user thread is not running a non-transactional update + @retval 1 the user thread is running a non-transactional update +*/ +int thd_non_transactional_update(const MYSQL_THD thd); + +/** + Get the user thread's binary logging format + @param thd user thread + @return Value to be used as index into the binlog_format_names array +*/ +int thd_binlog_format(const MYSQL_THD thd); + +/** + Mark transaction to rollback and mark error as fatal to a sub-statement. + @param thd Thread handle + @param all TRUE <=> rollback main transaction. +*/ +void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all); +} + +typedef struct trx_struct trx_t; +/************************************************************************ +Converts an InnoDB error code to a MySQL error code and also tells to MySQL +about a possible transaction rollback inside InnoDB caused by a lock wait +timeout or a deadlock. */ +extern "C" +int +convert_error_code_to_mysql( +/*========================*/ + /* out: MySQL error code */ + int error, /* in: InnoDB error code */ + ulint flags, /* in: InnoDB table flags, or 0 */ + MYSQL_THD thd); /* in: user thread handle or NULL */ + +/************************************************************************* +Allocates an InnoDB transaction for a MySQL handler object. */ +extern "C" +trx_t* +innobase_trx_allocate( +/*==================*/ + /* out: InnoDB transaction handle */ + MYSQL_THD thd); /* in: user thread handle */ diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc new file mode 100644 index 00000000000..1b5466e66eb --- /dev/null +++ b/storage/xtradb/handler/handler0alter.cc @@ -0,0 +1,1214 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Smart ALTER TABLE +*******************************************************/ + +#include +#include + +extern "C" { +#include "log0log.h" +#include "row0merge.h" +#include "srv0srv.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "ha_prototypes.h" +#include "handler0alter.h" +} + +#include "ha_innodb.h" +#include "handler0vars.h" + +/***************************************************************** +Copies an InnoDB column to a MySQL field. This function is +adapted from row_sel_field_store_in_mysql_format(). */ +static +void +innobase_col_to_mysql( +/*==================*/ + const dict_col_t* col, /* in: InnoDB column */ + const uchar* data, /* in: InnoDB column data */ + ulint len, /* in: length of data, in bytes */ + Field* field) /* in/out: MySQL field */ +{ + uchar* ptr; + uchar* dest = field->ptr; + ulint flen = field->pack_length(); + + switch (col->mtype) { + case DATA_INT: + ut_ad(len == flen); + + /* Convert integer data from Innobase to little-endian + format, sign bit restored to normal */ + + for (ptr = dest + len; ptr != dest; ) { + *--ptr = *data++; + } + + if (!(field->flags & UNSIGNED_FLAG)) { + ((byte*) dest)[len - 1] ^= 0x80; + } + + break; + + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_BINARY: + field->reset(); + + if (field->type() == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len( + dest, len, flen - field->key_length()); + } + + /* Copy the actual data */ + memcpy(dest, data, len); + break; + + case DATA_BLOB: + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, flen, data, len); + break; + +#ifdef UNIV_DEBUG + case DATA_MYSQL: + ut_ad(flen >= len); + ut_ad(col->mbmaxlen >= col->mbminlen); + ut_ad(col->mbmaxlen > col->mbminlen || flen == len); + memcpy(dest, data, len); + break; + + default: + case DATA_SYS_CHILD: + case DATA_SYS: + /* These column types should never be shipped to MySQL. */ + ut_ad(0); + + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Above are the valid column types for MySQL data. */ + ut_ad(flen == len); +#else /* UNIV_DEBUG */ + default: +#endif /* UNIV_DEBUG */ + memcpy(dest, data, len); + } +} + +/***************************************************************** +Copies an InnoDB record to table->record[0]. */ +extern "C" UNIV_INTERN +void +innobase_rec_to_mysql( +/*==================*/ + TABLE* table, /* in/out: MySQL table */ + const rec_t* rec, /* in: record */ + const dict_index_t* index, /* in: index */ + const ulint* offsets) /* in: rec_get_offsets( + rec, index, ...) */ +{ + uint n_fields = table->s->fields; + uint i; + + ut_ad(n_fields == dict_table_get_n_user_cols(index->table)); + + for (i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + ulint ipos; + ulint ilen; + const uchar* ifield; + + field->reset(); + + ipos = dict_index_get_nth_col_pos(index, i); + + if (UNIV_UNLIKELY(ipos == ULINT_UNDEFINED)) { +null_field: + field->set_null(); + continue; + } + + ifield = rec_get_nth_field(rec, offsets, ipos, &ilen); + + /* Assign the NULL flag */ + if (ilen == UNIV_SQL_NULL) { + ut_ad(field->real_maybe_null()); + goto null_field; + } + + field->set_notnull(); + + innobase_col_to_mysql( + dict_field_get_col( + dict_index_get_nth_field(index, ipos)), + ifield, ilen, field); + } +} + +/***************************************************************** +Resets table->record[0]. */ +extern "C" UNIV_INTERN +void +innobase_rec_reset( +/*===============*/ + TABLE* table) /* in/out: MySQL table */ +{ + uint n_fields = table->s->fields; + uint i; + + for (i = 0; i < n_fields; i++) { + table->field[i]->set_default(); + } +} + +/********************************************************************** +Removes the filename encoding of a database and table name. */ +static +void +innobase_convert_tablename( +/*=======================*/ + char* s) /* in: identifier; out: decoded identifier */ +{ + uint errors; + + char* slash = strchr(s, '/'); + + if (slash) { + char* t; + /* Temporarily replace the '/' with NUL. */ + *slash = 0; + /* Convert the database name. */ + strconvert(&my_charset_filename, s, system_charset_info, + s, slash - s + 1, &errors); + + t = s + strlen(s); + ut_ad(slash >= t); + /* Append a '.' after the database name. */ + *t++ = '.'; + slash++; + /* Convert the table name. */ + strconvert(&my_charset_filename, slash, system_charset_info, + t, slash - t + strlen(slash), &errors); + } else { + strconvert(&my_charset_filename, s, + system_charset_info, s, strlen(s), &errors); + } +} + +/*********************************************************************** +This function checks that index keys are sensible. */ +static +int +innobase_check_index_keys( +/*======================*/ + /* out: 0 or error number */ + const KEY* key_info, /* in: Indexes to be created */ + ulint num_of_keys) /* in: Number of indexes to + be created */ +{ + ulint key_num; + + ut_ad(key_info); + ut_ad(num_of_keys); + + for (key_num = 0; key_num < num_of_keys; key_num++) { + const KEY& key = key_info[key_num]; + + /* Check that the same index name does not appear + twice in indexes to be created. */ + + for (ulint i = 0; i < key_num; i++) { + const KEY& key2 = key_info[i]; + + if (0 == strcmp(key.name, key2.name)) { + sql_print_error("InnoDB: key name `%s` appears" + " twice in CREATE INDEX\n", + key.name); + + return(ER_WRONG_NAME_FOR_INDEX); + } + } + + /* Check that MySQL does not try to create a column + prefix index field on an inappropriate data type and + that the same colum does not appear twice in the index. */ + + for (ulint i = 0; i < key.key_parts; i++) { + const KEY_PART_INFO& key_part1 + = key.key_part[i]; + const Field* field + = key_part1.field; + ibool is_unsigned; + + switch (get_innobase_type_from_mysql_type( + &is_unsigned, field)) { + default: + break; + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + if (field->type() == MYSQL_TYPE_VARCHAR) { + if (key_part1.length + >= field->pack_length() + - ((Field_varstring*) field) + ->length_bytes) { + break; + } + } else { + if (key_part1.length + >= field->pack_length()) { + break; + } + } + + sql_print_error("InnoDB: MySQL is trying to" + " create a column prefix" + " index field on an" + " inappropriate data type." + " column `%s`," + " index `%s`.\n", + field->field_name, + key.name); + return(ER_WRONG_KEY_COLUMN); + } + + for (ulint j = 0; j < i; j++) { + const KEY_PART_INFO& key_part2 + = key.key_part[j]; + + if (strcmp(key_part1.field->field_name, + key_part2.field->field_name)) { + continue; + } + + sql_print_error("InnoDB: column `%s`" + " is not allowed to occur" + " twice in index `%s`.\n", + key_part1.field->field_name, + key.name); + return(ER_WRONG_KEY_COLUMN); + } + } + } + + return(0); +} + +/*********************************************************************** +Create index field definition for key part */ +static +void +innobase_create_index_field_def( +/*============================*/ + KEY_PART_INFO* key_part, /* in: MySQL key definition */ + mem_heap_t* heap, /* in: memory heap */ + merge_index_field_t* index_field) /* out: index field + definition for key_part */ +{ + Field* field; + ibool is_unsigned; + ulint col_type; + + DBUG_ENTER("innobase_create_index_field_def"); + + ut_ad(key_part); + ut_ad(index_field); + + field = key_part->field; + ut_a(field); + + col_type = get_innobase_type_from_mysql_type(&is_unsigned, field); + + if (DATA_BLOB == col_type + || (key_part->length < field->pack_length() + && field->type() != MYSQL_TYPE_VARCHAR) + || (field->type() == MYSQL_TYPE_VARCHAR + && key_part->length < field->pack_length() + - ((Field_varstring*)field)->length_bytes)) { + + index_field->prefix_len = key_part->length; + } else { + index_field->prefix_len = 0; + } + + index_field->field_name = mem_heap_strdup(heap, field->field_name); + + DBUG_VOID_RETURN; +} + +/*********************************************************************** +Create index definition for key */ +static +void +innobase_create_index_def( +/*======================*/ + KEY* key, /* in: key definition */ + bool new_primary, /* in: TRUE=generating + a new primary key + on the table */ + bool key_primary, /* in: TRUE if this key + is a primary key */ + merge_index_def_t* index, /* out: index definition */ + mem_heap_t* heap) /* in: heap where memory + is allocated */ +{ + ulint i; + ulint len; + ulint n_fields = key->key_parts; + char* index_name; + + DBUG_ENTER("innobase_create_index_def"); + + index->fields = (merge_index_field_t*) mem_heap_alloc( + heap, n_fields * sizeof *index->fields); + + index->ind_type = 0; + index->n_fields = n_fields; + len = strlen(key->name) + 1; + index->name = index_name = (char*) mem_heap_alloc(heap, + len + !new_primary); + + if (UNIV_LIKELY(!new_primary)) { + *index_name++ = TEMP_INDEX_PREFIX; + } + + memcpy(index_name, key->name, len); + + if (key->flags & HA_NOSAME) { + index->ind_type |= DICT_UNIQUE; + } + + if (key_primary) { + index->ind_type |= DICT_CLUSTERED; + } + + for (i = 0; i < n_fields; i++) { + innobase_create_index_field_def(&key->key_part[i], heap, + &index->fields[i]); + } + + DBUG_VOID_RETURN; +} + +/*********************************************************************** +Copy index field definition */ +static +void +innobase_copy_index_field_def( +/*==========================*/ + const dict_field_t* field, /* in: definition to copy */ + merge_index_field_t* index_field) /* out: copied definition */ +{ + DBUG_ENTER("innobase_copy_index_field_def"); + DBUG_ASSERT(field != NULL); + DBUG_ASSERT(index_field != NULL); + + index_field->field_name = field->name; + index_field->prefix_len = field->prefix_len; + + DBUG_VOID_RETURN; +} + +/*********************************************************************** +Copy index definition for the index */ +static +void +innobase_copy_index_def( +/*====================*/ + const dict_index_t* index, /* in: index definition to copy */ + merge_index_def_t* new_index,/* out: Index definition */ + mem_heap_t* heap) /* in: heap where allocated */ +{ + ulint n_fields; + ulint i; + + DBUG_ENTER("innobase_copy_index_def"); + + /* Note that we take only those fields that user defined to be + in the index. In the internal representation more colums were + added and those colums are not copied .*/ + + n_fields = index->n_user_defined_cols; + + new_index->fields = (merge_index_field_t*) mem_heap_alloc( + heap, n_fields * sizeof *new_index->fields); + + /* When adding a PRIMARY KEY, we may convert a previous + clustered index to a secondary index (UNIQUE NOT NULL). */ + new_index->ind_type = index->type & ~DICT_CLUSTERED; + new_index->n_fields = n_fields; + new_index->name = index->name; + + for (i = 0; i < n_fields; i++) { + innobase_copy_index_field_def(&index->fields[i], + &new_index->fields[i]); + } + + DBUG_VOID_RETURN; +} + +/*********************************************************************** +Create an index table where indexes are ordered as follows: + +IF a new primary key is defined for the table THEN + + 1) New primary key + 2) Original secondary indexes + 3) New secondary indexes + +ELSE + + 1) All new indexes in the order they arrive from MySQL + +ENDIF + +*/ +static +merge_index_def_t* +innobase_create_key_def( +/*====================*/ + /* out: key definitions or NULL */ + trx_t* trx, /* in: trx */ + const dict_table_t*table, /* in: table definition */ + mem_heap_t* heap, /* in: heap where space for key + definitions are allocated */ + KEY* key_info, /* in: Indexes to be created */ + ulint& n_keys) /* in/out: Number of indexes to + be created */ +{ + ulint i = 0; + merge_index_def_t* indexdef; + merge_index_def_t* indexdefs; + bool new_primary; + + DBUG_ENTER("innobase_create_key_def"); + + indexdef = indexdefs = (merge_index_def_t*) + mem_heap_alloc(heap, sizeof *indexdef + * (n_keys + UT_LIST_GET_LEN(table->indexes))); + + /* If there is a primary key, it is always the first index + defined for the table. */ + + new_primary = !my_strcasecmp(system_charset_info, + key_info->name, "PRIMARY"); + + /* If there is a UNIQUE INDEX consisting entirely of NOT NULL + columns, MySQL will treat it as a PRIMARY KEY unless the + table already has one. */ + + if (!new_primary && (key_info->flags & HA_NOSAME) + && row_table_got_default_clust_index(table)) { + uint key_part = key_info->key_parts; + + new_primary = TRUE; + + while (key_part--) { + if (key_info->key_part[key_part].key_type + & FIELDFLAG_MAYBE_NULL) { + new_primary = FALSE; + break; + } + } + } + + if (new_primary) { + const dict_index_t* index; + + /* Create the PRIMARY key index definition */ + innobase_create_index_def(&key_info[i++], TRUE, TRUE, + indexdef++, heap); + + row_mysql_lock_data_dictionary(trx); + + index = dict_table_get_first_index(table); + + /* Copy the index definitions of the old table. Skip + the old clustered index if it is a generated clustered + index or a PRIMARY KEY. If the clustered index is a + UNIQUE INDEX, it must be converted to a secondary index. */ + + if (dict_index_get_nth_col(index, 0)->mtype == DATA_SYS + || !my_strcasecmp(system_charset_info, + index->name, "PRIMARY")) { + index = dict_table_get_next_index(index); + } + + while (index) { + innobase_copy_index_def(index, indexdef++, heap); + index = dict_table_get_next_index(index); + } + + row_mysql_unlock_data_dictionary(trx); + } + + /* Create definitions for added secondary indexes. */ + + while (i < n_keys) { + innobase_create_index_def(&key_info[i++], new_primary, FALSE, + indexdef++, heap); + } + + n_keys = indexdef - indexdefs; + + DBUG_RETURN(indexdefs); +} + +/*********************************************************************** +Create a temporary tablename using query id, thread id, and id */ +static +char* +innobase_create_temporary_tablename( +/*================================*/ + /* out: temporary tablename */ + mem_heap_t* heap, /* in: memory heap */ + char id, /* in: identifier [0-9a-zA-Z] */ + const char* table_name) /* in: table name */ +{ + char* name; + ulint len; + static const char suffix[] = "@0023 "; /* "# " */ + + len = strlen(table_name); + + name = (char*) mem_heap_alloc(heap, len + sizeof suffix); + memcpy(name, table_name, len); + memcpy(name + len, suffix, sizeof suffix); + name[len + (sizeof suffix - 2)] = id; + + return(name); +} + +/*********************************************************************** +Create indexes. */ +UNIV_INTERN +int +ha_innobase::add_index( +/*===================*/ + /* out: 0 or error number */ + TABLE* table, /* in: Table where indexes are created */ + KEY* key_info, /* in: Indexes to be created */ + uint num_of_keys) /* in: Number of indexes to be created */ +{ + dict_index_t** index; /* Index to be created */ + dict_table_t* innodb_table; /* InnoDB table in dictionary */ + dict_table_t* indexed_table; /* Table where indexes are created */ + merge_index_def_t* index_defs; /* Index definitions */ + mem_heap_t* heap; /* Heap for index definitions */ + trx_t* trx; /* Transaction */ + ulint num_of_idx; + ulint num_created = 0; + ibool dict_locked = FALSE; + ulint new_primary; + ulint error; + + DBUG_ENTER("ha_innobase::add_index"); + ut_a(table); + ut_a(key_info); + ut_a(num_of_keys); + + if (srv_created_new_raw || srv_force_recovery) { + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + update_thd(); + + heap = mem_heap_create(1024); + + /* In case MySQL calls this in the middle of a SELECT query, release + possible adaptive hash latch to avoid deadlocks of threads. */ + trx_search_latch_release_if_reserved(prebuilt->trx); + + /* Create a background transaction for the operations on + the data dictionary tables. */ + trx = innobase_trx_allocate(user_thd); + trx_start_if_not_started(trx); + + innodb_table = indexed_table + = dict_table_get(prebuilt->table->name, FALSE); + + /* Check that index keys are sensible */ + + error = innobase_check_index_keys(key_info, num_of_keys); + + if (UNIV_UNLIKELY(error)) { +err_exit: + mem_heap_free(heap); + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx_free_for_mysql(trx); + trx_commit_for_mysql(prebuilt->trx); + DBUG_RETURN(error); + } + + /* Create table containing all indexes to be built in this + alter table add index so that they are in the correct order + in the table. */ + + num_of_idx = num_of_keys; + + index_defs = innobase_create_key_def( + trx, innodb_table, heap, key_info, num_of_idx); + + new_primary = DICT_CLUSTERED & index_defs[0].ind_type; + + /* Allocate memory for dictionary index definitions */ + + index = (dict_index_t**) mem_heap_alloc( + heap, num_of_idx * sizeof *index); + + /* Flag this transaction as a dictionary operation, so that + the data dictionary will be locked in crash recovery. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + /* Acquire a lock on the table before creating any indexes. */ + error = row_merge_lock_table(prebuilt->trx, innodb_table, + new_primary ? LOCK_X : LOCK_S); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + + goto error_handling; + } + + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during an index create operation. */ + + row_mysql_lock_data_dictionary(trx); + dict_locked = TRUE; + + /* If a new primary key is defined for the table we need + to drop the original table and rebuild all indexes. */ + + if (UNIV_UNLIKELY(new_primary)) { + /* This transaction should be the only one + operating on the table. */ + ut_a(innodb_table->n_mysql_handles_opened == 1); + + char* new_table_name = innobase_create_temporary_tablename( + heap, '1', innodb_table->name); + + /* Clone the table. */ + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + indexed_table = row_merge_create_temporary_table( + new_table_name, index_defs, innodb_table, trx); + + if (!indexed_table) { + + switch (trx->error_state) { + case DB_TABLESPACE_ALREADY_EXISTS: + case DB_DUPLICATE_KEY: + innobase_convert_tablename(new_table_name); + my_error(HA_ERR_TABLE_EXIST, MYF(0), + new_table_name); + error = HA_ERR_TABLE_EXIST; + break; + default: + error = convert_error_code_to_mysql( + trx->error_state, innodb_table->flags, + user_thd); + } + + row_mysql_unlock_data_dictionary(trx); + goto err_exit; + } + + trx->table_id = indexed_table->id; + } + + /* Create the indexes in SYS_INDEXES and load into dictionary. */ + + for (ulint i = 0; i < num_of_idx; i++) { + + index[i] = row_merge_create_index(trx, indexed_table, + &index_defs[i]); + + if (!index[i]) { + error = trx->error_state; + goto error_handling; + } + + num_created++; + } + + ut_ad(error == DB_SUCCESS); + + /* Commit the data dictionary transaction in order to release + the table locks on the system tables. Unfortunately, this + means that if MySQL crashes while creating a new primary key + inside row_merge_build_indexes(), indexed_table will not be + dropped on crash recovery. Thus, it will become orphaned. */ + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + dict_locked = FALSE; + + ut_a(trx->n_active_thrs == 0); + ut_a(UT_LIST_GET_LEN(trx->signals) == 0); + + if (UNIV_UNLIKELY(new_primary)) { + /* A primary key is to be built. Acquire an exclusive + table lock also on the table that is being created. */ + ut_ad(indexed_table != innodb_table); + + error = row_merge_lock_table(prebuilt->trx, indexed_table, + LOCK_X); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + + goto error_handling; + } + } + + /* Read the clustered index of the table and build indexes + based on this information using temporary files and merge sort. */ + error = row_merge_build_indexes(prebuilt->trx, + innodb_table, indexed_table, + index, num_of_idx, table); + +error_handling: +#ifdef UNIV_DEBUG + /* TODO: At the moment we can't handle the following statement + in our debugging code below: + + alter table t drop index b, add index (b); + + The fix will have to parse the SQL and note that the index + being added has the same name as the the one being dropped and + ignore that in the dup index check.*/ + //dict_table_check_for_dup_indexes(prebuilt->table); +#endif + + /* After an error, remove all those index definitions from the + dictionary which were defined. */ + + switch (error) { + const char* old_name; + char* tmp_name; + case DB_SUCCESS: + ut_a(!dict_locked); + row_mysql_lock_data_dictionary(trx); + dict_locked = TRUE; + + if (!new_primary) { + error = row_merge_rename_indexes(trx, indexed_table); + + if (error != DB_SUCCESS) { + row_merge_drop_indexes(trx, indexed_table, + index, num_created); + } + + goto convert_error; + } + + /* If a new primary key was defined for the table and + there was no error at this point, we can now rename + the old table as a temporary table, rename the new + temporary table as the old table and drop the old table. */ + old_name = innodb_table->name; + tmp_name = innobase_create_temporary_tablename(heap, '2', + old_name); + + error = row_merge_rename_tables(innodb_table, indexed_table, + tmp_name, trx); + + if (error != DB_SUCCESS) { + + row_merge_drop_table(trx, indexed_table); + + switch (error) { + case DB_TABLESPACE_ALREADY_EXISTS: + case DB_DUPLICATE_KEY: + innobase_convert_tablename(tmp_name); + my_error(HA_ERR_TABLE_EXIST, MYF(0), tmp_name); + error = HA_ERR_TABLE_EXIST; + break; + default: + goto convert_error; + } + break; + } + + trx_commit_for_mysql(prebuilt->trx); + row_prebuilt_free(prebuilt, TRUE); + prebuilt = row_create_prebuilt(indexed_table); + + indexed_table->n_mysql_handles_opened++; + + error = row_merge_drop_table(trx, innodb_table); + goto convert_error; + + case DB_TOO_BIG_RECORD: + my_error(HA_ERR_TO_BIG_ROW, MYF(0)); + goto error; + case DB_PRIMARY_KEY_IS_NULL: + my_error(ER_PRIMARY_CANT_HAVE_NULL, MYF(0)); + /* fall through */ + case DB_DUPLICATE_KEY: +error: + prebuilt->trx->error_info = NULL; + /* fall through */ + default: + if (new_primary) { + row_merge_drop_table(trx, indexed_table); + } else { + if (!dict_locked) { + row_mysql_lock_data_dictionary(trx); + dict_locked = TRUE; + } + + row_merge_drop_indexes(trx, indexed_table, + index, num_created); + } + +convert_error: + error = convert_error_code_to_mysql(error, + innodb_table->flags, + user_thd); + } + + mem_heap_free(heap); + trx_commit_for_mysql(trx); + if (prebuilt->trx) { + trx_commit_for_mysql(prebuilt->trx); + } + + if (dict_locked) { + row_mysql_unlock_data_dictionary(trx); + } + + trx_free_for_mysql(trx); + + /* There might be work for utility threads.*/ + srv_active_wake_master_thread(); + + DBUG_RETURN(error); +} + +/*********************************************************************** +Prepare to drop some indexes of a table. */ +UNIV_INTERN +int +ha_innobase::prepare_drop_index( +/*============================*/ + /* out: 0 or error number */ + TABLE* table, /* in: Table where indexes are dropped */ + uint* key_num, /* in: Key nums to be dropped */ + uint num_of_keys) /* in: Number of keys to be dropped */ +{ + trx_t* trx; + int err = 0; + uint n_key; + + DBUG_ENTER("ha_innobase::prepare_drop_index"); + ut_ad(table); + ut_ad(key_num); + ut_ad(num_of_keys); + if (srv_created_new_raw || srv_force_recovery) { + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + update_thd(); + + trx_search_latch_release_if_reserved(prebuilt->trx); + trx = prebuilt->trx; + + /* Test and mark all the indexes to be dropped */ + + row_mysql_lock_data_dictionary(trx); + + /* Check that none of the indexes have previously been flagged + for deletion. */ + { + const dict_index_t* index + = dict_table_get_first_index(prebuilt->table); + do { + ut_a(!index->to_be_dropped); + index = dict_table_get_next_index(index); + } while (index); + } + + for (n_key = 0; n_key < num_of_keys; n_key++) { + const KEY* key; + dict_index_t* index; + + key = table->key_info + key_num[n_key]; + index = dict_table_get_index_on_name_and_min_id( + prebuilt->table, key->name); + + if (!index) { + sql_print_error("InnoDB could not find key n:o %u " + "with name %s for table %s", + key_num[n_key], + key ? key->name : "NULL", + prebuilt->table->name); + + err = HA_ERR_KEY_NOT_FOUND; + goto func_exit; + } + + /* Refuse to drop the clustered index. It would be + better to automatically generate a clustered index, + but mysql_alter_table() will call this method only + after ha_innobase::add_index(). */ + + if (dict_index_is_clust(index)) { + my_error(ER_REQUIRES_PRIMARY_KEY, MYF(0)); + err = -1; + goto func_exit; + } + + index->to_be_dropped = TRUE; + } + + /* If FOREIGN_KEY_CHECK = 1 you may not drop an index defined + for a foreign key constraint because InnoDB requires that both + tables contain indexes for the constraint. Note that CREATE + INDEX id ON table does a CREATE INDEX and DROP INDEX, and we + can ignore here foreign keys because a new index for the + foreign key has already been created. + + We check for the foreign key constraints after marking the + candidate indexes for deletion, because when we check for an + equivalent foreign index we don't want to select an index that + is later deleted. */ + + if (trx->check_foreigns + && thd_sql_command(user_thd) != SQLCOM_CREATE_INDEX) { + dict_index_t* index; + + for (index = dict_table_get_first_index(prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + dict_foreign_t* foreign; + + if (!index->to_be_dropped) { + + continue; + } + + /* Check if the index is referenced. */ + foreign = dict_table_get_referenced_constraint( + prebuilt->table, index); + + if (foreign) { +index_needed: + trx_set_detailed_error( + trx, + "Index needed in foreign key " + "constraint"); + + trx->error_info = index; + + err = HA_ERR_DROP_INDEX_FK; + break; + } else { + /* Check if this index references some + other table */ + foreign = dict_table_get_foreign_constraint( + prebuilt->table, index); + + if (foreign) { + ut_a(foreign->foreign_index == index); + + /* Search for an equivalent index that + the foreign key constraint could use + if this index were to be deleted. */ + if (!dict_foreign_find_equiv_index( + foreign)) { + + goto index_needed; + } + } + } + } + } else if (thd_sql_command(user_thd) == SQLCOM_CREATE_INDEX) { + /* This is a drop of a foreign key constraint index that + was created by MySQL when the constraint was added. MySQL + does this when the user creates an index explicitly which + can be used in place of the automatically generated index. */ + + dict_index_t* index; + + for (index = dict_table_get_first_index(prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + dict_foreign_t* foreign; + + if (!index->to_be_dropped) { + + continue; + } + + /* Check if this index references some other table */ + foreign = dict_table_get_foreign_constraint( + prebuilt->table, index); + + if (foreign == NULL) { + + continue; + } + + ut_a(foreign->foreign_index == index); + + /* Search for an equivalent index that the + foreign key constraint could use if this index + were to be deleted. */ + + if (!dict_foreign_find_equiv_index(foreign)) { + trx_set_detailed_error( + trx, + "Index needed in foreign key " + "constraint"); + + trx->error_info = foreign->foreign_index; + + err = HA_ERR_DROP_INDEX_FK; + break; + } + } + } + +func_exit: + if (err) { + /* Undo our changes since there was some sort of error. */ + dict_index_t* index + = dict_table_get_first_index(prebuilt->table); + + do { + index->to_be_dropped = FALSE; + index = dict_table_get_next_index(index); + } while (index); + } + + row_mysql_unlock_data_dictionary(trx); + + DBUG_RETURN(err); +} + +/*********************************************************************** +Drop the indexes that were passed to a successful prepare_drop_index(). */ +UNIV_INTERN +int +ha_innobase::final_drop_index( +/*==========================*/ + /* out: 0 or error number */ + TABLE* table) /* in: Table where indexes are dropped */ +{ + dict_index_t* index; /* Index to be dropped */ + trx_t* trx; /* Transaction */ + int err; + + DBUG_ENTER("ha_innobase::final_drop_index"); + ut_ad(table); + + if (srv_created_new_raw || srv_force_recovery) { + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + update_thd(); + + trx_search_latch_release_if_reserved(prebuilt->trx); + + /* Create a background transaction for the operations on + the data dictionary tables. */ + trx = innobase_trx_allocate(user_thd); + trx_start_if_not_started(trx); + + /* Flag this transaction as a dictionary operation, so that + the data dictionary will be locked in crash recovery. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + /* Lock the table exclusively, to ensure that no active + transaction depends on an index that is being dropped. */ + err = convert_error_code_to_mysql( + row_merge_lock_table(prebuilt->trx, prebuilt->table, LOCK_X), + prebuilt->table->flags, user_thd); + + row_mysql_lock_data_dictionary(trx); + + if (UNIV_UNLIKELY(err)) { + + /* Unmark the indexes to be dropped. */ + for (index = dict_table_get_first_index(prebuilt->table); + index; index = dict_table_get_next_index(index)) { + + index->to_be_dropped = FALSE; + } + + goto func_exit; + } + + /* Drop indexes marked to be dropped */ + + index = dict_table_get_first_index(prebuilt->table); + + while (index) { + dict_index_t* next_index; + + next_index = dict_table_get_next_index(index); + + if (index->to_be_dropped) { + + row_merge_drop_index(index, prebuilt->table, trx); + } + + index = next_index; + } + + /* Check that all flagged indexes were dropped. */ + for (index = dict_table_get_first_index(prebuilt->table); + index; index = dict_table_get_next_index(index)) { + ut_a(!index->to_be_dropped); + } + +#ifdef UNIV_DEBUG + dict_table_check_for_dup_indexes(prebuilt->table); +#endif + +func_exit: + trx_commit_for_mysql(trx); + trx_commit_for_mysql(prebuilt->trx); + row_mysql_unlock_data_dictionary(trx); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + trx_free_for_mysql(trx); + + /* Tell the InnoDB server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + DBUG_RETURN(err); +} diff --git a/storage/xtradb/handler/handler0vars.h b/storage/xtradb/handler/handler0vars.h new file mode 100644 index 00000000000..ea9f305ce66 --- /dev/null +++ b/storage/xtradb/handler/handler0vars.h @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 2008, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +This file contains accessor functions for dynamic plugin on Windows. +***********************************************************************/ + +#if defined __WIN__ && defined MYSQL_DYNAMIC_PLUGIN +/*********************************************************************** +This is a list of externals that can not be resolved by delay loading. +They have to be resolved indirectly via their addresses in the .map file. +All of them are external variables. */ +extern CHARSET_INFO* wdl_my_charset_bin; +extern CHARSET_INFO* wdl_my_charset_latin1; +extern CHARSET_INFO* wdl_my_charset_filename; +extern CHARSET_INFO** wdl_system_charset_info; +extern CHARSET_INFO** wdl_default_charset_info; +extern CHARSET_INFO** wdl_all_charsets; +extern system_variables* wdl_global_system_variables; +extern char* wdl_mysql_real_data_home; +extern char** wdl_mysql_data_home; +extern char** wdl_tx_isolation_names; +extern char** wdl_binlog_format_names; +extern char* wdl_reg_ext; +extern pthread_mutex_t* wdl_LOCK_thread_count; +extern key_map* wdl_key_map_full; +extern MY_TMPDIR* wdl_mysql_tmpdir_list; +extern bool* wdl_mysqld_embedded; +extern uint* wdl_lower_case_table_names; +extern ulong* wdl_specialflag; +extern int* wdl_my_umask; + +#define my_charset_bin (*wdl_my_charset_bin) +#define my_charset_latin1 (*wdl_my_charset_latin1) +#define my_charset_filename (*wdl_my_charset_filename) +#define system_charset_info (*wdl_system_charset_info) +#define default_charset_info (*wdl_default_charset_info) +#define all_charsets (wdl_all_charsets) +#define global_system_variables (*wdl_global_system_variables) +#define mysql_real_data_home (wdl_mysql_real_data_home) +#define mysql_data_home (*wdl_mysql_data_home) +#define tx_isolation_names (wdl_tx_isolation_names) +#define binlog_format_names (wdl_binlog_format_names) +#define reg_ext (wdl_reg_ext) +#define LOCK_thread_count (*wdl_LOCK_thread_count) +#define key_map_full (*wdl_key_map_full) +#define mysql_tmpdir_list (*wdl_mysql_tmpdir_list) +#define mysqld_embedded (*wdl_mysqld_embedded) +#define lower_case_table_names (*wdl_lower_case_table_names) +#define specialflag (*wdl_specialflag) +#define my_umask (*wdl_my_umask) + +#endif diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc new file mode 100644 index 00000000000..a5404d067ca --- /dev/null +++ b/storage/xtradb/handler/i_s.cc @@ -0,0 +1,2655 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +InnoDB INFORMATION SCHEMA tables interface to MySQL. + +Created July 18, 2007 Vasil Dimov +*******************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include "i_s.h" +#include "innodb_patch_info.h" +#include + +extern "C" { +#include "trx0i_s.h" +#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */ +#include "buf0buddy.h" /* for i_s_cmpmem */ +#include "buf0buf.h" /* for buf_pool and PAGE_ZIP_MIN_SIZE */ +#include "ha_prototypes.h" /* for innobase_convert_name() */ +#include "srv0start.h" /* for srv_was_started */ +#include "btr0btr.h" /* for btr_page_get_index_id */ +#include "dict0dict.h" /* for dict_index_get_if_in_cache */ +#include "trx0rseg.h" /* for trx_rseg_struct */ +#include "trx0sys.h" /* for trx_sys */ +/* from buf0buf.c */ +struct buf_chunk_struct{ + ulint mem_size; /* allocated size of the chunk */ + ulint size; /* size of frames[] and blocks[] */ + void* mem; /* pointer to the memory area which + was allocated for the frames */ + buf_block_t* blocks; /* array of buffer control blocks */ +}; +} +#include "handler0vars.h" + +static const char plugin_author[] = "Innobase Oy"; + +#define OK(expr) \ + if ((expr) != 0) { \ + DBUG_RETURN(1); \ + } + +#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \ +do { \ + if (!srv_was_started) { \ + push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, \ + ER_CANT_FIND_SYSTEM_REC, \ + "InnoDB: SELECTing from " \ + "INFORMATION_SCHEMA.%s but " \ + "the InnoDB storage engine " \ + "is not installed", plugin_name); \ + DBUG_RETURN(0); \ + } \ +} while (0) + +#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && !defined __INTEL_COMPILER +#define STRUCT_FLD(name, value) name: value +#else +#define STRUCT_FLD(name, value) value +#endif + +static const ST_FIELD_INFO END_OF_ST_FIELD_INFO = + {STRUCT_FLD(field_name, NULL), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_NULL), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}; + +/* +Use the following types mapping: + +C type ST_FIELD_INFO::field_type +--------------------------------- +long MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS) + +long unsigned MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED) + +char* MYSQL_TYPE_STRING +(field_length=n) + +float MYSQL_TYPE_FLOAT +(field_length=0 is ignored) + +void* MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED) + +boolean (if else) MYSQL_TYPE_LONG +(field_length=1) + +time_t MYSQL_TYPE_DATETIME +(field_length=0 ignored) +--------------------------------- +*/ + +/* XXX these are defined in mysql_priv.h inside #ifdef MYSQL_SERVER */ +bool schema_table_store_record(THD *thd, TABLE *table); +void localtime_to_TIME(MYSQL_TIME *to, struct tm *from); +bool check_global_access(THD *thd, ulong want_access); + +/*********************************************************************** +Common function to fill any of the dynamic tables: +INFORMATION_SCHEMA.innodb_trx +INFORMATION_SCHEMA.innodb_locks +INFORMATION_SCHEMA.innodb_lock_waits */ +static +int +trx_i_s_common_fill_table( +/*======================*/ + /* out: 0 on success */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond); /* in: condition (not used) */ + +/*********************************************************************** +Unbind a dynamic INFORMATION_SCHEMA table. */ +static +int +i_s_common_deinit( +/*==============*/ + /* out: 0 on success */ + void* p); /* in/out: table schema object */ + +/*********************************************************************** +Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME +field. */ +static +int +field_store_time_t( +/*===============*/ + /* out: 0 on success */ + Field* field, /* in/out: target field for storage */ + time_t time) /* in: value to store */ +{ + MYSQL_TIME my_time; + struct tm tm_time; + +#if 0 + /* use this if you are sure that `variables' and `time_zone' + are always initialized */ + thd->variables.time_zone->gmt_sec_to_TIME( + &my_time, (my_time_t) time); +#else + localtime_r(&time, &tm_time); + localtime_to_TIME(&my_time, &tm_time); + my_time.time_type = MYSQL_TIMESTAMP_DATETIME; +#endif + + return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME)); +} + +/*********************************************************************** +Auxiliary function to store char* value in MYSQL_TYPE_STRING field. */ +static +int +field_store_string( +/*===============*/ + /* out: 0 on success */ + Field* field, /* in/out: target field for storage */ + const char* str) /* in: NUL-terminated utf-8 string, + or NULL */ +{ + int ret; + + if (str != NULL) { + + ret = field->store(str, strlen(str), + system_charset_info); + field->set_notnull(); + } else { + + ret = 0; /* success */ + field->set_null(); + } + + return(ret); +} + +/*********************************************************************** +Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field. +If the value is ULINT_UNDEFINED then the field it set to NULL. */ +static +int +field_store_ulint( +/*==============*/ + /* out: 0 on success */ + Field* field, /* in/out: target field for storage */ + ulint n) /* in: value to store */ +{ + int ret; + + if (n != ULINT_UNDEFINED) { + + ret = field->store(n); + field->set_notnull(); + } else { + + ret = 0; /* success */ + field->set_null(); + } + + return(ret); +} + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_patches */ +static ST_FIELD_INFO innodb_patches_fields_info[] = +{ +#define IDX_PATCH_NAME 0 + {STRUCT_FLD(field_name, "name"), + STRUCT_FLD(field_length, 255), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_PATCH_DESCR 1 + {STRUCT_FLD(field_name, "description"), + STRUCT_FLD(field_length, 255), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_PATCH_COMMENT 2 + {STRUCT_FLD(field_name, "comment"), + STRUCT_FLD(field_length, 100), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_PATCH_LINK 3 + {STRUCT_FLD(field_name, "link"), + STRUCT_FLD(field_length, 255), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static struct st_mysql_information_schema i_s_info = +{ + MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION +}; + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_patches */ +static +int +innodb_patches_fill( +/*=============*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + int i; + Field** fields; + + + DBUG_ENTER("innodb_patches_fill"); + fields = table->field; + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + for (i = 0; innodb_enhancements[i].file; i++) { + + field_store_string(fields[0],innodb_enhancements[i].file); + field_store_string(fields[1],innodb_enhancements[i].name); + field_store_string(fields[2],innodb_enhancements[i].comment); + field_store_string(fields[3],innodb_enhancements[i].link); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + + } + + + DBUG_RETURN(status); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_patches. */ +static +int +innodb_patches_init( +/*=========*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("innodb_patches_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_patches_fields_info; + schema->fill_table = innodb_patches_fill; + + DBUG_RETURN(0); +} + + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_patches = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "XTRADB_ENHANCEMENTS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, "Percona"), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Enhancements applied to InnoDB plugin"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_patches_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + + +static ST_FIELD_INFO i_s_innodb_buffer_pool_pages_fields_info[] = +{ + {STRUCT_FLD(field_name, "page_type"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "space_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "page_no"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "lru_position"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "fix_count"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "flush_type"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static ST_FIELD_INFO i_s_innodb_buffer_pool_pages_index_fields_info[] = +{ + {STRUCT_FLD(field_name, "schema_name"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "table_name"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "index_name"), + STRUCT_FLD(field_length, 64), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "space_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "page_no"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "n_recs"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "data_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "hashed"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "accessed"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "modified"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "dirty"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "old"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "lru_position"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "fix_count"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "flush_type"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static ST_FIELD_INFO i_s_innodb_buffer_pool_pages_blob_fields_info[] = +{ + {STRUCT_FLD(field_name, "space_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "page_no"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "compressed"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "part_len"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "next_page_no"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "lru_position"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "fix_count"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "flush_type"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_buffer_pool_pages. */ +static +int +i_s_innodb_buffer_pool_pages_fill( +/*================*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + + ulint n_chunks, n_blocks; + + buf_chunk_t* chunk; + + DBUG_ENTER("i_s_innodb_buffer_pool_pages_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + buf_pool_mutex_enter(); + mutex_enter(&(dict_sys->mutex)); + + chunk = buf_pool->chunks; + + for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) { + buf_block_t* block = chunk->blocks; + + for (n_blocks = chunk->size; n_blocks--; block++) { + const buf_frame_t* frame = block->frame; + + char page_type[64]; + + switch(fil_page_get_type(frame)) + { + case FIL_PAGE_INDEX: + strcpy(page_type, "index"); + break; + case FIL_PAGE_UNDO_LOG: + strcpy(page_type, "undo_log"); + break; + case FIL_PAGE_INODE: + strcpy(page_type, "inode"); + break; + case FIL_PAGE_IBUF_FREE_LIST: + strcpy(page_type, "ibuf_free_list"); + break; + case FIL_PAGE_TYPE_ALLOCATED: + strcpy(page_type, "allocated"); + break; + case FIL_PAGE_IBUF_BITMAP: + strcpy(page_type, "bitmap"); + break; + case FIL_PAGE_TYPE_SYS: + strcpy(page_type, "sys"); + break; + case FIL_PAGE_TYPE_TRX_SYS: + strcpy(page_type, "trx_sys"); + break; + case FIL_PAGE_TYPE_FSP_HDR: + strcpy(page_type, "fsp_hdr"); + break; + case FIL_PAGE_TYPE_XDES: + strcpy(page_type, "xdes"); + break; + case FIL_PAGE_TYPE_BLOB: + strcpy(page_type, "blob"); + break; + case FIL_PAGE_TYPE_ZBLOB: + strcpy(page_type, "zblob"); + break; + case FIL_PAGE_TYPE_ZBLOB2: + strcpy(page_type, "zblob2"); + break; + default: + sprintf(page_type, "unknown (type=%li)", fil_page_get_type(frame)); + } + + field_store_string(table->field[0], page_type); + table->field[1]->store(block->page.space); + table->field[2]->store(block->page.offset); + table->field[3]->store(block->page.LRU_position); + table->field[4]->store(block->page.buf_fix_count); + table->field[5]->store(block->page.flush_type); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + + } + } + + mutex_exit(&(dict_sys->mutex)); + buf_pool_mutex_exit(); + + DBUG_RETURN(status); +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */ +static +int +i_s_innodb_buffer_pool_pages_index_fill( +/*================*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + + ulint n_chunks, n_blocks; + dict_index_t* index; + dulint index_id; + + char *p; + char db_name_raw[NAME_LEN*5+1], db_name[NAME_LEN+1]; + char table_name_raw[NAME_LEN*5+1], table_name[NAME_LEN+1]; + + buf_chunk_t* chunk; + + DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + buf_pool_mutex_enter(); + mutex_enter(&(dict_sys->mutex)); + + chunk = buf_pool->chunks; + + for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) { + buf_block_t* block = chunk->blocks; + + for (n_blocks = chunk->size; n_blocks--; block++) { + const buf_frame_t* frame = block->frame; + + if (fil_page_get_type(frame) == FIL_PAGE_INDEX) { + index_id = btr_page_get_index_id(frame); + index = dict_index_get_if_in_cache_low(index_id); + if(index) + { + if((p = strchr(index->table_name, '/'))) + { + strncpy(db_name_raw, index->table_name, p-index->table_name); + db_name_raw[p-index->table_name] = 0; + filename_to_tablename(db_name_raw, db_name, sizeof(db_name)); + field_store_string(table->field[0], db_name); + p++; + } else { + field_store_string(table->field[0], NULL); + p = (char *)index->table_name; + } + strcpy(table_name_raw, p); + filename_to_tablename(table_name_raw, table_name, sizeof(table_name)); + field_store_string(table->field[1], table_name); + field_store_string(table->field[2], index->name); + + table->field[3]->store(block->page.space); + table->field[4]->store(block->page.offset); + table->field[5]->store(page_get_n_recs(frame)); + table->field[6]->store(page_get_data_size(frame)); + table->field[7]->store(block->is_hashed); + table->field[8]->store(block->page.accessed); + table->field[9]->store(block->page.newest_modification != 0); + table->field[10]->store(block->page.oldest_modification != 0); + table->field[11]->store(block->page.old); + table->field[12]->store(block->page.LRU_position); + table->field[13]->store(block->page.buf_fix_count); + table->field[14]->store(block->page.flush_type); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + } + } + } + } + + mutex_exit(&(dict_sys->mutex)); + buf_pool_mutex_exit(); + + DBUG_RETURN(status); +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */ +static +int +i_s_innodb_buffer_pool_pages_blob_fill( +/*================*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + + ulint n_chunks, n_blocks; + buf_chunk_t* chunk; + page_zip_des_t* block_page_zip; + + ulint part_len; + ulint next_page_no; + + DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + buf_pool_mutex_enter(); + mutex_enter(&(dict_sys->mutex)); + + chunk = buf_pool->chunks; + + for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) { + buf_block_t* block = chunk->blocks; + block_page_zip = buf_block_get_page_zip(block); + + for (n_blocks = chunk->size; n_blocks--; block++) { + const buf_frame_t* frame = block->frame; + + if (fil_page_get_type(frame) == FIL_PAGE_TYPE_BLOB) { + + if (UNIV_LIKELY_NULL(block_page_zip)) { + part_len = 0; /* hmm, can't figure it out */ + + next_page_no = mach_read_from_4( + buf_block_get_frame(block) + + FIL_PAGE_NEXT); + } else { + part_len = mach_read_from_4( + buf_block_get_frame(block) + + FIL_PAGE_DATA + + 0 /*BTR_BLOB_HDR_PART_LEN*/); + + next_page_no = mach_read_from_4( + buf_block_get_frame(block) + + FIL_PAGE_DATA + + 4 /*BTR_BLOB_HDR_NEXT_PAGE_NO*/); + } + + table->field[0]->store(block->page.space); + table->field[1]->store(block->page.offset); + table->field[2]->store(block_page_zip != NULL); + table->field[3]->store(part_len); + + if(next_page_no == FIL_NULL) + { + table->field[4]->store(0); + } else { + table->field[4]->store(block->page.offset); + } + + table->field[5]->store(block->page.LRU_position); + table->field[6]->store(block->page.buf_fix_count); + table->field[7]->store(block->page.flush_type); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + + } + } + } + + mutex_exit(&(dict_sys->mutex)); + buf_pool_mutex_exit(); + + DBUG_RETURN(status); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_buffer_pool_pages. */ +static +int +i_s_innodb_buffer_pool_pages_init( +/*=========*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_innodb_buffer_pool_pages_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_buffer_pool_pages_fields_info; + schema->fill_table = i_s_innodb_buffer_pool_pages_fill; + + DBUG_RETURN(0); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_buffer_pool_pages. */ +static +int +i_s_innodb_buffer_pool_pages_index_init( +/*=========*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_buffer_pool_pages_index_fields_info; + schema->fill_table = i_s_innodb_buffer_pool_pages_index_fill; + + DBUG_RETURN(0); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_buffer_pool_pages. */ +static +int +i_s_innodb_buffer_pool_pages_blob_init( +/*=========*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_buffer_pool_pages_blob_fields_info; + schema->fill_table = i_s_innodb_buffer_pool_pages_blob_fill; + + DBUG_RETURN(0); +} + + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB buffer pool pages"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, 0x0100 /* 1.0 */), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_index = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_INDEX"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB buffer pool index pages"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_index_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, 0x0100 /* 1.0 */), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_blob = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_BLOB"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB buffer pool blob pages"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_blob_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, 0x0100 /* 1.0 */), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */ +static ST_FIELD_INFO innodb_trx_fields_info[] = +{ +#define IDX_TRX_ID 0 + {STRUCT_FLD(field_name, "trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_STATE 1 + {STRUCT_FLD(field_name, "trx_state"), + STRUCT_FLD(field_length, TRX_QUE_STATE_STR_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_STARTED 2 + {STRUCT_FLD(field_name, "trx_started"), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_REQUESTED_LOCK_ID 3 + {STRUCT_FLD(field_name, "trx_requested_lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_WAIT_STARTED 4 + {STRUCT_FLD(field_name, "trx_wait_started"), + STRUCT_FLD(field_length, 0), + STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_WEIGHT 5 + {STRUCT_FLD(field_name, "trx_weight"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_MYSQL_THREAD_ID 6 + {STRUCT_FLD(field_name, "trx_mysql_thread_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_TRX_QUERY 7 + {STRUCT_FLD(field_name, "trx_query"), + STRUCT_FLD(field_length, TRX_I_S_TRX_QUERY_MAX_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*********************************************************************** +Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx +table with it. */ +static +int +fill_innodb_trx_from_cache( +/*=======================*/ + /* out: 0 on success */ + trx_i_s_cache_t* cache, /* in: cache to read from */ + THD* thd, /* in: used to call + schema_table_store_record() */ + TABLE* table) /* in/out: fill this table */ +{ + Field** fields; + ulint rows_num; + char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_trx_from_cache"); + + fields = table->field; + + rows_num = trx_i_s_cache_get_rows_used(cache, + I_S_INNODB_TRX); + + for (i = 0; i < rows_num; i++) { + + i_s_trx_row_t* row; + char trx_id[TRX_ID_MAX_LEN + 1]; + + row = (i_s_trx_row_t*) + trx_i_s_cache_get_nth_row( + cache, I_S_INNODB_TRX, i); + + /* trx_id */ + ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, row->trx_id); + OK(field_store_string(fields[IDX_TRX_ID], trx_id)); + + /* trx_state */ + OK(field_store_string(fields[IDX_TRX_STATE], + row->trx_state)); + + /* trx_started */ + OK(field_store_time_t(fields[IDX_TRX_STARTED], + (time_t) row->trx_started)); + + /* trx_requested_lock_id */ + /* trx_wait_started */ + if (row->trx_wait_started != 0) { + + OK(field_store_string( + fields[IDX_TRX_REQUESTED_LOCK_ID], + trx_i_s_create_lock_id( + row->requested_lock_row, + lock_id, sizeof(lock_id)))); + /* field_store_string() sets it no notnull */ + + OK(field_store_time_t( + fields[IDX_TRX_WAIT_STARTED], + (time_t) row->trx_wait_started)); + fields[IDX_TRX_WAIT_STARTED]->set_notnull(); + } else { + + fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null(); + fields[IDX_TRX_WAIT_STARTED]->set_null(); + } + + /* trx_weight */ + OK(fields[IDX_TRX_WEIGHT]->store((longlong) row->trx_weight, + true)); + + /* trx_mysql_thread_id */ + OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store( + row->trx_mysql_thread_id)); + + /* trx_query */ + OK(field_store_string(fields[IDX_TRX_QUERY], + row->trx_query)); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/*********************************************************************** +Bind the dynamic table INFORMATION_SCHEMA.innodb_trx */ +static +int +innodb_trx_init( +/*============*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_trx_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_trx_fields_info; + schema->fill_table = trx_i_s_common_fill_table; + + DBUG_RETURN(0); +} + + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_trx = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_TRX"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB transactions"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_trx_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */ +static ST_FIELD_INFO innodb_locks_fields_info[] = +{ +#define IDX_LOCK_ID 0 + {STRUCT_FLD(field_name, "lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_TRX_ID 1 + {STRUCT_FLD(field_name, "lock_trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_MODE 2 + {STRUCT_FLD(field_name, "lock_mode"), + /* S[,GAP] X[,GAP] IS[,GAP] IX[,GAP] AUTO_INC UNKNOWN */ + STRUCT_FLD(field_length, 32), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_TYPE 3 + {STRUCT_FLD(field_name, "lock_type"), + STRUCT_FLD(field_length, 32 /* RECORD|TABLE|UNKNOWN */), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_TABLE 4 + {STRUCT_FLD(field_name, "lock_table"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_INDEX 5 + {STRUCT_FLD(field_name, "lock_index"), + STRUCT_FLD(field_length, 1024), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_SPACE 6 + {STRUCT_FLD(field_name, "lock_space"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_PAGE 7 + {STRUCT_FLD(field_name, "lock_page"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_REC 8 + {STRUCT_FLD(field_name, "lock_rec"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_LOCK_DATA 9 + {STRUCT_FLD(field_name, "lock_data"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_DATA_MAX_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*********************************************************************** +Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks +table with it. */ +static +int +fill_innodb_locks_from_cache( +/*=========================*/ + /* out: 0 on success */ + trx_i_s_cache_t* cache, /* in: cache to read from */ + THD* thd, /* in: MySQL client connection */ + TABLE* table) /* in/out: fill this table */ +{ + Field** fields; + ulint rows_num; + char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_locks_from_cache"); + + fields = table->field; + + rows_num = trx_i_s_cache_get_rows_used(cache, + I_S_INNODB_LOCKS); + + for (i = 0; i < rows_num; i++) { + + i_s_locks_row_t* row; + + /* note that the decoded database or table name is + never expected to be longer than NAME_LEN; + NAME_LEN for database name + 2 for surrounding quotes around database name + NAME_LEN for table name + 2 for surrounding quotes around table name + 1 for the separating dot (.) + 9 for the #mysql50# prefix */ + char buf[2 * NAME_LEN + 14]; + const char* bufend; + + char lock_trx_id[TRX_ID_MAX_LEN + 1]; + + row = (i_s_locks_row_t*) + trx_i_s_cache_get_nth_row( + cache, I_S_INNODB_LOCKS, i); + + /* lock_id */ + trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id)); + OK(field_store_string(fields[IDX_LOCK_ID], + lock_id)); + + /* lock_trx_id */ + ut_snprintf(lock_trx_id, sizeof(lock_trx_id), + TRX_ID_FMT, row->lock_trx_id); + OK(field_store_string(fields[IDX_LOCK_TRX_ID], lock_trx_id)); + + /* lock_mode */ + OK(field_store_string(fields[IDX_LOCK_MODE], + row->lock_mode)); + + /* lock_type */ + OK(field_store_string(fields[IDX_LOCK_TYPE], + row->lock_type)); + + /* lock_table */ + bufend = innobase_convert_name(buf, sizeof(buf), + row->lock_table, + strlen(row->lock_table), + thd, TRUE); + OK(fields[IDX_LOCK_TABLE]->store(buf, bufend - buf, + system_charset_info)); + + /* lock_index */ + if (row->lock_index != NULL) { + + bufend = innobase_convert_name(buf, sizeof(buf), + row->lock_index, + strlen(row->lock_index), + thd, FALSE); + OK(fields[IDX_LOCK_INDEX]->store(buf, bufend - buf, + system_charset_info)); + fields[IDX_LOCK_INDEX]->set_notnull(); + } else { + + fields[IDX_LOCK_INDEX]->set_null(); + } + + /* lock_space */ + OK(field_store_ulint(fields[IDX_LOCK_SPACE], + row->lock_space)); + + /* lock_page */ + OK(field_store_ulint(fields[IDX_LOCK_PAGE], + row->lock_page)); + + /* lock_rec */ + OK(field_store_ulint(fields[IDX_LOCK_REC], + row->lock_rec)); + + /* lock_data */ + OK(field_store_string(fields[IDX_LOCK_DATA], + row->lock_data)); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/*********************************************************************** +Bind the dynamic table INFORMATION_SCHEMA.innodb_locks */ +static +int +innodb_locks_init( +/*==============*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_locks_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_locks_fields_info; + schema->fill_table = trx_i_s_common_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_locks = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_LOCKS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB conflicting locks"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_locks_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */ +static ST_FIELD_INFO innodb_lock_waits_fields_info[] = +{ +#define IDX_REQUESTING_TRX_ID 0 + {STRUCT_FLD(field_name, "requesting_trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_REQUESTED_LOCK_ID 1 + {STRUCT_FLD(field_name, "requested_lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BLOCKING_TRX_ID 2 + {STRUCT_FLD(field_name, "blocking_trx_id"), + STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + +#define IDX_BLOCKING_LOCK_ID 3 + {STRUCT_FLD(field_name, "blocking_lock_id"), + STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*********************************************************************** +Read data from cache buffer and fill the +INFORMATION_SCHEMA.innodb_lock_waits table with it. */ +static +int +fill_innodb_lock_waits_from_cache( +/*==============================*/ + /* out: 0 on success */ + trx_i_s_cache_t* cache, /* in: cache to read from */ + THD* thd, /* in: used to call + schema_table_store_record() */ + TABLE* table) /* in/out: fill this table */ +{ + Field** fields; + ulint rows_num; + char requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + char blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_lock_waits_from_cache"); + + fields = table->field; + + rows_num = trx_i_s_cache_get_rows_used(cache, + I_S_INNODB_LOCK_WAITS); + + for (i = 0; i < rows_num; i++) { + + i_s_lock_waits_row_t* row; + + char requesting_trx_id[TRX_ID_MAX_LEN + 1]; + char blocking_trx_id[TRX_ID_MAX_LEN + 1]; + + row = (i_s_lock_waits_row_t*) + trx_i_s_cache_get_nth_row( + cache, I_S_INNODB_LOCK_WAITS, i); + + /* requesting_trx_id */ + ut_snprintf(requesting_trx_id, sizeof(requesting_trx_id), + TRX_ID_FMT, row->requested_lock_row->lock_trx_id); + OK(field_store_string(fields[IDX_REQUESTING_TRX_ID], + requesting_trx_id)); + + /* requested_lock_id */ + OK(field_store_string( + fields[IDX_REQUESTED_LOCK_ID], + trx_i_s_create_lock_id( + row->requested_lock_row, + requested_lock_id, + sizeof(requested_lock_id)))); + + /* blocking_trx_id */ + ut_snprintf(blocking_trx_id, sizeof(blocking_trx_id), + TRX_ID_FMT, row->blocking_lock_row->lock_trx_id); + OK(field_store_string(fields[IDX_BLOCKING_TRX_ID], + blocking_trx_id)); + + /* blocking_lock_id */ + OK(field_store_string( + fields[IDX_BLOCKING_LOCK_ID], + trx_i_s_create_lock_id( + row->blocking_lock_row, + blocking_lock_id, + sizeof(blocking_lock_id)))); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/*********************************************************************** +Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */ +static +int +innodb_lock_waits_init( +/*===================*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_lock_waits_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = innodb_lock_waits_fields_info; + schema->fill_table = trx_i_s_common_fill_table; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_lock_waits = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_LOCK_WAITS"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, "Innobase Oy"), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB which lock is blocking which"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, innodb_lock_waits_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +/*********************************************************************** +Common function to fill any of the dynamic tables: +INFORMATION_SCHEMA.innodb_trx +INFORMATION_SCHEMA.innodb_locks +INFORMATION_SCHEMA.innodb_lock_waits */ +static +int +trx_i_s_common_fill_table( +/*======================*/ + /* out: 0 on success */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (not used) */ +{ + const char* table_name; + int ret; + trx_i_s_cache_t* cache; + + DBUG_ENTER("trx_i_s_common_fill_table"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + /* minimize the number of places where global variables are + referenced */ + cache = trx_i_s_cache; + + /* which table we have to fill? */ + table_name = tables->schema_table_name; + /* or table_name = tables->schema_table->table_name; */ + + RETURN_IF_INNODB_NOT_STARTED(table_name); + + /* update the cache */ + trx_i_s_cache_start_write(cache); + trx_i_s_possibly_fetch_data_into_cache(cache); + trx_i_s_cache_end_write(cache); + + if (trx_i_s_cache_is_truncated(cache)) { + + /* XXX show warning to user if possible */ + fprintf(stderr, "Warning: data in %s truncated due to " + "memory limit of %d bytes\n", table_name, + TRX_I_S_MEM_LIMIT); + } + + ret = 0; + + trx_i_s_cache_start_read(cache); + + if (innobase_strcasecmp(table_name, "innodb_trx") == 0) { + + if (fill_innodb_trx_from_cache( + cache, thd, tables->table) != 0) { + + ret = 1; + } + + } else if (innobase_strcasecmp(table_name, "innodb_locks") == 0) { + + if (fill_innodb_locks_from_cache( + cache, thd, tables->table) != 0) { + + ret = 1; + } + + } else if (innobase_strcasecmp(table_name, "innodb_lock_waits") == 0) { + + if (fill_innodb_lock_waits_from_cache( + cache, thd, tables->table) != 0) { + + ret = 1; + } + + } else { + + /* huh! what happened!? */ + fprintf(stderr, + "InnoDB: trx_i_s_common_fill_table() was " + "called to fill unknown table: %s.\n" + "This function only knows how to fill " + "innodb_trx, innodb_locks and " + "innodb_lock_waits tables.\n", table_name); + + ret = 1; + } + + trx_i_s_cache_end_read(cache); + +#if 0 + DBUG_RETURN(ret); +#else + /* if this function returns something else than 0 then a + deadlock occurs between the mysqld server and mysql client, + see http://bugs.mysql.com/29900 ; when that bug is resolved + we can enable the DBUG_RETURN(ret) above */ + DBUG_RETURN(0); +#endif +} + +/* Fields of the dynamic table information_schema.innodb_cmp. */ +static ST_FIELD_INFO i_s_cmp_fields_info[] = +{ + {STRUCT_FLD(field_name, "page_size"), + STRUCT_FLD(field_length, 5), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Compressed Page Size"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "compress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of Compressions"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "compress_ops_ok"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of" + " Successful Compressions"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "compress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Duration of Compressions," + " in Seconds"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "uncompress_ops"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of Decompressions"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "uncompress_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Duration of Decompressions," + " in Seconds"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_cmp or +innodb_cmp_reset. */ +static +int +i_s_cmp_fill_low( +/*=============*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond, /* in: condition (ignored) */ + ibool reset) /* in: TRUE=reset cumulated counts */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + + DBUG_ENTER("i_s_cmp_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + for (uint i = 0; i < PAGE_ZIP_NUM_SSIZE - 1; i++) { + page_zip_stat_t* zip_stat = &page_zip_stat[i]; + + table->field[0]->store(PAGE_ZIP_MIN_SIZE << i); + + /* The cumulated counts are not protected by any + mutex. Thus, some operation in page0zip.c could + increment a counter between the time we read it and + clear it. We could introduce mutex protection, but it + could cause a measureable performance hit in + page0zip.c. */ + table->field[1]->store(zip_stat->compressed); + table->field[2]->store(zip_stat->compressed_ok); + table->field[3]->store( + (ulong) (zip_stat->compressed_usec / 1000000)); + table->field[4]->store(zip_stat->decompressed); + table->field[5]->store( + (ulong) (zip_stat->decompressed_usec / 1000000)); + + if (reset) { + memset(zip_stat, 0, sizeof *zip_stat); + } + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + } + + DBUG_RETURN(status); +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_cmp. */ +static +int +i_s_cmp_fill( +/*=========*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + return(i_s_cmp_fill_low(thd, tables, cond, FALSE)); +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_cmp_reset. */ +static +int +i_s_cmp_reset_fill( +/*===============*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + return(i_s_cmp_fill_low(thd, tables, cond, TRUE)); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_cmp. */ +static +int +i_s_cmp_init( +/*=========*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_fields_info; + schema->fill_table = i_s_cmp_fill; + + DBUG_RETURN(0); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_cmp_reset. */ +static +int +i_s_cmp_reset_init( +/*===============*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmp_fields_info; + schema->fill_table = i_s_cmp_reset_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMP_RESET"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compression;" + " reset cumulated counts"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmp_reset_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +/* Fields of the dynamic table information_schema.innodb_cmpmem. */ +static ST_FIELD_INFO i_s_cmpmem_fields_info[] = +{ + {STRUCT_FLD(field_name, "page_size"), + STRUCT_FLD(field_length, 5), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Buddy Block Size"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "pages_used"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Currently in Use"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "pages_free"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Currently Available"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "relocation_ops"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Number of Relocations"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "relocation_time"), + STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, "Total Duration of Relocations," + " in Seconds"), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_cmpmem or +innodb_cmpmem_reset. */ +static +int +i_s_cmpmem_fill_low( +/*================*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond, /* in: condition (ignored) */ + ibool reset) /* in: TRUE=reset cumulated counts */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + + DBUG_ENTER("i_s_cmpmem_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + buf_pool_mutex_enter(); + + for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { + buf_buddy_stat_t* buddy_stat = &buf_buddy_stat[x]; + + table->field[0]->store(BUF_BUDDY_LOW << x); + table->field[1]->store(buddy_stat->used); + table->field[2]->store(UNIV_LIKELY(x < BUF_BUDDY_SIZES) + ? UT_LIST_GET_LEN(buf_pool->zip_free[x]) + : 0); + table->field[3]->store((longlong) buddy_stat->relocated, true); + table->field[4]->store( + (ulong) (buddy_stat->relocated_usec / 1000000)); + + if (reset) { + /* This is protected by buf_pool_mutex. */ + buddy_stat->relocated = 0; + buddy_stat->relocated_usec = 0; + } + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + } + + buf_pool_mutex_exit(); + DBUG_RETURN(status); +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_cmpmem. */ +static +int +i_s_cmpmem_fill( +/*============*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE)); +} + +/*********************************************************************** +Fill the dynamic table information_schema.innodb_cmpmem_reset. */ +static +int +i_s_cmpmem_reset_fill( +/*==================*/ + /* out: 0 on success, 1 on failure */ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE)); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_cmpmem. */ +static +int +i_s_cmpmem_init( +/*============*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmpmem_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmpmem_fields_info; + schema->fill_table = i_s_cmpmem_fill; + + DBUG_RETURN(0); +} + +/*********************************************************************** +Bind the dynamic table information_schema.innodb_cmpmem_reset. */ +static +int +i_s_cmpmem_reset_init( +/*==================*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmpmem_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_cmpmem_fields_info; + schema->fill_table = i_s_cmpmem_reset_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMPMEM"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmpmem_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_CMPMEM_RESET"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;" + " reset cumulated counts"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_cmpmem_reset_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, INNODB_VERSION_SHORT), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; + +/*********************************************************************** +Unbind a dynamic INFORMATION_SCHEMA table. */ +static +int +i_s_common_deinit( +/*==============*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_common_deinit"); + + /* Do nothing */ + + DBUG_RETURN(0); +} + +/*********************************************************************** +*/ +static ST_FIELD_INFO i_s_innodb_rseg_fields_info[] = +{ + {STRUCT_FLD(field_name, "rseg_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "space_id"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "zip_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "page_no"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "max_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "curr_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static +int +i_s_innodb_rseg_fill( +/*=================*/ + THD* thd, /* in: thread */ + TABLE_LIST* tables, /* in/out: tables to fill */ + COND* cond) /* in: condition (ignored) */ +{ + TABLE* table = (TABLE *) tables->table; + int status = 0; + trx_rseg_t* rseg; + + DBUG_ENTER("i_s_innodb_rseg_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + while (rseg) { + table->field[0]->store(rseg->id); + table->field[1]->store(rseg->space); + table->field[2]->store(rseg->zip_size); + table->field[3]->store(rseg->page_no); + table->field[4]->store(rseg->max_size); + table->field[5]->store(rseg->curr_size); + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } + + DBUG_RETURN(status); +} + +static +int +i_s_innodb_rseg_init( +/*=================*/ + /* out: 0 on success */ + void* p) /* in/out: table schema object */ +{ + DBUG_ENTER("i_s_innodb_rseg_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_rseg_fields_info; + schema->fill_table = i_s_innodb_rseg_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_rseg = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + + /* pointer to type-specific plugin descriptor */ + /* void* */ + STRUCT_FLD(info, &i_s_info), + + /* plugin name */ + /* const char* */ + STRUCT_FLD(name, "INNODB_RSEG"), + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(author, plugin_author), + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + STRUCT_FLD(descr, "InnoDB rollback segment information"), + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + STRUCT_FLD(init, i_s_innodb_rseg_init), + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + STRUCT_FLD(deinit, i_s_common_deinit), + + /* plugin version (for SHOW PLUGINS) */ + /* unsigned int */ + STRUCT_FLD(version, 0x0100 /* 1.0 */), + + /* struct st_mysql_show_var* */ + STRUCT_FLD(status_vars, NULL), + + /* struct st_mysql_sys_var** */ + STRUCT_FLD(system_vars, NULL), + + /* reserved for dependency checking */ + /* void* */ + STRUCT_FLD(__reserved1, NULL) +}; diff --git a/storage/xtradb/handler/i_s.h b/storage/xtradb/handler/i_s.h new file mode 100644 index 00000000000..de0e7610646 --- /dev/null +++ b/storage/xtradb/handler/i_s.h @@ -0,0 +1,41 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +InnoDB INFORMATION SCHEMA tables interface to MySQL. + +Created July 18, 2007 Vasil Dimov +*******************************************************/ + +#ifndef i_s_h +#define i_s_h + +extern struct st_mysql_plugin i_s_innodb_buffer_pool_pages; +extern struct st_mysql_plugin i_s_innodb_buffer_pool_pages_index; +extern struct st_mysql_plugin i_s_innodb_buffer_pool_pages_blob; +extern struct st_mysql_plugin i_s_innodb_trx; +extern struct st_mysql_plugin i_s_innodb_locks; +extern struct st_mysql_plugin i_s_innodb_lock_waits; +extern struct st_mysql_plugin i_s_innodb_cmp; +extern struct st_mysql_plugin i_s_innodb_cmp_reset; +extern struct st_mysql_plugin i_s_innodb_cmpmem; +extern struct st_mysql_plugin i_s_innodb_cmpmem_reset; +extern struct st_mysql_plugin i_s_innodb_patches; +extern struct st_mysql_plugin i_s_innodb_rseg; + +#endif /* i_s_h */ diff --git a/storage/xtradb/handler/innodb_patch_info.h b/storage/xtradb/handler/innodb_patch_info.h new file mode 100644 index 00000000000..976e6eefe1c --- /dev/null +++ b/storage/xtradb/handler/innodb_patch_info.h @@ -0,0 +1,35 @@ +/* Copyright (C) 2002-2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifdef USE_PRAGMA_INTERFACE +#pragma interface /* gcc class implementation */ +#endif + +struct innodb_enhancement { + const char *file; + const char *name; + const char *comment; + const char *link; +}innodb_enhancements[] = { +{"xtradb_show_enhancements","I_S.XTRADB_ENHANCEMENTS","","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_show_status","Improvements to SHOW INNODB STATUS","Memory information and lock info fixes","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_io","Improvements to InnoDB IO","","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_opt_lru_count","Fix of buffer_pool mutex","Decreases contention on buffer_pool mutex on LRU operations","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_buffer_pool_pages","Information of buffer pool content","","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_expand_undo_slots","expandable maximum number of undo slots","from 1024 (default) to about 4000","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_extra_rseg","allow to create extra rollback segments","When create new db, the new parameter allows to create more rollback segments","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_overwrite_relay_log_info","overwrite relay-log.info when slave recovery","Building as plugin, it is not used.","http://www.percona.com/docs/wiki/percona-xtradb:innodb_overwrite_relay_log_info"}, +{NULL, NULL, NULL, NULL} +}; diff --git a/storage/xtradb/handler/mysql_addons.cc b/storage/xtradb/handler/mysql_addons.cc new file mode 100644 index 00000000000..a5d9c82c3e3 --- /dev/null +++ b/storage/xtradb/handler/mysql_addons.cc @@ -0,0 +1,41 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +This file contains functions that need to be added to +MySQL code but have not been added yet. + +Whenever you add a function here submit a MySQL bug +report (feature request) with the implementation. Then +write the bug number in the comment before the +function in this file. + +When MySQL commits the function it can be deleted from +here. In a perfect world this file exists but is empty. + +Created November 07, 2007 Vasil Dimov +*******************************************************/ + +#ifndef MYSQL_SERVER +#define MYSQL_SERVER +#endif /* MYSQL_SERVER */ + +#include + +#include "mysql_addons.h" +#include "univ.i" diff --git a/storage/xtradb/handler/win_delay_loader.cc b/storage/xtradb/handler/win_delay_loader.cc new file mode 100644 index 00000000000..1572df42e30 --- /dev/null +++ b/storage/xtradb/handler/win_delay_loader.cc @@ -0,0 +1,1036 @@ +/***************************************************************************** + +Copyright (c) 2008, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +This file contains functions that implement the delay loader on Windows. + +This is a customized version of delay loader with limited functionalities. +It does not support: + +* (manual) unloading +* multiple delay loaded DLLs +* multiple loading of the same DLL + +This delay loader is used only by the InnoDB plugin. Other components (DLLs) +can still use the default delay loader, provided by MSVC. + +Several acronyms used by Microsoft: + * IAT: import address table + * INT: import name table + * RVA: Relative Virtual Address + +See http://msdn.microsoft.com/en-us/magazine/bb985992.aspx for details of +PE format. +***********************************************************************/ +#if defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) +# define WIN32_LEAN_AND_MEAN +# include +# include +# include + +extern "C" { +# include "univ.i" +# include "hash0hash.h" +} + +/*********************************************************************** +This following contains a list of externals that can not be resolved by +delay loading. They have to be resolved indirectly via their addresses +in the .map file. All of them are external variables. */ +CHARSET_INFO* wdl_my_charset_bin; +CHARSET_INFO* wdl_my_charset_latin1; +CHARSET_INFO* wdl_my_charset_filename; +CHARSET_INFO** wdl_system_charset_info; +CHARSET_INFO** wdl_default_charset_info; +CHARSET_INFO** wdl_all_charsets; +system_variables* wdl_global_system_variables; +char* wdl_mysql_real_data_home; +char** wdl_mysql_data_home; +char** wdl_tx_isolation_names; +char** wdl_binlog_format_names; +char* wdl_reg_ext; +pthread_mutex_t* wdl_LOCK_thread_count; +key_map* wdl_key_map_full; +MY_TMPDIR* wdl_mysql_tmpdir_list; +bool* wdl_mysqld_embedded; +uint* wdl_lower_case_table_names; +ulong* wdl_specialflag; +int* wdl_my_umask; + +/*********************************************************************** +The following is defined in ha_innodb.cc. It is used for copying the +system variables from the builtin innodb plugin to the dynamic plugin. +*/ +extern struct st_mysql_plugin* builtin_innobase_plugin_ptr; + +/*********************************************************************** +The preffered load-address defined in PE (portable executable format).*/ +#if defined(_M_IA64) +#pragma section(".base", long, read) +extern "C" +__declspec(allocate(".base")) +const IMAGE_DOS_HEADER __ImageBase; +#else +extern "C" +const IMAGE_DOS_HEADER __ImageBase; +#endif + +/*********************************************************************** +A template function for converting a relative address (RVA) to an +absolute address (VA). This is due to the pointers in the delay +descriptor (ImgDelayDescr in delayimp.h) have been changed from +VAs to RVAs to work on both 32- and 64-bit platforms. */ +template +X PFromRva(RVA rva) { + return X(PBYTE(&__ImageBase) + rva); +} + +/*********************************************************************** +Convert to the old format for convenience. The structure as well as its +element names follow the definition of ImgDelayDescr in delayimp.h. */ +struct InternalImgDelayDescr { + DWORD grAttrs; /* attributes */ + LPCSTR szName; /* pointer to dll name */ + HMODULE* phmod; /* address of module handle */ + PImgThunkData pIAT; /* address of the IAT */ + PCImgThunkData pINT; /* address of the INT */ + PCImgThunkData pBoundIAT; /* address of the optional bound IAT */ + PCImgThunkData pUnloadIAT; /* address of optional copy of + original IAT */ + DWORD dwTimeStamp; /* 0 if not bound, + otherwise date/time stamp of DLL + bound to (Old BIND) */ +}; + +typedef struct map_hash_chain_struct map_hash_chain_t; + +struct map_hash_chain_struct { + char* symbol; /* pointer to a symbol */ + ulint value; /* address of the symbol */ + map_hash_chain_t* next; /* pointer to the next cell + in the same folder. */ + map_hash_chain_t* chain; /* a linear chain used for + cleanup. */ +}; + +static HMODULE my_hmod = 0; +static struct hash_table_struct* m_htbl = NULL ; +static map_hash_chain_t* chain_header = NULL; +static ibool wdl_init = FALSE; +const ulint MAP_HASH_CELLS_NUM = 10000; + +#ifndef DBUG_OFF +/*********************************************************************** +In the dynamic plugin, it is required to call the following dbug functions +in the server: + _db_pargs_ + _db_doprnt_ + _db_enter_ + _db_return_ + _db_dump_ + +The plugin will get those function pointers during the initialization. +*/ +typedef void (__cdecl* pfn_db_enter_)( + const char* _func_, + const char* _file_, + uint _line_, + const char** _sfunc_, + const char** _sfile_, + uint* _slevel_, + char***); + +typedef void (__cdecl* pfn_db_return_)( + uint _line_, + const char** _sfunc_, + const char** _sfile_, + uint* _slevel_); + +typedef void (__cdecl* pfn_db_pargs_)( + uint _line_, + const char* keyword); + +typedef void (__cdecl* pfn_db_doprnt_)( + const char* format, + ...); + +typedef void (__cdecl* pfn_db_dump_)( + uint _line_, + const char* keyword, + const unsigned char* memory, + size_t length); + +static pfn_db_enter_ wdl_db_enter_; +static pfn_db_return_ wdl_db_return_; +static pfn_db_pargs_ wdl_db_pargs_; +static pfn_db_doprnt_ wdl_db_doprnt_; +static pfn_db_dump_ wdl_db_dump_; +#endif /* !DBUG_OFF */ + +/***************************************************************** +Creates a hash table with >= n array cells. The actual number of cells is +chosen to be a prime number slightly bigger than n. + +This is the same function as hash_create in hash0hash.c, except the +memory allocation. This function is invoked before the engine is +initialized, and buffer pools are not ready yet. */ +static +hash_table_t* +wdl_hash_create( +/*============*/ + /* out, own: created hash table */ + ulint n) /* in: number of array cells */ +{ + hash_cell_t* array; + ulint prime; + hash_table_t* table; + + prime = ut_find_prime(n); + + table = (hash_table_t*) malloc(sizeof(hash_table_t)); + if (table == NULL) { + return(NULL); + } + + array = (hash_cell_t*) malloc(sizeof(hash_cell_t) * prime); + if (array == NULL) { + free(table); + return(NULL); + } + + table->array = array; + table->n_cells = prime; + table->n_mutexes = 0; + table->mutexes = NULL; + table->heaps = NULL; + table->heap = NULL; + table->magic_n = HASH_TABLE_MAGIC_N; + + /* Initialize the cell array */ + hash_table_clear(table); + + return(table); +} + +/***************************************************************** +Frees a hash table. */ +static +void +wdl_hash_table_free( +/*================*/ + hash_table_t* table) /* in, own: hash table */ +{ + ut_a(table != NULL); + ut_a(table->mutexes == NULL); + + free(table->array); + free(table); +} + +/*********************************************************************** +Function for calculating the count of imports given the base of the IAT. */ +static +ulint +wdl_import_count( +/*=============*/ + /* out: number of imports */ + PCImgThunkData pitd_base) /* in: base of the IAT */ +{ + ulint ret = 0; + PCImgThunkData pitd = pitd_base; + + while (pitd->u1.Function) { + pitd++; + ret++; + } + + return(ret); +} + +/*********************************************************************** +Read Mapfile to a hashtable for faster access */ +static +ibool +wdl_load_mapfile( +/*=============*/ + /* out: TRUE if the mapfile is + loaded successfully. */ + const char* filename) /* in: name of the mapfile. */ +{ + FILE* fp; + const size_t nSize = 256; + char tmp_buf[nSize]; + char* func_name; + char* func_addr; + ulint load_addr = 0; + ibool valid_load_addr = FALSE; +#ifdef _WIN64 + const char* tmp_string = " Preferred load address is %16llx"; +#else + const char* tmp_string = " Preferred load address is %08x"; +#endif + + fp = fopen(filename, "r"); + if (fp == NULL) { + + return(FALSE); + } + + /* Check whether to create the hashtable */ + if (m_htbl == NULL) { + + m_htbl = wdl_hash_create(MAP_HASH_CELLS_NUM); + + if (m_htbl == NULL) { + + fclose(fp); + return(FALSE); + } + } + + /* Search start of symbol list and get the preferred load address */ + while (fgets(tmp_buf, sizeof(tmp_buf), fp)) { + + if (sscanf(tmp_buf, tmp_string, &load_addr) == 1) { + + valid_load_addr = TRUE; + } + + if (strstr(tmp_buf, "Rva+Base") != NULL) { + + break; + } + } + + if (valid_load_addr == FALSE) { + + /* No "Preferred load address", the map file is wrong. */ + fclose(fp); + return(FALSE); + } + + /* Read symbol list */ + while (fgets(tmp_buf, sizeof(tmp_buf), fp)) + { + map_hash_chain_t* map_cell; + ulint map_fold; + + if (*tmp_buf == 0) { + + continue; + } + + func_name = strtok(tmp_buf, " "); + func_name = strtok(NULL, " "); + func_addr = strtok(NULL, " "); + + if (func_name && func_addr) { + + ut_snprintf(tmp_buf, nSize, "0x%s", func_addr); + if (*func_name == '_') { + + func_name++; + } + + map_cell = (map_hash_chain_t*) + malloc(sizeof(map_hash_chain_t)); + if (map_cell == NULL) { + return(FALSE); + } + + /* Chain all cells together */ + map_cell->chain = chain_header; + chain_header = map_cell; + + map_cell->symbol = strdup(func_name); + map_cell->value = (ulint) strtoull(tmp_buf, NULL, 0) + - load_addr; + map_fold = ut_fold_string(map_cell->symbol); + + HASH_INSERT(map_hash_chain_t, + next, + m_htbl, + map_fold, + map_cell); + } + } + + fclose(fp); + + return(TRUE); +} + +/***************************************************************** +Cleanup.during DLL unload */ +static +void +wdl_cleanup(void) +/*=============*/ +{ + while (chain_header != NULL) { + map_hash_chain_t* tmp; + + tmp = chain_header->chain; + free(chain_header->symbol); + free(chain_header); + chain_header = tmp; + } + + if (m_htbl != NULL) { + + wdl_hash_table_free(m_htbl); + } +} + +/*********************************************************************** +Load the mapfile mysqld.map. */ +static +HMODULE +wdl_get_mysqld_mapfile(void) +/*========================*/ + /* out: the module handle */ +{ + char file_name[MAX_PATH]; + char* ext; + ulint err; + + if (my_hmod == 0) { + + size_t nSize = MAX_PATH - strlen(".map") -1; + + /* First find out the name of current executable */ + my_hmod = GetModuleHandle(NULL); + if (my_hmod == 0) { + + return(my_hmod); + } + + err = GetModuleFileName(my_hmod, file_name, nSize); + if (err == 0) { + + my_hmod = 0; + return(my_hmod); + } + + ext = strrchr(file_name, '.'); + if (ext != NULL) { + + *ext = 0; + strcat(file_name, ".map"); + + err = wdl_load_mapfile(file_name); + if (err == 0) { + + my_hmod = 0; + } + } else { + + my_hmod = 0; + } + } + + return(my_hmod); +} + +/*********************************************************************** +Retrieves the address of an exported function. It follows the convention +of GetProcAddress(). */ +static +FARPROC +wdl_get_procaddr_from_map( +/*======================*/ + /* out: address of exported + function. */ + HANDLE m_handle, /* in: module handle */ + const char* import_proc) /* in: procedure name */ +{ + map_hash_chain_t* hash_chain; + ulint map_fold; + + map_fold = ut_fold_string(import_proc); + HASH_SEARCH( + next, + m_htbl, + map_fold, + map_hash_chain_t*, + hash_chain, + , + (ut_strcmp(hash_chain->symbol, import_proc) == 0)); + + if (hash_chain == NULL) { + +#ifdef _WIN64 + /* On Win64, the leading '_' may not be taken out. In this + case, search again without the leading '_'. */ + if (*import_proc == '_') { + + import_proc++; + } + + map_fold = ut_fold_string(import_proc); + HASH_SEARCH( + next, + m_htbl, + map_fold, + map_hash_chain_t*, + hash_chain, + , + (ut_strcmp(hash_chain->symbol, import_proc) == 0)); + + if (hash_chain == NULL) { +#endif + if (wdl_init == TRUE) { + + sql_print_error( + "InnoDB: the procedure pointer of %s" + " is not found.", + import_proc); + } + + return(0); +#ifdef _WIN64 + } +#endif + } + + return((FARPROC) ((ulint) m_handle + hash_chain->value)); +} + +/*********************************************************************** +Retrieves the address of an exported variable. +Note: It does not follow the Windows call convention FARPROC. */ +static +void* +wdl_get_varaddr_from_map( +/*=====================*/ + /* out: address of exported + variable. */ + HANDLE m_handle, /* in: module handle */ + const char* import_variable) /* in: variable name */ +{ + map_hash_chain_t* hash_chain; + ulint map_fold; + + map_fold = ut_fold_string(import_variable); + HASH_SEARCH( + next, + m_htbl, + map_fold, + map_hash_chain_t*, + hash_chain, + , + (ut_strcmp(hash_chain->symbol, import_variable) == 0)); + + if (hash_chain == NULL) { + +#ifdef _WIN64 + /* On Win64, the leading '_' may not be taken out. In this + case, search again without the leading '_'. */ + if (*import_variable == '_') { + + import_variable++; + } + + map_fold = ut_fold_string(import_variable); + HASH_SEARCH( + next, + m_htbl, + map_fold, + map_hash_chain_t*, + hash_chain, + , + (ut_strcmp(hash_chain->symbol, import_variable) == 0)); + + if (hash_chain == NULL) { +#endif + if (wdl_init == TRUE) { + + sql_print_error( + "InnoDB: the variable address of %s" + " is not found.", + import_variable); + } + + return(0); +#ifdef _WIN64 + } +#endif + } + + return((void*) ((ulint) m_handle + hash_chain->value)); +} + +/*********************************************************************** +Bind all unresolved external variables from the MySQL executable. */ +static +bool +wdl_get_external_variables(void) +/*============================*/ + /* out: TRUE if successful */ +{ + HMODULE hmod = wdl_get_mysqld_mapfile(); + + if (hmod == 0) { + + return(FALSE); + } + +#define GET_SYM(sym, var, type) \ + var = (type*) wdl_get_varaddr_from_map(hmod, sym); \ + if (var == NULL) return(FALSE) +#ifdef _WIN64 +#define GET_SYM2(sym1, sym2, var, type) \ + var = (type*) wdl_get_varaddr_from_map(hmod, sym1); \ + if (var == NULL) return(FALSE) +#else +#define GET_SYM2(sym1, sym2, var, type) \ + var = (type*) wdl_get_varaddr_from_map(hmod, sym2); \ + if (var == NULL) return(FALSE) +#endif // (_WIN64) +#define GET_C_SYM(sym, type) GET_SYM(#sym, wdl_##sym, type) +#define GET_PROC_ADDR(sym) \ + wdl##sym = (pfn##sym) wdl_get_procaddr_from_map(hmod, #sym) + + GET_C_SYM(my_charset_bin, CHARSET_INFO); + GET_C_SYM(my_charset_latin1, CHARSET_INFO); + GET_C_SYM(my_charset_filename, CHARSET_INFO); + GET_C_SYM(default_charset_info, CHARSET_INFO*); + GET_C_SYM(all_charsets, CHARSET_INFO*); + GET_C_SYM(my_umask, int); + + GET_SYM("?global_system_variables@@3Usystem_variables@@A", + wdl_global_system_variables, struct system_variables); + GET_SYM("?mysql_real_data_home@@3PADA", + wdl_mysql_real_data_home, char); + GET_SYM("?reg_ext@@3PADA", wdl_reg_ext, char); + GET_SYM("?LOCK_thread_count@@3U_RTL_CRITICAL_SECTION@@A", + wdl_LOCK_thread_count, pthread_mutex_t); + GET_SYM("?key_map_full@@3V?$Bitmap@$0EA@@@A", + wdl_key_map_full, key_map); + GET_SYM("?mysql_tmpdir_list@@3Ust_my_tmpdir@@A", + wdl_mysql_tmpdir_list, MY_TMPDIR); + GET_SYM("?mysqld_embedded@@3_NA", + wdl_mysqld_embedded, bool); + GET_SYM("?lower_case_table_names@@3IA", + wdl_lower_case_table_names, uint); + GET_SYM("?specialflag@@3KA", wdl_specialflag, ulong); + + GET_SYM2("?system_charset_info@@3PEAUcharset_info_st@@EA", + "?system_charset_info@@3PAUcharset_info_st@@A", + wdl_system_charset_info, CHARSET_INFO*); + GET_SYM2("?mysql_data_home@@3PEADEA", + "?mysql_data_home@@3PADA", + wdl_mysql_data_home, char*); + GET_SYM2("?tx_isolation_names@@3PAPEBDA", + "?tx_isolation_names@@3PAPBDA", + wdl_tx_isolation_names, char*); + GET_SYM2("?binlog_format_names@@3PAPEBDA", + "?binlog_format_names@@3PAPBDA", + wdl_binlog_format_names, char*); + + /* It is fine if builtin_innobase_plugin is not available. */ + builtin_innobase_plugin_ptr = (struct st_mysql_plugin*) + wdl_get_varaddr_from_map( + hmod, + "?builtin_innobase_plugin@@3PAUst_mysql_plugin@@A"); + +#ifndef DBUG_OFF + GET_PROC_ADDR(_db_enter_); + GET_PROC_ADDR(_db_return_); + GET_PROC_ADDR(_db_pargs_); + GET_PROC_ADDR(_db_doprnt_); + GET_PROC_ADDR(_db_dump_); + + /* If any of the dbug functions is not available, just make them + all invalid. This is the case when working with a non-debug + version of the server. */ + if (wdl_db_enter_ == NULL || wdl_db_return_ == NULL + || wdl_db_pargs_ == NULL || wdl_db_doprnt_ == NULL + || wdl_db_dump_ == NULL) { + + wdl_db_enter_ = NULL; + wdl_db_return_ = NULL; + wdl_db_pargs_ = NULL; + wdl_db_doprnt_ = NULL; + wdl_db_dump_ = NULL; + } +#endif /* !DBUG_OFF */ + + wdl_init = TRUE; + return(TRUE); + +#undef GET_SYM +#undef GET_SYM2 +#undef GET_C_SYM +#undef GET_PROC_ADDR +} + +/*********************************************************************** +The DLL Delayed Loading Helper Function for resolving externals. + +The function may fail due to one of the three reasons: + +* Invalid parameter, which happens if the attributes in pidd aren't + specified correctly. +* Failed to load the map file mysqld.map. +* Failed to find an external name in the map file mysqld.map. + +Note: this function is called by run-time as well as __HrLoadAllImportsForDll. +So, it has to follow Windows call convention. */ +extern "C" +FARPROC WINAPI +__delayLoadHelper2( +/*===============*/ + /* out: the address of the imported + function*/ + PCImgDelayDescr pidd, /* in: a const pointer to a + ImgDelayDescr, see delayimp.h. */ + FARPROC* iat_entry) /* in/out: A pointer to the slot in + the delay load import address table + to be updated with the address of the + imported function. */ +{ + ulint iIAT, iINT; + HMODULE hmod; + PCImgThunkData pitd; + FARPROC fun = NULL; + + /* Set up data used for the hook procs */ + InternalImgDelayDescr idd = { + pidd->grAttrs, + PFromRva(pidd->rvaDLLName), + PFromRva(pidd->rvaHmod), + PFromRva(pidd->rvaIAT), + PFromRva(pidd->rvaINT), + PFromRva(pidd->rvaBoundIAT), + PFromRva(pidd->rvaUnloadIAT), + pidd->dwTimeStamp + }; + + DelayLoadInfo dli = { + sizeof(DelayLoadInfo), + pidd, + iat_entry, + idd.szName, + {0}, + 0, + 0, + 0 + }; + + /* Check the Delay Load Attributes, log an error of invalid + parameter, which happens if the attributes in pidd are not + specified correctly. */ + if ((idd.grAttrs & dlattrRva) == 0) { + + sql_print_error("InnoDB: invalid parameter for delay loader."); + return(0); + } + + hmod = *idd.phmod; + + /* Calculate the index for the IAT entry in the import address table. + The INT entries are ordered the same as the IAT entries so the + calculation can be done on the IAT side. */ + iIAT = (PCImgThunkData) iat_entry - idd.pIAT; + iINT = iIAT; + + pitd = &(idd.pINT[iINT]); + + dli.dlp.fImportByName = !IMAGE_SNAP_BY_ORDINAL(pitd->u1.Ordinal); + + if (dli.dlp.fImportByName) { + + dli.dlp.szProcName = (LPCSTR) (PFromRva + ((RVA) ((UINT_PTR) pitd->u1.AddressOfData))->Name); + } else { + + dli.dlp.dwOrdinal = (ulint) IMAGE_ORDINAL(pitd->u1.Ordinal); + } + + /* Now, load the mapfile, if it has not been done yet */ + if (hmod == 0) { + + hmod = wdl_get_mysqld_mapfile(); + } + + if (hmod == 0) { + /* LoadLibrary failed. */ + PDelayLoadInfo rgpdli[1] = {&dli}; + + dli.dwLastError = ::GetLastError(); + + sql_print_error( + "InnoDB: failed to load mysqld.map with error %d.", + dli.dwLastError); + + return(0); + } + + /* Store the library handle. */ + idd.phmod = &hmod; + + /* Go for the procedure now. */ + dli.hmodCur = hmod; + + if (pidd->rvaBoundIAT && pidd->dwTimeStamp) { + + /* Bound imports exist, check the timestamp from the target + image */ + PIMAGE_NT_HEADERS pinh; + + pinh = (PIMAGE_NT_HEADERS) ((byte*) hmod + + ((PIMAGE_DOS_HEADER) hmod)->e_lfanew); + + if (pinh->Signature == IMAGE_NT_SIGNATURE + && pinh->FileHeader.TimeDateStamp == idd.dwTimeStamp + && (DWORD) hmod == pinh->OptionalHeader.ImageBase) { + + /* We have a decent address in the bound IAT. */ + fun = (FARPROC) (UINT_PTR) + idd.pBoundIAT[iIAT].u1.Function; + + if (fun) { + + *iat_entry = fun; + return(fun); + } + } + } + + fun = wdl_get_procaddr_from_map(hmod, dli.dlp.szProcName); + + if (fun == 0) { + + return(0); + } + + *iat_entry = fun; + return(fun); +} + +/*********************************************************************** +Unload a DLL that was delay loaded. This function is called by run-time. */ +extern "C" +BOOL WINAPI +__FUnloadDelayLoadedDLL2( +/*=====================*/ + /* out: TRUE is returned if the DLL is found + and the IAT matches the original one. */ + LPCSTR module_name) /* in: DLL name */ +{ + return(TRUE); +} + +/****************************************************************** +Load all imports from a DLL that was specified with the /delayload linker +option. +Note: this function is called by run-time. So, it has to follow Windows call +convention. */ +extern "C" +HRESULT WINAPI +__HrLoadAllImportsForDll( +/*=====================*/ + /* out: S_OK if the DLL matches, otherwise + ERROR_MOD_NOT_FOUND is returned. */ + LPCSTR module_name) /* in: DLL name */ +{ + PIMAGE_NT_HEADERS img; + PCImgDelayDescr pidd; + IMAGE_DATA_DIRECTORY* image_data; + LPCSTR current_module; + HRESULT ret = ERROR_MOD_NOT_FOUND; + HMODULE hmod = (HMODULE) &__ImageBase; + + img = (PIMAGE_NT_HEADERS) ((byte*) hmod + + ((PIMAGE_DOS_HEADER) hmod)->e_lfanew); + image_data = + &img->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT]; + + /* Scan the delay load IAT/INT for the DLL */ + if (image_data->Size) { + + pidd = PFromRva(image_data->VirtualAddress); + + /* Check all of the listed DLLs we want to load. */ + while (pidd->rvaDLLName) { + + current_module = PFromRva(pidd->rvaDLLName); + + if (stricmp(module_name, current_module) == 0) { + + /* Found it, break out with pidd and + current_module set appropriately */ + break; + } + + /* To the next delay import descriptor */ + pidd++; + } + + if (pidd->rvaDLLName) { + + /* Found a matching DLL, now process it. */ + FARPROC* iat_entry; + size_t count; + + iat_entry = PFromRva(pidd->rvaIAT); + count = wdl_import_count((PCImgThunkData) iat_entry); + + /* now load all the imports from the DLL */ + while (count > 0) { + + /* No need to check the return value */ + __delayLoadHelper2(pidd, iat_entry); + iat_entry++; + count--; + } + + ret = S_OK; + } + } + + return ret; +} + +/****************************************************************** +The main function of a DLL */ +BOOL +WINAPI +DllMain( +/*====*/ + /* out: TRUE if the call succeeds */ + HINSTANCE hinstDLL, /* in: handle to the DLL module */ + DWORD fdwReason, /* Reason code that indicates why the + DLL entry-point function is being + called.*/ + LPVOID lpvReserved) /* in: additional parameter based on + fdwReason */ +{ + BOOL success = TRUE; + + switch (fdwReason) { + + case DLL_PROCESS_ATTACH: + success = wdl_get_external_variables(); + break; + + case DLL_PROCESS_DETACH: + wdl_cleanup(); + break; + } + + return(success); +} + +#ifndef DBUG_OFF +/****************************************************************** +Process entry point to user function. It makes the call to _db_enter_ +in mysqld.exe. The DBUG functions are defined in my_dbug.h. */ +extern "C" UNIV_INTERN +void +_db_enter_( + const char* _func_, /* in: current function name */ + const char* _file_, /* in: current file name */ + uint _line_, /* in: current source line number */ + const char** _sfunc_, /* out: previous _func_ */ + const char** _sfile_, /* out: previous _file_ */ + uint* _slevel_, /* out: previous nesting level */ + char*** _sframep_) /* out: previous frame pointer */ +{ + if (wdl_db_enter_ != NULL) { + + wdl_db_enter_(_func_, _file_, _line_, _sfunc_, _sfile_, + _slevel_, _sframep_); + } +} + +/****************************************************************** +Process exit from user function. It makes the call to _db_return_() +in the server. */ +extern "C" UNIV_INTERN +void +_db_return_( + uint _line_, /* in: current source line number */ + const char** _sfunc_, /* out: previous _func_ */ + const char** _sfile_, /* out: previous _file_ */ + uint* _slevel_) /* out: previous level */ +{ + if (wdl_db_return_ != NULL) { + + wdl_db_return_(_line_, _sfunc_, _sfile_, _slevel_); + } +} + +/****************************************************************** +Log arguments for subsequent use. It makes the call to _db_pargs_() +in the server. */ +extern "C" UNIV_INTERN +void +_db_pargs_( + uint _line_, /* in: current source line number */ + const char* keyword) /* in: keyword for current macro */ +{ + if (wdl_db_pargs_ != NULL) { + + wdl_db_pargs_(_line_, keyword); + } +} + +/****************************************************************** +Handle print of debug lines. It saves the text into a buffer first, +then makes the call to _db_doprnt_() in the server. The text is +truncated to the size of buffer. */ +extern "C" UNIV_INTERN +void +_db_doprnt_( + const char* format, /* in: the format string */ + ...) /* in: list of arguments */ +{ + va_list argp; + char buffer[512]; + + if (wdl_db_doprnt_ != NULL) { + + va_start(argp, format); + /* it is ok to ignore the trunction. */ + _vsnprintf(buffer, sizeof(buffer), format, argp); + wdl_db_doprnt_(buffer); + va_end(argp); + } +} + +/****************************************************************** +Dump a string in hex. It makes the call to _db_dump_() in the server. */ +extern "C" UNIV_INTERN +void +_db_dump_( + uint _line_, /* in: current source line + number */ + const char* keyword, /* in: keyword list */ + const unsigned char* memory, /* in: memory to dump */ + size_t length) /* in: bytes to dump */ +{ + if (wdl_db_dump_ != NULL) { + + wdl_db_dump_(_line_, keyword, memory, length); + } +} + +#endif /* !DBUG_OFF */ +#endif /* defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) */ diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.c new file mode 100644 index 00000000000..ac678471312 --- /dev/null +++ b/storage/xtradb/ibuf/ibuf0ibuf.c @@ -0,0 +1,3598 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "ibuf0ibuf.h" + +#ifdef UNIV_NONINL +#include "ibuf0ibuf.ic" +#endif + +#include "buf0buf.h" +#include "buf0rea.h" +#include "fsp0fsp.h" +#include "trx0sys.h" +#include "fil0fil.h" +#include "thr0loc.h" +#include "rem0rec.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "sync0sync.h" +#include "dict0boot.h" +#include "fut0lst.h" +#include "lock0lock.h" +#include "log0recv.h" +#include "que0que.h" + +/* STRUCTURE OF AN INSERT BUFFER RECORD + +In versions < 4.1.x: + +1. The first field is the page number. +2. The second field is an array which stores type info for each subsequent + field. We store the information which affects the ordering of records, and + also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it + is 10 bytes. +3. Next we have the fields of the actual index record. + +In versions >= 4.1.x: + +Note that contary to what we planned in the 1990's, there will only be one +insert buffer tree, and that is in the system tablespace of InnoDB. + +1. The first field is the space id. +2. The second field is a one-byte marker (0) which differentiates records from + the < 4.1.x storage format. +3. The third field is the page number. +4. The fourth field contains the type info, where we have also added 2 bytes to + store the charset. In the compressed table format of 5.0.x we must add more + information here so that we can build a dummy 'index' struct which 5.0.x + can use in the binary search on the index page in the ibuf merge phase. +5. The rest of the fields contain the fields of the actual index record. + +In versions >= 5.0.3: + +The first byte of the fourth field is an additional marker (0) if the record +is in the compact format. The presence of this marker can be detected by +looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. + +The high-order bit of the character set field in the type info is the +"nullable" flag for the field. */ + + +/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM + +If an OS thread performs any operation that brings in disk pages from +non-system tablespaces into the buffer pool, or creates such a page there, +then the operation may have as a side effect an insert buffer index tree +compression. Thus, the tree latch of the insert buffer tree may be acquired +in the x-mode, and also the file space latch of the system tablespace may +be acquired in the x-mode. + +Also, an insert to an index in a non-system tablespace can have the same +effect. How do we know this cannot lead to a deadlock of OS threads? There +is a problem with the i\o-handler threads: they break the latching order +because they own x-latches to pages which are on a lower level than the +insert buffer tree latch, its page latches, and the tablespace latch an +insert buffer operation can reserve. + +The solution is the following: Let all the tree and page latches connected +with the insert buffer be later in the latching order than the fsp latch and +fsp page latches. + +Insert buffer pages must be such that the insert buffer is never invoked +when these pages are accessed as this would result in a recursion violating +the latching order. We let a special i/o-handler thread take care of i/o to +the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap +pages and the first inode page, which contains the inode of the ibuf tree: let +us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead +access both non-ibuf and ibuf pages. + +Then an i/o-handler for the insert buffer never needs to access recursively the +insert buffer tree and thus obeys the latching order. On the other hand, other +i/o-handlers for other tablespaces may require access to the insert buffer, +but because all kinds of latches they need to access there are later in the +latching order, no violation of the latching order occurs in this case, +either. + +A problem is how to grow and contract an insert buffer tree. As it is later +in the latching order than the fsp management, we have to reserve the fsp +latch first, before adding or removing pages from the insert buffer tree. +We let the insert buffer tree have its own file space management: a free +list of pages linked to the tree root. To prevent recursive using of the +insert buffer when adding pages to the tree, we must first load these pages +to memory, obtaining a latch on them, and only after that add them to the +free list of the insert buffer tree. More difficult is removing of pages +from the free list. If there is an excess of pages in the free list of the +ibuf tree, they might be needed if some thread reserves the fsp latch, +intending to allocate more file space. So we do the following: if a thread +reserves the fsp latch, we check the writer count field of the latch. If +this field has value 1, it means that the thread did not own the latch +before entering the fsp system, and the mtr of the thread contains no +modifications to the fsp pages. Now we are free to reserve the ibuf latch, +and check if there is an excess of pages in the free list. We can then, in a +separate mini-transaction, take them out of the free list and free them to +the fsp system. + +To avoid deadlocks in the ibuf system, we divide file pages into three levels: + +(1) non-ibuf pages, +(2) ibuf tree pages and the pages in the ibuf tree free list, and +(3) ibuf bitmap pages. + +No OS thread is allowed to access higher level pages if it has latches to +lower level pages; even if the thread owns a B-tree latch it must not access +the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead +is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle +exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively +level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e., +it uses synchronous aio, it can access any pages, as long as it obeys the +access order rules. */ + +/* Buffer pool size per the maximum insert buffer size */ +#define IBUF_POOL_SIZE_PER_MAX_SIZE 2 + +/* Table name for the insert buffer. */ +#define IBUF_TABLE_NAME "SYS_IBUF_TABLE" + +/** Operations that can currently be buffered. */ +UNIV_INTERN ibuf_use_t ibuf_use = IBUF_USE_INSERT; + +/** The insert buffer control structure */ +UNIV_INTERN ibuf_t* ibuf = NULL; + +UNIV_INTERN ulint ibuf_flush_count = 0; + +#ifdef UNIV_IBUF_COUNT_DEBUG +/* Dimensions for the ibuf_count array */ +#define IBUF_COUNT_N_SPACES 4 +#define IBUF_COUNT_N_PAGES 130000 + +/* Buffered entry counts for file pages, used in debugging */ +static ulint ibuf_counts[IBUF_COUNT_N_SPACES][IBUF_COUNT_N_PAGES]; + +/********************************************************************** +Checks that the indexes to ibuf_counts[][] are within limits. */ +UNIV_INLINE +void +ibuf_count_check( +/*=============*/ + ulint space_id, /* in: space identifier */ + ulint page_no) /* in: page number */ +{ + if (space_id < IBUF_COUNT_N_SPACES && page_no < IBUF_COUNT_N_PAGES) { + return; + } + + fprintf(stderr, + "InnoDB: UNIV_IBUF_COUNT_DEBUG limits space_id and page_no\n" + "InnoDB: and breaks crash recovery.\n" + "InnoDB: space_id=%lu, should be 0<=space_id<%lu\n" + "InnoDB: page_no=%lu, should be 0<=page_no<%lu\n", + (ulint) space_id, (ulint) IBUF_COUNT_N_SPACES, + (ulint) page_no, (ulint) IBUF_COUNT_N_PAGES); + ut_error; +} +#endif + +/* The start address for an insert buffer bitmap page bitmap */ +#define IBUF_BITMAP PAGE_DATA + +/* Offsets in bits for the bits describing a single page in the bitmap */ +#define IBUF_BITMAP_FREE 0 +#define IBUF_BITMAP_BUFFERED 2 +#define IBUF_BITMAP_IBUF 3 /* TRUE if page is a part of the ibuf + tree, excluding the root page, or is + in the free list of the ibuf */ + +/* Number of bits describing a single page */ +#define IBUF_BITS_PER_PAGE 4 +#if IBUF_BITS_PER_PAGE % 2 +# error "IBUF_BITS_PER_PAGE must be an even number!" +#endif + +/* The mutex used to block pessimistic inserts to ibuf trees */ +static mutex_t ibuf_pessimistic_insert_mutex; + +/* The mutex protecting the insert buffer structs */ +static mutex_t ibuf_mutex; + +/* The mutex protecting the insert buffer bitmaps */ +static mutex_t ibuf_bitmap_mutex; + +/* The area in pages from which contract looks for page numbers for merge */ +#define IBUF_MERGE_AREA 8 + +/* Inside the merge area, pages which have at most 1 per this number less +buffered entries compared to maximum volume that can buffered for a single +page are merged along with the page whose buffer became full */ +#define IBUF_MERGE_THRESHOLD 4 + +/* In ibuf_contract at most this number of pages is read to memory in one +batch, in order to merge the entries for them in the insert buffer */ +#define IBUF_MAX_N_PAGES_MERGED IBUF_MERGE_AREA + +/* If the combined size of the ibuf trees exceeds ibuf->max_size by this +many pages, we start to contract it in connection to inserts there, using +non-synchronous contract */ +#define IBUF_CONTRACT_ON_INSERT_NON_SYNC 0 + +/* Same as above, but use synchronous contract */ +#define IBUF_CONTRACT_ON_INSERT_SYNC 5 + +/* Same as above, but no insert is done, only contract is called */ +#define IBUF_CONTRACT_DO_NOT_INSERT 10 + +/* TODO: how to cope with drop table if there are records in the insert +buffer for the indexes of the table? Is there actually any problem, +because ibuf merge is done to a page when it is read in, and it is +still physically like the index page even if the index would have been +dropped! So, there seems to be no problem. */ + +/********************************************************************** +Sets the flag in the current OS thread local storage denoting that it is +inside an insert buffer routine. */ +UNIV_INLINE +void +ibuf_enter(void) +/*============*/ +{ + ibool* ptr; + + ptr = thr_local_get_in_ibuf_field(); + + ut_ad(*ptr == FALSE); + + *ptr = TRUE; +} + +/********************************************************************** +Sets the flag in the current OS thread local storage denoting that it is +exiting an insert buffer routine. */ +UNIV_INLINE +void +ibuf_exit(void) +/*===========*/ +{ + ibool* ptr; + + ptr = thr_local_get_in_ibuf_field(); + + ut_ad(*ptr == TRUE); + + *ptr = FALSE; +} + +/********************************************************************** +Returns TRUE if the current OS thread is performing an insert buffer +routine. */ +UNIV_INTERN +ibool +ibuf_inside(void) +/*=============*/ + /* out: TRUE if inside an insert buffer routine: for instance, + a read-ahead of non-ibuf pages is then forbidden */ +{ + return(*thr_local_get_in_ibuf_field()); +} + +/********************************************************************** +Gets the ibuf header page and x-latches it. */ +static +page_t* +ibuf_header_page_get( +/*=================*/ + /* out: insert buffer header page */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + + ut_ad(!ibuf_inside()); + + block = buf_page_get( + IBUF_SPACE_ID, 0, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_IBUF_HEADER); + + return(buf_block_get_frame(block)); +} + +/********************************************************************** +Gets the root page and x-latches it. */ +static +page_t* +ibuf_tree_root_get( +/*===============*/ + /* out: insert buffer tree root page */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + + ut_ad(ibuf_inside()); + + mtr_x_lock(dict_index_get_lock(ibuf->index), mtr); + + block = buf_page_get( + IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + return(buf_block_get_frame(block)); +} + +#ifdef UNIV_IBUF_COUNT_DEBUG +/********************************************************************** +Gets the ibuf count for a given page. */ +UNIV_INTERN +ulint +ibuf_count_get( +/*===========*/ + /* out: number of entries in the insert buffer + currently buffered for this page */ + ulint space, /* in: space id */ + ulint page_no)/* in: page number */ +{ + ibuf_count_check(space, page_no); + + return(ibuf_counts[space][page_no]); +} + +/********************************************************************** +Sets the ibuf count for a given page. */ +static +void +ibuf_count_set( +/*===========*/ + ulint space, /* in: space id */ + ulint page_no,/* in: page number */ + ulint val) /* in: value to set */ +{ + ibuf_count_check(space, page_no); + ut_a(val < UNIV_PAGE_SIZE); + + ibuf_counts[space][page_no] = val; +} +#endif + +/********************************************************************** +Updates the size information of the ibuf, assuming the segment size has not +changed. */ +static +void +ibuf_size_update( +/*=============*/ + const page_t* root, /* in: ibuf tree root */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + ibuf->free_list_len = flst_get_len(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST, mtr); + + ibuf->height = 1 + btr_page_get_level(root, mtr); + + /* the '1 +' is the ibuf header page */ + ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len); + + ibuf->empty = page_get_n_recs(root) == 0; +} + +/********************************************************************** +Creates the insert buffer data structure at a database startup and initializes +the data structures for the insert buffer. */ +UNIV_INTERN +void +ibuf_init_at_db_start(void) +/*=======================*/ +{ + page_t* root; + mtr_t mtr; + dict_table_t* table; + mem_heap_t* heap; + dict_index_t* index; + ulint n_used; + page_t* header_page; + ulint error; + + ibuf = mem_alloc(sizeof(ibuf_t)); + + memset(ibuf, 0, sizeof(*ibuf)); + + /* Note that also a pessimistic delete can sometimes make a B-tree + grow in size, as the references on the upper levels of the tree can + change */ + + /* The default for ibuf_max_size is calculated from the requested + buffer pool size srv_buf_pool_size, not the actual size as returned + by buf_pool_get_curr_size(). The latter can differ from the former + by one page due to alignment requirements, and we do not want a + user-visible variable like INNODB_IBUF_MAX_SIZE to vary at random. */ + ibuf->max_size = ut_min( srv_buf_pool_size / UNIV_PAGE_SIZE + / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE); + + srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE; + + mutex_create(&ibuf_pessimistic_insert_mutex, + SYNC_IBUF_PESS_INSERT_MUTEX); + + mutex_create(&ibuf_mutex, SYNC_IBUF_MUTEX); + + mutex_create(&ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX); + + mtr_start(&mtr); + + mutex_enter(&ibuf_mutex); + + mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, NULL), &mtr); + + header_page = ibuf_header_page_get(&mtr); + + fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + &n_used, &mtr); + ibuf_enter(); + + ut_ad(n_used >= 2); + + ibuf->seg_size = n_used; + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + root = buf_block_get_frame(block); + } + + ibuf_size_update(root, &mtr); + mutex_exit(&ibuf_mutex); + + mtr_commit(&mtr); + + ibuf_exit(); + + heap = mem_heap_create(450); + + /* Use old-style record format for the insert buffer. */ + table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0); + + dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0); + + table->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID); + + dict_table_add_to_cache(table, heap); + mem_heap_free(heap); + + index = dict_mem_index_create( + IBUF_TABLE_NAME, "CLUST_IND", + IBUF_SPACE_ID, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, 1); + + dict_mem_index_add_field(index, "DUMMY_COLUMN", 0); + + index->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID); + + error = dict_index_add_to_cache(table, index, + FSP_IBUF_TREE_ROOT_PAGE_NO, FALSE); + ut_a(error == DB_SUCCESS); + + ibuf->index = dict_table_get_first_index(table); +} + +/************************************************************************* +Initializes an ibuf bitmap page. */ +UNIV_INTERN +void +ibuf_bitmap_page_init( +/*==================*/ + buf_block_t* block, /* in: bitmap page */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* page; + ulint byte_offset; + ulint zip_size = buf_block_get_zip_size(block); + + ut_a(ut_is_2pow(zip_size)); + + page = buf_block_get_frame(block); + fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP); + + /* Write all zeros to the bitmap */ + + if (!zip_size) { + byte_offset = UT_BITS_IN_BYTES(UNIV_PAGE_SIZE + * IBUF_BITS_PER_PAGE); + } else { + byte_offset = UT_BITS_IN_BYTES(zip_size * IBUF_BITS_PER_PAGE); + } + + memset(page + IBUF_BITMAP, 0, byte_offset); + + /* The remaining area (up to the page trailer) is uninitialized. */ + + mlog_write_initial_log_record(page, MLOG_IBUF_BITMAP_INIT, mtr); +} + +/************************************************************************* +Parses a redo log record of an ibuf bitmap page init. */ +UNIV_INTERN +byte* +ibuf_parse_bitmap_init( +/*===================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + buf_block_t* block, /* in: block or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (block) { + ibuf_bitmap_page_init(block, mtr); + } + + return(ptr); +} + +/************************************************************************ +Gets the desired bits for a given page from a bitmap page. */ +UNIV_INLINE +ulint +ibuf_bitmap_page_get_bits( +/*======================*/ + /* out: value of bits */ + const page_t* page, /* in: bitmap page */ + ulint page_no,/* in: page whose bits to get */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint bit, /* in: IBUF_BITMAP_FREE, + IBUF_BITMAP_BUFFERED, ... */ + mtr_t* mtr __attribute__((unused))) + /* in: mtr containing an + x-latch to the bitmap page */ +{ + ulint byte_offset; + ulint bit_offset; + ulint map_byte; + ulint value; + + ut_ad(bit < IBUF_BITS_PER_PAGE); +#if IBUF_BITS_PER_PAGE % 2 +# error "IBUF_BITS_PER_PAGE % 2 != 0" +#endif + ut_ad(ut_is_2pow(zip_size)); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + + if (!zip_size) { + bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE + + bit; + } else { + bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE + + bit; + } + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE); + + map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); + + value = ut_bit_get_nth(map_byte, bit_offset); + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + + value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1); + } + + return(value); +} + +/************************************************************************ +Sets the desired bit for a given page in a bitmap page. */ +static +void +ibuf_bitmap_page_set_bits( +/*======================*/ + page_t* page, /* in: bitmap page */ + ulint page_no,/* in: page whose bits to set */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint bit, /* in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... */ + ulint val, /* in: value to set */ + mtr_t* mtr) /* in: mtr containing an x-latch to the bitmap page */ +{ + ulint byte_offset; + ulint bit_offset; + ulint map_byte; + + ut_ad(bit < IBUF_BITS_PER_PAGE); +#if IBUF_BITS_PER_PAGE % 2 +# error "IBUF_BITS_PER_PAGE % 2 != 0" +#endif + ut_ad(ut_is_2pow(zip_size)); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a((bit != IBUF_BITMAP_BUFFERED) || (val != FALSE) + || (0 == ibuf_count_get(page_get_space_id(page), + page_no))); +#endif + if (!zip_size) { + bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE + + bit; + } else { + bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE + + bit; + } + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE); + + map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + ut_ad(val <= 3); + + map_byte = ut_bit_set_nth(map_byte, bit_offset, val / 2); + map_byte = ut_bit_set_nth(map_byte, bit_offset + 1, val % 2); + } else { + ut_ad(val <= 1); + map_byte = ut_bit_set_nth(map_byte, bit_offset, val); + } + + mlog_write_ulint(page + IBUF_BITMAP + byte_offset, map_byte, + MLOG_1BYTE, mtr); +} + +/************************************************************************ +Calculates the bitmap page number for a given page number. */ +UNIV_INLINE +ulint +ibuf_bitmap_page_no_calc( +/*=====================*/ + /* out: the bitmap page number where + the file page is mapped */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no) /* in: tablespace page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return(FSP_IBUF_BITMAP_OFFSET + + (page_no & ~(UNIV_PAGE_SIZE - 1))); + } else { + return(FSP_IBUF_BITMAP_OFFSET + + (page_no & ~(zip_size - 1))); + } +} + +/************************************************************************ +Gets the ibuf bitmap page where the bits describing a given file page are +stored. */ +static +page_t* +ibuf_bitmap_get_map_page( +/*=====================*/ + /* out: bitmap page where the file page is mapped, + that is, the bitmap page containing the descriptor + bits for the file page; the bitmap page is + x-latched */ + ulint space, /* in: space id of the file page */ + ulint page_no,/* in: page number of the file page */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + + block = buf_page_get(space, zip_size, + ibuf_bitmap_page_no_calc(zip_size, page_no), + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP); + + return(buf_block_get_frame(block)); +} + +/**************************************************************************** +Sets the free bits of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INLINE +void +ibuf_set_free_bits_low( +/*===================*/ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + const buf_block_t* block, /* in: index page; free bits are set if + the index is non-clustered and page + level is 0 */ + ulint val, /* in: value to set: < 4 */ + mtr_t* mtr) /* in/out: mtr */ +{ + page_t* bitmap_page; + ulint space; + ulint page_no; + + if (!page_is_leaf(buf_block_get_frame(block))) { + + return; + } + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); +#ifdef UNIV_IBUF_DEBUG +# if 0 + fprintf(stderr, + "Setting space %lu page %lu free bits to %lu should be %lu\n", + space, page_no, val, + ibuf_index_page_calc_free(zip_size, block)); +# endif + + ut_a(val <= ibuf_index_page_calc_free(zip_size, block)); +#endif /* UNIV_IBUF_DEBUG */ + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, val, mtr); +} + +/**************************************************************************** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INTERN +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /* in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/* in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val) /* in: value to set: < 4 */ +{ + mtr_t mtr; + page_t* page; + page_t* bitmap_page; + ulint space; + ulint page_no; + ulint zip_size; + + page = buf_block_get_frame(block); + + if (!page_is_leaf(page)) { + + return; + } + + mtr_start(&mtr); + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + zip_size = buf_block_get_zip_size(block); + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr); + +#ifdef UNIV_IBUF_DEBUG + if (max_val != ULINT_UNDEFINED) { + ulint old_val; + + old_val = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, &mtr); +# if 0 + if (old_val != max_val) { + fprintf(stderr, + "Ibuf: page %lu old val %lu max val %lu\n", + page_get_page_no(page), + old_val, max_val); + } +# endif + + ut_a(old_val <= max_val); + } +# if 0 + fprintf(stderr, "Setting page no %lu free bits to %lu should be %lu\n", + page_get_page_no(page), val, + ibuf_index_page_calc_free(zip_size, block)); +# endif + + ut_a(val <= ibuf_index_page_calc_free(zip_size, block)); +#endif /* UNIV_IBUF_DEBUG */ + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, val, &mtr); + mtr_commit(&mtr); +} + +/**************************************************************************** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +UNIV_INTERN +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block) /* in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +{ + ibuf_set_free_bits(block, 0, ULINT_UNDEFINED); +} + +/************************************************************************** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /* in: index page */ + ulint max_ins_size, /* in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr) /* in/out: mtr */ +{ + ulint before; + ulint after; + + ut_a(!buf_block_get_page_zip(block)); + + before = ibuf_index_page_calc_free_bits(0, max_ins_size); + + after = ibuf_index_page_calc_free(0, block); + + /* This approach cannot be used on compressed pages, since the + computed value of "before" often does not match the current + state of the bitmap. This is because the free space may + increase or decrease when a compressed page is reorganized. */ + if (before != after) { + ibuf_set_free_bits_low(0, block, after, mtr); + } +} + +/************************************************************************** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /* in/out: index page */ + mtr_t* mtr) /* in/out: mtr */ +{ + page_t* bitmap_page; + ulint space; + ulint page_no; + ulint zip_size; + ulint after; + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + zip_size = buf_block_get_zip_size(block); + + ut_a(page_is_leaf(buf_block_get_frame(block))); + ut_a(zip_size); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); + + after = ibuf_index_page_calc_free_zip(zip_size, block); + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, after, mtr); +} + +/************************************************************************** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +UNIV_INTERN +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + buf_block_t* block1, /* in: index page */ + buf_block_t* block2, /* in: index page */ + mtr_t* mtr) /* in: mtr */ +{ + ulint state; + + /* As we have to x-latch two random bitmap pages, we have to acquire + the bitmap mutex to prevent a deadlock with a similar operation + performed by another OS thread. */ + + mutex_enter(&ibuf_bitmap_mutex); + + state = ibuf_index_page_calc_free(zip_size, block1); + + ibuf_set_free_bits_low(zip_size, block1, state, mtr); + + state = ibuf_index_page_calc_free(zip_size, block2); + + ibuf_set_free_bits_low(zip_size, block2, state, mtr); + + mutex_exit(&ibuf_bitmap_mutex); +} + +/************************************************************************** +Returns TRUE if the page is one of the fixed address ibuf pages. */ +UNIV_INLINE +ibool +ibuf_fixed_addr_page( +/*=================*/ + /* out: TRUE if a fixed address ibuf i/o page */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/* in: page number */ +{ + return((space == IBUF_SPACE_ID && page_no == IBUF_TREE_ROOT_PAGE_NO) + || ibuf_bitmap_page(zip_size, page_no)); +} + +/*************************************************************************** +Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==TRUE. */ +UNIV_INTERN +ibool +ibuf_page( +/*======*/ + /* out: TRUE if level 2 or level 3 page */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint page_no,/* in: page number */ + mtr_t* mtr) /* in: mtr which will contain an x-latch to the + bitmap page if the page is not one of the fixed + address ibuf pages, or NULL, in which case a new + transaction is created. */ +{ + ibool ret; + mtr_t local_mtr; + page_t* bitmap_page; + + ut_ad(!recv_no_ibuf_operations); + + if (ibuf_fixed_addr_page(space, zip_size, page_no)) { + + return(TRUE); + } else if (space != IBUF_SPACE_ID) { + + return(FALSE); + } + + ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TABLESPACE); + + if (mtr == NULL) { + mtr = &local_mtr; + mtr_start(mtr); + } + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr); + + ret = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_IBUF, mtr); + + if (mtr == &local_mtr) { + mtr_commit(mtr); + } + + return(ret); +} + +/************************************************************************ +Returns the page number field of an ibuf record. */ +static +ulint +ibuf_rec_get_page_no( +/*=================*/ + /* out: page number */ + const rec_t* rec) /* in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(ibuf_inside()); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, 1, &len); + + if (len == 1) { + /* This is of the >= 4.1.x record format */ + ut_a(trx_sys_multiple_tablespace_format); + + field = rec_get_nth_field_old(rec, 2, &len); + } else { + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + field = rec_get_nth_field_old(rec, 0, &len); + } + + ut_a(len == 4); + + return(mach_read_from_4(field)); +} + +/************************************************************************ +Returns the space id field of an ibuf record. For < 4.1.x format records +returns 0. */ +static +ulint +ibuf_rec_get_space( +/*===============*/ + /* out: space id */ + const rec_t* rec) /* in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(ibuf_inside()); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, 1, &len); + + if (len == 1) { + /* This is of the >= 4.1.x record format */ + + ut_a(trx_sys_multiple_tablespace_format); + field = rec_get_nth_field_old(rec, 0, &len); + ut_a(len == 4); + + return(mach_read_from_4(field)); + } + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + return(0); +} + +/************************************************************************ +Creates a dummy index for inserting a record to a non-clustered index. +*/ +static +dict_index_t* +ibuf_dummy_index_create( +/*====================*/ + /* out: dummy index */ + ulint n, /* in: number of fields */ + ibool comp) /* in: TRUE=use compact record format */ +{ + dict_table_t* table; + dict_index_t* index; + + table = dict_mem_table_create("IBUF_DUMMY", + DICT_HDR_SPACE, n, + comp ? DICT_TF_COMPACT : 0); + + index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY", + DICT_HDR_SPACE, 0, n); + + index->table = table; + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + return(index); +} +/************************************************************************ +Add a column to the dummy index */ +static +void +ibuf_dummy_index_add_col( +/*=====================*/ + dict_index_t* index, /* in: dummy index */ + const dtype_t* type, /* in: the data type of the column */ + ulint len) /* in: length of the column */ +{ + ulint i = index->table->n_def; + dict_mem_table_add_col(index->table, NULL, NULL, + dtype_get_mtype(type), + dtype_get_prtype(type), + dtype_get_len(type)); + dict_index_add_col(index, index->table, + dict_table_get_nth_col(index->table, i), len); +} +/************************************************************************ +Deallocates a dummy index for inserting a record to a non-clustered index. +*/ +static +void +ibuf_dummy_index_free( +/*==================*/ + dict_index_t* index) /* in: dummy index */ +{ + dict_table_t* table = index->table; + + dict_mem_index_free(index); + dict_mem_table_free(table); +} + +/************************************************************************* +Builds the entry to insert into a non-clustered index when we have the +corresponding record in an ibuf index. */ +UNIV_INLINE +dtuple_t* +ibuf_build_entry_pre_4_1_x( +/*=======================*/ + /* out, own: entry to insert to + a non-clustered index; NOTE that + as we copy pointers to fields in + ibuf_rec, the caller must hold a + latch to the ibuf_rec page as long + as the entry is used! */ + const rec_t* ibuf_rec, /* in: record in an insert buffer */ + mem_heap_t* heap, /* in: heap where built */ + dict_index_t** pindex) /* out, own: dummy index that + describes the entry */ +{ + ulint i; + ulint len; + const byte* types; + dtuple_t* tuple; + ulint n_fields; + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; + tuple = dtuple_create(heap, n_fields); + types = rec_get_nth_field_old(ibuf_rec, 1, &len); + + ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + const byte* data; + dfield_t* field; + + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); + + dfield_set_data(field, data, len); + + dtype_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } + + *pindex = ibuf_dummy_index_create(n_fields, FALSE); + + return(tuple); +} + +/************************************************************************* +Builds the entry to insert into a non-clustered index when we have the +corresponding record in an ibuf index. */ +static +dtuple_t* +ibuf_build_entry_from_ibuf_rec( +/*===========================*/ + /* out, own: entry to insert to + a non-clustered index; NOTE that + as we copy pointers to fields in + ibuf_rec, the caller must hold a + latch to the ibuf_rec page as long + as the entry is used! */ + const rec_t* ibuf_rec, /* in: record in an insert buffer */ + mem_heap_t* heap, /* in: heap where built */ + dict_index_t** pindex) /* out, own: dummy index that + describes the entry */ +{ + dtuple_t* tuple; + dfield_t* field; + ulint n_fields; + const byte* types; + const byte* data; + ulint len; + ulint i; + dict_index_t* index; + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); + + if (len > 1) { + /* This a < 4.1.x format record */ + + return(ibuf_build_entry_pre_4_1_x(ibuf_rec, heap, pindex)); + } + + /* This a >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + ut_a(*data == 0); + ut_a(rec_get_n_fields_old(ibuf_rec) > 4); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; + + tuple = dtuple_create(heap, n_fields); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + index = ibuf_dummy_index_create( + n_fields, len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + len--; + ut_a(*types == 0); + types++; + } + + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); + + dfield_set_data(field, data, len); + + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ibuf_dummy_index_add_col(index, dfield_get_type(field), len); + } + + /* Prevent an ut_ad() failure in page_zip_write_rec() by + adding system columns to the dummy table pointed to by the + dummy secondary index. The insert buffer is only used for + secondary indexes, whose records never contain any system + columns, such as DB_TRX_ID. */ + ut_d(dict_table_add_system_columns(index->table, index->table->heap)); + + *pindex = index; + + return(tuple); +} + +/************************************************************************ +Returns the space taken by a stored non-clustered index entry if converted to +an index record. */ +static +ulint +ibuf_rec_get_volume( +/*================*/ + /* out: size of index record in bytes + + an upper limit of the space taken in the + page directory */ + const rec_t* ibuf_rec)/* in: ibuf record */ +{ + dtype_t dtype; + ibool new_format = FALSE; + ulint data_size = 0; + ulint n_fields; + const byte* types; + const byte* data; + ulint len; + ulint i; + + ut_ad(ibuf_inside()); + ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); + + data = rec_get_nth_field_old(ibuf_rec, 1, &len); + + if (len > 1) { + /* < 4.1.x format record */ + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields_old(ibuf_rec) - 2; + + types = rec_get_nth_field_old(ibuf_rec, 1, &len); + + ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } else { + /* >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + ut_a(*data == 0); + + types = rec_get_nth_field_old(ibuf_rec, 3, &len); + + ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1); + if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + /* compact record format */ + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + dtuple_t* entry = ibuf_build_entry_from_ibuf_rec( + ibuf_rec, heap, &dummy_index); + volume = rec_get_converted_size(dummy_index, entry, 0); + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + return(volume + page_dir_calc_reserved_space(1)); + } + + n_fields = rec_get_n_fields_old(ibuf_rec) - 4; + + new_format = TRUE; + } + + for (i = 0; i < n_fields; i++) { + if (new_format) { + data = rec_get_nth_field_old(ibuf_rec, i + 4, &len); + + dtype_new_read_for_order_and_null_size( + &dtype, types + i + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + } else { + data = rec_get_nth_field_old(ibuf_rec, i + 2, &len); + + dtype_read_for_order_and_null_size( + &dtype, types + i + * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } + + if (len == UNIV_SQL_NULL) { + data_size += dtype_get_sql_null_size(&dtype); + } else { + data_size += len; + } + } + + return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0) + + page_dir_calc_reserved_space(1)); +} + +/************************************************************************* +Builds the tuple to insert to an ibuf tree when we have an entry for a +non-clustered index. */ +static +dtuple_t* +ibuf_entry_build( +/*=============*/ + /* out, own: entry to insert into an ibuf + index tree; NOTE that the original entry + must be kept because we copy pointers to its + fields */ + dict_index_t* index, /* in: non-clustered index */ + const dtuple_t* entry, /* in: entry for a non-clustered index */ + ulint space, /* in: space id */ + ulint page_no,/* in: index page number where entry should + be inserted */ + mem_heap_t* heap) /* in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + const dfield_t* entry_field; + ulint n_fields; + byte* buf; + byte* buf2; + ulint i; + + /* Starting from 4.1.x, we have to build a tuple whose + (1) first field is the space id, + (2) the second field a single marker byte (0) to tell that this + is a new format record, + (3) the third contains the page number, and + (4) the fourth contains the relevent type information of each data + field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is + (a) 0 for b-trees in the old format, and + (b) 1 for b-trees in the compact format, the first byte of the field + being the marker (0); + (5) and the rest of the fields are copied from entry. All fields + in the tuple are ordered like the type binary in our insert buffer + tree. */ + + n_fields = dtuple_get_n_fields(entry); + + tuple = dtuple_create(heap, n_fields + 4); + + /* Store the space id in tuple */ + + field = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* Store the marker byte field in tuple */ + + field = dtuple_get_nth_field(tuple, 1); + + buf = mem_heap_alloc(heap, 1); + + /* We set the marker byte zero */ + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, 2); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + /* Store the type info in buf2, and add the fields from entry to + tuple */ + buf2 = mem_heap_alloc(heap, n_fields + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + dict_table_is_comp(index->table)); + if (dict_table_is_comp(index->table)) { + *buf2++ = 0; /* write the compact format indicator */ + } + for (i = 0; i < n_fields; i++) { + ulint fixed_len; + const dict_field_t* ifield; + + /* We add 4 below because we have the 4 extra fields at the + start of an ibuf record */ + + field = dtuple_get_nth_field(tuple, i + 4); + entry_field = dtuple_get_nth_field(entry, i); + dfield_copy(field, entry_field); + + ifield = dict_index_get_nth_field(index, i); + /* Prefix index columns of fixed-length columns are of + fixed length. However, in the function call below, + dfield_get_type(entry_field) contains the fixed length + of the column in the clustered index. Replace it with + the fixed length of the secondary index column. */ + fixed_len = ifield->fixed_len; + +#ifdef UNIV_DEBUG + if (fixed_len) { + /* dict_index_add_col() should guarantee these */ + ut_ad(fixed_len <= (ulint) + dfield_get_type(entry_field)->len); + if (ifield->prefix_len) { + ut_ad(ifield->prefix_len == fixed_len); + } else { + ut_ad(fixed_len == (ulint) + dfield_get_type(entry_field)->len); + } + } +#endif /* UNIV_DEBUG */ + + dtype_new_store_for_order_and_null_size( + buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE, + dfield_get_type(entry_field), fixed_len); + } + + /* Store the type info in buf2 to field 3 of tuple */ + + field = dtuple_get_nth_field(tuple, 3); + + if (dict_table_is_comp(index->table)) { + buf2--; + } + + dfield_set_data(field, buf2, n_fields + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + + dict_table_is_comp(index->table)); + /* Set all the types in the new tuple binary */ + + dtuple_set_types_binary(tuple, n_fields + 4); + + return(tuple); +} + +/************************************************************************* +Builds a search tuple used to search buffered inserts for an index page. +This is for < 4.1.x format records */ +static +dtuple_t* +ibuf_search_tuple_build( +/*====================*/ + /* out, own: search tuple */ + ulint space, /* in: space id */ + ulint page_no,/* in: index page number */ + mem_heap_t* heap) /* in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + + ut_a(space == 0); + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + tuple = dtuple_create(heap, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + dtuple_set_types_binary(tuple, 1); + + return(tuple); +} + +/************************************************************************* +Builds a search tuple used to search buffered inserts for an index page. +This is for >= 4.1.x format records. */ +static +dtuple_t* +ibuf_new_search_tuple_build( +/*========================*/ + /* out, own: search tuple */ + ulint space, /* in: space id */ + ulint page_no,/* in: index page number */ + mem_heap_t* heap) /* in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + + ut_a(trx_sys_multiple_tablespace_format); + + tuple = dtuple_create(heap, 3); + + /* Store the space id in tuple */ + + field = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* Store the new format record marker byte */ + + field = dtuple_get_nth_field(tuple, 1); + + buf = mem_heap_alloc(heap, 1); + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, 2); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + dtuple_set_types_binary(tuple, 3); + + return(tuple); +} + +/************************************************************************* +Checks if there are enough pages in the free list of the ibuf tree that we +dare to start a pessimistic insert to the insert buffer. */ +UNIV_INLINE +ibool +ibuf_data_enough_free_for_insert(void) +/*==================================*/ + /* out: TRUE if enough free pages in list */ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + /* We want a big margin of free pages, because a B-tree can sometimes + grow in size also if records are deleted from it, as the node pointers + can change, and we must make sure that we are able to delete the + inserts buffered for pages that we read to the buffer pool, without + any risk of running out of free space in the insert buffer. */ + + return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height); +} + +/************************************************************************* +Checks if there are enough pages in the free list of the ibuf tree that we +should remove them and free to the file space management. */ +UNIV_INLINE +ibool +ibuf_data_too_much_free(void) +/*=========================*/ + /* out: TRUE if enough free pages in list */ +{ + ut_ad(mutex_own(&ibuf_mutex)); + + return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height); +} + +/************************************************************************* +Allocates a new page from the ibuf file segment and adds it to the free +list. */ +static +ulint +ibuf_add_free_page(void) +/*====================*/ + /* out: DB_SUCCESS, or DB_STRONG_FAIL + if no space left */ +{ + mtr_t mtr; + page_t* header_page; + ulint flags; + ulint zip_size; + ulint page_no; + page_t* page; + page_t* root; + page_t* bitmap_page; + + mtr_start(&mtr); + + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr); + zip_size = dict_table_flags_to_zip_size(flags); + + header_page = ibuf_header_page_get(&mtr); + + /* Allocate a new page: NOTE that if the page has been a part of a + non-clustered index which has subsequently been dropped, then the + page may have buffered inserts in the insert buffer, and these + should be deleted from there. These get deleted when the page + allocation creates the page in buffer. Thus the call below may end + up calling the insert buffer routines and, as we yet have no latches + to insert buffer tree pages, these routines can run without a risk + of a deadlock. This is the reason why we created a special ibuf + header page apart from the ibuf tree. */ + + page_no = fseg_alloc_free_page( + header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP, + &mtr); + + if (page_no == FIL_NULL) { + mtr_commit(&mtr); + + return(DB_STRONG_FAIL); + } + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); + + + page = buf_block_get_frame(block); + } + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + + root = ibuf_tree_root_get(&mtr); + + /* Add the page to the free list and update the ibuf size data */ + + flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST, + MLOG_2BYTES, &mtr); + + ibuf->seg_size++; + ibuf->free_list_len++; + + /* Set the bit indicating that this page is now an ibuf tree page + (level 2 page) */ + + bitmap_page = ibuf_bitmap_get_map_page( + IBUF_SPACE_ID, page_no, zip_size, &mtr); + + ibuf_bitmap_page_set_bits( + bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, TRUE, &mtr); + + mtr_commit(&mtr); + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + return(DB_SUCCESS); +} + +/************************************************************************* +Removes a page from the free list and frees it to the fsp system. */ +static +void +ibuf_remove_free_page(void) +/*=======================*/ +{ + mtr_t mtr; + mtr_t mtr2; + page_t* header_page; + ulint flags; + ulint zip_size; + ulint page_no; + page_t* page; + page_t* root; + page_t* bitmap_page; + + mtr_start(&mtr); + + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr); + zip_size = dict_table_flags_to_zip_size(flags); + + header_page = ibuf_header_page_get(&mtr); + + /* Prevent pessimistic inserts to insert buffer trees for a while */ + mutex_enter(&ibuf_pessimistic_insert_mutex); + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + + if (!ibuf_data_too_much_free()) { + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + mutex_exit(&ibuf_pessimistic_insert_mutex); + + mtr_commit(&mtr); + + return; + } + + mtr_start(&mtr2); + + root = ibuf_tree_root_get(&mtr2); + + page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + &mtr2).page; + + /* NOTE that we must release the latch on the ibuf tree root + because in fseg_free_page we access level 1 pages, and the root + is a level 2 page. */ + + mtr_commit(&mtr2); + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + /* Since pessimistic inserts were prevented, we know that the + page is still in the free list. NOTE that also deletes may take + pages from the free list, but they take them from the start, and + the free list was so long that they cannot have taken the last + page from it. */ + + fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + IBUF_SPACE_ID, page_no, &mtr); + +#ifdef UNIV_DEBUG_FILE_ACCESSES + buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no); +#endif + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + + root = ibuf_tree_root_get(&mtr); + + ut_ad(page_no == flst_get_last(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST, &mtr).page); + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + + page = buf_block_get_frame(block); + } + + /* Remove the page from the free list and update the ibuf size data */ + + flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); + + ibuf->seg_size--; + ibuf->free_list_len--; + + mutex_exit(&ibuf_pessimistic_insert_mutex); + + /* Set the bit indicating that this page is no more an ibuf tree page + (level 2 page) */ + + bitmap_page = ibuf_bitmap_get_map_page( + IBUF_SPACE_ID, page_no, zip_size, &mtr); + + ibuf_bitmap_page_set_bits( + bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr); + +#ifdef UNIV_DEBUG_FILE_ACCESSES + buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no); +#endif + mtr_commit(&mtr); + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); +} + +/*************************************************************************** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +UNIV_INTERN +void +ibuf_free_excess_pages(void) +/*========================*/ +{ + ulint i; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(fil_space_get_latch(IBUF_SPACE_ID, NULL), + RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rw_lock_get_x_lock_count( + fil_space_get_latch(IBUF_SPACE_ID, NULL)) == 1); + + ut_ad(!ibuf_inside()); + + /* NOTE: We require that the thread did not own the latch before, + because then we know that we can obey the correct latching order + for ibuf latches */ + + if (!ibuf) { + /* Not yet initialized; not sure if this is possible, but + does no harm to check for it. */ + + return; + } + + /* Free at most a few pages at a time, so that we do not delay the + requested service too much */ + + for (i = 0; i < 4; i++) { + + mutex_enter(&ibuf_mutex); + + if (!ibuf_data_too_much_free()) { + + mutex_exit(&ibuf_mutex); + + return; + } + + mutex_exit(&ibuf_mutex); + + ibuf_remove_free_page(); + } +} + +/************************************************************************* +Reads page numbers from a leaf in an ibuf tree. */ +static +ulint +ibuf_get_merge_page_nos( +/*====================*/ + /* out: a lower limit for the combined volume + of records which will be merged */ + ibool contract,/* in: TRUE if this function is called to + contract the tree, FALSE if this is called + when a single page becomes full and we look + if it pays to read also nearby pages */ + rec_t* rec, /* in: record from which we read up and down + in the chain of records */ + ulint* space_ids,/* in/out: space id's of the pages */ + ib_int64_t* space_versions,/* in/out: tablespace version + timestamps; used to prevent reading in old + pages after DISCARD + IMPORT tablespace */ + ulint* page_nos,/* in/out: buffer for at least + IBUF_MAX_N_PAGES_MERGED many page numbers; + the page numbers are in an ascending order */ + ulint* n_stored)/* out: number of page numbers stored to + page_nos in this function */ +{ + ulint prev_page_no; + ulint prev_space_id; + ulint first_page_no; + ulint first_space_id; + ulint rec_page_no; + ulint rec_space_id; + ulint sum_volumes; + ulint volume_for_page; + ulint rec_volume; + ulint limit; + ulint n_pages; + + *n_stored = 0; + + limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool->curr_size / 4); + + if (page_rec_is_supremum(rec)) { + + rec = page_rec_get_prev(rec); + } + + if (page_rec_is_infimum(rec)) { + + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return(0); + } + + first_page_no = ibuf_rec_get_page_no(rec); + first_space_id = ibuf_rec_get_space(rec); + n_pages = 0; + prev_page_no = 0; + prev_space_id = 0; + + /* Go backwards from the first rec until we reach the border of the + 'merge area', or the page start or the limit of storeable pages is + reached */ + + while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) { + + rec_page_no = ibuf_rec_get_page_no(rec); + rec_space_id = ibuf_rec_get_space(rec); + + if (rec_space_id != first_space_id + || (rec_page_no / IBUF_MERGE_AREA) + != (first_page_no / IBUF_MERGE_AREA)) { + + break; + } + + if (rec_page_no != prev_page_no + || rec_space_id != prev_space_id) { + n_pages++; + } + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + rec = page_rec_get_prev(rec); + } + + rec = page_rec_get_next(rec); + + /* At the loop start there is no prev page; we mark this with a pair + of space id, page no (0, 0) for which there can never be entries in + the insert buffer */ + + prev_page_no = 0; + prev_space_id = 0; + sum_volumes = 0; + volume_for_page = 0; + + while (*n_stored < limit) { + if (page_rec_is_supremum(rec)) { + /* When no more records available, mark this with + another 'impossible' pair of space id, page no */ + rec_page_no = 1; + rec_space_id = 0; + } else { + rec_page_no = ibuf_rec_get_page_no(rec); + rec_space_id = ibuf_rec_get_space(rec); + ut_ad(rec_page_no > IBUF_TREE_ROOT_PAGE_NO); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED); +#endif + if ((rec_space_id != prev_space_id + || rec_page_no != prev_page_no) + && (prev_space_id != 0 || prev_page_no != 0)) { + + if ((prev_page_no == first_page_no + && prev_space_id == first_space_id) + || contract + || (volume_for_page + > ((IBUF_MERGE_THRESHOLD - 1) + * 4 * UNIV_PAGE_SIZE + / IBUF_PAGE_SIZE_PER_FREE_SPACE) + / IBUF_MERGE_THRESHOLD)) { + + space_ids[*n_stored] = prev_space_id; + space_versions[*n_stored] + = fil_space_get_version(prev_space_id); + page_nos[*n_stored] = prev_page_no; + + (*n_stored)++; + + sum_volumes += volume_for_page; + } + + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { + + break; + } + + volume_for_page = 0; + } + + if (rec_page_no == 1 && rec_space_id == 0) { + /* Supremum record */ + + break; + } + + rec_volume = ibuf_rec_get_volume(rec); + + volume_for_page += rec_volume; + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + rec = page_rec_get_next(rec); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED); +#endif +#if 0 + fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n", + *n_stored, sum_volumes); +#endif + return(sum_volumes); +} + +/************************************************************************* +Contracts insert buffer trees by reading pages to the buffer pool. */ +static +ulint +ibuf_contract_ext( +/*==============*/ + /* out: a lower limit for the combined size in bytes + of entries which will be merged from ibuf trees to the + pages read, 0 if ibuf is empty */ + ulint* n_pages,/* out: number of pages to which merged */ + ibool sync) /* in: TRUE if the caller wants to wait for the + issued read with the highest tablespace address + to complete */ +{ + btr_pcur_t pcur; + ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; + ulint n_stored; + ulint sum_sizes; + mtr_t mtr; + + *n_pages = 0; + ut_ad(!ibuf_inside()); + + mutex_enter(&ibuf_mutex); + + if (ibuf->empty) { +ibuf_is_empty: + mutex_exit(&ibuf_mutex); + + return(0); + } + + mtr_start(&mtr); + + ibuf_enter(); + + /* Open a cursor to a randomly chosen leaf of the tree, at a random + position within the leaf */ + + btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr); + + if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) { + /* When the ibuf tree is emptied completely, the last record + is removed using an optimistic delete and ibuf_size_update + is not called, causing ibuf->empty to remain FALSE. If we do + not reset it to TRUE here then database shutdown will hang + in the loop in ibuf_contract_for_n_pages. */ + + ibuf->empty = TRUE; + + ibuf_exit(); + + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + goto ibuf_is_empty; + } + + mutex_exit(&ibuf_mutex); + + sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), + space_ids, space_versions, + page_nos, &n_stored); +#if 0 /* defined UNIV_IBUF_DEBUG */ + fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n", + sync, n_stored, sum_sizes); +#endif + ibuf_exit(); + + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos, + n_stored); + *n_pages = n_stored; + + return(sum_sizes + 1); +} + +/************************************************************************* +Contracts insert buffer trees by reading pages to the buffer pool. */ +UNIV_INTERN +ulint +ibuf_contract( +/*==========*/ + /* out: a lower limit for the combined size in bytes + of entries which will be merged from ibuf trees to the + pages read, 0 if ibuf is empty */ + ibool sync) /* in: TRUE if the caller wants to wait for the + issued read with the highest tablespace address + to complete */ +{ + ulint n_pages; + + return(ibuf_contract_ext(&n_pages, sync)); +} + +/************************************************************************* +Contracts insert buffer trees by reading pages to the buffer pool. */ +UNIV_INTERN +ulint +ibuf_contract_for_n_pages( +/*======================*/ + /* out: a lower limit for the combined size in bytes + of entries which will be merged from ibuf trees to the + pages read, 0 if ibuf is empty */ + ibool sync, /* in: TRUE if the caller wants to wait for the + issued read with the highest tablespace address + to complete */ + ulint n_pages)/* in: try to read at least this many pages to + the buffer pool and merge the ibuf contents to + them */ +{ + ulint sum_bytes = 0; + ulint sum_pages = 0; + ulint n_bytes; + ulint n_pag2; + + while (sum_pages < n_pages) { + n_bytes = ibuf_contract_ext(&n_pag2, sync); + + if (n_bytes == 0) { + return(sum_bytes); + } + + sum_bytes += n_bytes; + sum_pages += n_pag2; + } + + return(sum_bytes); +} + +/************************************************************************* +Contract insert buffer trees after insert if they are too big. */ +UNIV_INLINE +void +ibuf_contract_after_insert( +/*=======================*/ + ulint entry_size) /* in: size of a record which was inserted + into an ibuf tree */ +{ + ibool sync; + ulint sum_sizes; + ulint size; + + mutex_enter(&ibuf_mutex); + + if (!srv_ibuf_active_contract) { + if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) { + mutex_exit(&ibuf_mutex); + + return; + } + } + + sync = FALSE; + + if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_ON_INSERT_SYNC) { + + sync = TRUE; + } + + mutex_exit(&ibuf_mutex); + + /* Contract at least entry_size many bytes */ + sum_sizes = 0; + size = 1; + + while ((size > 0) && (sum_sizes < entry_size)) { + + size = ibuf_contract(sync); + sum_sizes += size; + } +} + +/************************************************************************* +Gets an upper limit for the combined size of entries buffered in the insert +buffer for a given page. */ +static +ulint +ibuf_get_volume_buffered( +/*=====================*/ + /* out: upper limit for the volume of + buffered inserts for the index page, in bytes; + we may also return UNIV_PAGE_SIZE, if the + entries for the index page span on several + pages in the insert buffer */ + btr_pcur_t* pcur, /* in: pcur positioned at a place in an + insert buffer tree where we would insert an + entry for the index page whose number is + page_no, latch mode has to be BTR_MODIFY_PREV + or BTR_MODIFY_TREE */ + ulint space, /* in: space id */ + ulint page_no,/* in: page number of an index page */ + mtr_t* mtr) /* in: mtr */ +{ + ulint volume; + rec_t* rec; + page_t* page; + ulint prev_page_no; + page_t* prev_page; + ulint next_page_no; + page_t* next_page; + + ut_a(trx_sys_multiple_tablespace_format); + + ut_ad((pcur->latch_mode == BTR_MODIFY_PREV) + || (pcur->latch_mode == BTR_MODIFY_TREE)); + + /* Count the volume of records earlier in the alphabetical order than + pcur */ + + volume = 0; + + rec = btr_pcur_get_rec(pcur); + page = page_align(rec); + + if (page_rec_is_supremum(rec)) { + rec = page_rec_get_prev(rec); + } + + for (;;) { + if (page_rec_is_infimum(rec)) { + + break; + } + + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { + + goto count_later; + } + + volume += ibuf_rec_get_volume(rec); + + rec = page_rec_get_prev(rec); + } + + /* Look at the previous page */ + + prev_page_no = btr_page_get_prev(page, mtr); + + if (prev_page_no == FIL_NULL) { + + goto count_later; + } + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + + prev_page = buf_block_get_frame(block); + } + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_next(prev_page, mtr) + == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + rec = page_get_supremum_rec(prev_page); + rec = page_rec_get_prev(rec); + + for (;;) { + if (page_rec_is_infimum(rec)) { + + /* We cannot go to yet a previous page, because we + do not have the x-latch on it, and cannot acquire one + because of the latching order: we have to give up */ + + return(UNIV_PAGE_SIZE); + } + + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { + + goto count_later; + } + + volume += ibuf_rec_get_volume(rec); + + rec = page_rec_get_prev(rec); + } + +count_later: + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_supremum(rec)) { + rec = page_rec_get_next(rec); + } + + for (;;) { + if (page_rec_is_supremum(rec)) { + + break; + } + + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { + + return(volume); + } + + volume += ibuf_rec_get_volume(rec); + + rec = page_rec_get_next(rec); + } + + /* Look at the next page */ + + next_page_no = btr_page_get_next(page, mtr); + + if (next_page_no == FIL_NULL) { + + return(volume); + } + + { + buf_block_t* block; + + block = buf_page_get( + IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH, mtr); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + + + next_page = buf_block_get_frame(block); + } + +#ifdef UNIV_BTR_DEBUG + ut_a(btr_page_get_prev(next_page, mtr) == page_get_page_no(page)); +#endif /* UNIV_BTR_DEBUG */ + + rec = page_get_infimum_rec(next_page); + rec = page_rec_get_next(rec); + + for (;;) { + if (page_rec_is_supremum(rec)) { + + /* We give up */ + + return(UNIV_PAGE_SIZE); + } + + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { + + return(volume); + } + + volume += ibuf_rec_get_volume(rec); + + rec = page_rec_get_next(rec); + } +} + +/************************************************************************* +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +UNIV_INTERN +void +ibuf_update_max_tablespace_id(void) +/*===============================*/ +{ + ulint max_space_id; + const rec_t* rec; + const byte* field; + ulint len; + btr_pcur_t pcur; + mtr_t mtr; + + ut_a(!dict_table_is_comp(ibuf->index->table)); + + ibuf_enter(); + + mtr_start(&mtr); + + btr_pcur_open_at_index_side( + FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + + btr_pcur_move_to_prev(&pcur, &mtr); + + if (btr_pcur_is_before_first_on_page(&pcur)) { + /* The tree is empty */ + + max_space_id = 0; + } else { + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, 0, &len); + + ut_a(len == 4); + + max_space_id = mach_read_from_4(field); + } + + mtr_commit(&mtr); + ibuf_exit(); + + /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */ + + fil_set_max_space_id_if_bigger(max_space_id); +} + +/************************************************************************* +Makes an index insert to the insert buffer, instead of directly to the disk +page, if this is possible. */ +static +ulint +ibuf_insert_low( +/*============*/ + /* out: DB_SUCCESS, DB_FAIL, DB_STRONG_FAIL */ + ulint mode, /* in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */ + const dtuple_t* entry, /* in: index entry to insert */ + ulint entry_size, + /* in: rec_get_converted_size(index, entry) */ + dict_index_t* index, /* in: index where to insert; must not be + unique or clustered */ + ulint space, /* in: space id where to insert */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint page_no,/* in: page number where to insert */ + que_thr_t* thr) /* in: query thread */ +{ + big_rec_t* dummy_big_rec; + btr_pcur_t pcur; + btr_cur_t* cursor; + dtuple_t* ibuf_entry; + mem_heap_t* heap; + ulint buffered; + rec_t* ins_rec; + ibool old_bit_value; + page_t* bitmap_page; + page_t* root; + ulint err; + ibool do_merge; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED]; + ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; + ulint n_stored; + ulint bits; + mtr_t mtr; + mtr_t bitmap_mtr; + + ut_a(!dict_index_is_clust(index)); + ut_ad(dtuple_check_typed(entry)); + ut_ad(ut_is_2pow(zip_size)); + + ut_a(trx_sys_multiple_tablespace_format); + + do_merge = FALSE; + + mutex_enter(&ibuf_mutex); + + if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) { + /* Insert buffer is now too big, contract it but do not try + to insert */ + + mutex_exit(&ibuf_mutex); + +#ifdef UNIV_IBUF_DEBUG + fputs("Ibuf too big\n", stderr); +#endif + /* Use synchronous contract (== TRUE) */ + ibuf_contract(TRUE); + + return(DB_STRONG_FAIL); + } + + mutex_exit(&ibuf_mutex); + + if (mode == BTR_MODIFY_TREE) { + mutex_enter(&ibuf_pessimistic_insert_mutex); + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + + while (!ibuf_data_enough_free_for_insert()) { + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + mutex_exit(&ibuf_pessimistic_insert_mutex); + + err = ibuf_add_free_page(); + + if (err == DB_STRONG_FAIL) { + + return(err); + } + + mutex_enter(&ibuf_pessimistic_insert_mutex); + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + } + } else { + ibuf_enter(); + } + + heap = mem_heap_create(512); + + /* Build the entry which contains the space id and the page number as + the first fields and the type information for other fields, and which + will be inserted to the insert buffer. */ + + ibuf_entry = ibuf_entry_build(index, entry, space, page_no, heap); + + /* Open a cursor to the insert buffer tree to calculate if we can add + the new entry to it without exceeding the free space limit for the + page. */ + + mtr_start(&mtr); + + btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr); + + /* Find out the volume of already buffered inserts for the same index + page */ + buffered = ibuf_get_volume_buffered(&pcur, space, page_no, &mtr); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a((buffered == 0) || ibuf_count_get(space, page_no)); +#endif + mtr_start(&bitmap_mtr); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, + zip_size, &bitmap_mtr); + + /* We check if the index page is suitable for buffered entries */ + + if (buf_page_peek(space, page_no) + || lock_rec_expl_exist_on_page(space, page_no)) { + err = DB_STRONG_FAIL; + + mtr_commit(&bitmap_mtr); + + goto function_exit; + } + + bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, &bitmap_mtr); + + if (buffered + entry_size + page_dir_calc_reserved_space(1) + > ibuf_index_page_calc_free_from_bits(zip_size, bits)) { + mtr_commit(&bitmap_mtr); + + /* It may not fit */ + err = DB_STRONG_FAIL; + + do_merge = TRUE; + + ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), + space_ids, space_versions, + page_nos, &n_stored); + goto function_exit; + } + + /* Set the bitmap bit denoting that the insert buffer contains + buffered entries for this index page, if the bit is not set yet */ + + old_bit_value = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, &bitmap_mtr); + + if (!old_bit_value) { + ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, TRUE, + &bitmap_mtr); + } + + mtr_commit(&bitmap_mtr); + + cursor = btr_pcur_get_btr_cur(&pcur); + + if (mode == BTR_MODIFY_PREV) { + err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + if (err == DB_SUCCESS) { + /* Update the page max trx id field */ + page_update_max_trx_id(btr_cur_get_block(cursor), NULL, + thr_get_trx(thr)->id); + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* We acquire an x-latch to the root page before the insert, + because a pessimistic insert releases the tree x-latch, + which would cause the x-latching of the root after that to + break the latching order. */ + + root = ibuf_tree_root_get(&mtr); + + err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG, + cursor, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + if (err == DB_SUCCESS) { + /* Update the page max trx id field */ + page_update_max_trx_id(btr_cur_get_block(cursor), NULL, + thr_get_trx(thr)->id); + } + + ibuf_size_update(root, &mtr); + } + +function_exit: +#ifdef UNIV_IBUF_COUNT_DEBUG + if (err == DB_SUCCESS) { + fprintf(stderr, + "Incrementing ibuf count of space %lu page %lu\n" + "from %lu by 1\n", space, page_no, + ibuf_count_get(space, page_no)); + + ibuf_count_set(space, page_no, + ibuf_count_get(space, page_no) + 1); + } +#endif + if (mode == BTR_MODIFY_TREE) { + + mutex_exit(&ibuf_mutex); + mutex_exit(&ibuf_pessimistic_insert_mutex); + } + + mtr_commit(&mtr); + btr_pcur_close(&pcur); + ibuf_exit(); + + mem_heap_free(heap); + + if (err == DB_SUCCESS) { + mutex_enter(&ibuf_mutex); + + ibuf->empty = FALSE; + ibuf->n_inserts++; + + mutex_exit(&ibuf_mutex); + + if (mode == BTR_MODIFY_TREE) { + ibuf_contract_after_insert(entry_size); + } + } + + if (do_merge) { +#ifdef UNIV_IBUF_DEBUG + ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED); +#endif + buf_read_ibuf_merge_pages(FALSE, space_ids, space_versions, + page_nos, n_stored); + } + + return(err); +} + +/************************************************************************* +Makes an index insert to the insert buffer, instead of directly to the disk +page, if this is possible. Does not do insert if the index is clustered +or unique. */ +UNIV_INTERN +ibool +ibuf_insert( +/*========*/ + /* out: TRUE if success */ + const dtuple_t* entry, /* in: index entry to insert */ + dict_index_t* index, /* in: index where to insert */ + ulint space, /* in: space id where to insert */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint page_no,/* in: page number where to insert */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + ulint entry_size; + + ut_a(trx_sys_multiple_tablespace_format); + ut_ad(dtuple_check_typed(entry)); + ut_ad(ut_is_2pow(zip_size)); + + ut_a(!dict_index_is_clust(index)); + + switch (UNIV_EXPECT(ibuf_use, IBUF_USE_INSERT)) { + case IBUF_USE_NONE: + return(FALSE); + case IBUF_USE_INSERT: + goto do_insert; + case IBUF_USE_COUNT: + break; + } + + ut_error; /* unknown value of ibuf_use */ + +do_insert: + entry_size = rec_get_converted_size(index, entry, 0); + + if (entry_size + >= (page_get_free_space_of_empty(dict_table_is_comp(index->table)) + / 2)) { + return(FALSE); + } + + err = ibuf_insert_low(BTR_MODIFY_PREV, entry, entry_size, + index, space, zip_size, page_no, thr); + if (err == DB_FAIL) { + err = ibuf_insert_low(BTR_MODIFY_TREE, entry, entry_size, + index, space, zip_size, page_no, thr); + } + + if (err == DB_SUCCESS) { +#ifdef UNIV_IBUF_DEBUG + /* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n", + page_no, index->name); */ +#endif + return(TRUE); + + } else { + ut_a(err == DB_STRONG_FAIL); + + return(FALSE); + } +} + +/************************************************************************ +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. */ +static +void +ibuf_insert_to_index_page( +/*======================*/ + dtuple_t* entry, /* in: buffered entry to insert */ + buf_block_t* block, /* in/out: index page where the buffered entry + should be placed */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t page_cur; + ulint low_match; + page_t* page = buf_block_get_frame(block); + rec_t* rec; + page_t* bitmap_page; + ulint old_bits; + + ut_ad(ibuf_inside()); + ut_ad(dtuple_check_typed(entry)); + + if (UNIV_UNLIKELY(dict_table_is_comp(index->table) + != (ibool)!!page_is_comp(page))) { + fputs("InnoDB: Trying to insert a record from" + " the insert buffer to an index page\n" + "InnoDB: but the 'compact' flag does not match!\n", + stderr); + goto dump; + } + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + if (UNIV_UNLIKELY(rec_get_n_fields(rec, index) + != dtuple_get_n_fields(entry))) { + fputs("InnoDB: Trying to insert a record from" + " the insert buffer to an index page\n" + "InnoDB: but the number of fields does not match!\n", + stderr); +dump: + buf_page_print(page, 0); + + dtuple_print(stderr, entry); + + fputs("InnoDB: The table where where" + " this index record belongs\n" + "InnoDB: is now probably corrupt." + " Please run CHECK TABLE on\n" + "InnoDB: your tables.\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com!\n", stderr); + + return; + } + + low_match = page_cur_search(block, index, entry, + PAGE_CUR_LE, &page_cur); + + if (low_match == dtuple_get_n_fields(entry)) { + page_zip_des_t* page_zip; + + rec = page_cur_get_rec(&page_cur); + page_zip = buf_block_get_page_zip(block); + + btr_cur_del_unmark_for_ibuf(rec, page_zip, mtr); + } else { + rec = page_cur_tuple_insert(&page_cur, entry, index, 0, mtr); + + if (UNIV_LIKELY(rec != NULL)) { + return; + } + + /* If the record did not fit, reorganize */ + + btr_page_reorganize(block, index, mtr); + page_cur_search(block, index, entry, PAGE_CUR_LE, &page_cur); + + /* This time the record must fit */ + if (UNIV_UNLIKELY + (!page_cur_tuple_insert(&page_cur, entry, index, + 0, mtr))) { + ulint space; + ulint page_no; + ulint zip_size; + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Insert buffer insert" + " fails; page free %lu," + " dtuple size %lu\n", + (ulong) page_get_max_insert_size( + page, 1), + (ulong) rec_get_converted_size( + index, entry, 0)); + fputs("InnoDB: Cannot insert index record ", + stderr); + dtuple_print(stderr, entry); + fputs("\nInnoDB: The table where" + " this index record belongs\n" + "InnoDB: is now probably corrupt." + " Please run CHECK TABLE on\n" + "InnoDB: that table.\n", stderr); + + space = page_get_space_id(page); + zip_size = buf_block_get_zip_size(block); + page_no = page_get_page_no(page); + + bitmap_page = ibuf_bitmap_get_map_page( + space, page_no, zip_size, mtr); + old_bits = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, mtr); + + fprintf(stderr, + "InnoDB: space %lu, page %lu," + " zip_size %lu, bitmap bits %lu\n", + (ulong) space, (ulong) page_no, + (ulong) zip_size, (ulong) old_bits); + + fputs("InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + } + } +} + +/************************************************************************* +Deletes from ibuf the record on which pcur is positioned. If we have to +resort to a pessimistic delete, this function commits mtr and closes +the cursor. */ +static +ibool +ibuf_delete_rec( +/*============*/ + /* out: TRUE if mtr was committed and pcur + closed in this operation */ + ulint space, /* in: space id */ + ulint page_no,/* in: index page number where the record + should belong */ + btr_pcur_t* pcur, /* in: pcur positioned on the record to + delete, having latch mode BTR_MODIFY_LEAF */ + const dtuple_t* search_tuple, + /* in: search tuple for entries of page_no */ + mtr_t* mtr) /* in: mtr */ +{ + ibool success; + page_t* root; + ulint err; + + ut_ad(ibuf_inside()); + ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); + ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no); + ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space); + + success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr); + + if (success) { +#ifdef UNIV_IBUF_COUNT_DEBUG + fprintf(stderr, + "Decrementing ibuf count of space %lu page %lu\n" + "from %lu by 1\n", space, page_no, + ibuf_count_get(space, page_no)); + ibuf_count_set(space, page_no, + ibuf_count_get(space, page_no) - 1); +#endif + return(FALSE); + } + + ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); + ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no); + ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space); + + /* We have to resort to a pessimistic delete from ibuf */ + btr_pcur_store_position(pcur, mtr); + + btr_pcur_commit_specify_mtr(pcur, mtr); + + mutex_enter(&ibuf_mutex); + + mtr_start(mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr); + + if (!success) { + if (fil_space_get_flags(space) == ULINT_UNDEFINED) { + /* The tablespace has been dropped. It is possible + that another thread has deleted the insert buffer + entry. Do not complain. */ + goto commit_and_exit; + } + + fprintf(stderr, + "InnoDB: ERROR: Submit the output to" + " http://bugs.mysql.com\n" + "InnoDB: ibuf cursor restoration fails!\n" + "InnoDB: ibuf record inserted to page %lu\n", + (ulong) page_no); + fflush(stderr); + + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); + dtuple_print(stderr, search_tuple); + + rec_print_old(stderr, + page_rec_get_next(btr_pcur_get_rec(pcur))); + fflush(stderr); + + btr_pcur_commit_specify_mtr(pcur, mtr); + + fputs("InnoDB: Validating insert buffer tree:\n", stderr); + if (!btr_validate_index(ibuf->index, NULL)) { + ut_error; + } + + fprintf(stderr, "InnoDB: ibuf tree ok\n"); + fflush(stderr); + + goto func_exit; + } + + root = ibuf_tree_root_get(mtr); + + btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), + RB_NONE, mtr); + ut_a(err == DB_SUCCESS); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1); +#endif + ibuf_size_update(root, mtr); + +commit_and_exit: + btr_pcur_commit_specify_mtr(pcur, mtr); + +func_exit: + btr_pcur_close(pcur); + + mutex_exit(&ibuf_mutex); + + return(TRUE); +} + +/************************************************************************* +When an index page is read from a disk to the buffer pool, this function +inserts to the page the possible index entries buffered in the insert buffer. +The entries are deleted from the insert buffer. If the page is not read, but +created in the buffer pool, this function deletes its buffered entries from +the insert buffer; there can exist entries for such a page if the page +belonged to an index which subsequently was dropped. */ +UNIV_INTERN +void +ibuf_merge_or_delete_for_page( +/*==========================*/ + buf_block_t* block, /* in: if page has been read from + disk, pointer to the page x-latched, + else NULL */ + ulint space, /* in: space id of the index page */ + ulint page_no,/* in: page number of the index page */ + ulint zip_size,/* in: compressed page size in bytes, + or 0 */ + ibool update_ibuf_bitmap)/* in: normally this is set + to TRUE, but if we have deleted or are + deleting the tablespace, then we + naturally do not want to update a + non-existent bitmap page */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + dtuple_t* search_tuple; + ulint n_inserts; +#ifdef UNIV_IBUF_DEBUG + ulint volume; +#endif + page_zip_des_t* page_zip = NULL; + ibool tablespace_being_deleted = FALSE; + ibool corruption_noticed = FALSE; + mtr_t mtr; + + ut_ad(!block || buf_block_get_space(block) == space); + ut_ad(!block || buf_block_get_page_no(block) == page_no); + ut_ad(!block || buf_block_get_zip_size(block) == zip_size); + + if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE + || trx_sys_hdr_page(space, page_no)) { + return; + } + + /* We cannot refer to zip_size in the following, because + zip_size is passed as ULINT_UNDEFINED (it is unknown) when + buf_read_ibuf_merge_pages() is merging (discarding) changes + for a dropped tablespace. When block != NULL or + update_ibuf_bitmap is specified, the zip_size must be known. + That is why we will repeat the check below, with zip_size in + place of 0. Passing zip_size as 0 assumes that the + uncompressed page size always is a power-of-2 multiple of the + compressed page size. */ + + if (ibuf_fixed_addr_page(space, 0, page_no) + || fsp_descr_page(0, page_no)) { + return; + } + + if (UNIV_LIKELY(update_ibuf_bitmap)) { + ut_a(ut_is_2pow(zip_size)); + + if (ibuf_fixed_addr_page(space, zip_size, page_no) + || fsp_descr_page(zip_size, page_no)) { + return; + } + + /* If the following returns FALSE, we get the counter + incremented, and must decrement it when we leave this + function. When the counter is > 0, that prevents tablespace + from being dropped. */ + + tablespace_being_deleted = fil_inc_pending_ibuf_merges(space); + + if (UNIV_UNLIKELY(tablespace_being_deleted)) { + /* Do not try to read the bitmap page from space; + just delete the ibuf records for the page */ + + block = NULL; + update_ibuf_bitmap = FALSE; + } else { + page_t* bitmap_page; + + mtr_start(&mtr); + + bitmap_page = ibuf_bitmap_get_map_page( + space, page_no, zip_size, &mtr); + + if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no, + zip_size, + IBUF_BITMAP_BUFFERED, + &mtr)) { + /* No inserts buffered for this page */ + mtr_commit(&mtr); + + if (!tablespace_being_deleted) { + fil_decr_pending_ibuf_merges(space); + } + + return; + } + mtr_commit(&mtr); + } + } else if (block + && (ibuf_fixed_addr_page(space, zip_size, page_no) + || fsp_descr_page(zip_size, page_no))) { + + return; + } + + ibuf_enter(); + + heap = mem_heap_create(512); + + if (!trx_sys_multiple_tablespace_format) { + ut_a(trx_doublewrite_must_reset_space_ids); + search_tuple = ibuf_search_tuple_build(space, page_no, heap); + } else { + search_tuple = ibuf_new_search_tuple_build(space, page_no, + heap); + } + + if (block) { + /* Move the ownership of the x-latch on the page to this OS + thread, so that we can acquire a second x-latch on it. This + is needed for the insert operations to the index page to pass + the debug checks. */ + + rw_lock_x_lock_move_ownership(&(block->lock)); + page_zip = buf_block_get_page_zip(block); + + if (UNIV_UNLIKELY(fil_page_get_type(block->frame) + != FIL_PAGE_INDEX) + || UNIV_UNLIKELY(!page_is_leaf(block->frame))) { + + page_t* bitmap_page; + + corruption_noticed = TRUE; + + ut_print_timestamp(stderr); + + mtr_start(&mtr); + + fputs(" InnoDB: Dump of the ibuf bitmap page:\n", + stderr); + + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, + zip_size, &mtr); + buf_page_print(bitmap_page, 0); + + mtr_commit(&mtr); + + fputs("\nInnoDB: Dump of the page:\n", stderr); + + buf_page_print(block->frame, 0); + + fprintf(stderr, + "InnoDB: Error: corruption in the tablespace." + " Bitmap shows insert\n" + "InnoDB: buffer records to page n:o %lu" + " though the page\n" + "InnoDB: type is %lu, which is" + " not an index leaf page!\n" + "InnoDB: We try to resolve the problem" + " by skipping the insert buffer\n" + "InnoDB: merge for this page." + " Please run CHECK TABLE on your tables\n" + "InnoDB: to determine if they are corrupt" + " after this.\n\n" + "InnoDB: Please submit a detailed bug report" + " to http://bugs.mysql.com\n\n", + (ulong) page_no, + (ulong) + fil_page_get_type(block->frame)); + } + } + + n_inserts = 0; +#ifdef UNIV_IBUF_DEBUG + volume = 0; +#endif +loop: + mtr_start(&mtr); + + if (block) { + ibool success; + + success = buf_page_get_known_nowait( + RW_X_LATCH, block, + BUF_KEEP_OLD, __FILE__, __LINE__, &mtr); + + ut_a(success); + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + } + + /* Position pcur in the insert buffer at the first entry for this + index page */ + btr_pcur_open_on_user_rec( + ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); + + goto reset_bit; + } + + for (;;) { + rec_t* rec; + + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this index page */ + if (ibuf_rec_get_page_no(rec) != page_no + || ibuf_rec_get_space(rec) != space) { + + if (block) { + page_header_reset_last_insert( + block->frame, page_zip, &mtr); + } + + goto reset_bit; + } + + if (UNIV_UNLIKELY(corruption_noticed)) { + fputs("InnoDB: Discarding record\n ", stderr); + rec_print_old(stderr, rec); + fputs("\nInnoDB: from the insert buffer!\n\n", stderr); + } else if (block) { + /* Now we have at pcur a record which should be + inserted to the index page; NOTE that the call below + copies pointers to fields in rec, and we must + keep the latch to the rec page until the + insertion is finished! */ + dtuple_t* entry; + dulint max_trx_id; + dict_index_t* dummy_index; + + max_trx_id = page_get_max_trx_id(page_align(rec)); + page_update_max_trx_id(block, page_zip, max_trx_id); + + entry = ibuf_build_entry_from_ibuf_rec( + rec, heap, &dummy_index); +#ifdef UNIV_IBUF_DEBUG + volume += rec_get_converted_size(dummy_index, entry, 0) + + page_dir_calc_reserved_space(1); + ut_a(volume <= 4 * UNIV_PAGE_SIZE + / IBUF_PAGE_SIZE_PER_FREE_SPACE); +#endif + ibuf_insert_to_index_page(entry, block, + dummy_index, &mtr); + ibuf_dummy_index_free(dummy_index); + } + + n_inserts++; + + /* Delete the record from ibuf */ + if (ibuf_delete_rec(space, page_no, &pcur, search_tuple, + &mtr)) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + goto loop; + } else if (btr_pcur_is_after_last_on_page(&pcur)) { + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + goto loop; + } + } + +reset_bit: +#ifdef UNIV_IBUF_COUNT_DEBUG + if (ibuf_count_get(space, page_no) > 0) { + /* btr_print_tree(ibuf_data->index->tree, 100); + ibuf_print(); */ + } +#endif + if (UNIV_LIKELY(update_ibuf_bitmap)) { + page_t* bitmap_page; + + bitmap_page = ibuf_bitmap_get_map_page( + space, page_no, zip_size, &mtr); + + ibuf_bitmap_page_set_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_BUFFERED, FALSE, &mtr); + + if (block) { + ulint old_bits = ibuf_bitmap_page_get_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, &mtr); + + ulint new_bits = ibuf_index_page_calc_free( + zip_size, block); + + if (old_bits != new_bits) { + ibuf_bitmap_page_set_bits( + bitmap_page, page_no, zip_size, + IBUF_BITMAP_FREE, new_bits, &mtr); + } + } + } + + mtr_commit(&mtr); + btr_pcur_close(&pcur); + mem_heap_free(heap); + + /* Protect our statistics keeping from race conditions */ + mutex_enter(&ibuf_mutex); + + ibuf->n_merges++; + ibuf->n_merged_recs += n_inserts; + + mutex_exit(&ibuf_mutex); + + if (update_ibuf_bitmap && !tablespace_being_deleted) { + + fil_decr_pending_ibuf_merges(space); + } + + ibuf_exit(); + +#ifdef UNIV_IBUF_COUNT_DEBUG + ut_a(ibuf_count_get(space, page_no) == 0); +#endif +} + +/************************************************************************* +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ +UNIV_INTERN +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space) /* in: space id */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + dtuple_t* search_tuple; + rec_t* ibuf_rec; + ulint page_no; + ibool closed; + ulint n_inserts; + mtr_t mtr; + + heap = mem_heap_create(512); + + /* Use page number 0 to build the search tuple so that we get the + cursor positioned at the first entry for this space id */ + + search_tuple = ibuf_new_search_tuple_build(space, 0, heap); + + n_inserts = 0; +loop: + ibuf_enter(); + + mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for the + space */ + btr_pcur_open_on_user_rec( + ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); + + goto leave_loop; + } + + for (;;) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + ibuf_rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this space */ + if (ibuf_rec_get_space(ibuf_rec) != space) { + + goto leave_loop; + } + + page_no = ibuf_rec_get_page_no(ibuf_rec); + + n_inserts++; + + /* Delete the record from ibuf */ + closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple, + &mtr); + if (closed) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ibuf_exit(); + + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur)) { + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + ibuf_exit(); + + goto loop; + } + } + +leave_loop: + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + /* Protect our statistics keeping from race conditions */ + mutex_enter(&ibuf_mutex); + + ibuf->n_merges++; + ibuf->n_merged_recs += n_inserts; + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + mem_heap_free(heap); +} + +/********************************************************************** +Looks if the insert buffer is empty. */ +UNIV_INTERN +ibool +ibuf_is_empty(void) +/*===============*/ + /* out: TRUE if empty */ +{ + ibool is_empty; + const page_t* root; + mtr_t mtr; + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + + mtr_start(&mtr); + + root = ibuf_tree_root_get(&mtr); + + if (page_get_n_recs(root) == 0) { + + is_empty = TRUE; + + if (ibuf->empty == FALSE) { + fprintf(stderr, + "InnoDB: Warning: insert buffer tree is empty" + " but the data struct does not\n" + "InnoDB: know it. This condition is legal" + " if the master thread has not yet\n" + "InnoDB: run to completion.\n"); + } + } else { + ut_a(ibuf->empty == FALSE); + + is_empty = FALSE; + } + + mtr_commit(&mtr); + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + return(is_empty); +} + +/********************************************************************** +Prints info of ibuf. */ +UNIV_INTERN +void +ibuf_print( +/*=======*/ + FILE* file) /* in: file where to print */ +{ +#ifdef UNIV_IBUF_COUNT_DEBUG + ulint i; + ulint j; +#endif + + mutex_enter(&ibuf_mutex); + + fprintf(file, + "Ibuf: size %lu, free list len %lu, seg size %lu,\n" + "%lu inserts, %lu merged recs, %lu merges\n", + (ulong) ibuf->size, + (ulong) ibuf->free_list_len, + (ulong) ibuf->seg_size, + (ulong) ibuf->n_inserts, + (ulong) ibuf->n_merged_recs, + (ulong) ibuf->n_merges); +#ifdef UNIV_IBUF_COUNT_DEBUG + for (i = 0; i < IBUF_COUNT_N_SPACES; i++) { + for (j = 0; j < IBUF_COUNT_N_PAGES; j++) { + ulint count = ibuf_count_get(i, j); + + if (count > 0) { + fprintf(stderr, + "Ibuf count for space/page %lu/%lu" + " is %lu\n", + (ulong) i, (ulong) j, (ulong) count); + } + } + } +#endif /* UNIV_IBUF_COUNT_DEBUG */ + + mutex_exit(&ibuf_mutex); +} diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h new file mode 100644 index 00000000000..298942bd542 --- /dev/null +++ b/storage/xtradb/include/btr0btr.h @@ -0,0 +1,494 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0btr_h +#define btr0btr_h + +#include "univ.i" + +#include "dict0dict.h" +#include "data0data.h" +#include "page0cur.h" +#include "mtr0mtr.h" +#include "btr0types.h" + +/* Maximum record size which can be stored on a page, without using the +special big record storage structure */ + +#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200) + +/* Maximum depth of a B-tree in InnoDB. Note that this isn't a maximum as +such; none of the tree operations avoid producing trees bigger than this. It +is instead a "max depth that other code must work with", useful for e.g. +fixed-size arrays that must store some information about each level in a +tree. In other words: if a B-tree with bigger depth than this is +encountered, it is not acceptable for it to lead to mysterious memory +corruption, but it is acceptable for the program to die with a clear assert +failure. */ +#define BTR_MAX_LEVELS 100 + +/* Latching modes for btr_cur_search_to_nth_level(). */ +#define BTR_SEARCH_LEAF RW_S_LATCH +#define BTR_MODIFY_LEAF RW_X_LATCH +#define BTR_NO_LATCHES RW_NO_LATCH +#define BTR_MODIFY_TREE 33 +#define BTR_CONT_MODIFY_TREE 34 +#define BTR_SEARCH_PREV 35 +#define BTR_MODIFY_PREV 36 + +/* If this is ORed to the latch mode, it means that the search tuple will be +inserted to the index, at the searched position */ +#define BTR_INSERT 512 + +/* This flag ORed to latch mode says that we do the search in query +optimization */ +#define BTR_ESTIMATE 1024 + +/* This flag ORed to latch mode says that we can ignore possible +UNIQUE definition on secondary indexes when we decide if we can use the +insert buffer to speed up inserts */ +#define BTR_IGNORE_SEC_UNIQUE 2048 + +/****************************************************************** +Gets the root node of a tree and x-latches it. */ +UNIV_INTERN +page_t* +btr_root_get( +/*=========*/ + /* out: root page, x-latched */ + dict_index_t* index, /* in: index tree */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +buf_block_t* +btr_block_get( +/*==========*/ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + ulint mode, /* in: latch mode */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +page_t* +btr_page_get( +/*=========*/ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + ulint mode, /* in: latch mode */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Gets the index id field of a page. */ +UNIV_INLINE +dulint +btr_page_get_index_id( +/*==================*/ + /* out: index id */ + const page_t* page); /* in: index page */ +/************************************************************ +Gets the node level field in an index page. */ +UNIV_INLINE +ulint +btr_page_get_level_low( +/*===================*/ + /* out: level, leaf level == 0 */ + const page_t* page); /* in: index page */ +/************************************************************ +Gets the node level field in an index page. */ +UNIV_INLINE +ulint +btr_page_get_level( +/*===============*/ + /* out: level, leaf level == 0 */ + const page_t* page, /* in: index page */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Gets the next index page number. */ +UNIV_INLINE +ulint +btr_page_get_next( +/*==============*/ + /* out: next page number */ + const page_t* page, /* in: index page */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Gets the previous index page number. */ +UNIV_INLINE +ulint +btr_page_get_prev( +/*==============*/ + /* out: prev page number */ + const page_t* page, /* in: index page */ + mtr_t* mtr); /* in: mini-transaction handle */ +/***************************************************************** +Gets pointer to the previous user record in the tree. It is assumed +that the caller has appropriate latches on the page and its neighbor. */ +UNIV_INTERN +rec_t* +btr_get_prev_user_rec( +/*==================*/ + /* out: previous user record, NULL if there is none */ + rec_t* rec, /* in: record on leaf level */ + mtr_t* mtr); /* in: mtr holding a latch on the page, and if + needed, also to the previous page */ +/***************************************************************** +Gets pointer to the next user record in the tree. It is assumed +that the caller has appropriate latches on the page and its neighbor. */ +UNIV_INTERN +rec_t* +btr_get_next_user_rec( +/*==================*/ + /* out: next user record, NULL if there is none */ + rec_t* rec, /* in: record on leaf level */ + mtr_t* mtr); /* in: mtr holding a latch on the page, and if + needed, also to the next page */ +/****************************************************************** +Releases the latch on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /* in: buffer block */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Gets the child node file address in a node pointer. */ +UNIV_INLINE +ulint +btr_node_ptr_get_child_page_no( +/*===========================*/ + /* out: child node address */ + const rec_t* rec, /* in: node pointer record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/**************************************************************** +Creates the root node for a new index tree. */ +UNIV_INTERN +ulint +btr_create( +/*=======*/ + /* out: page number of the created root, + FIL_NULL if did not succeed */ + ulint type, /* in: type of the index */ + ulint space, /* in: space where created */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + dulint index_id,/* in: index id */ + dict_index_t* index, /* in: index */ + mtr_t* mtr); /* in: mini-transaction handle */ +/**************************************************************** +Frees a B-tree except the root page, which MUST be freed after this +by calling btr_free_root. */ +UNIV_INTERN +void +btr_free_but_not_root( +/*==================*/ + ulint space, /* in: space where created */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no); /* in: root page number */ +/**************************************************************** +Frees the B-tree root page. Other tree MUST already have been freed. */ +UNIV_INTERN +void +btr_free_root( +/*==========*/ + ulint space, /* in: space where created */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint root_page_no, /* in: root page number */ + mtr_t* mtr); /* in: a mini-transaction which has already + been started */ +/***************************************************************** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. */ +UNIV_INTERN +rec_t* +btr_root_raise_and_insert( +/*======================*/ + /* out: inserted record */ + btr_cur_t* cursor, /* in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + const dtuple_t* tuple, /* in: tuple to insert */ + ulint n_ext, /* in: number of externally stored columns */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Reorganizes an index page. +IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf +page of a non-clustered index, the caller must update the insert +buffer free bits in the same mini-transaction in such a way that the +modification will be redo-logged. */ +UNIV_INTERN +ibool +btr_page_reorganize( +/*================*/ + /* out: TRUE on success, FALSE on failure */ + buf_block_t* block, /* in: page to be reorganized */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Decides if the page should be split at the convergence point of +inserts converging to left. */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_left( +/*===========================*/ + /* out: TRUE if split recommended */ + btr_cur_t* cursor, /* in: cursor at which to insert */ + rec_t** split_rec);/* out: if split recommended, + the first record on upper half page, + or NULL if tuple should be first */ +/***************************************************************** +Decides if the page should be split at the convergence point of +inserts converging to right. */ +UNIV_INTERN +ibool +btr_page_get_split_rec_to_right( +/*============================*/ + /* out: TRUE if split recommended */ + btr_cur_t* cursor, /* in: cursor at which to insert */ + rec_t** split_rec);/* out: if split recommended, + the first record on upper half page, + or NULL if tuple should be first */ +/***************************************************************** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch +is released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore +enough free disk space must be guaranteed to be available before +this function is called. */ +UNIV_INTERN +rec_t* +btr_page_split_and_insert( +/*======================*/ + /* out: inserted record; NOTE: the tree + x-latch is released! NOTE: 2 free disk + pages must be available! */ + btr_cur_t* cursor, /* in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + const dtuple_t* tuple, /* in: tuple to insert */ + ulint n_ext, /* in: number of externally stored columns */ + mtr_t* mtr); /* in: mtr */ +/*********************************************************** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +UNIV_INTERN +void +btr_insert_on_non_leaf_level( +/*=========================*/ + dict_index_t* index, /* in: index */ + ulint level, /* in: level, must be > 0 */ + dtuple_t* tuple, /* in: the record to be inserted */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************** +Sets a record as the predefined minimum record. */ +UNIV_INTERN +void +btr_set_min_rec_mark( +/*=================*/ + rec_t* rec, /* in/out: record */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Deletes on the upper level the node pointer to a page. */ +UNIV_INTERN +void +btr_node_ptr_delete( +/*================*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: page whose node pointer is deleted */ + mtr_t* mtr); /* in: mtr */ +#ifdef UNIV_DEBUG +/**************************************************************** +Checks that the node pointer to a page is appropriate. */ +UNIV_INTERN +ibool +btr_check_node_ptr( +/*===============*/ + /* out: TRUE */ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: index page */ + mtr_t* mtr); /* in: mtr */ +#endif /* UNIV_DEBUG */ +/***************************************************************** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the +brother reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to +the brothers, if they exist. */ +UNIV_INTERN +ibool +btr_compress( +/*=========*/ + /* out: TRUE on success */ + btr_cur_t* cursor, /* in: cursor on the page to merge or lift; + the page must not be empty: in record delete + use btr_discard_page if the page would become + empty */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +UNIV_INTERN +void +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /* in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************** +Parses the redo log record for setting an index record as the predefined +minimum record. */ +UNIV_INTERN +byte* +btr_parse_set_min_rec_mark( +/*=======================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ +/*************************************************************** +Parses a redo log record of reorganizing a page. */ +UNIV_INTERN +byte* +btr_parse_page_reorganize( +/*======================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + dict_index_t* index, /* in: record descriptor */ + buf_block_t* block, /* in: page to be reorganized, or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ +/****************************************************************** +Gets the number of pages in a B-tree. */ +UNIV_INTERN +ulint +btr_get_size( +/*=========*/ + /* out: number of pages */ + dict_index_t* index, /* in: index */ + ulint flag); /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ +/****************************************************************** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! */ +UNIV_INTERN +buf_block_t* +btr_page_alloc( +/*===========*/ + /* out: new allocated block, x-latched; + NULL if out of space */ + dict_index_t* index, /* in: index tree */ + ulint hint_page_no, /* in: hint of a good page */ + byte file_direction, /* in: direction where a possible + page split is made */ + ulint level, /* in: level where the page is placed + in the tree */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Frees a file page used in an index tree. NOTE: cannot free field external +storage pages because the page must contain info on its level. */ +UNIV_INTERN +void +btr_page_free( +/*==========*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: block to be freed, x-latched */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Frees a file page used in an index tree. Can be used also to BLOB +external storage pages, because the page level 0 can be given as an +argument. */ +UNIV_INTERN +void +btr_page_free_low( +/*==============*/ + dict_index_t* index, /* in: index tree */ + buf_block_t* block, /* in: block to be freed, x-latched */ + ulint level, /* in: page level */ + mtr_t* mtr); /* in: mtr */ +#ifdef UNIV_BTR_PRINT +/***************************************************************** +Prints size info of a B-tree. */ +UNIV_INTERN +void +btr_print_size( +/*===========*/ + dict_index_t* index); /* in: index tree */ +/****************************************************************** +Prints directories and other info of all nodes in the index. */ +UNIV_INTERN +void +btr_print_index( +/*============*/ + dict_index_t* index, /* in: index */ + ulint width); /* in: print this many entries from start + and end */ +#endif /* UNIV_BTR_PRINT */ +/**************************************************************** +Checks the size and number of fields in a record based on the definition of +the index. */ +UNIV_INTERN +ibool +btr_index_rec_validate( +/*===================*/ + /* out: TRUE if ok */ + const rec_t* rec, /* in: index record */ + const dict_index_t* index, /* in: index */ + ibool dump_on_error); /* in: TRUE if the function + should print hex dump of record + and page on error */ +/****************************************************************** +Checks the consistency of an index tree. */ +UNIV_INTERN +ibool +btr_validate_index( +/*===============*/ + /* out: TRUE if ok */ + dict_index_t* index, /* in: index */ + trx_t* trx); /* in: transaction or NULL */ + +#define BTR_N_LEAF_PAGES 1 +#define BTR_TOTAL_SIZE 2 + +#ifndef UNIV_NONINL +#include "btr0btr.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic new file mode 100644 index 00000000000..a8d934ecc87 --- /dev/null +++ b/storage/xtradb/include/btr0btr.ic @@ -0,0 +1,301 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "page0zip.h" + +#define BTR_MAX_NODE_LEVEL 50 /* used in debug checking */ + +/****************************************************************** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +buf_block_t* +btr_block_get( +/*==========*/ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + ulint mode, /* in: latch mode */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + + block = buf_page_get(space, zip_size, page_no, mode, mtr); + + if (mode != RW_NO_LATCH) { + + buf_block_dbg_add_level(block, SYNC_TREE_NODE); + } + + return(block); +} + +/****************************************************************** +Gets a buffer page and declares its latching order level. */ +UNIV_INLINE +page_t* +btr_page_get( +/*=========*/ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + ulint mode, /* in: latch mode */ + mtr_t* mtr) /* in: mtr */ +{ + return(buf_block_get_frame(btr_block_get(space, zip_size, page_no, + mode, mtr))); +} + +/****************************************************************** +Sets the index id field of a page. */ +UNIV_INLINE +void +btr_page_set_index_id( +/*==================*/ + page_t* page, /* in: page to be created */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + dulint id, /* in: index id */ + mtr_t* mtr) /* in: mtr */ +{ + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_INDEX_ID), + 8, mtr); + } else { + mlog_write_dulint(page + (PAGE_HEADER + PAGE_INDEX_ID), + id, mtr); + } +} + +/****************************************************************** +Gets the index id field of a page. */ +UNIV_INLINE +dulint +btr_page_get_index_id( +/*==================*/ + /* out: index id */ + const page_t* page) /* in: index page */ +{ + return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)); +} + +/************************************************************ +Gets the node level field in an index page. */ +UNIV_INLINE +ulint +btr_page_get_level_low( +/*===================*/ + /* out: level, leaf level == 0 */ + const page_t* page) /* in: index page */ +{ + ulint level; + + ut_ad(page); + + level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL); + + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + return(level); +} + +/************************************************************ +Gets the node level field in an index page. */ +UNIV_INLINE +ulint +btr_page_get_level( +/*===============*/ + /* out: level, leaf level == 0 */ + const page_t* page, /* in: index page */ + mtr_t* mtr __attribute__((unused))) + /* in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + return(btr_page_get_level_low(page)); +} + +/************************************************************ +Sets the node level field in an index page. */ +UNIV_INLINE +void +btr_page_set_level( +/*===============*/ + page_t* page, /* in: index page */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint level, /* in: level, leaf level == 0 */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ut_ad(page && mtr); + ut_ad(level <= BTR_MAX_NODE_LEVEL); + + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_LEVEL), + 2, mtr); + } else { + mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level, + MLOG_2BYTES, mtr); + } +} + +/************************************************************ +Gets the next index page number. */ +UNIV_INLINE +ulint +btr_page_get_next( +/*==============*/ + /* out: next page number */ + const page_t* page, /* in: index page */ + mtr_t* mtr __attribute__((unused))) + /* in: mini-transaction handle */ +{ + ut_ad(page && mtr); + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX)); + + return(mach_read_from_4(page + FIL_PAGE_NEXT)); +} + +/************************************************************ +Sets the next index page field. */ +UNIV_INLINE +void +btr_page_set_next( +/*==============*/ + page_t* page, /* in: index page */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint next, /* in: next page number */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_4(page + FIL_PAGE_NEXT, next); + page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr); + } else { + mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr); + } +} + +/************************************************************ +Gets the previous index page number. */ +UNIV_INLINE +ulint +btr_page_get_prev( +/*==============*/ + /* out: prev page number */ + const page_t* page, /* in: index page */ + mtr_t* mtr __attribute__((unused))) /* in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + return(mach_read_from_4(page + FIL_PAGE_PREV)); +} + +/************************************************************ +Sets the previous index page field. */ +UNIV_INLINE +void +btr_page_set_prev( +/*==============*/ + page_t* page, /* in: index page */ + page_zip_des_t* page_zip,/* in: compressed page whose uncompressed + part will be updated, or NULL */ + ulint prev, /* in: previous page number */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ut_ad(page && mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_4(page + FIL_PAGE_PREV, prev); + page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr); + } else { + mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr); + } +} + +/****************************************************************** +Gets the child node file address in a node pointer. */ +UNIV_INLINE +ulint +btr_node_ptr_get_child_page_no( +/*===========================*/ + /* out: child node address */ + const rec_t* rec, /* in: node pointer record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + const byte* field; + ulint len; + ulint page_no; + + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == 4); + + page_no = mach_read_from_4(field); + + if (UNIV_UNLIKELY(page_no == 0)) { + fprintf(stderr, + "InnoDB: a nonsensical page number 0" + " in a node ptr record at offset %lu\n", + (ulong) page_offset(rec)); + buf_page_print(page_align(rec), 0); + } + + return(page_no); +} + +/****************************************************************** +Releases the latches on a leaf page and bufferunfixes it. */ +UNIV_INLINE +void +btr_leaf_page_release( +/*==================*/ + buf_block_t* block, /* in: buffer block */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); + ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)); + + mtr_memo_release(mtr, block, + latch_mode == BTR_SEARCH_LEAF + ? MTR_MEMO_PAGE_S_FIX + : MTR_MEMO_PAGE_X_FIX); +} diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h new file mode 100644 index 00000000000..c3a478c4bb7 --- /dev/null +++ b/storage/xtradb/include/btr0cur.h @@ -0,0 +1,742 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0cur_h +#define btr0cur_h + +#include "univ.i" +#include "dict0dict.h" +#include "page0cur.h" +#include "btr0types.h" +#include "que0types.h" +#include "row0types.h" +#include "ha0ha.h" + +/* Mode flags for btr_cur operations; these can be ORed */ +#define BTR_NO_UNDO_LOG_FLAG 1 /* do no undo logging */ +#define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */ +#define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the + update vector or inserted entry */ + +#define BTR_CUR_ADAPT +#define BTR_CUR_HASH_ADAPT + +#ifdef UNIV_DEBUG +/************************************************************* +Returns the page cursor component of a tree cursor. */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + /* out: pointer to page cursor + component */ + const btr_cur_t* cursor);/* in: tree cursor */ +#else /* UNIV_DEBUG */ +# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur) +#endif /* UNIV_DEBUG */ +/************************************************************* +Returns the buffer block on which the tree cursor is positioned. */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + /* out: pointer to buffer block */ + btr_cur_t* cursor);/* in: tree cursor */ +/************************************************************* +Returns the record pointer of a tree cursor. */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + /* out: pointer to record */ + btr_cur_t* cursor);/* in: tree cursor */ +/************************************************************* +Returns the compressed page on which the tree cursor is positioned. */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + /* out: pointer to compressed page, + or NULL if the page is not compressed */ + btr_cur_t* cursor);/* in: tree cursor */ +/************************************************************* +Invalidates a tree cursor by setting record pointer to NULL. */ +UNIV_INLINE +void +btr_cur_invalidate( +/*===============*/ + btr_cur_t* cursor);/* in: tree cursor */ +/************************************************************* +Returns the page of a tree cursor. */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + /* out: pointer to page */ + btr_cur_t* cursor);/* in: tree cursor */ +/************************************************************* +Returns the index of a cursor. */ +UNIV_INLINE +dict_index_t* +btr_cur_get_index( +/*==============*/ + /* out: index */ + btr_cur_t* cursor);/* in: B-tree cursor */ +/************************************************************* +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record in tree */ + buf_block_t* block, /* in: buffer block of rec */ + btr_cur_t* cursor);/* in: cursor */ +/************************************************************************ +Searches an index tree and positions a tree cursor on a given level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +Note that if mode is PAGE_CUR_LE, which is used in inserts, then +cursor->up_match and cursor->low_match both will have sensible values. +If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */ +UNIV_INTERN +void +btr_cur_search_to_nth_level( +/*========================*/ + dict_index_t* index, /* in: index */ + ulint level, /* in: the tree level of search */ + const dtuple_t* tuple, /* in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + ulint mode, /* in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be PAGE_CUR_LE, + not PAGE_CUR_GE, as the latter may end up on + the previous page of the record! Inserts + should always be made using PAGE_CUR_LE to + search the position! */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ..., ORed with + BTR_INSERT and BTR_ESTIMATE; + cursor->left_block is used to store a pointer + to the left neighbor page, in the cases + BTR_SEARCH_PREV and BTR_MODIFY_PREV; + NOTE that if has_search_latch + is != 0, we maybe do not have a latch set + on the cursor page, we assume + the caller uses his search latch + to protect the record! */ + btr_cur_t* cursor, /* in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ + ulint has_search_latch,/* in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Opens a cursor at either end of an index. */ +UNIV_INTERN +void +btr_cur_open_at_index_side( +/*=======================*/ + ibool from_left, /* in: TRUE if open to the low end, + FALSE if to the high end */ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: latch mode */ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INTERN +void +btr_cur_open_at_rnd_pos( +/*====================*/ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /* in/out: B-tree cursor */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. */ +UNIV_INTERN +ulint +btr_cur_optimistic_insert( +/*======================*/ + /* out: DB_SUCCESS, DB_WAIT_LOCK, + DB_FAIL, or error number */ + ulint flags, /* in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /* in: cursor on page after which to insert; + cursor stays valid */ + dtuple_t* entry, /* in/out: entry to insert */ + rec_t** rec, /* out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /* in: number of externally stored columns */ + que_thr_t* thr, /* in: query thread or NULL */ + mtr_t* mtr); /* in: mtr; if this function returns + DB_SUCCESS on a leaf page of a secondary + index in a compressed tablespace, the + mtr must be committed before latching + any further pages */ +/***************************************************************** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. */ +UNIV_INTERN +ulint +btr_cur_pessimistic_insert( +/*=======================*/ + /* out: DB_SUCCESS or error number */ + ulint flags, /* in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /* in: cursor after which to insert; + cursor stays valid */ + dtuple_t* entry, /* in/out: entry to insert */ + rec_t** rec, /* out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or + NULL */ + ulint n_ext, /* in: number of externally stored columns */ + que_thr_t* thr, /* in: query thread or NULL */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Updates a record when the update causes no size changes in its fields. */ +UNIV_INTERN +ulint +btr_cur_update_in_place( +/*====================*/ + /* out: DB_SUCCESS or error number */ + ulint flags, /* in: undo logging and locking flags */ + btr_cur_t* cursor, /* in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + const upd_t* update, /* in: update vector */ + ulint cmpl_info,/* in: compiler info on secondary index + updates */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr); /* in: mtr; must be committed before + latching any further pages */ +/***************************************************************** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. */ +UNIV_INTERN +ulint +btr_cur_optimistic_update( +/*======================*/ + /* out: DB_SUCCESS, or DB_OVERFLOW if the + updated record does not fit, DB_UNDERFLOW + if the page would become too empty, or + DB_ZIP_OVERFLOW if there is not enough + space left on the compressed page */ + ulint flags, /* in: undo logging and locking flags */ + btr_cur_t* cursor, /* in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + const upd_t* update, /* in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/* in: compiler info on secondary index + updates */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr); /* in: mtr; must be committed before + latching any further pages */ +/***************************************************************** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. */ +UNIV_INTERN +ulint +btr_cur_pessimistic_update( +/*=======================*/ + /* out: DB_SUCCESS or error code */ + ulint flags, /* in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /* in: cursor on the record to update */ + mem_heap_t** heap, /* in/out: pointer to memory heap, or NULL */ + big_rec_t** big_rec,/* out: big rec vector whose fields have to + be stored externally by the caller, or NULL */ + const upd_t* update, /* in: update vector; this is allowed also + contain trx id and roll ptr fields, but + the values in update vector have no effect */ + ulint cmpl_info,/* in: compiler info on secondary index + updates */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr); /* in: mtr; must be committed before + latching any further pages */ +/*************************************************************** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. */ +UNIV_INTERN +ulint +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, or error + number */ + ulint flags, /* in: undo logging and locking flags */ + btr_cur_t* cursor, /* in: cursor */ + ibool val, /* in: value to set */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr); /* in: mtr */ +/*************************************************************** +Sets a secondary index record delete mark to TRUE or FALSE. */ +UNIV_INTERN +ulint +btr_cur_del_mark_set_sec_rec( +/*=========================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, or error + number */ + ulint flags, /* in: locking flag */ + btr_cur_t* cursor, /* in: cursor */ + ibool val, /* in: value to set */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr); /* in: mtr */ +/*************************************************************** +Clear a secondary index record's delete mark. This function is only +used by the insert buffer insert merge mechanism. */ +UNIV_INTERN +void +btr_cur_del_unmark_for_ibuf( +/*========================*/ + rec_t* rec, /* in/out: record to delete unmark */ + page_zip_des_t* page_zip, /* in/out: compressed page + corresponding to rec, or NULL + when the tablespace is + uncompressed */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! */ +UNIV_INTERN +ibool +btr_cur_compress_if_useful( +/*=======================*/ + /* out: TRUE if compression occurred */ + btr_cur_t* cursor, /* in: cursor on the page to compress; + cursor does not stay valid if compression + occurs */ + mtr_t* mtr); /* in: mtr */ +/*********************************************************** +Removes the record on which the tree cursor is positioned. It is assumed +that the mtr has an x-latch on the page where the cursor is positioned, +but no latch on the whole tree. */ +UNIV_INTERN +ibool +btr_cur_optimistic_delete( +/*======================*/ + /* out: TRUE if success, i.e., the page + did not become too empty */ + btr_cur_t* cursor, /* in: cursor on the record to delete; + cursor stays valid: if deletion succeeds, + on function exit it points to the successor + of the deleted record */ + mtr_t* mtr); /* in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ +/***************************************************************** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. */ +UNIV_INTERN +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + /* out: TRUE if compression occurred */ + ulint* err, /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /* in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /* in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* mtr); /* in: mtr */ +/*************************************************************** +Parses a redo log record of updating a record in-place. */ +UNIV_INTERN +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dict_index_t* index); /* in: index corresponding to page */ +/******************************************************************** +Parses the redo log record for delete marking or unmarking of a clustered +index record. */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_clust_rec( +/*=================================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dict_index_t* index); /* in: index corresponding to page */ +/******************************************************************** +Parses the redo log record for delete marking or unmarking of a secondary +index record. */ +UNIV_INTERN +byte* +btr_cur_parse_del_mark_set_sec_rec( +/*===============================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip);/* in/out: compressed page, or NULL */ +/*********************************************************************** +Estimates the number of rows in a given index range. */ +UNIV_INTERN +ib_int64_t +btr_estimate_n_rows_in_range( +/*=========================*/ + /* out: estimated number of rows */ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple1, /* in: range start, may also be empty tuple */ + ulint mode1, /* in: search mode for range start */ + const dtuple_t* tuple2, /* in: range end, may also be empty tuple */ + ulint mode2); /* in: search mode for range end */ +/*********************************************************************** +Estimates the number of different key values in a given index, for +each n-column prefix of the index where n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals. */ +UNIV_INTERN +void +btr_estimate_number_of_different_key_vals( +/*======================================*/ + dict_index_t* index); /* in: index */ +/*********************************************************************** +Marks not updated extern fields as not-owned by this record. The ownership +is transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +UNIV_INTERN +void +btr_cur_mark_extern_inherited_fields( +/*=================================*/ + page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed + part will be updated, or NULL */ + rec_t* rec, /* in/out: record in a clustered index */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + const upd_t* update, /* in: update vector */ + mtr_t* mtr); /* in: mtr, or NULL if not logged */ +/*********************************************************************** +The complement of the previous function: in an update entry may inherit +some externally stored fields from a record. We must mark them as inherited +in entry, so that they are not freed in a rollback. */ +UNIV_INTERN +void +btr_cur_mark_dtuple_inherited_extern( +/*=================================*/ + dtuple_t* entry, /* in/out: updated entry to be + inserted to clustered index */ + const upd_t* update); /* in: update vector */ +/*********************************************************************** +Marks all extern fields in a dtuple as owned by the record. */ +UNIV_INTERN +void +btr_cur_unmark_dtuple_extern_fields( +/*================================*/ + dtuple_t* entry); /* in/out: clustered index entry */ +/*********************************************************************** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. */ +UNIV_INTERN +ulint +btr_store_big_rec_extern_fields( +/*============================*/ + /* out: DB_SUCCESS or error */ + dict_index_t* index, /* in: index of rec; the index tree + MUST be X-latched */ + buf_block_t* rec_block, /* in/out: block containing rec */ + rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec, index); + the "external storage" flags in offsets + will not correspond to rec when + this function returns */ + big_rec_t* big_rec_vec, /* in: vector containing fields + to be stored externally */ + mtr_t* local_mtr); /* in: mtr containing the latch to + rec and to the tree */ +/*********************************************************************** +Frees the space in an externally stored field to the file space +management if the field in data is owned the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +UNIV_INTERN +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /* in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /* in/out: field reference */ + const rec_t* rec, /* in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const ulint* offsets, /* in: rec_get_offsets(rec, index), + or NULL */ + page_zip_des_t* page_zip, /* in: compressed page corresponding + to rec, or NULL if rec == NULL */ + ulint i, /* in: field number of field_ref; + ignored if rec == NULL */ + enum trx_rb_ctx rb_ctx, /* in: rollback context */ + mtr_t* local_mtr); /* in: mtr containing the latch to + data an an X-latch to the index + tree */ +/*********************************************************************** +Copies the prefix of an externally stored field of a record. The +clustered index record must be protected by a lock or a page latch. */ +UNIV_INTERN +ulint +btr_copy_externally_stored_field_prefix( +/*====================================*/ + /* out: the length of the copied field, + or 0 if the column is being or has been + deleted */ + byte* buf, /* out: the field, or a prefix of it */ + ulint len, /* in: length of buf, in bytes */ + ulint zip_size,/* in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + const byte* data, /* in: 'internally' stored part of the + field containing also the reference to + the external part; must be protected by + a lock or a page latch */ + ulint local_len);/* in: length of data, in bytes */ +/*********************************************************************** +Copies an externally stored field of a record to mem heap. */ +UNIV_INTERN +byte* +btr_rec_copy_externally_stored_field( +/*=================================*/ + /* out: the field copied to heap */ + const rec_t* rec, /* in: record in a clustered index; + must be protected by a lock or a page latch */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint zip_size,/* in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ + ulint no, /* in: field number */ + ulint* len, /* out: length of the field */ + mem_heap_t* heap); /* in: mem heap */ +/*********************************************************************** +Flags the data tuple fields that are marked as extern storage in the +update vector. We use this function to remember which fields we must +mark as extern storage in a record inserted for an update. */ +UNIV_INTERN +ulint +btr_push_update_extern_fields( +/*==========================*/ + /* out: number of flagged external columns */ + dtuple_t* tuple, /* in/out: data tuple */ + const upd_t* update, /* in: update vector */ + mem_heap_t* heap) /* in: memory heap */ + __attribute__((nonnull)); + +/*######################################################################*/ + +/* In the pessimistic delete, if the page data size drops below this +limit, merging it to a neighbor is tried */ + +#define BTR_CUR_PAGE_COMPRESS_LIMIT (UNIV_PAGE_SIZE / 2) + +/* A slot in the path array. We store here info on a search path down the +tree. Each slot contains data on a single level of the tree. */ + +typedef struct btr_path_struct btr_path_t; +struct btr_path_struct{ + ulint nth_rec; /* index of the record + where the page cursor stopped on + this level (index in alphabetical + order); value ULINT_UNDEFINED + denotes array end */ + ulint n_recs; /* number of records on the page */ +}; + +#define BTR_PATH_ARRAY_N_SLOTS 250 /* size of path array (in slots) */ + +/* The tree cursor: the definition appears here only for the compiler +to know struct size! */ + +struct btr_cur_struct { + dict_index_t* index; /* index where positioned */ + page_cur_t page_cur; /* page cursor */ + buf_block_t* left_block; /* this field is used to store + a pointer to the left neighbor + page, in the cases + BTR_SEARCH_PREV and + BTR_MODIFY_PREV */ + /*------------------------------*/ + que_thr_t* thr; /* this field is only used when + btr_cur_search_... is called for an + index entry insertion: the calling + query thread is passed here to be + used in the insert buffer */ + /*------------------------------*/ + /* The following fields are used in btr_cur_search... to pass + information: */ + ulint flag; /* BTR_CUR_HASH, BTR_CUR_HASH_FAIL, + BTR_CUR_BINARY, or + BTR_CUR_INSERT_TO_IBUF */ + ulint tree_height; /* Tree height if the search is done + for a pessimistic insert or update + operation */ + ulint up_match; /* If the search mode was PAGE_CUR_LE, + the number of matched fields to the + the first user record to the right of + the cursor record after + btr_cur_search_...; + for the mode PAGE_CUR_GE, the matched + fields to the first user record AT THE + CURSOR or to the right of it; + NOTE that the up_match and low_match + values may exceed the correct values + for comparison to the adjacent user + record if that record is on a + different leaf page! (See the note in + row_ins_duplicate_key.) */ + ulint up_bytes; /* number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint low_match; /* if search mode was PAGE_CUR_LE, + the number of matched fields to the + first user record AT THE CURSOR or + to the left of it after + btr_cur_search_...; + NOT defined for PAGE_CUR_GE or any + other search modes; see also the NOTE + in up_match! */ + ulint low_bytes; /* number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint n_fields; /* prefix length used in a hash + search if hash_node != NULL */ + ulint n_bytes; /* hash prefix bytes if hash_node != + NULL */ + ulint fold; /* fold value used in the search if + flag is BTR_CUR_HASH */ + /*------------------------------*/ + btr_path_t* path_arr; /* in estimating the number of + rows in range, we store in this array + information of the path through + the tree */ +}; + +/* Values for the flag documenting the used search method */ +#define BTR_CUR_HASH 1 /* successful shortcut using the hash + index */ +#define BTR_CUR_HASH_FAIL 2 /* failure using hash, success using + binary search: the misleading hash + reference is stored in the field + hash_node, and might be necessary to + update */ +#define BTR_CUR_BINARY 3 /* success using the binary search */ +#define BTR_CUR_INSERT_TO_IBUF 4 /* performed the intended insert to + the insert buffer */ + +/* If pessimistic delete fails because of lack of file space, +there is still a good change of success a little later: try this many times, +and sleep this many microseconds in between */ +#define BTR_CUR_RETRY_DELETE_N_TIMES 100 +#define BTR_CUR_RETRY_SLEEP_TIME 50000 + +/* The reference in a field for which data is stored on a different page. +The reference is at the end of the 'locally' stored part of the field. +'Locally' means storage in the index record. +We store locally a long enough prefix of each column so that we can determine +the ordering parts of each index record without looking into the externally +stored part. */ + +/*--------------------------------------*/ +#define BTR_EXTERN_SPACE_ID 0 /* space id where stored */ +#define BTR_EXTERN_PAGE_NO 4 /* page no where stored */ +#define BTR_EXTERN_OFFSET 8 /* offset of BLOB header + on that page */ +#define BTR_EXTERN_LEN 12 /* 8 bytes containing the + length of the externally + stored part of the BLOB. + The 2 highest bits are + reserved to the flags below. */ +/*--------------------------------------*/ +/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */ + +/* The highest bit of BTR_EXTERN_LEN (i.e., the highest bit of the byte +at lowest address) is set to 1 if this field does not 'own' the externally +stored field; only the owner field is allowed to free the field in purge! +If the 2nd highest bit is 1 then it means that the externally stored field +was inherited from an earlier version of the row. In rollback we are not +allowed to free an inherited external field. */ + +#define BTR_EXTERN_OWNER_FLAG 128 +#define BTR_EXTERN_INHERITED_FLAG 64 + +extern ulint btr_cur_n_non_sea; +extern ulint btr_cur_n_sea; +extern ulint btr_cur_n_non_sea_old; +extern ulint btr_cur_n_sea_old; + +#ifndef UNIV_NONINL +#include "btr0cur.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0cur.ic b/storage/xtradb/include/btr0cur.ic new file mode 100644 index 00000000000..84a3a5cba0b --- /dev/null +++ b/storage/xtradb/include/btr0cur.ic @@ -0,0 +1,200 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#ifdef UNIV_DEBUG +/************************************************************* +Returns the page cursor component of a tree cursor. */ +UNIV_INLINE +page_cur_t* +btr_cur_get_page_cur( +/*=================*/ + /* out: pointer to page cursor + component */ + const btr_cur_t* cursor) /* in: tree cursor */ +{ + return(&((btr_cur_t*) cursor)->page_cur); +} +#endif /* UNIV_DEBUG */ +/************************************************************* +Returns the buffer block on which the tree cursor is positioned. */ +UNIV_INLINE +buf_block_t* +btr_cur_get_block( +/*==============*/ + /* out: pointer to buffer block */ + btr_cur_t* cursor) /* in: tree cursor */ +{ + return(page_cur_get_block(btr_cur_get_page_cur(cursor))); +} + +/************************************************************* +Returns the record pointer of a tree cursor. */ +UNIV_INLINE +rec_t* +btr_cur_get_rec( +/*============*/ + /* out: pointer to record */ + btr_cur_t* cursor) /* in: tree cursor */ +{ + return(page_cur_get_rec(&(cursor->page_cur))); +} + +/************************************************************* +Returns the compressed page on which the tree cursor is positioned. */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + /* out: pointer to compressed page, + or NULL if the page is not compressed */ + btr_cur_t* cursor) /* in: tree cursor */ +{ + return(buf_block_get_page_zip(btr_cur_get_block(cursor))); +} + +/************************************************************* +Invalidates a tree cursor by setting record pointer to NULL. */ +UNIV_INLINE +void +btr_cur_invalidate( +/*===============*/ + btr_cur_t* cursor) /* in: tree cursor */ +{ + page_cur_invalidate(&(cursor->page_cur)); +} + +/************************************************************* +Returns the page of a tree cursor. */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + /* out: pointer to page */ + btr_cur_t* cursor) /* in: tree cursor */ +{ + return(page_align(page_cur_get_rec(&(cursor->page_cur)))); +} + +/************************************************************* +Returns the index of a cursor. */ +UNIV_INLINE +dict_index_t* +btr_cur_get_index( +/*==============*/ + /* out: index */ + btr_cur_t* cursor) /* in: B-tree cursor */ +{ + return(cursor->index); +} + +/************************************************************* +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record in tree */ + buf_block_t* block, /* in: buffer block of rec */ + btr_cur_t* cursor) /* out: cursor */ +{ + ut_ad(page_align(rec) == block->frame); + + page_cur_position(rec, block, btr_cur_get_page_cur(cursor)); + + cursor->index = index; +} + +/************************************************************************* +Checks if compressing an index page where a btr cursor is placed makes +sense. */ +UNIV_INLINE +ibool +btr_cur_compress_recommendation( +/*============================*/ + /* out: TRUE if compression is recommended */ + btr_cur_t* cursor, /* in: btr cursor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* page; + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT) + || ((btr_page_get_next(page, mtr) == FIL_NULL) + && (btr_page_get_prev(page, mtr) == FIL_NULL))) { + + /* The page fillfactor has dropped below a predefined + minimum value OR the level in the B-tree contains just + one page: we recommend compression if this is not the + root page. */ + + return(dict_index_get_page(cursor->index) + != page_get_page_no(page)); + } + + return(FALSE); +} + +/************************************************************************* +Checks if the record on which the cursor is placed can be deleted without +making tree compression necessary (or, recommended). */ +UNIV_INLINE +ibool +btr_cur_can_delete_without_compress( +/*================================*/ + /* out: TRUE if can be deleted without + recommended compression */ + btr_cur_t* cursor, /* in: btr cursor */ + ulint rec_size,/* in: rec_get_size(btr_cur_get_rec(cursor))*/ + mtr_t* mtr) /* in: mtr */ +{ + page_t* page; + + ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT) + || ((btr_page_get_next(page, mtr) == FIL_NULL) + && (btr_page_get_prev(page, mtr) == FIL_NULL)) + || (page_get_n_recs(page) < 2)) { + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + return(dict_index_get_page(cursor->index) + == page_get_page_no(page)); + } + + return(TRUE); +} diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h new file mode 100644 index 00000000000..1fdd102d32a --- /dev/null +++ b/storage/xtradb/include/btr0pcur.h @@ -0,0 +1,545 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#ifndef btr0pcur_h +#define btr0pcur_h + +#include "univ.i" +#include "dict0dict.h" +#include "data0data.h" +#include "mtr0mtr.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0btr.h" +#include "btr0types.h" + +/* Relative positions for a stored cursor position */ +#define BTR_PCUR_ON 1 +#define BTR_PCUR_BEFORE 2 +#define BTR_PCUR_AFTER 3 +/* Note that if the tree is not empty, btr_pcur_store_position does not +use the following, but only uses the above three alternatives, where the +position is stored relative to a specific record: this makes implementation +of a scroll cursor easier */ +#define BTR_PCUR_BEFORE_FIRST_IN_TREE 4 /* in an empty tree */ +#define BTR_PCUR_AFTER_LAST_IN_TREE 5 /* in an empty tree */ + +/****************************************************************** +Allocates memory for a persistent cursor object and initializes the cursor. */ +UNIV_INTERN +btr_pcur_t* +btr_pcur_create_for_mysql(void); +/*============================*/ + /* out, own: persistent cursor */ +/****************************************************************** +Frees the memory for a persistent cursor object. */ +UNIV_INTERN +void +btr_pcur_free_for_mysql( +/*====================*/ + btr_pcur_t* cursor); /* in, own: persistent cursor */ +/****************************************************************** +Copies the stored position of a pcur to another pcur. */ +UNIV_INTERN +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /* in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate); /* in: pcur from which the info is + copied */ +/****************************************************************** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur); /* in: persistent cursor */ +/****************************************************************** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +void +btr_pcur_open( +/*==========*/ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple, /* in: tuple on which search done */ + ulint mode, /* in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/* in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +void +btr_pcur_open_with_no_init( +/*=======================*/ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple, /* in: tuple on which search done */ + ulint mode, /* in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/* in: BTR_SEARCH_LEAF, ...; + NOTE that if has_search_latch != 0 then + we maybe do not acquire a latch on the cursor + page, but assume that the caller uses his + btr search latch to protect the record! */ + btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */ + ulint has_search_latch,/* in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +void +btr_pcur_open_at_index_side( +/*========================*/ + ibool from_left, /* in: TRUE if open to the low end, + FALSE if to the high end */ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: latch mode */ + btr_pcur_t* pcur, /* in: cursor */ + ibool do_init, /* in: TRUE if should be initialized */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Gets the up_match value for a pcur after a search. */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + /* out: number of matched fields at the cursor + or to the right if search mode was PAGE_CUR_GE, + otherwise undefined */ + btr_pcur_t* cursor); /* in: memory buffer for persistent cursor */ +/****************************************************************** +Gets the low_match value for a pcur after a search. */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + /* out: number of matched fields at the cursor + or to the right if search mode was PAGE_CUR_LE, + otherwise undefined */ + btr_pcur_t* cursor); /* in: memory buffer for persistent cursor */ +/****************************************************************** +If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first +user record satisfying the search condition, in the case PAGE_CUR_L or +PAGE_CUR_LE, on the last user record. If no such user record exists, then +in the first case sets the cursor after last in tree, and in the latter case +before first in tree. The latching mode must be BTR_SEARCH_LEAF or +BTR_MODIFY_LEAF. */ +UNIV_INTERN +void +btr_pcur_open_on_user_rec( +/*======================*/ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple, /* in: tuple on which search done */ + ulint mode, /* in: PAGE_CUR_L, ... */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /* in: memory buffer for persistent + cursor */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INLINE +void +btr_pcur_open_at_rnd_pos( +/*=====================*/ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /* in/out: B-tree pcur */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Frees the possible old_rec_buf buffer of a persistent cursor and sets the +latch mode of the persistent cursor to BTR_NO_LATCHES. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor); /* in: persistent cursor */ +/****************************************************************** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +UNIV_INTERN +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +Restores the stored position of a persistent cursor bufferfixing the page and +obtaining the specified latches. If the cursor position was saved when the +(1) cursor was positioned on a user record: this function restores the position +to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the position to +the last record LESS than the user record which was the successor of the page +infimum; +(3) cursor was positioned on the page supremum: restores to the first record +GREATER than the user record which was the predecessor of the supremum. +(4) cursor was positioned before the first or after the last in an empty tree: +restores to before first or after the last in the tree. */ +UNIV_INTERN +ibool +btr_pcur_restore_position( +/*======================*/ + /* out: TRUE if the cursor position + was stored when it was on a user record + and it can be restored on a user record + whose ordering fields are identical to + the ones of the original user record */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /* in: detached persistent cursor */ + mtr_t* mtr); /* in: mtr */ +/****************************************************************** +If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY, +releases the page latch and bufferfix reserved by the cursor. +NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes +made by the current mini-transaction to the data protected by the +cursor latch, as then the latch must not be released until mtr_commit. */ +UNIV_INTERN +void +btr_pcur_release_leaf( +/*==================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Gets the rel_pos field for a cursor whose position has been stored. */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + /* out: BTR_PCUR_ON, ... */ + const btr_pcur_t* cursor);/* in: persistent cursor */ +/************************************************************* +Sets the mtr field for a pcur. */ +UNIV_INLINE +void +btr_pcur_set_mtr( +/*=============*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr); /* in, own: mtr */ +/************************************************************* +Gets the mtr field for a pcur. */ +UNIV_INLINE +mtr_t* +btr_pcur_get_mtr( +/*=============*/ + /* out: mtr */ + btr_pcur_t* cursor); /* in: persistent cursor */ +/****************************************************************** +Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. If there have been modifications +to the page where pcur is positioned, this can be used instead of +btr_pcur_release_leaf. Function btr_pcur_store_position should be used +before calling this, if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit( +/*============*/ + btr_pcur_t* pcur); /* in: persistent cursor */ +/****************************************************************** +Differs from btr_pcur_commit in that we can specify the mtr to commit. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /* in: persistent cursor */ + mtr_t* mtr); /* in: mtr to commit */ +/****************************************************************** +Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES. */ +UNIV_INLINE +ibool +btr_pcur_is_detached( +/*=================*/ + /* out: TRUE if detached */ + btr_pcur_t* pcur); /* in: persistent cursor */ +/************************************************************* +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + /* out: TRUE if the cursor was not after last + in tree */ + btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. */ +UNIV_INTERN +ibool +btr_pcur_move_to_prev( +/*==================*/ + /* out: TRUE if the cursor was not before first + in tree */ + btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Moves the persistent cursor to the last record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_last_on_page( +/*==========================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + /* out: TRUE if the cursor moved forward, + ending on a user record */ + btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Moves the persistent cursor to the first record on the next page. +Releases the latch on the current page, and bufferunfixes it. +Note that there must not be modifications on the current page, +as then the x-latch can be released only in mtr_commit. */ +UNIV_INTERN +void +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /* in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Moves the persistent cursor backward if it is on the first record +of the page. Releases the latch on the current page, and bufferunfixes +it. Note that to prevent a possible deadlock, the operation first +stores the position of the cursor, releases the leaf latch, acquires +necessary latches and restores the cursor position again before returning. +The alphabetical position of the cursor is guaranteed to be sensible +on return, but it may happen that the cursor is not positioned on the +last record of any page, because the structure of the tree may have +changed while the cursor had no latches. */ +UNIV_INTERN +void +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /* in: persistent cursor, must be on the + first record of the current page */ + mtr_t* mtr); /* in: mtr */ +#ifdef UNIV_DEBUG +/************************************************************* +Returns the btr cursor component of a persistent cursor. */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + /* out: pointer to + btr cursor component */ + const btr_pcur_t* cursor); /* in: persistent cursor */ +/************************************************************* +Returns the page cursor component of a persistent cursor. */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + /* out: pointer to + page cursor component */ + const btr_pcur_t* cursor); /* in: persistent cursor */ +#else /* UNIV_DEBUG */ +# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur) +# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur) +#endif /* UNIV_DEBUG */ +/************************************************************* +Returns the page of a persistent cursor. */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + /* out: pointer to the page */ + btr_pcur_t* cursor);/* in: persistent cursor */ +/************************************************************* +Returns the buffer block of a persistent cursor. */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + /* out: pointer to the block */ + btr_pcur_t* cursor);/* in: persistent cursor */ +/************************************************************* +Returns the record of a persistent cursor. */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + /* out: pointer to the record */ + btr_pcur_t* cursor);/* in: persistent cursor */ +/************************************************************* +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor);/* in: persistent cursor */ +/************************************************************* +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor);/* in: persistent cursor */ +/************************************************************* +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor);/* in: persistent cursor */ +/************************************************************* +Checks if the persistent cursor is before the first user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_in_tree( +/*=============================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Checks if the persistent cursor is after the last user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_in_tree( +/*===========================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr); /* in: mtr */ +/************************************************************* +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor);/* in/out: persistent cursor */ +/************************************************************* +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor);/* in/out: persistent cursor */ + + +/* The persistent B-tree cursor structure. This is used mainly for SQL +selects, updates, and deletes. */ + +struct btr_pcur_struct{ + btr_cur_t btr_cur; /* a B-tree cursor */ + ulint latch_mode; /* see TODO note below! + BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, + BTR_MODIFY_TREE, or BTR_NO_LATCHES, + depending on the latching state of + the page and tree where the cursor is + positioned; the last value means that + the cursor is not currently positioned: + we say then that the cursor is + detached; it can be restored to + attached if the old position was + stored in old_rec */ + ulint old_stored; /* BTR_PCUR_OLD_STORED + or BTR_PCUR_OLD_NOT_STORED */ + rec_t* old_rec; /* if cursor position is stored, + contains an initial segment of the + latest record cursor was positioned + either on, before, or after */ + ulint old_n_fields; /* number of fields in old_rec */ + ulint rel_pos; /* BTR_PCUR_ON, BTR_PCUR_BEFORE, or + BTR_PCUR_AFTER, depending on whether + cursor was on, before, or after the + old_rec record */ + buf_block_t* block_when_stored;/* buffer block when the position was + stored */ + ib_uint64_t modify_clock; /* the modify clock value of the + buffer block when the cursor position + was stored */ + ulint pos_state; /* see TODO note below! + BTR_PCUR_IS_POSITIONED, + BTR_PCUR_WAS_POSITIONED, + BTR_PCUR_NOT_POSITIONED */ + ulint search_mode; /* PAGE_CUR_G, ... */ + trx_t* trx_if_known; /* the transaction, if we know it; + otherwise this field is not defined; + can ONLY BE USED in error prints in + fatal assertion failures! */ + /*-----------------------------*/ + /* NOTE that the following fields may possess dynamically allocated + memory which should be freed if not needed anymore! */ + + mtr_t* mtr; /* NULL, or this field may contain + a mini-transaction which holds the + latch on the cursor page */ + byte* old_rec_buf; /* NULL, or a dynamically allocated + buffer for old_rec */ + ulint buf_size; /* old_rec_buf size if old_rec_buf + is not NULL */ +}; + +#define BTR_PCUR_IS_POSITIONED 1997660512 /* TODO: currently, the state + can be BTR_PCUR_IS_POSITIONED, + though it really should be + BTR_PCUR_WAS_POSITIONED, + because we have no obligation + to commit the cursor with + mtr; similarly latch_mode may + be out of date. This can + lead to problems if btr_pcur + is not used the right way; + all current code should be + ok. */ +#define BTR_PCUR_WAS_POSITIONED 1187549791 +#define BTR_PCUR_NOT_POSITIONED 1328997689 + +#define BTR_PCUR_OLD_STORED 908467085 +#define BTR_PCUR_OLD_NOT_STORED 122766467 + +#ifndef UNIV_NONINL +#include "btr0pcur.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic new file mode 100644 index 00000000000..bde7413820a --- /dev/null +++ b/storage/xtradb/include/btr0pcur.ic @@ -0,0 +1,656 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + + +/************************************************************* +Gets the rel_pos field for a cursor whose position has been stored. */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + /* out: BTR_PCUR_ON, ... */ + const btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor); + ut_ad(cursor->old_rec); + ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED); + ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED + || cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(cursor->rel_pos); +} + +/************************************************************* +Sets the mtr field for a pcur. */ +UNIV_INLINE +void +btr_pcur_set_mtr( +/*=============*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr) /* in, own: mtr */ +{ + ut_ad(cursor); + + cursor->mtr = mtr; +} + +/************************************************************* +Gets the mtr field for a pcur. */ +UNIV_INLINE +mtr_t* +btr_pcur_get_mtr( +/*=============*/ + /* out: mtr */ + btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor); + + return(cursor->mtr); +} + +#ifdef UNIV_DEBUG +/************************************************************* +Returns the btr cursor component of a persistent cursor. */ +UNIV_INLINE +btr_cur_t* +btr_pcur_get_btr_cur( +/*=================*/ + /* out: pointer to + btr cursor component */ + const btr_pcur_t* cursor) /* in: persistent cursor */ +{ + const btr_cur_t* btr_cur = &cursor->btr_cur; + return((btr_cur_t*) btr_cur); +} + +/************************************************************* +Returns the page cursor component of a persistent cursor. */ +UNIV_INLINE +page_cur_t* +btr_pcur_get_page_cur( +/*==================*/ + /* out: pointer to page cursor + component */ + const btr_pcur_t* cursor) /* in: persistent cursor */ +{ + return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor))); +} +#endif /* UNIV_DEBUG */ +/************************************************************* +Returns the page of a persistent cursor. */ +UNIV_INLINE +page_t* +btr_pcur_get_page( +/*==============*/ + /* out: pointer to the page */ + btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor))); +} + +/************************************************************* +Returns the buffer block of a persistent cursor. */ +UNIV_INLINE +buf_block_t* +btr_pcur_get_block( +/*===============*/ + /* out: pointer to the block */ + btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor))); +} + +/************************************************************* +Returns the record of a persistent cursor. */ +UNIV_INLINE +rec_t* +btr_pcur_get_rec( +/*=============*/ + /* out: pointer to the record */ + btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor))); +} + +/****************************************************************** +Gets the up_match value for a pcur after a search. */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + /* out: number of matched fields at the cursor + or to the right if search mode was PAGE_CUR_GE, + otherwise undefined */ + btr_pcur_t* cursor) /* in: memory buffer for persistent cursor */ +{ + btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + ut_ad(btr_cursor->up_match != ULINT_UNDEFINED); + + return(btr_cursor->up_match); +} + +/****************************************************************** +Gets the low_match value for a pcur after a search. */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + /* out: number of matched fields at the cursor + or to the right if search mode was PAGE_CUR_LE, + otherwise undefined */ + btr_pcur_t* cursor) /* in: memory buffer for persistent cursor */ +{ + btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + ut_ad(btr_cursor->low_match != ULINT_UNDEFINED); + + return(btr_cursor->low_match); +} + +/************************************************************* +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/************************************************************* +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/************************************************************* +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor) /* in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_pcur_is_before_first_on_page(cursor) + || btr_pcur_is_after_last_on_page(cursor)) { + + return(FALSE); + } + + return(TRUE); +} + +/************************************************************* +Checks if the persistent cursor is before the first user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_in_tree( +/*=============================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { + + return(FALSE); + } + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/************************************************************* +Checks if the persistent cursor is after the last user record in +the index tree. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_in_tree( +/*===========================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) { + + return(FALSE); + } + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/************************************************************* +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor) /* in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_next(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/************************************************************* +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor) /* in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_move_to_prev(btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/************************************************************* +Moves the persistent cursor to the last record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_to_last_on_page( +/*==========================*/ + btr_pcur_t* cursor, /* in: persistent cursor */ + mtr_t* mtr) /* in: mtr */ +{ + UT_NOT_USED(mtr); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_set_after_last(btr_pcur_get_block(cursor), + btr_pcur_get_page_cur(cursor)); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +} + +/************************************************************* +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + /* out: TRUE if the cursor moved forward, + ending on a user record */ + btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; +loop: + if (btr_pcur_is_after_last_on_page(cursor)) { + + if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + } else { + btr_pcur_move_to_next_on_page(cursor); + } + + if (btr_pcur_is_on_user_rec(cursor)) { + + return(TRUE); + } + + goto loop; +} + +/************************************************************* +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + /* out: TRUE if the cursor was not after last + in tree */ + btr_pcur_t* cursor, /* in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + if (btr_pcur_is_after_last_on_page(cursor)) { + + if (btr_pcur_is_after_last_in_tree(cursor, mtr)) { + + return(FALSE); + } + + btr_pcur_move_to_next_page(cursor, mtr); + + return(TRUE); + } + + btr_pcur_move_to_next_on_page(cursor); + + return(TRUE); +} + +/****************************************************************** +Commits the pcur mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. If there have been modifications +to the page where pcur is positioned, this can be used instead of +btr_pcur_release_leaf. Function btr_pcur_store_position should be used +before calling this, if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit( +/*============*/ + btr_pcur_t* pcur) /* in: persistent cursor */ +{ + ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(pcur->mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/****************************************************************** +Differs from btr_pcur_commit in that we can specify the mtr to commit. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /* in: persistent cursor */ + mtr_t* mtr) /* in: mtr to commit */ +{ + ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/****************************************************************** +Sets the pcur latch mode to BTR_NO_LATCHES. */ +UNIV_INLINE +void +btr_pcur_detach( +/*============*/ + btr_pcur_t* pcur) /* in: persistent cursor */ +{ + ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/****************************************************************** +Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES. */ +UNIV_INLINE +ibool +btr_pcur_is_detached( +/*=================*/ + /* out: TRUE if detached */ + btr_pcur_t* pcur) /* in: persistent cursor */ +{ + if (pcur->latch_mode == BTR_NO_LATCHES) { + + return(TRUE); + } + + return(FALSE); +} + +/****************************************************************** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur) /* in: persistent cursor */ +{ + pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; + pcur->old_rec_buf = NULL; + pcur->old_rec = NULL; +} + +/****************************************************************** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +UNIV_INLINE +void +btr_pcur_open( +/*==========*/ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple, /* in: tuple on which search done */ + ulint mode, /* in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page from the + record! */ + ulint latch_mode,/* in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */ + mtr_t* mtr) /* in: mtr */ +{ + btr_cur_t* btr_cursor; + + /* Initialize the cursor */ + + btr_pcur_init(cursor); + + cursor->latch_mode = latch_mode; + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, + btr_cursor, 0, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->trx_if_known = NULL; +} + +/****************************************************************** +Opens an persistent cursor to an index tree without initializing the +cursor. */ +UNIV_INLINE +void +btr_pcur_open_with_no_init( +/*=======================*/ + dict_index_t* index, /* in: index */ + const dtuple_t* tuple, /* in: tuple on which search done */ + ulint mode, /* in: PAGE_CUR_L, ...; + NOTE that if the search is made using a unique + prefix of a record, mode should be + PAGE_CUR_LE, not PAGE_CUR_GE, as the latter + may end up on the previous page of the + record! */ + ulint latch_mode,/* in: BTR_SEARCH_LEAF, ...; + NOTE that if has_search_latch != 0 then + we maybe do not acquire a latch on the cursor + page, but assume that the caller uses his + btr search latch to protect the record! */ + btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */ + ulint has_search_latch,/* in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, or 0 */ + mtr_t* mtr) /* in: mtr */ +{ + btr_cur_t* btr_cursor; + + cursor->latch_mode = latch_mode; + cursor->search_mode = mode; + + /* Search with the tree cursor */ + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode, + btr_cursor, has_search_latch, mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->trx_if_known = NULL; +} + +/********************************************************************* +Opens a persistent cursor at either end of an index. */ +UNIV_INLINE +void +btr_pcur_open_at_index_side( +/*========================*/ + ibool from_left, /* in: TRUE if open to the low end, + FALSE if to the high end */ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: latch mode */ + btr_pcur_t* pcur, /* in: cursor */ + ibool do_init, /* in: TRUE if should be initialized */ + mtr_t* mtr) /* in: mtr */ +{ + pcur->latch_mode = latch_mode; + + if (from_left) { + pcur->search_mode = PAGE_CUR_G; + } else { + pcur->search_mode = PAGE_CUR_L; + } + + if (do_init) { + btr_pcur_init(pcur); + } + + btr_cur_open_at_index_side(from_left, index, latch_mode, + btr_pcur_get_btr_cur(pcur), mtr); + pcur->pos_state = BTR_PCUR_IS_POSITIONED; + + pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; + + pcur->trx_if_known = NULL; +} + +/************************************************************************** +Positions a cursor at a randomly chosen position within a B-tree. */ +UNIV_INLINE +void +btr_pcur_open_at_rnd_pos( +/*=====================*/ + dict_index_t* index, /* in: index */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /* in/out: B-tree pcur */ + mtr_t* mtr) /* in: mtr */ +{ + /* Initialize the cursor */ + + cursor->latch_mode = latch_mode; + cursor->search_mode = PAGE_CUR_G; + + btr_pcur_init(cursor); + + btr_cur_open_at_rnd_pos(index, latch_mode, + btr_pcur_get_btr_cur(cursor), mtr); + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->trx_if_known = NULL; +} + +/****************************************************************** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor) /* in: persistent cursor */ +{ + if (cursor->old_rec_buf != NULL) { + + mem_free(cursor->old_rec_buf); + + cursor->old_rec = NULL; + cursor->old_rec_buf = NULL; + } + + cursor->btr_cur.page_cur.rec = NULL; + cursor->btr_cur.page_cur.block = NULL; + cursor->old_rec = NULL; + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; + + cursor->trx_if_known = NULL; +} diff --git a/storage/xtradb/include/btr0sea.h b/storage/xtradb/include/btr0sea.h new file mode 100644 index 00000000000..074e6595258 --- /dev/null +++ b/storage/xtradb/include/btr0sea.h @@ -0,0 +1,298 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0sea_h +#define btr0sea_h + +#include "univ.i" + +#include "rem0rec.h" +#include "dict0dict.h" +#include "btr0types.h" +#include "mtr0mtr.h" +#include "ha0ha.h" + +/********************************************************************* +Creates and initializes the adaptive search system at a database start. */ +UNIV_INTERN +void +btr_search_sys_create( +/*==================*/ + ulint hash_size); /* in: hash index hash table size */ + +/************************************************************************ +Disable the adaptive hash search system and empty the index. */ +UNIV_INTERN +void +btr_search_disable(void); +/*====================*/ +/************************************************************************ +Enable the adaptive hash search system. */ +UNIV_INTERN +void +btr_search_enable(void); +/*====================*/ + +/************************************************************************ +Returns search info for an index. */ +UNIV_INLINE +btr_search_t* +btr_search_get_info( +/*================*/ + /* out: search info; search mutex reserved */ + dict_index_t* index); /* in: index */ +/********************************************************************* +Creates and initializes a search info struct. */ +UNIV_INTERN +btr_search_t* +btr_search_info_create( +/*===================*/ + /* out, own: search info struct */ + mem_heap_t* heap); /* in: heap where created */ +/********************************************************************* +Returns the value of ref_count. The value is protected by +btr_search_latch. */ +UNIV_INTERN +ulint +btr_search_info_get_ref_count( +/*==========================*/ + /* out: ref_count value. */ + btr_search_t* info); /* in: search info. */ +/************************************************************************* +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /* in: index of the cursor */ + btr_cur_t* cursor);/* in: cursor which was just positioned */ +/********************************************************************** +Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. */ +UNIV_INTERN +ibool +btr_search_guess_on_hash( +/*=====================*/ + /* out: TRUE if succeeded */ + dict_index_t* index, /* in: index */ + btr_search_t* info, /* in: index search info */ + const dtuple_t* tuple, /* in: logical record */ + ulint mode, /* in: PAGE_CUR_L, ... */ + ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */ + btr_cur_t* cursor, /* out: tree cursor */ + ulint has_search_latch,/* in: latch mode the caller + currently has on btr_search_latch: + RW_S_LATCH, RW_X_LATCH, or 0 */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************ +Moves or deletes hash entries for moved records. If new_page is already hashed, +then the hash index for page, if any, is dropped. If new_page is not hashed, +and page is hashed, then a new hash index is built to new_page with the same +parameters as page (this often happens when a page is split). */ +UNIV_INTERN +void +btr_search_move_or_delete_hash_entries( +/*===================================*/ + buf_block_t* new_block, /* in: records are copied + to this page */ + buf_block_t* block, /* in: index page from which + records were copied, and the + copied records will be deleted + from this page */ + dict_index_t* index); /* in: record descriptor */ +/************************************************************************ +Drops a page hash index. */ +UNIV_INTERN +void +btr_search_drop_page_hash_index( +/*============================*/ + buf_block_t* block); /* in: block containing index page, + s- or x-latched, or an index page + for which we know that + block->buf_fix_count == 0 */ +/************************************************************************ +Drops a page hash index when a page is freed from a fseg to the file system. +Drops possible hash index if the page happens to be in the buffer pool. */ +UNIV_INTERN +void +btr_search_drop_page_hash_when_freed( +/*=================================*/ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no); /* in: page number */ +/************************************************************************ +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_node_on_insert( +/*==================================*/ + btr_cur_t* cursor);/* in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +/************************************************************************ +Updates the page hash index when a single record is inserted on a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_insert( +/*=============================*/ + btr_cur_t* cursor);/* in: cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor */ +/************************************************************************ +Updates the page hash index when a single record is deleted from a page. */ +UNIV_INTERN +void +btr_search_update_hash_on_delete( +/*=============================*/ + btr_cur_t* cursor);/* in: cursor which was positioned on the + record to delete using btr_cur_search_..., + the record is not yet deleted */ +/************************************************************************ +Validates the search system. */ +UNIV_INTERN +ibool +btr_search_validate(void); +/*======================*/ + /* out: TRUE if ok */ + +/* Flag: has the search system been enabled? +Protected by btr_search_latch and btr_search_enabled_mutex. */ +extern char btr_search_enabled; + +/* The search info struct in an index */ + +struct btr_search_struct{ + ulint ref_count; /* Number of blocks in this index tree + that have search index built + i.e. block->index points to this index. + Protected by btr_search_latch except + when during initialization in + btr_search_info_create(). */ + + /* The following fields are not protected by any latch. + Unfortunately, this means that they must be aligned to + the machine word, i.e., they cannot be turned into bit-fields. */ + buf_block_t* root_guess;/* the root page frame when it was last time + fetched, or NULL */ + ulint hash_analysis; /* when this exceeds BTR_SEARCH_HASH_ANALYSIS, + the hash analysis starts; this is reset if no + success noticed */ + ibool last_hash_succ; /* TRUE if the last search would have + succeeded, or did succeed, using the hash + index; NOTE that the value here is not exact: + it is not calculated for every search, and the + calculation itself is not always accurate! */ + ulint n_hash_potential; + /* number of consecutive searches + which would have succeeded, or did succeed, + using the hash index; + the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */ + /*----------------------*/ + ulint n_fields; /* recommended prefix length for hash search: + number of full fields */ + ulint n_bytes; /* recommended prefix: number of bytes in + an incomplete field; + see also BTR_PAGE_MAX_REC_SIZE */ + ibool left_side; /* TRUE or FALSE, depending on whether + the leftmost record of several records with + the same prefix should be indexed in the + hash index */ + /*----------------------*/ +#ifdef UNIV_SEARCH_PERF_STAT + ulint n_hash_succ; /* number of successful hash searches thus + far */ + ulint n_hash_fail; /* number of failed hash searches */ + ulint n_patt_succ; /* number of successful pattern searches thus + far */ + ulint n_searches; /* number of searches */ +#endif /* UNIV_SEARCH_PERF_STAT */ +#ifdef UNIV_DEBUG + ulint magic_n; /* magic number */ +# define BTR_SEARCH_MAGIC_N 1112765 +#endif /* UNIV_DEBUG */ +}; + +/* The hash index system */ + +typedef struct btr_search_sys_struct btr_search_sys_t; + +struct btr_search_sys_struct{ + hash_table_t* hash_index; +}; + +extern btr_search_sys_t* btr_search_sys; + +/* The latch protecting the adaptive search system: this latch protects the +(1) hash index; +(2) columns of a record to which we have a pointer in the hash index; + +but does NOT protect: + +(3) next record offset field in a record; +(4) next or previous records on the same page. + +Bear in mind (3) and (4) when using the hash index. +*/ + +extern rw_lock_t* btr_search_latch_temp; + +#define btr_search_latch (*btr_search_latch_temp) + +#ifdef UNIV_SEARCH_PERF_STAT +extern ulint btr_search_n_succ; +extern ulint btr_search_n_hash_fail; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/* After change in n_fields or n_bytes in info, this many rounds are waited +before starting the hash analysis again: this is to save CPU time when there +is no hope in building a hash index. */ + +#define BTR_SEARCH_HASH_ANALYSIS 17 + +/* Limit of consecutive searches for trying a search shortcut on the search +pattern */ + +#define BTR_SEARCH_ON_PATTERN_LIMIT 3 + +/* Limit of consecutive searches for trying a search shortcut using the hash +index */ + +#define BTR_SEARCH_ON_HASH_LIMIT 3 + +/* We do this many searches before trying to keep the search latch over calls +from MySQL. If we notice someone waiting for the latch, we again set this +much timeout. This is to reduce contention. */ + +#define BTR_SEA_TIMEOUT 10000 + +#ifndef UNIV_NONINL +#include "btr0sea.ic" +#endif + +#endif diff --git a/storage/xtradb/include/btr0sea.ic b/storage/xtradb/include/btr0sea.ic new file mode 100644 index 00000000000..c948d7e92af --- /dev/null +++ b/storage/xtradb/include/btr0sea.ic @@ -0,0 +1,83 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "dict0mem.h" +#include "btr0cur.h" +#include "buf0buf.h" + +/************************************************************************* +Updates the search info. */ +UNIV_INTERN +void +btr_search_info_update_slow( +/*========================*/ + btr_search_t* info, /* in/out: search info */ + btr_cur_t* cursor);/* in: cursor which was just positioned */ + +/************************************************************************ +Returns search info for an index. */ +UNIV_INLINE +btr_search_t* +btr_search_get_info( +/*================*/ + /* out: search info; search mutex reserved */ + dict_index_t* index) /* in: index */ +{ + ut_ad(index); + + return(index->search_info); +} + +/************************************************************************* +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /* in: index of the cursor */ + btr_cur_t* cursor) /* in: cursor which was just positioned */ +{ + btr_search_t* info; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + info = btr_search_get_info(index); + + info->hash_analysis++; + + if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) { + + /* Do nothing */ + + return; + + } + + ut_ad(cursor->flag != BTR_CUR_HASH); + + btr_search_info_update_slow(info, cursor); +} diff --git a/storage/xtradb/include/btr0types.h b/storage/xtradb/include/btr0types.h new file mode 100644 index 00000000000..074b15fa68d --- /dev/null +++ b/storage/xtradb/include/btr0types.h @@ -0,0 +1,47 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The index tree general types + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0types_h +#define btr0types_h + +#include "univ.i" + +#include "rem0types.h" +#include "page0types.h" + +typedef struct btr_pcur_struct btr_pcur_t; +typedef struct btr_cur_struct btr_cur_t; +typedef struct btr_search_struct btr_search_t; + +/* The size of a reference to data stored on a different page. +The reference is stored at the end of the prefix of the field +in the index record. */ +#define BTR_EXTERN_FIELD_REF_SIZE 20 + +/* A BLOB field reference full of zero, for use in assertions and tests. +Initially, BLOB field references are set to zero, in +dtuple_convert_big_rec(). */ +extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]; + +#endif diff --git a/storage/xtradb/include/buf0buddy.h b/storage/xtradb/include/buf0buddy.h new file mode 100644 index 00000000000..f3e593151b5 --- /dev/null +++ b/storage/xtradb/include/buf0buddy.h @@ -0,0 +1,89 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifndef buf0buddy_h +#define buf0buddy_h + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "univ.i" +#include "buf0types.h" + +/************************************************************************** +Allocate a block. The thread calling this function must hold +buf_pool_mutex and must not hold buf_pool_zip_mutex or any +block->mutex. The buf_pool_mutex may only be released and reacquired +if lru != NULL. This function should only be used for allocating +compressed page frames or control blocks (buf_page_t). Allocated +control blocks must be properly initialized immediately after +buf_buddy_alloc() has returned the memory, before releasing +buf_pool_mutex. */ +UNIV_INLINE +void* +buf_buddy_alloc( +/*============*/ + /* out: allocated block, + possibly NULL if lru == NULL */ + ulint size, /* in: block size, up to UNIV_PAGE_SIZE */ + ibool* lru) /* in: pointer to a variable that will be assigned + TRUE if storage was allocated from the LRU list + and buf_pool_mutex was temporarily released, + or NULL if the LRU list should not be used */ + __attribute__((malloc)); + +/************************************************************************** +Release a block. */ +UNIV_INLINE +void +buf_buddy_free( +/*===========*/ + void* buf, /* in: block to be freed, must not be + pointed to by the buffer pool */ + ulint size) /* in: block size, up to UNIV_PAGE_SIZE */ + __attribute__((nonnull)); + +/** Statistics of buddy blocks of a given size. */ +struct buf_buddy_stat_struct { + /** Number of blocks allocated from the buddy system. */ + ulint used; + /** Number of blocks relocated by the buddy system. */ + ib_uint64_t relocated; + /** Total duration of block relocations, in microseconds. */ + ib_uint64_t relocated_usec; +}; + +typedef struct buf_buddy_stat_struct buf_buddy_stat_t; + +/** Statistics of the buddy system, indexed by block size. +Protected by buf_pool_mutex. */ +extern buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES + 1]; + +#ifndef UNIV_NONINL +# include "buf0buddy.ic" +#endif + +#endif /* buf0buddy_h */ diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic new file mode 100644 index 00000000000..769b9d11d94 --- /dev/null +++ b/storage/xtradb/include/buf0buddy.ic @@ -0,0 +1,129 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "buf0buf.h" +#include "buf0buddy.h" +#include "ut0ut.h" +#include "sync0sync.h" + +/************************************************************************** +Allocate a block. The thread calling this function must hold +buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. +The buf_pool_mutex may only be released and reacquired if lru != NULL. */ +UNIV_INTERN +void* +buf_buddy_alloc_low( +/*================*/ + /* out: allocated block, + possibly NULL if lru==NULL */ + ulint i, /* in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + ibool* lru) /* in: pointer to a variable that will be assigned + TRUE if storage was allocated from the LRU list + and buf_pool_mutex was temporarily released, + or NULL if the LRU list should not be used */ + __attribute__((malloc)); + +/************************************************************************** +Deallocate a block. */ +UNIV_INTERN +void +buf_buddy_free_low( +/*===============*/ + void* buf, /* in: block to be freed, must not be + pointed to by the buffer pool */ + ulint i) /* in: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + __attribute__((nonnull)); + +/************************************************************************** +Get the index of buf_pool->zip_free[] for a given block size. */ +UNIV_INLINE +ulint +buf_buddy_get_slot( +/*===============*/ + /* out: index of buf_pool->zip_free[], + or BUF_BUDDY_SIZES */ + ulint size) /* in: block size */ +{ + ulint i; + ulint s; + + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { + } + + ut_ad(i <= BUF_BUDDY_SIZES); + return(i); +} + +/************************************************************************** +Allocate a block. The thread calling this function must hold +buf_pool_mutex and must not hold buf_pool_zip_mutex or any +block->mutex. The buf_pool_mutex may only be released and reacquired +if lru != NULL. This function should only be used for allocating +compressed page frames or control blocks (buf_page_t). Allocated +control blocks must be properly initialized immediately after +buf_buddy_alloc() has returned the memory, before releasing +buf_pool_mutex. */ +UNIV_INLINE +void* +buf_buddy_alloc( +/*============*/ + /* out: allocated block, + possibly NULL if lru == NULL */ + ulint size, /* in: block size, up to UNIV_PAGE_SIZE */ + ibool* lru) /* in: pointer to a variable that will be assigned + TRUE if storage was allocated from the LRU list + and buf_pool_mutex was temporarily released, + or NULL if the LRU list should not be used */ +{ + ut_ad(buf_pool_mutex_own()); + + return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru)); +} + +/************************************************************************** +Deallocate a block. */ +UNIV_INLINE +void +buf_buddy_free( +/*===========*/ + void* buf, /* in: block to be freed, must not be + pointed to by the buffer pool */ + ulint size) /* in: block size, up to UNIV_PAGE_SIZE */ +{ + ut_ad(buf_pool_mutex_own()); + + buf_buddy_free_low(buf, buf_buddy_get_slot(size)); +} + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h new file mode 100644 index 00000000000..65b7ad18da9 --- /dev/null +++ b/storage/xtradb/include/buf0buf.h @@ -0,0 +1,1411 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer pool high-level routines + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0buf_h +#define buf0buf_h + +#include "univ.i" +#include "fil0fil.h" +#include "mtr0types.h" +#include "buf0types.h" +#include "sync0rw.h" +#include "hash0hash.h" +#include "ut0byte.h" +#include "os0proc.h" +#include "page0types.h" + +/* Modes for buf_page_get_gen */ +#define BUF_GET 10 /* get always */ +#define BUF_GET_IF_IN_POOL 11 /* get if in pool */ +#define BUF_GET_NO_LATCH 14 /* get and bufferfix, but set no latch; + we have separated this case, because + it is error-prone programming not to + set a latch, and it should be used + with care */ +/* Modes for buf_page_get_known_nowait */ +#define BUF_MAKE_YOUNG 51 +#define BUF_KEEP_OLD 52 +/* Magic value to use instead of checksums when they are disabled */ +#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL + +extern buf_pool_t* buf_pool; /* The buffer pool of the database */ +#ifdef UNIV_DEBUG +extern ibool buf_debug_prints;/* If this is set TRUE, the program + prints info whenever read or flush + occurs */ +#endif /* UNIV_DEBUG */ +extern ulint srv_buf_pool_write_requests; /* variable to count write request + issued */ + +/* States of a control block (@see buf_page_struct). +The enumeration values must be 0..7. */ +enum buf_page_state { + BUF_BLOCK_ZIP_FREE = 0, /* contains a free compressed page */ + BUF_BLOCK_ZIP_PAGE, /* contains a clean compressed page */ + BUF_BLOCK_ZIP_DIRTY, /* contains a compressed page that is + in the buf_pool->flush_list */ + + /* The constants for compressed-only pages must precede + BUF_BLOCK_NOT_USED; @see buf_block_state_valid() */ + + BUF_BLOCK_NOT_USED, /* is in the free list */ + BUF_BLOCK_READY_FOR_USE, /* when buf_LRU_get_free_block returns + a block, it is in this state */ + BUF_BLOCK_FILE_PAGE, /* contains a buffered file page */ + BUF_BLOCK_MEMORY, /* contains some main memory object */ + BUF_BLOCK_REMOVE_HASH /* hash index should be removed + before putting to the free list */ +}; + +/************************************************************************ +Creates the buffer pool. */ +UNIV_INTERN +buf_pool_t* +buf_pool_init(void); +/*===============*/ + /* out, own: buf_pool object, NULL if not + enough memory or error */ +/************************************************************************ +Frees the buffer pool at shutdown. This must not be invoked before +freeing all mutexes. */ +UNIV_INTERN +void +buf_pool_free(void); +/*===============*/ + +/************************************************************************ +Drops the adaptive hash index. To prevent a livelock, this function +is only to be called while holding btr_search_latch and while +btr_search_enabled == FALSE. */ +UNIV_INTERN +void +buf_pool_drop_hash_index(void); +/*==========================*/ + +/************************************************************************ +Relocate a buffer control block. Relocates the block on the LRU list +and in buf_pool->page_hash. Does not relocate bpage->list. +The caller must take care of relocating bpage->list. */ +UNIV_INTERN +void +buf_relocate( +/*=========*/ + buf_page_t* bpage, /* in/out: control block being relocated; + buf_page_get_state(bpage) must be + BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ + buf_page_t* dpage) /* in/out: destination control block */ + __attribute__((nonnull)); +/************************************************************************ +Resizes the buffer pool. */ +UNIV_INTERN +void +buf_pool_resize(void); +/*=================*/ +/************************************************************************* +Gets the current size of buffer buf_pool in bytes. */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void); +/*========================*/ + /* out: size in bytes */ +/************************************************************************ +Gets the smallest oldest_modification lsn for any page in the pool. Returns +zero if all modified pages have been flushed to disk. */ +UNIV_INLINE +ib_uint64_t +buf_pool_get_oldest_modification(void); +/*==================================*/ + /* out: oldest modification in pool, + zero if none */ +/************************************************************************ +Allocates a buffer block. */ +UNIV_INLINE +buf_block_t* +buf_block_alloc( +/*============*/ + /* out, own: the allocated block, + in state BUF_BLOCK_MEMORY */ + ulint zip_size); /* in: compressed page size in bytes, + or 0 if uncompressed tablespace */ +/************************************************************************ +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block); /* in, own: block to be freed */ +/************************************************************************* +Copies contents of a buffer frame to a given buffer. */ +UNIV_INLINE +byte* +buf_frame_copy( +/*===========*/ + /* out: buf */ + byte* buf, /* in: buffer to copy to */ + const buf_frame_t* frame); /* in: buffer frame */ +/****************************************************************** +NOTE! The following macros should be used instead of buf_page_get_gen, +to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed +in LA! */ +#define buf_page_get(SP, ZS, OF, LA, MTR) buf_page_get_gen(\ + SP, ZS, OF, LA, NULL,\ + BUF_GET, __FILE__, __LINE__, MTR) +/****************************************************************** +Use these macros to bufferfix a page with no latching. Remember not to +read the contents of the page unless you know it is safe. Do not modify +the contents of the page! We have separated this case, because it is +error-prone programming not to set a latch, and it should be used +with care. */ +#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\ + SP, ZS, OF, RW_NO_LATCH, NULL,\ + BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR) +/****************************************************************** +NOTE! The following macros should be used instead of +buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and +RW_X_LATCH are allowed as LA! */ +#define buf_page_optimistic_get(LA, BL, MC, MTR) \ + buf_page_optimistic_get_func(LA, BL, MC, __FILE__, __LINE__, MTR) +/************************************************************************ +This is the general function used to get optimistic access to a database +page. */ +UNIV_INTERN +ibool +buf_page_optimistic_get_func( +/*=========================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /* in: guessed block */ + ib_uint64_t modify_clock,/* in: modify clock value if mode is + ..._GUESS_ON_CLOCK */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr); /* in: mini-transaction */ +/************************************************************************ +This is used to get access to a known database page, when no waiting can be +done. */ +UNIV_INTERN +ibool +buf_page_get_known_nowait( +/*======================*/ + /* out: TRUE if success */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH */ + buf_block_t* block, /* in: the known page */ + ulint mode, /* in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr); /* in: mini-transaction */ + +/*********************************************************************** +Given a tablespace id and page number tries to get that page. If the +page is not in the buffer pool it is not loaded and NULL is returned. +Suitable for using when holding the kernel mutex. */ + +const buf_block_t* +buf_page_try_get_func( +/*==================*/ + ulint space_id,/* in: tablespace id */ + ulint page_no,/* in: page number */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr); /* in: mini-transaction */ + +#define buf_page_try_get(space_id, page_no, mtr) \ + buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr); + +/************************************************************************ +Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with buf_page_release_zip(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. */ +UNIV_INTERN +buf_page_t* +buf_page_get_zip( +/*=============*/ + /* out: pointer to the block, + or NULL if not compressed */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size */ + ulint offset);/* in: page number */ +/************************************************************************ +This is the general function used to get access to a database page. */ +UNIV_INTERN +buf_block_t* +buf_page_get_gen( +/*=============*/ + /* out: pointer to the block or NULL */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint offset, /* in: page number */ + ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ + buf_block_t* guess, /* in: guessed block or NULL */ + ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL, + BUF_GET_NO_LATCH */ + const char* file, /* in: file name */ + ulint line, /* in: line where called */ + mtr_t* mtr); /* in: mini-transaction */ +/************************************************************************ +Initializes a page to the buffer buf_pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). */ +UNIV_INTERN +buf_block_t* +buf_page_create( +/*============*/ + /* out: pointer to the block, page bufferfixed */ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space in units of + a page */ + ulint zip_size,/* in: compressed page size, or 0 */ + mtr_t* mtr); /* in: mini-transaction handle */ +#ifdef UNIV_HOTBACKUP +/************************************************************************ +Inits a page to the buffer buf_pool, for use in ibbackup --restore. */ +UNIV_INTERN +void +buf_page_init_for_backup_restore( +/*=============================*/ + ulint space, /* in: space id */ + ulint offset, /* in: offset of the page within space + in units of a page */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + buf_block_t* block); /* in: block to init */ +#endif /* UNIV_HOTBACKUP */ +/************************************************************************ +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage); /* in: buffer block */ +/************************************************************************ +Decrements the bufferfix count of a buffer control block and releases +a latch, if specified. */ +UNIV_INLINE +void +buf_page_release( +/*=============*/ + buf_block_t* block, /* in: buffer block */ + ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************ +Moves a page to the start of the buffer pool LRU list. This high-level +function can be used to prevent an important page from from slipping out of +the buffer pool. */ +UNIV_INTERN +void +buf_page_make_young( +/*================*/ + buf_page_t* bpage); /* in: buffer block of a file page */ +/************************************************************************ +Returns TRUE if the page can be found in the buffer pool hash table. NOTE +that it is possible that the page is not yet read from disk, though. */ +UNIV_INLINE +ibool +buf_page_peek( +/*==========*/ + /* out: TRUE if found from page hash table, + NOTE that the page is not necessarily yet read + from disk! */ + ulint space, /* in: space id */ + ulint offset);/* in: page number */ +/************************************************************************ +Resets the check_index_page_at_flush field of a page if found in the buffer +pool. */ +UNIV_INTERN +void +buf_reset_check_index_page_at_flush( +/*================================*/ + ulint space, /* in: space id */ + ulint offset);/* in: page number */ +#ifdef UNIV_DEBUG_FILE_ACCESSES +/************************************************************************ +Sets file_page_was_freed TRUE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. */ +UNIV_INTERN +buf_page_t* +buf_page_set_file_page_was_freed( +/*=============================*/ + /* out: control block if found in page hash table, + otherwise NULL */ + ulint space, /* in: space id */ + ulint offset);/* in: page number */ +/************************************************************************ +Sets file_page_was_freed FALSE if the page is found in the buffer pool. +This function should be called when we free a file page and want the +debug version to check that it is not accessed any more unless +reallocated. */ +UNIV_INTERN +buf_page_t* +buf_page_reset_file_page_was_freed( +/*===============================*/ + /* out: control block if found in page hash table, + otherwise NULL */ + ulint space, /* in: space id */ + ulint offset); /* in: page number */ +#endif /* UNIV_DEBUG_FILE_ACCESSES */ +/************************************************************************ +Reads the freed_page_clock of a buffer block. */ +UNIV_INLINE +ulint +buf_page_get_freed_page_clock( +/*==========================*/ + /* out: freed_page_clock */ + const buf_page_t* bpage) /* in: block */ + __attribute__((pure)); +/************************************************************************ +Reads the freed_page_clock of a buffer block. */ +UNIV_INLINE +ulint +buf_block_get_freed_page_clock( +/*===========================*/ + /* out: freed_page_clock */ + const buf_block_t* block) /* in: block */ + __attribute__((pure)); + +/************************************************************************ +Recommends a move of a block to the start of the LRU list if there is danger +of dropping from the buffer pool. NOTE: does not reserve the buffer pool +mutex. */ +UNIV_INLINE +ibool +buf_page_peek_if_too_old( +/*=====================*/ + /* out: TRUE if should be made + younger */ + const buf_page_t* bpage); /* in: block to make younger */ +/************************************************************************ +Returns the current state of is_hashed of a page. FALSE if the page is +not in the pool. NOTE that this operation does not fix the page in the +pool if it is found there. */ +UNIV_INTERN +ibool +buf_page_peek_if_search_hashed( +/*===========================*/ + /* out: TRUE if page hash index is built in search + system */ + ulint space, /* in: space id */ + ulint offset);/* in: page number */ +/************************************************************************ +Gets the youngest modification log sequence number for a frame. +Returns zero if not file page or no modification occurred yet. */ +UNIV_INLINE +ib_uint64_t +buf_page_get_newest_modification( +/*=============================*/ + /* out: newest modification to page */ + const buf_page_t* bpage); /* in: block containing the + page frame */ +/************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block); /* in: block */ +/************************************************************************ +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + /* out: value */ + buf_block_t* block); /* in: block */ +/************************************************************************ +Calculates a page checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value +on 32-bit and 64-bit architectures. */ +UNIV_INTERN +ulint +buf_calc_page_new_checksum( +/*=======================*/ + /* out: checksum */ + const byte* page); /* in: buffer page */ +/************************************************************************ +In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only +looked at the first few bytes of the page. This calculates that old +checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! */ +UNIV_INTERN +ulint +buf_calc_page_old_checksum( +/*=======================*/ + /* out: checksum */ + const byte* page); /* in: buffer page */ +/************************************************************************ +Checks if a page is corrupt. */ +UNIV_INTERN +ibool +buf_page_is_corrupted( +/*==================*/ + /* out: TRUE if corrupted */ + const byte* read_buf, /* in: a database page */ + ulint zip_size); /* in: size of compressed page; + 0 for uncompressed pages */ +/************************************************************************** +Gets the space id, page offset, and byte offset within page of a +pointer pointing to a buffer frame containing a file page. */ +UNIV_INLINE +void +buf_ptr_get_fsp_addr( +/*=================*/ + const void* ptr, /* in: pointer to a buffer frame */ + ulint* space, /* out: space id */ + fil_addr_t* addr); /* out: page offset and byte offset */ +/************************************************************************** +Gets the hash value of a block. This can be used in searches in the +lock hash table. */ +UNIV_INLINE +ulint +buf_block_get_lock_hash_val( +/*========================*/ + /* out: lock hash value */ + const buf_block_t* block) /* in: block */ + __attribute__((pure)); +#ifdef UNIV_DEBUG +/************************************************************************* +Finds a block in the buffer pool that points to a +given compressed page. */ +UNIV_INTERN +buf_block_t* +buf_pool_contains_zip( +/*==================*/ + /* out: buffer block pointing to + the compressed page, or NULL */ + const void* data); /* in: pointer to compressed page */ +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************* +Validates the buffer pool data structure. */ +UNIV_INTERN +ibool +buf_validate(void); +/*==============*/ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************* +Prints info of the buffer pool data structure. */ +UNIV_INTERN +void +buf_print(void); +/*============*/ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +/************************************************************************ +Prints a page to stderr. */ +UNIV_INTERN +void +buf_page_print( +/*===========*/ + const byte* read_buf, /* in: a database page */ + ulint zip_size); /* in: compressed page size, or + 0 for uncompressed pages */ +#ifdef UNIV_DEBUG +/************************************************************************* +Returns the number of latched pages in the buffer pool. */ +UNIV_INTERN +ulint +buf_get_latched_pages_number(void); +/*==============================*/ +#endif /* UNIV_DEBUG */ +/************************************************************************* +Returns the number of pending buf pool ios. */ +UNIV_INTERN +ulint +buf_get_n_pending_ios(void); +/*=======================*/ +/************************************************************************* +Prints info of the buffer i/o. */ +UNIV_INTERN +void +buf_print_io( +/*=========*/ + FILE* file); /* in: file where to print */ +/************************************************************************* +Returns the ratio in percents of modified pages in the buffer pool / +database pages in the buffer pool. */ +UNIV_INTERN +ulint +buf_get_modified_ratio_pct(void); +/*============================*/ +/************************************************************************** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +buf_refresh_io_stats(void); +/*======================*/ +/************************************************************************* +Checks that all file pages in the buffer are in a replaceable state. */ +UNIV_INTERN +ibool +buf_all_freed(void); +/*===============*/ +/************************************************************************* +Checks that there currently are no pending i/o-operations for the buffer +pool. */ +UNIV_INTERN +ibool +buf_pool_check_no_pending_io(void); +/*==============================*/ + /* out: TRUE if there is no pending i/o */ +/************************************************************************* +Invalidates the file pages in the buffer pool when an archive recovery is +completed. All the file pages buffered must be in a replaceable state when +this function is called: not latched and not modified. */ +UNIV_INTERN +void +buf_pool_invalidate(void); +/*=====================*/ + +/*======================================================================== +--------------------------- LOWER LEVEL ROUTINES ------------------------- +=========================================================================*/ + +#ifdef UNIV_SYNC_DEBUG +/************************************************************************* +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /* in: buffer page + where we have acquired latch */ + ulint level); /* in: latching order level */ +#else /* UNIV_SYNC_DEBUG */ +# define buf_block_dbg_add_level(block, level) /* nothing */ +#endif /* UNIV_SYNC_DEBUG */ +/************************************************************************* +Gets the state of a block. */ +UNIV_INLINE +enum buf_page_state +buf_page_get_state( +/*===============*/ + /* out: state */ + const buf_page_t* bpage); /* in: pointer to the control block */ +/************************************************************************* +Gets the state of a block. */ +UNIV_INLINE +enum buf_page_state +buf_block_get_state( +/*================*/ + /* out: state */ + const buf_block_t* block) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Sets the state of a block. */ +UNIV_INLINE +void +buf_page_set_state( +/*===============*/ + buf_page_t* bpage, /* in/out: pointer to control block */ + enum buf_page_state state); /* in: state */ +/************************************************************************* +Sets the state of a block. */ +UNIV_INLINE +void +buf_block_set_state( +/*================*/ + buf_block_t* block, /* in/out: pointer to control block */ + enum buf_page_state state); /* in: state */ +/************************************************************************* +Determines if a block is mapped to a tablespace. */ +UNIV_INLINE +ibool +buf_page_in_file( +/*=============*/ + /* out: TRUE if mapped */ + const buf_page_t* bpage) /* in: pointer to control block */ + __attribute__((pure)); +/************************************************************************* +Determines if a block should be on unzip_LRU list. */ +UNIV_INLINE +ibool +buf_page_belongs_to_unzip_LRU( +/*==========================*/ + /* out: TRUE if block belongs + to unzip_LRU */ + const buf_page_t* bpage) /* in: pointer to control block */ + __attribute__((pure)); +/************************************************************************* +Determine the approximate LRU list position of a block. */ +UNIV_INLINE +ulint +buf_page_get_LRU_position( +/*======================*/ + /* out: LRU list position */ + const buf_page_t* bpage) /* in: control block */ + __attribute__((pure)); + +/************************************************************************* +Gets the mutex of a block. */ +UNIV_INLINE +mutex_t* +buf_page_get_mutex( +/*===============*/ + /* out: pointer to mutex + protecting bpage */ + const buf_page_t* bpage) /* in: pointer to control block */ + __attribute__((pure)); + +/************************************************************************* +Get the flush type of a page. */ +UNIV_INLINE +enum buf_flush +buf_page_get_flush_type( +/*====================*/ + /* out: flush type */ + const buf_page_t* bpage) /* in: buffer page */ + __attribute__((pure)); +/************************************************************************* +Set the flush type of a page. */ +UNIV_INLINE +void +buf_page_set_flush_type( +/*====================*/ + buf_page_t* bpage, /* in: buffer page */ + enum buf_flush flush_type); /* in: flush type */ +/************************************************************************* +Map a block to a file page. */ +UNIV_INLINE +void +buf_block_set_file_page( +/*====================*/ + buf_block_t* block, /* in/out: pointer to control block */ + ulint space, /* in: tablespace id */ + ulint page_no);/* in: page number */ +/************************************************************************* +Gets the io_fix state of a block. */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix( +/*================*/ + /* out: io_fix state */ + const buf_page_t* bpage) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Gets the io_fix state of a block. */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix( +/*================*/ + /* out: io_fix state */ + const buf_block_t* block) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_page_set_io_fix( +/*================*/ + buf_page_t* bpage, /* in/out: control block */ + enum buf_io_fix io_fix);/* in: io_fix state */ +/************************************************************************* +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_block_set_io_fix( +/*=================*/ + buf_block_t* block, /* in/out: control block */ + enum buf_io_fix io_fix);/* in: io_fix state */ + +/************************************************************************ +Determine if a buffer block can be relocated in memory. The block +can be dirty, but it must not be I/O-fixed or bufferfixed. */ +UNIV_INLINE +ibool +buf_page_can_relocate( +/*==================*/ + const buf_page_t* bpage) /* control block being relocated */ + __attribute__((pure)); + +/************************************************************************* +Determine if a block has been flagged old. */ +UNIV_INLINE +ibool +buf_page_is_old( +/*============*/ + /* out: TRUE if old */ + const buf_page_t* bpage) /* in: control block */ + __attribute__((pure)); +/************************************************************************* +Flag a block old. */ +UNIV_INLINE +void +buf_page_set_old( +/*=============*/ + buf_page_t* bpage, /* in/out: control block */ + ibool old); /* in: old */ +/************************************************************************* +Determine if a block has been accessed in the buffer pool. */ +UNIV_INLINE +ibool +buf_page_is_accessed( +/*=================*/ + /* out: TRUE if accessed */ + const buf_page_t* bpage) /* in: control block */ + __attribute__((pure)); +/************************************************************************* +Flag a block accessed. */ +UNIV_INLINE +void +buf_page_set_accessed( +/*==================*/ + buf_page_t* bpage, /* in/out: control block */ + ibool accessed); /* in: accessed */ +/************************************************************************* +Gets the buf_block_t handle of a buffered file block if an uncompressed +page frame exists, or NULL. */ +UNIV_INLINE +buf_block_t* +buf_page_get_block( +/*===============*/ + /* out: control block, or NULL */ + buf_page_t* bpage) /* in: control block, or NULL */ + __attribute__((pure)); +#ifdef UNIV_DEBUG +/************************************************************************* +Gets a pointer to the memory frame of a block. */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + /* out: pointer to the frame */ + const buf_block_t* block) /* in: pointer to the control block */ + __attribute__((pure)); +#else /* UNIV_DEBUG */ +# define buf_block_get_frame(block) (block)->frame +#endif /* UNIV_DEBUG */ +/************************************************************************* +Gets the space id of a block. */ +UNIV_INLINE +ulint +buf_page_get_space( +/*===============*/ + /* out: space id */ + const buf_page_t* bpage) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Gets the space id of a block. */ +UNIV_INLINE +ulint +buf_block_get_space( +/*================*/ + /* out: space id */ + const buf_block_t* block) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Gets the page number of a block. */ +UNIV_INLINE +ulint +buf_page_get_page_no( +/*=================*/ + /* out: page number */ + const buf_page_t* bpage) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Gets the page number of a block. */ +UNIV_INLINE +ulint +buf_block_get_page_no( +/*==================*/ + /* out: page number */ + const buf_block_t* block) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Gets the compressed page size of a block. */ +UNIV_INLINE +ulint +buf_page_get_zip_size( +/*==================*/ + /* out: compressed page size, or 0 */ + const buf_page_t* bpage) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Gets the compressed page size of a block. */ +UNIV_INLINE +ulint +buf_block_get_zip_size( +/*===================*/ + /* out: compressed page size, or 0 */ + const buf_block_t* block) /* in: pointer to the control block */ + __attribute__((pure)); +/************************************************************************* +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. */ +#define buf_block_get_page_zip(block) \ + (UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL) +/*********************************************************************** +Gets the block to whose frame the pointer is pointing to. */ +UNIV_INTERN +buf_block_t* +buf_block_align( +/*============*/ + /* out: pointer to block, never NULL */ + const byte* ptr); /* in: pointer to a frame */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/************************************************************************* +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. */ +UNIV_INLINE +const page_zip_des_t* +buf_frame_get_page_zip( +/*===================*/ + /* out: compressed page descriptor, or NULL */ + const byte* ptr); /* in: pointer to the page */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +/************************************************************************ +This function is used to get info if there is an io operation +going on on a buffer page. */ +UNIV_INLINE +ibool +buf_page_io_query( +/*==============*/ + /* out: TRUE if io going on */ + buf_page_t* bpage); /* in: pool block, must be bufferfixed */ +/************************************************************************ +Function which inits a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. */ +UNIV_INTERN +buf_page_t* +buf_page_init_for_read( +/*===================*/ + /* out: pointer to the block or NULL */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size, or 0 */ + ibool unzip, /* in: TRUE=request uncompressed page */ + ib_int64_t tablespace_version,/* in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset);/* in: page number */ +/************************************************************************ +Completes an asynchronous read or write request of a file page to or from +the buffer pool. */ +UNIV_INTERN +void +buf_page_io_complete( +/*=================*/ + buf_page_t* bpage); /* in: pointer to the block in question */ +/************************************************************************ +Calculates a folded value of a file page address to use in the page hash +table. */ +UNIV_INLINE +ulint +buf_page_address_fold( +/*==================*/ + /* out: the folded value */ + ulint space, /* in: space id */ + ulint offset) /* in: offset of the page within space */ + __attribute__((const)); +/********************************************************************** +Returns the control block of a file page, NULL if not found. */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get( +/*==============*/ + /* out: block, NULL if not found */ + ulint space, /* in: space id */ + ulint offset);/* in: offset of the page within space */ +/********************************************************************** +Returns the control block of a file page, NULL if not found +or an uncompressed page frame does not exist. */ +UNIV_INLINE +buf_block_t* +buf_block_hash_get( +/*===============*/ + /* out: block, NULL if not found */ + ulint space, /* in: space id */ + ulint offset);/* in: offset of the page within space */ +/*********************************************************************** +Increments the pool clock by one and returns its new value. Remember that +in the 32 bit version the clock wraps around at 4 billion! */ +UNIV_INLINE +ulint +buf_pool_clock_tic(void); +/*====================*/ + /* out: new clock value */ +/************************************************************************* +Gets the current length of the free list of buffer blocks. */ +UNIV_INTERN +ulint +buf_get_free_list_len(void); +/*=======================*/ + + + +/* The common buffer control block structure +for compressed and uncompressed frames */ + +struct buf_page_struct{ + /* None of the following bit-fields must be modified without + holding buf_page_get_mutex() [block->mutex or buf_pool_zip_mutex], + since they can be stored in the same machine word. Some of them are + additionally protected by buf_pool_mutex. */ + + unsigned space:32; /* tablespace id; also protected + by buf_pool_mutex. */ + unsigned offset:32; /* page number; also protected + by buf_pool_mutex. */ + + unsigned state:3; /* state of the control block + (@see enum buf_page_state); also + protected by buf_pool_mutex. + State transitions from + BUF_BLOCK_READY_FOR_USE to + BUF_BLOCK_MEMORY need not be + protected by buf_page_get_mutex(). */ + unsigned flush_type:2; /* if this block is currently being + flushed to disk, this tells the + flush_type (@see enum buf_flush) */ + unsigned accessed:1; /* TRUE if the page has been accessed + while in the buffer pool: read-ahead + may read in pages which have not been + accessed yet; a thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ + unsigned io_fix:2; /* type of pending I/O operation + (@see enum buf_io_fix); also + protected by buf_pool_mutex */ + unsigned buf_fix_count:24;/* count of how manyfold this block + is currently bufferfixed */ + + page_zip_des_t zip; /* compressed page; zip.data + (but not the data it points to) is + also protected by buf_pool_mutex */ + buf_page_t* hash; /* node used in chaining to + buf_pool->page_hash or + buf_pool->zip_hash */ +#ifdef UNIV_DEBUG + ibool in_page_hash; /* TRUE if in buf_pool->page_hash */ + ibool in_zip_hash; /* TRUE if in buf_pool->zip_hash */ +#endif /* UNIV_DEBUG */ + + /* 2. Page flushing fields; protected by buf_pool_mutex */ + + UT_LIST_NODE_T(buf_page_t) list; + /* based on state, this is a list + node in one of the following lists + in buf_pool: + + BUF_BLOCK_NOT_USED: free + BUF_BLOCK_FILE_PAGE: flush_list + BUF_BLOCK_ZIP_DIRTY: flush_list + BUF_BLOCK_ZIP_PAGE: zip_clean + BUF_BLOCK_ZIP_FREE: zip_free[] */ +#ifdef UNIV_DEBUG + ibool in_flush_list; /* TRUE if in buf_pool->flush_list; + when buf_pool_mutex is free, the + following should hold: in_flush_list + == (state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_ZIP_DIRTY) */ + ibool in_free_list; /* TRUE if in buf_pool->free; when + buf_pool_mutex is free, the following + should hold: in_free_list + == (state == BUF_BLOCK_NOT_USED) */ +#endif /* UNIV_DEBUG */ + ib_uint64_t newest_modification; + /* log sequence number of the youngest + modification to this block, zero if + not modified */ + ib_uint64_t oldest_modification; + /* log sequence number of the START of + the log entry written of the oldest + modification to this block which has + not yet been flushed on disk; zero if + all modifications are on disk */ + + /* 3. LRU replacement algorithm fields; protected by + buf_pool_mutex only (not buf_pool_zip_mutex or block->mutex) */ + + UT_LIST_NODE_T(buf_page_t) LRU; + /* node of the LRU list */ +//#ifdef UNIV_DEBUG + ibool in_LRU_list; /* TRUE if the page is in the LRU list; + used in debugging */ +//#endif /* UNIV_DEBUG */ + unsigned old:1; /* TRUE if the block is in the old + blocks in the LRU list */ + unsigned LRU_position:31;/* value which monotonically decreases + (or may stay constant if old==TRUE) + toward the end of the LRU list, if + buf_pool->ulint_clock has not wrapped + around: NOTE that this value can only + be used in heuristic algorithms, + because of the possibility of a + wrap-around! */ + unsigned freed_page_clock:32;/* the value of + buf_pool->freed_page_clock when this + block was the last time put to the + head of the LRU list; a thread is + allowed to read this for heuristic + purposes without holding any mutex or + latch */ +#ifdef UNIV_DEBUG_FILE_ACCESSES + ibool file_page_was_freed; + /* this is set to TRUE when fsp + frees a page in buffer pool */ +#endif /* UNIV_DEBUG_FILE_ACCESSES */ +}; + +/* The buffer control block structure */ + +struct buf_block_struct{ + + /* 1. General fields */ + + buf_page_t page; /* page information; this must + be the first field, so that + buf_pool->page_hash can point + to buf_page_t or buf_block_t */ + UT_LIST_NODE_T(buf_block_t) unzip_LRU; + /* node of the decompressed LRU list; + a block is in the unzip_LRU list + if page.state == BUF_BLOCK_FILE_PAGE + and page.zip.data != NULL */ +#ifdef UNIV_DEBUG + ibool in_unzip_LRU_list;/* TRUE if the page is in the + decompressed LRU list; + used in debugging */ +#endif /* UNIV_DEBUG */ + byte* frame; /* pointer to buffer frame which + is of size UNIV_PAGE_SIZE, and + aligned to an address divisible by + UNIV_PAGE_SIZE */ + mutex_t mutex; /* mutex protecting this block: + state (also protected by the buffer + pool mutex), io_fix, buf_fix_count, + and accessed; we introduce this new + mutex in InnoDB-5.1 to relieve + contention on the buffer pool mutex */ + rw_lock_t lock; /* read-write lock of the buffer + frame */ + unsigned lock_hash_val:32;/* hashed value of the page address + in the record lock hash table */ + unsigned check_index_page_at_flush:1; + /* TRUE if we know that this is + an index page, and want the database + to check its consistency before flush; + note that there may be pages in the + buffer pool which are index pages, + but this flag is not set because + we do not keep track of all pages */ + + /* 2. Optimistic search field */ + + ib_uint64_t modify_clock; /* this clock is incremented every + time a pointer to a record on the + page may become obsolete; this is + used in the optimistic cursor + positioning: if the modify clock has + not changed, we know that the pointer + is still valid; this field may be + changed if the thread (1) owns the + pool mutex and the page is not + bufferfixed, or (2) the thread has an + x-latch on the block */ + + /* 3. Hash search fields: NOTE that the first 4 fields are NOT + protected by any semaphore! */ + + ulint n_hash_helps; /* counter which controls building + of a new hash index for the page */ + ulint n_fields; /* recommended prefix length for hash + search: number of full fields */ + ulint n_bytes; /* recommended prefix: number of bytes + in an incomplete field */ + ibool left_side; /* TRUE or FALSE, depending on + whether the leftmost record of several + records with the same prefix should be + indexed in the hash index */ + + /* These 6 fields may only be modified when we have + an x-latch on btr_search_latch AND + a) we are holding an s-latch or x-latch on block->lock or + b) we know that block->buf_fix_count == 0. + + An exception to this is when we init or create a page + in the buffer pool in buf0buf.c. */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ulint n_pointers; /* used in debugging: the number of + pointers in the adaptive hash index + pointing to this frame */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + unsigned is_hashed:1; /* TRUE if hash index has already been + built on this page; note that it does + not guarantee that the index is + complete, though: there may have been + hash collisions, record deletions, + etc. */ + unsigned curr_n_fields:10;/* prefix length for hash indexing: + number of full fields */ + unsigned curr_n_bytes:15;/* number of bytes in hash indexing */ + unsigned curr_left_side:1;/* TRUE or FALSE in hash indexing */ + dict_index_t* index; /* Index for which the adaptive + hash index has been created. */ + /* 4. Debug fields */ +#ifdef UNIV_SYNC_DEBUG + rw_lock_t debug_latch; /* in the debug version, each thread + which bufferfixes the block acquires + an s-latch here; so we can use the + debug utilities in sync0rw */ +#endif +}; + +/* Check if a buf_block_t object is in a valid state. */ +#define buf_block_state_valid(block) \ +(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED \ + && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH)) + +/************************************************************************** +Compute the hash fold value for blocks in buf_pool->zip_hash. */ +#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE) +#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame) +#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) + +/* The buffer pool structure. NOTE! The definition appears here only for +other modules of this directory (buf) to see it. Do not use from outside! */ + +struct buf_pool_struct{ + + /* 1. General fields */ + + ulint n_chunks; /* number of buffer pool chunks */ + buf_chunk_t* chunks; /* buffer pool chunks */ + ulint curr_size; /* current pool size in pages */ + hash_table_t* page_hash; /* hash table of buf_page_t or + buf_block_t file pages, + buf_page_in_file() == TRUE, + indexed by (space_id, offset) */ + hash_table_t* zip_hash; /* hash table of buf_block_t blocks + whose frames are allocated to the + zip buddy system, + indexed by block->frame */ + ulint n_pend_reads; /* number of pending read operations */ + ulint n_pend_unzip; /* number of pending decompressions */ + + time_t last_printout_time; /* when buf_print was last time + called */ + ulint n_pages_read; /* number read operations */ + ulint n_pages_written;/* number write operations */ + ulint n_pages_created;/* number of pages created in the pool + with no read */ + ulint n_page_gets; /* number of page gets performed; + also successful searches through + the adaptive hash index are + counted as page gets; this field + is NOT protected by the buffer + pool mutex */ + ulint n_page_gets_old;/* n_page_gets when buf_print was + last time called: used to calculate + hit rate */ + ulint n_pages_read_old;/* n_pages_read when buf_print was + last time called */ + ulint n_pages_written_old;/* number write operations */ + ulint n_pages_created_old;/* number of pages created in + the pool with no read */ + /* 2. Page flushing algorithm fields */ + + UT_LIST_BASE_NODE_T(buf_page_t) flush_list; + /* base node of the modified block + list */ + ibool init_flush[BUF_FLUSH_N_TYPES]; + /* this is TRUE when a flush of the + given type is being initialized */ + ulint n_flush[BUF_FLUSH_N_TYPES]; + /* this is the number of pending + writes in the given flush type */ + os_event_t no_flush[BUF_FLUSH_N_TYPES]; + /* this is in the set state when there + is no flush batch of the given type + running */ + ulint ulint_clock; /* a sequence number used to count + time. NOTE! This counter wraps + around at 4 billion (if ulint == + 32 bits)! */ + ulint freed_page_clock;/* a sequence number used to count the + number of buffer blocks removed from + the end of the LRU list; NOTE that + this counter may wrap around at 4 + billion! A thread is allowed to + read this for heuristic purposes + without holding any mutex or latch */ + ulint LRU_flush_ended;/* when an LRU flush ends for a page, + this is incremented by one; this is + set to zero when a buffer block is + allocated */ + + /* 3. LRU replacement algorithm fields */ + + UT_LIST_BASE_NODE_T(buf_page_t) free; + /* base node of the free block list */ + UT_LIST_BASE_NODE_T(buf_page_t) LRU; + /* base node of the LRU list */ + buf_page_t* LRU_old; /* pointer to the about 3/8 oldest + blocks in the LRU list; NULL if LRU + length less than BUF_LRU_OLD_MIN_LEN; + NOTE: when LRU_old != NULL, its length + should always equal LRU_old_len */ + ulint LRU_old_len; /* length of the LRU list from + the block to which LRU_old points + onward, including that block; + see buf0lru.c for the restrictions + on this value; not defined if + LRU_old == NULL; + NOTE: LRU_old_len must be adjusted + whenever LRU_old shrinks or grows! */ + + UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; + /* base node of the unzip_LRU list */ + + /* 4. Fields for the buddy allocator of compressed pages */ + UT_LIST_BASE_NODE_T(buf_page_t) zip_clean; + /* unmodified compressed pages */ + UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES]; + /* buddy free lists */ +#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE +# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE" +#endif +#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE +# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE" +#endif +}; + +/* mutex protecting the buffer pool struct and control blocks, except the +read-write lock in them */ +extern mutex_t buf_pool_mutex; +/* mutex protecting the control blocks of compressed-only pages +(of type buf_page_t, not buf_block_t) */ +extern mutex_t buf_pool_zip_mutex; + +/* Accessors for buf_pool_mutex. Use these instead of accessing +buf_pool_mutex directly. */ + +/* Test if buf_pool_mutex is owned. */ +#define buf_pool_mutex_own() mutex_own(&buf_pool_mutex) +/* Acquire the buffer pool mutex. */ +#define buf_pool_mutex_enter() do { \ + ut_ad(!mutex_own(&buf_pool_zip_mutex)); \ + mutex_enter(&buf_pool_mutex); \ +} while (0) + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/** Flag to forbid the release of the buffer pool mutex. +Protected by buf_pool_mutex. */ +extern ulint buf_pool_mutex_exit_forbidden; +/* Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid() do { \ + ut_ad(buf_pool_mutex_own()); \ + buf_pool_mutex_exit_forbidden++; \ +} while (0) +/* Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow() do { \ + ut_ad(buf_pool_mutex_own()); \ + ut_a(buf_pool_mutex_exit_forbidden); \ + buf_pool_mutex_exit_forbidden--; \ +} while (0) +/* Release the buffer pool mutex. */ +# define buf_pool_mutex_exit() do { \ + ut_a(!buf_pool_mutex_exit_forbidden); \ + mutex_exit(&buf_pool_mutex); \ +} while (0) +#else +/* Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid() ((void) 0) +/* Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow() ((void) 0) +/* Release the buffer pool mutex. */ +# define buf_pool_mutex_exit() mutex_exit(&buf_pool_mutex) +#endif + +/************************************************************************ +Let us list the consistency conditions for different control block states. + +NOT_USED: is in free list, not in LRU list, not in flush list, nor + page hash table +READY_FOR_USE: is not in free list, LRU list, or flush list, nor page + hash table +MEMORY: is not in free list, LRU list, or flush list, nor page + hash table +FILE_PAGE: space and offset are defined, is in page hash table + if io_fix == BUF_IO_WRITE, + pool: no_flush[flush_type] is in reset state, + pool: n_flush[flush_type] > 0 + + (1) if buf_fix_count == 0, then + is in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + is x-locked, + if and only if io_fix == BUF_IO_READ + is s-locked, + if and only if io_fix == BUF_IO_WRITE + + (2) if buf_fix_count > 0, then + is not in LRU list, not in free list + is in flush list, + if and only if oldest_modification > 0 + if io_fix == BUF_IO_READ, + is x-locked + if io_fix == BUF_IO_WRITE, + is s-locked + +State transitions: + +NOT_USED => READY_FOR_USE +READY_FOR_USE => MEMORY +READY_FOR_USE => FILE_PAGE +MEMORY => NOT_USED +FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if + (1) buf_fix_count == 0, + (2) oldest_modification == 0, and + (3) io_fix == 0. +*/ + +#ifndef UNIV_NONINL +#include "buf0buf.ic" +#endif + +#endif diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic new file mode 100644 index 00000000000..a1dbfe2ebde --- /dev/null +++ b/storage/xtradb/include/buf0buf.ic @@ -0,0 +1,1076 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "mtr0mtr.h" + +/************************************************************************ +Reads the freed_page_clock of a buffer block. */ +UNIV_INLINE +ulint +buf_page_get_freed_page_clock( +/*==========================*/ + /* out: freed_page_clock */ + const buf_page_t* bpage) /* in: block */ +{ + /* This is sometimes read without holding buf_pool_mutex. */ + return(bpage->freed_page_clock); +} + +/************************************************************************ +Reads the freed_page_clock of a buffer block. */ +UNIV_INLINE +ulint +buf_block_get_freed_page_clock( +/*===========================*/ + /* out: freed_page_clock */ + const buf_block_t* block) /* in: block */ +{ + return(buf_page_get_freed_page_clock(&block->page)); +} + +/************************************************************************ +Recommends a move of a block to the start of the LRU list if there is danger +of dropping from the buffer pool. NOTE: does not reserve the buffer pool +mutex. */ +UNIV_INLINE +ibool +buf_page_peek_if_too_old( +/*=====================*/ + /* out: TRUE if should be made + younger */ + const buf_page_t* bpage) /* in: block to make younger */ +{ + return(buf_pool->freed_page_clock + >= buf_page_get_freed_page_clock(bpage) + + 1 + (buf_pool->curr_size / 4)); +} + +/************************************************************************* +Gets the current size of buffer buf_pool in bytes. */ +UNIV_INLINE +ulint +buf_pool_get_curr_size(void) +/*========================*/ + /* out: size in bytes */ +{ + return(buf_pool->curr_size * UNIV_PAGE_SIZE); +} + +/************************************************************************ +Gets the smallest oldest_modification lsn for any page in the pool. Returns +zero if all modified pages have been flushed to disk. */ +UNIV_INLINE +ib_uint64_t +buf_pool_get_oldest_modification(void) +/*==================================*/ + /* out: oldest modification in pool, + zero if none */ +{ + buf_page_t* bpage; + ib_uint64_t lsn; + + buf_pool_mutex_enter(); + + bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + + if (bpage == NULL) { + lsn = 0; + } else { + ut_ad(bpage->in_flush_list); + lsn = bpage->oldest_modification; + } + + buf_pool_mutex_exit(); + + /* The returned answer may be out of date: the flush_list can + change after the mutex has been released. */ + + return(lsn); +} + +/*********************************************************************** +Increments the buf_pool clock by one and returns its new value. Remember +that in the 32 bit version the clock wraps around at 4 billion! */ +UNIV_INLINE +ulint +buf_pool_clock_tic(void) +/*====================*/ + /* out: new clock value */ +{ + ut_ad(buf_pool_mutex_own()); + + buf_pool->ulint_clock++; + + return(buf_pool->ulint_clock); +} + +/************************************************************************* +Gets the state of a block. */ +UNIV_INLINE +enum buf_page_state +buf_page_get_state( +/*===============*/ + /* out: state */ + const buf_page_t* bpage) /* in: pointer to the control block */ +{ + enum buf_page_state state = (enum buf_page_state) bpage->state; + +#ifdef UNIV_DEBUG + switch (state) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_FILE_PAGE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + + return(state); +} +/************************************************************************* +Gets the state of a block. */ +UNIV_INLINE +enum buf_page_state +buf_block_get_state( +/*================*/ + /* out: state */ + const buf_block_t* block) /* in: pointer to the control block */ +{ + return(buf_page_get_state(&block->page)); +} +/************************************************************************* +Sets the state of a block. */ +UNIV_INLINE +void +buf_page_set_state( +/*===============*/ + buf_page_t* bpage, /* in/out: pointer to control block */ + enum buf_page_state state) /* in: state */ +{ +#ifdef UNIV_DEBUG + enum buf_page_state old_state = buf_page_get_state(bpage); + + switch (old_state) { + case BUF_BLOCK_ZIP_FREE: + ut_error; + break; + case BUF_BLOCK_ZIP_PAGE: + ut_a(state == BUF_BLOCK_ZIP_DIRTY); + break; + case BUF_BLOCK_ZIP_DIRTY: + ut_a(state == BUF_BLOCK_ZIP_PAGE); + break; + case BUF_BLOCK_NOT_USED: + ut_a(state == BUF_BLOCK_READY_FOR_USE); + break; + case BUF_BLOCK_READY_FOR_USE: + ut_a(state == BUF_BLOCK_MEMORY + || state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_NOT_USED); + break; + case BUF_BLOCK_MEMORY: + ut_a(state == BUF_BLOCK_NOT_USED); + break; + case BUF_BLOCK_FILE_PAGE: + ut_a(state == BUF_BLOCK_NOT_USED + || state == BUF_BLOCK_REMOVE_HASH); + break; + case BUF_BLOCK_REMOVE_HASH: + ut_a(state == BUF_BLOCK_MEMORY); + break; + } +#endif /* UNIV_DEBUG */ + bpage->state = state; + ut_ad(buf_page_get_state(bpage) == state); +} + +/************************************************************************* +Sets the state of a block. */ +UNIV_INLINE +void +buf_block_set_state( +/*================*/ + buf_block_t* block, /* in/out: pointer to control block */ + enum buf_page_state state) /* in: state */ +{ + buf_page_set_state(&block->page, state); +} + +/************************************************************************* +Determines if a block is mapped to a tablespace. */ +UNIV_INLINE +ibool +buf_page_in_file( +/*=============*/ + /* out: TRUE if mapped */ + const buf_page_t* bpage) /* in: pointer to control block */ +{ + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_FREE: + /* This is a free page in buf_pool->zip_free[]. + Such pages should only be accessed by the buddy allocator. */ + ut_error; + break; + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_FILE_PAGE: + return(TRUE); + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + return(FALSE); +} + +/************************************************************************* +Determines if a block should be on unzip_LRU list. */ +UNIV_INLINE +ibool +buf_page_belongs_to_unzip_LRU( +/*==========================*/ + /* out: TRUE if block belongs + to unzip_LRU */ + const buf_page_t* bpage) /* in: pointer to control block */ +{ + ut_ad(buf_page_in_file(bpage)); + + return(bpage->zip.data + && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); +} + +/************************************************************************* +Determine the approximate LRU list position of a block. */ +UNIV_INLINE +ulint +buf_page_get_LRU_position( +/*======================*/ + /* out: LRU list position */ + const buf_page_t* bpage) /* in: control block */ +{ + ut_ad(buf_page_in_file(bpage)); + ut_ad(buf_pool_mutex_own()); + + return(bpage->LRU_position); +} + +/************************************************************************* +Gets the mutex of a block. */ +UNIV_INLINE +mutex_t* +buf_page_get_mutex( +/*===============*/ + /* out: pointer to mutex + protecting bpage */ + const buf_page_t* bpage) /* in: pointer to control block */ +{ + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_FREE: + ut_error; + return(NULL); + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + return(&buf_pool_zip_mutex); + default: + return(&((buf_block_t*) bpage)->mutex); + } +} + +/************************************************************************* +Get the flush type of a page. */ +UNIV_INLINE +enum buf_flush +buf_page_get_flush_type( +/*====================*/ + /* out: flush type */ + const buf_page_t* bpage) /* in: buffer page */ +{ + enum buf_flush flush_type = (enum buf_flush) bpage->flush_type; + +#ifdef UNIV_DEBUG + switch (flush_type) { + case BUF_FLUSH_LRU: + case BUF_FLUSH_SINGLE_PAGE: + case BUF_FLUSH_LIST: + return(flush_type); + case BUF_FLUSH_N_TYPES: + break; + } + ut_error; +#endif /* UNIV_DEBUG */ + return(flush_type); +} +/************************************************************************* +Set the flush type of a page. */ +UNIV_INLINE +void +buf_page_set_flush_type( +/*====================*/ + buf_page_t* bpage, /* in: buffer page */ + enum buf_flush flush_type) /* in: flush type */ +{ + bpage->flush_type = flush_type; + ut_ad(buf_page_get_flush_type(bpage) == flush_type); +} + +/************************************************************************* +Map a block to a file page. */ +UNIV_INLINE +void +buf_block_set_file_page( +/*====================*/ + buf_block_t* block, /* in/out: pointer to control block */ + ulint space, /* in: tablespace id */ + ulint page_no)/* in: page number */ +{ + buf_block_set_state(block, BUF_BLOCK_FILE_PAGE); + block->page.space = space; + block->page.offset = page_no; +} + +/************************************************************************* +Gets the io_fix state of a block. */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix( +/*================*/ + /* out: io_fix state */ + const buf_page_t* bpage) /* in: pointer to the control block */ +{ + enum buf_io_fix io_fix = (enum buf_io_fix) bpage->io_fix; +#ifdef UNIV_DEBUG + switch (io_fix) { + case BUF_IO_NONE: + case BUF_IO_READ: + case BUF_IO_WRITE: + return(io_fix); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(io_fix); +} + +/************************************************************************* +Gets the io_fix state of a block. */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix( +/*================*/ + /* out: io_fix state */ + const buf_block_t* block) /* in: pointer to the control block */ +{ + return(buf_page_get_io_fix(&block->page)); +} + +/************************************************************************* +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_page_set_io_fix( +/*================*/ + buf_page_t* bpage, /* in/out: control block */ + enum buf_io_fix io_fix) /* in: io_fix state */ +{ + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + bpage->io_fix = io_fix; + ut_ad(buf_page_get_io_fix(bpage) == io_fix); +} + +/************************************************************************* +Sets the io_fix state of a block. */ +UNIV_INLINE +void +buf_block_set_io_fix( +/*=================*/ + buf_block_t* block, /* in/out: control block */ + enum buf_io_fix io_fix) /* in: io_fix state */ +{ + buf_page_set_io_fix(&block->page, io_fix); +} + +/************************************************************************ +Determine if a buffer block can be relocated in memory. The block +can be dirty, but it must not be I/O-fixed or bufferfixed. */ +UNIV_INLINE +ibool +buf_page_can_relocate( +/*==================*/ + const buf_page_t* bpage) /* control block being relocated */ +{ + ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + return(buf_page_get_io_fix(bpage) == BUF_IO_NONE + && bpage->buf_fix_count == 0); +} + +/************************************************************************* +Determine if a block has been flagged old. */ +UNIV_INLINE +ibool +buf_page_is_old( +/*============*/ + /* out: TRUE if old */ + const buf_page_t* bpage) /* in: control block */ +{ + ut_ad(buf_page_in_file(bpage)); + ut_ad(buf_pool_mutex_own()); + + return(bpage->old); +} + +/************************************************************************* +Flag a block old. */ +UNIV_INLINE +void +buf_page_set_old( +/*=============*/ + buf_page_t* bpage, /* in/out: control block */ + ibool old) /* in: old */ +{ + ut_a(buf_page_in_file(bpage)); + ut_ad(buf_pool_mutex_own()); + ut_ad(bpage->in_LRU_list); + +#ifdef UNIV_LRU_DEBUG + if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage) + && UT_LIST_GET_PREV(LRU, bpage)->old + == UT_LIST_GET_NEXT(LRU, bpage)->old) { + ut_a(UT_LIST_GET_PREV(LRU, bpage)->old == old); + } +#endif /* UNIV_LRU_DEBUG */ + + bpage->old = old; +} + +/************************************************************************* +Determine if a block has been accessed in the buffer pool. */ +UNIV_INLINE +ibool +buf_page_is_accessed( +/*=================*/ + /* out: TRUE if accessed */ + const buf_page_t* bpage) /* in: control block */ +{ + ut_ad(buf_page_in_file(bpage)); + + return(bpage->accessed); +} + +/************************************************************************* +Flag a block accessed. */ +UNIV_INLINE +void +buf_page_set_accessed( +/*==================*/ + buf_page_t* bpage, /* in/out: control block */ + ibool accessed) /* in: accessed */ +{ + ut_a(buf_page_in_file(bpage)); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + bpage->accessed = accessed; +} + +/************************************************************************* +Gets the buf_block_t handle of a buffered file block if an uncompressed +page frame exists, or NULL. */ +UNIV_INLINE +buf_block_t* +buf_page_get_block( +/*===============*/ + /* out: control block, or NULL */ + buf_page_t* bpage) /* in: control block, or NULL */ +{ + if (UNIV_LIKELY(bpage != NULL)) { + ut_ad(buf_page_in_file(bpage)); + + if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + return((buf_block_t*) bpage); + } + } + + return(NULL); +} + +#ifdef UNIV_DEBUG +/************************************************************************* +Gets a pointer to the memory frame of a block. */ +UNIV_INLINE +buf_frame_t* +buf_block_get_frame( +/*================*/ + /* out: pointer to the frame */ + const buf_block_t* block) /* in: pointer to the control block */ +{ + ut_ad(block); + + switch (buf_block_get_state(block)) { + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + case BUF_BLOCK_NOT_USED: + ut_error; + break; + case BUF_BLOCK_FILE_PAGE: + ut_a(block->page.buf_fix_count > 0); + /* fall through */ + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + goto ok; + } + ut_error; +ok: + return((buf_frame_t*) block->frame); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Gets the space id of a block. */ +UNIV_INLINE +ulint +buf_page_get_space( +/*===============*/ + /* out: space id */ + const buf_page_t* bpage) /* in: pointer to the control block */ +{ + ut_ad(bpage); + ut_a(buf_page_in_file(bpage)); + + return(bpage->space); +} + +/************************************************************************* +Gets the space id of a block. */ +UNIV_INLINE +ulint +buf_block_get_space( +/*================*/ + /* out: space id */ + const buf_block_t* block) /* in: pointer to the control block */ +{ + ut_ad(block); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + return(block->page.space); +} + +/************************************************************************* +Gets the page number of a block. */ +UNIV_INLINE +ulint +buf_page_get_page_no( +/*=================*/ + /* out: page number */ + const buf_page_t* bpage) /* in: pointer to the control block */ +{ + ut_ad(bpage); + ut_a(buf_page_in_file(bpage)); + + return(bpage->offset); +} + +/************************************************************************* +Gets the page number of a block. */ +UNIV_INLINE +ulint +buf_block_get_page_no( +/*==================*/ + /* out: page number */ + const buf_block_t* block) /* in: pointer to the control block */ +{ + ut_ad(block); + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + + return(block->page.offset); +} + +/************************************************************************* +Gets the compressed page size of a block. */ +UNIV_INLINE +ulint +buf_page_get_zip_size( +/*==================*/ + /* out: compressed page size, or 0 */ + const buf_page_t* bpage) /* in: pointer to the control block */ +{ + return(bpage->zip.ssize ? 512 << bpage->zip.ssize : 0); +} + +/************************************************************************* +Gets the compressed page size of a block. */ +UNIV_INLINE +ulint +buf_block_get_zip_size( +/*===================*/ + /* out: compressed page size, or 0 */ + const buf_block_t* block) /* in: pointer to the control block */ +{ + return(block->page.zip.ssize ? 512 << block->page.zip.ssize : 0); +} + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +/************************************************************************* +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. */ +UNIV_INLINE +const page_zip_des_t* +buf_frame_get_page_zip( +/*===================*/ + /* out: compressed page descriptor, or NULL */ + const byte* ptr) /* in: pointer to the page */ +{ + return(buf_block_get_page_zip(buf_block_align(ptr))); +} +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + +/************************************************************************** +Gets the space id, page offset, and byte offset within page of a +pointer pointing to a buffer frame containing a file page. */ +UNIV_INLINE +void +buf_ptr_get_fsp_addr( +/*=================*/ + const void* ptr, /* in: pointer to a buffer frame */ + ulint* space, /* out: space id */ + fil_addr_t* addr) /* out: page offset and byte offset */ +{ + const page_t* page = (const page_t*) ut_align_down(ptr, + UNIV_PAGE_SIZE); + + *space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET); + addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE); +} + +/************************************************************************** +Gets the hash value of the page the pointer is pointing to. This can be used +in searches in the lock hash table. */ +UNIV_INLINE +ulint +buf_block_get_lock_hash_val( +/*========================*/ + /* out: lock hash value */ + const buf_block_t* block) /* in: block */ +{ + return(block->lock_hash_val); +} + +/************************************************************************ +Allocates a buffer block. */ +UNIV_INLINE +buf_block_t* +buf_block_alloc( +/*============*/ + /* out, own: the allocated block, + in state BUF_BLOCK_MEMORY */ + ulint zip_size) /* in: compressed page size in bytes, + or 0 if uncompressed tablespace */ +{ + buf_block_t* block; + + block = buf_LRU_get_free_block(zip_size); + + buf_block_set_state(block, BUF_BLOCK_MEMORY); + + return(block); +} + +/************************************************************************ +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /* in, own: block to be freed */ +{ + buf_pool_mutex_enter(); + + mutex_enter(&block->mutex); + + ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); + + buf_LRU_block_free_non_file_page(block); + + mutex_exit(&block->mutex); + + buf_pool_mutex_exit(); +} + +/************************************************************************* +Copies contents of a buffer frame to a given buffer. */ +UNIV_INLINE +byte* +buf_frame_copy( +/*===========*/ + /* out: buf */ + byte* buf, /* in: buffer to copy to */ + const buf_frame_t* frame) /* in: buffer frame */ +{ + ut_ad(buf && frame); + + ut_memcpy(buf, frame, UNIV_PAGE_SIZE); + + return(buf); +} + +/************************************************************************ +Calculates a folded value of a file page address to use in the page hash +table. */ +UNIV_INLINE +ulint +buf_page_address_fold( +/*==================*/ + /* out: the folded value */ + ulint space, /* in: space id */ + ulint offset) /* in: offset of the page within space */ +{ + return((space << 20) + space + offset); +} + +/************************************************************************ +This function is used to get info if there is an io operation +going on on a buffer page. */ +UNIV_INLINE +ibool +buf_page_io_query( +/*==============*/ + /* out: TRUE if io going on */ + buf_page_t* bpage) /* in: buf_pool block, must be bufferfixed */ +{ + ibool io_fixed; + + buf_pool_mutex_enter(); + + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->buf_fix_count > 0); + + io_fixed = buf_page_get_io_fix(bpage) != BUF_IO_NONE; + buf_pool_mutex_exit(); + + return(io_fixed); +} + +/************************************************************************ +Gets the youngest modification log sequence number for a frame. +Returns zero if not file page or no modification occurred yet. */ +UNIV_INLINE +ib_uint64_t +buf_page_get_newest_modification( +/*=============================*/ + /* out: newest modification to page */ + const buf_page_t* bpage) /* in: block containing the + page frame */ +{ + ib_uint64_t lsn; + mutex_t* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_page_in_file(bpage)) { + lsn = bpage->newest_modification; + } else { + lsn = 0; + } + + mutex_exit(block_mutex); + + return(lsn); +} + +/************************************************************************ +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block) /* in: block */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad((buf_pool_mutex_own() + && (block->page.buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); +#endif /* UNIV_SYNC_DEBUG */ + + block->modify_clock++; +} + +/************************************************************************ +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + /* out: value */ + buf_block_t* block) /* in: block */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); +#endif /* UNIV_SYNC_DEBUG */ + + return(block->modify_clock); +} + +/*********************************************************************** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +#ifdef UNIV_SYNC_DEBUG + const char* file, /* in: file name */ + ulint line, /* in: line */ +#endif /* UNIV_SYNC_DEBUG */ + buf_block_t* block) /* in: block to bufferfix */ +{ +#ifdef UNIV_SYNC_DEBUG + ibool ret; + + ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line); + ut_a(ret); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&block->mutex)); + + block->page.buf_fix_count++; +} +#ifdef UNIV_SYNC_DEBUG +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) +#else /* UNIV_SYNC_DEBUG */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) +#endif /* UNIV_SYNC_DEBUG */ + +/*********************************************************************** +Decrements the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_dec( +/*==================*/ + buf_block_t* block) /* in: block to bufferunfix */ +{ + ut_ad(mutex_own(&block->mutex)); + + block->page.buf_fix_count--; +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&block->debug_latch); +#endif +} + +/********************************************************************** +Returns the control block of a file page, NULL if not found. */ +UNIV_INLINE +buf_page_t* +buf_page_hash_get( +/*==============*/ + /* out: block, NULL if not found */ + ulint space, /* in: space id */ + ulint offset) /* in: offset of the page within space */ +{ + buf_page_t* bpage; + ulint fold; + + ut_ad(buf_pool); + ut_ad(buf_pool_mutex_own()); + + /* Look for the page in the hash table */ + + fold = buf_page_address_fold(space, offset); + + HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage, + ut_ad(bpage->in_page_hash && !bpage->in_zip_hash + && buf_page_in_file(bpage)), + bpage->space == space && bpage->offset == offset); + if (bpage) { + ut_a(buf_page_in_file(bpage)); + ut_ad(bpage->in_page_hash); + ut_ad(!bpage->in_zip_hash); + UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); + } + + return(bpage); +} + +/********************************************************************** +Returns the control block of a file page, NULL if not found +or an uncompressed page frame does not exist. */ +UNIV_INLINE +buf_block_t* +buf_block_hash_get( +/*===============*/ + /* out: block, NULL if not found */ + ulint space, /* in: space id */ + ulint offset) /* in: offset of the page within space */ +{ + return(buf_page_get_block(buf_page_hash_get(space, offset))); +} + +/************************************************************************ +Returns TRUE if the page can be found in the buffer pool hash table. NOTE +that it is possible that the page is not yet read from disk, though. */ +UNIV_INLINE +ibool +buf_page_peek( +/*==========*/ + /* out: TRUE if found from page hash table, + NOTE that the page is not necessarily yet read + from disk! */ + ulint space, /* in: space id */ + ulint offset) /* in: page number */ +{ + const buf_page_t* bpage; + + buf_pool_mutex_enter(); + + bpage = buf_page_hash_get(space, offset); + + buf_pool_mutex_exit(); + + return(bpage != NULL); +} + +/************************************************************************ +Releases a compressed-only page acquired with buf_page_get_zip(). */ +UNIV_INLINE +void +buf_page_release_zip( +/*=================*/ + buf_page_t* bpage) /* in: buffer block */ +{ + buf_block_t* block; + + ut_ad(bpage); + ut_a(bpage->buf_fix_count > 0); + + switch (buf_page_get_state(bpage)) { + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_ZIP_DIRTY: + mutex_enter(&buf_pool_zip_mutex); + bpage->buf_fix_count--; + mutex_exit(&buf_pool_zip_mutex); + return; + case BUF_BLOCK_FILE_PAGE: + block = (buf_block_t*) bpage; + mutex_enter(&block->mutex); +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&block->debug_latch); +#endif + bpage->buf_fix_count--; + mutex_exit(&block->mutex); + return; + case BUF_BLOCK_ZIP_FREE: + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_READY_FOR_USE: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + break; + } + + ut_error; +} + +/************************************************************************ +Decrements the bufferfix count of a buffer control block and releases +a latch, if specified. */ +UNIV_INLINE +void +buf_page_release( +/*=============*/ + buf_block_t* block, /* in: buffer block */ + ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH, + RW_NO_LATCH */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(block); + + ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_a(block->page.buf_fix_count > 0); + + if (rw_latch == RW_X_LATCH && mtr->modifications) { + buf_pool_mutex_enter(); + buf_flush_note_modification(block, mtr); + buf_pool_mutex_exit(); + } + + mutex_enter(&block->mutex); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_s_unlock(&(block->debug_latch)); +#endif + block->page.buf_fix_count--; + + /* Dirty blocks should be in the flush list. */ + ut_ad(!block->page.oldest_modification + || block->page.in_flush_list); + + mutex_exit(&block->mutex); + + if (rw_latch == RW_S_LATCH) { + rw_lock_s_unlock(&(block->lock)); + } else if (rw_latch == RW_X_LATCH) { + rw_lock_x_unlock(&(block->lock)); + } +} + +#ifdef UNIV_SYNC_DEBUG +/************************************************************************* +Adds latch level info for the rw-lock protecting the buffer frame. This +should be called in the debug version after a successful latching of a +page if we know the latching order level of the acquired latch. */ +UNIV_INLINE +void +buf_block_dbg_add_level( +/*====================*/ + buf_block_t* block, /* in: buffer page + where we have acquired latch */ + ulint level) /* in: latching order level */ +{ + sync_thread_add_level(&block->lock, level); +} +#endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h new file mode 100644 index 00000000000..11a37351479 --- /dev/null +++ b/storage/xtradb/include/buf0flu.h @@ -0,0 +1,151 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0flu_h +#define buf0flu_h + +#include "univ.i" +#include "buf0types.h" +#include "ut0byte.h" +#include "mtr0types.h" + +/************************************************************************ +Remove a block from the flush list of modified blocks. */ +UNIV_INTERN +void +buf_flush_remove( +/*=============*/ + buf_page_t* bpage); /* in: pointer to the block in question */ +/************************************************************************ +Updates the flush system data structures when a write is completed. */ +UNIV_INTERN +void +buf_flush_write_complete( +/*=====================*/ + buf_page_t* bpage); /* in: pointer to the block in question */ +/************************************************************************* +Flushes pages from the end of the LRU list if there is too small +a margin of replaceable pages there. */ +UNIV_INTERN +void +buf_flush_free_margin( +/*=======================*/ + ibool wait); +/************************************************************************ +Initializes a page for writing to the tablespace. */ +UNIV_INTERN +void +buf_flush_init_for_writing( +/*=======================*/ + byte* page, /* in/out: page */ + void* page_zip_, /* in/out: compressed page, or NULL */ + ib_uint64_t newest_lsn); /* in: newest modification lsn + to the page */ +/*********************************************************************** +This utility flushes dirty blocks from the end of the LRU list or flush_list. +NOTE 1: in the case of an LRU flush the calling thread may own latches to +pages: to avoid deadlocks, this function must be written so that it cannot +end up waiting for these latches! NOTE 2: in the case of a flush list flush, +the calling thread is not allowed to own any latches on pages! */ +UNIV_INTERN +ulint +buf_flush_batch( +/*============*/ + /* out: number of blocks for which the + write request was queued; + ULINT_UNDEFINED if there was a flush + of the same type already running */ + enum buf_flush flush_type, /* in: BUF_FLUSH_LRU or + BUF_FLUSH_LIST; if BUF_FLUSH_LIST, + then the caller must not own any + latches on pages */ + ulint min_n, /* in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + ib_uint64_t lsn_limit); /* in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ +/********************************************************************** +Waits until a flush batch of the given type ends */ +UNIV_INTERN +void +buf_flush_wait_batch_end( +/*=====================*/ + enum buf_flush type); /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ +/************************************************************************ +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /* in: block which is modified */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************ +This function should be called when recovery has modified a buffer page. */ +UNIV_INLINE +void +buf_flush_recv_note_modification( +/*=============================*/ + buf_block_t* block, /* in: block which is modified */ + ib_uint64_t start_lsn, /* in: start lsn of the first mtr in a + set of mtr's */ + ib_uint64_t end_lsn); /* in: end lsn of the last mtr in the + set of mtr's */ +/************************************************************************ +Returns TRUE if the file page block is immediately suitable for replacement, +i.e., transition FILE_PAGE => NOT_USED allowed. */ +UNIV_INTERN +ibool +buf_flush_ready_for_replace( +/*========================*/ + /* out: TRUE if can replace immediately */ + buf_page_t* bpage); /* in: buffer control block, must be + buf_page_in_file(bpage) and in the LRU list */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/********************************************************************** +Validates the flush list. */ +UNIV_INTERN +ibool +buf_flush_validate(void); +/*====================*/ + /* out: TRUE if ok */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/* When buf_flush_free_margin is called, it tries to make this many blocks +available to replacement in the free list and at the end of the LRU list (to +make sure that a read-ahead batch can be read efficiently in a single +sweep). */ + +#define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA) +#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100) + +#ifndef UNIV_NONINL +#include "buf0flu.ic" +#endif + +#endif diff --git a/storage/xtradb/include/buf0flu.ic b/storage/xtradb/include/buf0flu.ic new file mode 100644 index 00000000000..2dfa7e68d41 --- /dev/null +++ b/storage/xtradb/include/buf0flu.ic @@ -0,0 +1,120 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0buf.h" +#include "mtr0mtr.h" + +/************************************************************************ +Inserts a modified block into the flush list. */ +UNIV_INTERN +void +buf_flush_insert_into_flush_list( +/*=============================*/ + buf_block_t* block); /* in/out: block which is modified */ +/************************************************************************ +Inserts a modified block into the flush list in the right sorted position. +This function is used by recovery, because there the modifications do not +necessarily come in the order of lsn's. */ +UNIV_INTERN +void +buf_flush_insert_sorted_into_flush_list( +/*====================================*/ + buf_block_t* block); /* in/out: block which is modified */ + +/************************************************************************ +This function should be called at a mini-transaction commit, if a page was +modified in it. Puts the block to the list of modified blocks, if it is not +already in it. */ +UNIV_INLINE +void +buf_flush_note_modification( +/*========================*/ + buf_block_t* block, /* in: block which is modified */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(block); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(buf_pool_mutex_own()); + + ut_ad(mtr->start_lsn != 0); + ut_ad(mtr->modifications); + ut_ad(block->page.newest_modification <= mtr->end_lsn); + + block->page.newest_modification = mtr->end_lsn; + + if (!block->page.oldest_modification) { + + block->page.oldest_modification = mtr->start_lsn; + ut_ad(block->page.oldest_modification != 0); + + buf_flush_insert_into_flush_list(block); + } else { + ut_ad(block->page.oldest_modification <= mtr->start_lsn); + } + + ++srv_buf_pool_write_requests; +} + +/************************************************************************ +This function should be called when recovery has modified a buffer page. */ +UNIV_INLINE +void +buf_flush_recv_note_modification( +/*=============================*/ + buf_block_t* block, /* in: block which is modified */ + ib_uint64_t start_lsn, /* in: start lsn of the first mtr in a + set of mtr's */ + ib_uint64_t end_lsn) /* in: end lsn of the last mtr in the + set of mtr's */ +{ + ut_ad(block); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count > 0); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + buf_pool_mutex_enter(); + + ut_ad(block->page.newest_modification <= end_lsn); + + block->page.newest_modification = end_lsn; + + if (!block->page.oldest_modification) { + + block->page.oldest_modification = start_lsn; + + ut_ad(block->page.oldest_modification != 0); + + buf_flush_insert_sorted_into_flush_list(block); + } else { + ut_ad(block->page.oldest_modification <= start_lsn); + } + + buf_pool_mutex_exit(); +} diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h new file mode 100644 index 00000000000..e73869580bd --- /dev/null +++ b/storage/xtradb/include/buf0lru.h @@ -0,0 +1,265 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer pool LRU replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0lru_h +#define buf0lru_h + +#include "univ.i" +#include "ut0byte.h" +#include "buf0types.h" + +/** The return type of buf_LRU_free_block() */ +enum buf_lru_free_block_status { + /** freed */ + BUF_LRU_FREED = 0, + /** not freed because the caller asked to remove the + uncompressed frame but the control block cannot be + relocated */ + BUF_LRU_CANNOT_RELOCATE, + /** not freed because of some other reason */ + BUF_LRU_NOT_FREED +}; + +/********************************************************************** +Tries to remove LRU flushed blocks from the end of the LRU list and put them +to the free list. This is beneficial for the efficiency of the insert buffer +operation, as flushed pages from non-unique non-clustered indexes are here +taken out of the buffer pool, and their inserts redirected to the insert +buffer. Otherwise, the flushed blocks could get modified again before read +operations need new buffer blocks, and the i/o work done in flushing would be +wasted. */ +UNIV_INTERN +void +buf_LRU_try_free_flushed_blocks(void); +/*==================================*/ +/********************************************************************** +Returns TRUE if less than 25 % of the buffer pool is available. This can be +used in heuristics to prevent huge transactions eating up the whole buffer +pool for their locks. */ +UNIV_INTERN +ibool +buf_LRU_buf_pool_running_out(void); +/*==============================*/ + /* out: TRUE if less than 25 % of buffer pool + left */ + +/*####################################################################### +These are low-level functions +#########################################################################*/ + +/* Minimum LRU list length for which the LRU_old pointer is defined */ + +#define BUF_LRU_OLD_MIN_LEN 80 + +#define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA) + +/********************************************************************** +Invalidates all pages belonging to a given tablespace when we are deleting +the data file(s) of that tablespace. A PROBLEM: if readahead is being started, +what guarantees that it will not try to read in pages after this operation has +completed? */ +UNIV_INTERN +void +buf_LRU_invalidate_tablespace( +/*==========================*/ + ulint id); /* in: space id */ +/********************************************************************** +Gets the minimum LRU_position field for the blocks in an initial segment +(determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not +guaranteed to be precise, because the ulint_clock may wrap around. */ +UNIV_INTERN +ulint +buf_LRU_get_recent_limit(void); +/*==========================*/ + /* out: the limit; zero if could not determine it */ +/************************************************************************ +Insert a compressed block into buf_pool->zip_clean in the LRU order. */ +UNIV_INTERN +void +buf_LRU_insert_zip_clean( +/*=====================*/ + buf_page_t* bpage); /* in: pointer to the block in question */ + +/********************************************************************** +Try to free a block. If bpage is a descriptor of a compressed-only +page, the descriptor object will be freed as well. + +NOTE: If this function returns BUF_LRU_FREED, it will not temporarily +release buf_pool_mutex. Furthermore, the page frame will no longer be +accessible via bpage. + +The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and +release these two mutexes after the call. No other +buf_page_get_mutex() may be held when calling this function. */ +UNIV_INTERN +enum buf_lru_free_block_status +buf_LRU_free_block( +/*===============*/ + /* out: BUF_LRU_FREED if freed, + BUF_LRU_CANNOT_RELOCATE or + BUF_LRU_NOT_FREED otherwise. */ + buf_page_t* bpage, /* in: block to be freed */ + ibool zip, /* in: TRUE if should remove also the + compressed page of an uncompressed page */ + ibool* buf_pool_mutex_released); + /* in: pointer to a variable that will + be assigned TRUE if buf_pool_mutex + was temporarily released, or NULL */ +/********************************************************************** +Try to free a replaceable block. */ +UNIV_INTERN +ibool +buf_LRU_search_and_free_block( +/*==========================*/ + /* out: TRUE if found and freed */ + ulint n_iterations); /* in: how many times this has been called + repeatedly without result: a high value means + that we should search farther; if + n_iterations < 10, then we search + n_iterations / 10 * buf_pool->curr_size + pages from the end of the LRU list; if + n_iterations < 5, then we will also search + n_iterations / 5 of the unzip_LRU list. */ +/********************************************************************** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, returns NULL. */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_only(void); +/*=======================*/ + /* out: a free control block, or NULL + if the buf_block->free list is empty */ +/********************************************************************** +Returns a free block from the buf_pool. The block is taken off the +free list. If it is empty, blocks are moved from the end of the +LRU list to the free list. */ +UNIV_INTERN +buf_block_t* +buf_LRU_get_free_block( +/*===================*/ + /* out: the free control block, + in state BUF_BLOCK_READY_FOR_USE */ + ulint zip_size); /* in: compressed page size in bytes, + or 0 if uncompressed tablespace */ + +/********************************************************************** +Puts a block back to the free list. */ +UNIV_INTERN +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block); /* in: block, must not contain a file page */ +/********************************************************************** +Adds a block to the LRU list. */ +UNIV_INTERN +void +buf_LRU_add_block( +/*==============*/ + buf_page_t* bpage, /* in: control block */ + ibool old); /* in: TRUE if should be put to the old + blocks in the LRU list, else put to the + start; if the LRU list is very short, added to + the start regardless of this parameter */ +/********************************************************************** +Adds a block to the LRU list of decompressed zip pages. */ +UNIV_INTERN +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /* in: control block */ + ibool old); /* in: TRUE if should be put to the end + of the list, else put to the start */ +/********************************************************************** +Moves a block to the start of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_young( +/*=====================*/ + buf_page_t* bpage); /* in: control block */ +/********************************************************************** +Moves a block to the end of the LRU list. */ +UNIV_INTERN +void +buf_LRU_make_block_old( +/*===================*/ + buf_page_t* bpage); /* in: control block */ +/************************************************************************ +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +UNIV_INTERN +void +buf_LRU_stat_update(void); +/*=====================*/ + +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************** +Validates the LRU list. */ +UNIV_INTERN +ibool +buf_LRU_validate(void); +/*==================*/ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************** +Prints the LRU list. */ +UNIV_INTERN +void +buf_LRU_print(void); +/*===============*/ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ + +/********************************************************************** +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics we decide +if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */ + +/** Statistics for selecting the LRU list for eviction. */ +struct buf_LRU_stat_struct +{ + ulint io; /**< Counter of buffer pool I/O operations. */ + ulint unzip; /**< Counter of page_zip_decompress operations. */ +}; + +typedef struct buf_LRU_stat_struct buf_LRU_stat_t; + +/** Current operation counters. Not protected by any mutex. +Cleared by buf_LRU_stat_update(). */ +extern buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Protected by buf_pool_mutex. */ +extern buf_LRU_stat_t buf_LRU_stat_sum; + +/************************************************************************ +Increments the I/O counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++ +/************************************************************************ +Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++ + +#ifndef UNIV_NONINL +#include "buf0lru.ic" +#endif + +#endif diff --git a/storage/xtradb/include/buf0lru.ic b/storage/xtradb/include/buf0lru.ic new file mode 100644 index 00000000000..f4c40e0b606 --- /dev/null +++ b/storage/xtradb/include/buf0lru.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/buf0rea.h b/storage/xtradb/include/buf0rea.h new file mode 100644 index 00000000000..6d138a3a02b --- /dev/null +++ b/storage/xtradb/include/buf0rea.h @@ -0,0 +1,136 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0rea_h +#define buf0rea_h + +#include "univ.i" +#include "buf0types.h" + +/************************************************************************ +High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. Does a random read-ahead if it seems +sensible. */ +UNIV_INTERN +ulint +buf_read_page( +/*==========*/ + /* out: number of page read requests issued: this can + be > 1 if read-ahead occurred */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint offset);/* in: page number */ +/************************************************************************ +Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. */ +UNIV_INTERN +ulint +buf_read_ahead_linear( +/*==================*/ + /* out: number of page read requests issued */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint offset);/* in: page number of a page; NOTE: the current thread + must want access to this page (see NOTE 3 above) */ +/************************************************************************ +Issues read requests for pages which the ibuf module wants to read in, in +order to contract the insert buffer tree. Technically, this function is like +a read-ahead function. */ +UNIV_INTERN +void +buf_read_ibuf_merge_pages( +/*======================*/ + ibool sync, /* in: TRUE if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + const ulint* space_ids, /* in: array of space ids */ + const ib_int64_t* space_versions,/* in: the spaces must have + this version number + (timestamp), otherwise we + discard the read; we use this + to cancel reads if DISCARD + + IMPORT may have changed the + tablespace size */ + const ulint* page_nos, /* in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored); /* in: number of elements + in the arrays */ +/************************************************************************ +Issues read requests for pages which recovery wants to read in. */ +UNIV_INTERN +void +buf_read_recv_pages( +/*================*/ + ibool sync, /* in: TRUE if the caller + wants this function to wait + for the highest address page + to get read in, before this + function returns */ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in + bytes, or 0 */ + const ulint* page_nos, /* in: array of page numbers + to read, with the highest page + number the last in the + array */ + ulint n_stored); /* in: number of page numbers + in the array */ + +/* The size in pages of the area which the read-ahead algorithms read if +invoked */ + +#define BUF_READ_AHEAD_AREA \ + ut_min(64, ut_2_power_up(buf_pool->curr_size / 32)) + +/* Modes used in read-ahead */ +#define BUF_READ_IBUF_PAGES_ONLY 131 +#define BUF_READ_ANY_PAGE 132 + +#endif diff --git a/storage/xtradb/include/buf0types.h b/storage/xtradb/include/buf0types.h new file mode 100644 index 00000000000..f2721da85f9 --- /dev/null +++ b/storage/xtradb/include/buf0types.h @@ -0,0 +1,70 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database buffer pool global types for the directory + +Created 11/17/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0types_h +#define buf0types_h + +typedef struct buf_page_struct buf_page_t; +typedef struct buf_block_struct buf_block_t; +typedef struct buf_chunk_struct buf_chunk_t; +typedef struct buf_pool_struct buf_pool_t; + +/* The 'type' used of a buffer frame */ +typedef byte buf_frame_t; + +/* Flags for flush types */ +enum buf_flush { + BUF_FLUSH_LRU = 0, + BUF_FLUSH_SINGLE_PAGE, + BUF_FLUSH_LIST, + BUF_FLUSH_N_TYPES /* index of last element + 1 */ +}; + +/* Flags for io_fix types */ +enum buf_io_fix { + BUF_IO_NONE = 0, /**< no pending I/O */ + BUF_IO_READ, /**< read pending */ + BUF_IO_WRITE /**< write pending */ +}; + +/* Parameters of binary buddy system for compressed pages (buf0buddy.h) */ +#if UNIV_WORD_SIZE <= 4 /* 32-bit system */ +# define BUF_BUDDY_LOW_SHIFT 6 +#else /* 64-bit system */ +# define BUF_BUDDY_LOW_SHIFT 7 +#endif +#define BUF_BUDDY_LOW (1 << BUF_BUDDY_LOW_SHIFT) + /* minimum block size in the binary + buddy system; must be at least + sizeof(buf_page_t) */ +#define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT) + /* number of buddy sizes */ + +/* twice the maximum block size of the buddy system; +the underlying memory is aligned by this amount: +this must be equal to UNIV_PAGE_SIZE */ +#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES) + +#endif + diff --git a/storage/xtradb/include/data0data.h b/storage/xtradb/include/data0data.h new file mode 100644 index 00000000000..1190a7ae45a --- /dev/null +++ b/storage/xtradb/include/data0data.h @@ -0,0 +1,480 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef data0data_h +#define data0data_h + +#include "univ.i" + +#include "data0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "dict0types.h" + +typedef struct big_rec_struct big_rec_t; + +#ifdef UNIV_DEBUG +/************************************************************************* +Gets pointer to the type struct of SQL data field. */ +UNIV_INLINE +dtype_t* +dfield_get_type( +/*============*/ + /* out: pointer to the type struct */ + const dfield_t* field); /* in: SQL data field */ +/************************************************************************* +Gets pointer to the data in a field. */ +UNIV_INLINE +void* +dfield_get_data( +/*============*/ + /* out: pointer to data */ + const dfield_t* field); /* in: field */ +#else /* UNIV_DEBUG */ +# define dfield_get_type(field) (&(field)->type) +# define dfield_get_data(field) ((field)->data) +#endif /* UNIV_DEBUG */ +/************************************************************************* +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /* in: SQL data field */ + dtype_t* type); /* in: pointer to data type struct */ +/************************************************************************* +Gets length of field data. */ +UNIV_INLINE +ulint +dfield_get_len( +/*===========*/ + /* out: length of data; UNIV_SQL_NULL if + SQL null data */ + const dfield_t* field); /* in: field */ +/************************************************************************* +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /* in: field */ + ulint len); /* in: length or UNIV_SQL_NULL */ +/************************************************************************* +Determines if a field is SQL NULL */ +UNIV_INLINE +ulint +dfield_is_null( +/*===========*/ + /* out: nonzero if SQL null data */ + const dfield_t* field); /* in: field */ +/************************************************************************* +Determines if a field is externally stored */ +UNIV_INLINE +ulint +dfield_is_ext( +/*==========*/ + /* out: nonzero if externally stored */ + const dfield_t* field); /* in: field */ +/************************************************************************* +Sets the "external storage" flag */ +UNIV_INLINE +void +dfield_set_ext( +/*===========*/ + dfield_t* field); /* in/out: field */ +/************************************************************************* +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /* in: field */ + const void* data, /* in: data */ + ulint len); /* in: length or UNIV_SQL_NULL */ +/************************************************************************* +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field); /* in/out: field */ +/************************************************************************** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /* in: pointer to a buffer of size len */ + ulint len); /* in: SQL null size in bytes */ +/************************************************************************* +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /* out: field to copy to */ + const dfield_t* field2);/* in: field to copy from */ +/************************************************************************* +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /* out: field to copy to */ + const dfield_t* field2);/* in: field to copy from */ +/************************************************************************* +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /* in/out: data field */ + mem_heap_t* heap); /* in: memory heap where allocated */ +/************************************************************************* +Tests if data length and content is equal for two dfields. */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + /* out: TRUE if equal */ + const dfield_t* field1, /* in: field */ + const dfield_t* field2);/* in: field */ +/************************************************************************* +Tests if dfield data length and content is equal to the given. */ +UNIV_INTERN +ibool +dfield_data_is_binary_equal( +/*========================*/ + /* out: TRUE if equal */ + const dfield_t* field, /* in: field */ + ulint len, /* in: data length or UNIV_SQL_NULL */ + const byte* data); /* in: data */ +/************************************************************************* +Gets number of fields in a data tuple. */ +UNIV_INLINE +ulint +dtuple_get_n_fields( +/*================*/ + /* out: number of fields */ + const dtuple_t* tuple); /* in: tuple */ +#ifdef UNIV_DEBUG +/************************************************************************* +Gets nth field of a tuple. */ +UNIV_INLINE +dfield_t* +dtuple_get_nth_field( +/*=================*/ + /* out: nth field */ + const dtuple_t* tuple, /* in: tuple */ + ulint n); /* in: index of field */ +#else /* UNIV_DEBUG */ +# define dtuple_get_nth_field(tuple, n) ((tuple)->fields + (n)) +#endif /* UNIV_DEBUG */ +/************************************************************************* +Gets info bits in a data tuple. */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + /* out: info bits */ + const dtuple_t* tuple); /* in: tuple */ +/************************************************************************* +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /* in: tuple */ + ulint info_bits); /* in: info bits */ +/************************************************************************* +Gets number of fields used in record comparisons. */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + /* out: number of fields used in comparisons + in rem0cmp.* */ + const dtuple_t* tuple); /* in: tuple */ +/************************************************************************* +Gets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /* in: tuple */ + ulint n_fields_cmp); /* in: number of fields used in + comparisons in rem0cmp.* */ +/************************************************************** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + /* out, own: created tuple */ + mem_heap_t* heap, /* in: memory heap where the tuple + is created */ + ulint n_fields); /* in: number of fields */ + +/************************************************************** +Wrap data fields in a tuple. The default value for number +of fields used in record comparisons for this tuple is n_fields. */ +UNIV_INLINE +const dtuple_t* +dtuple_from_fields( +/*===============*/ + /* out: data tuple */ + dtuple_t* tuple, /* in: storage for data tuple */ + const dfield_t* fields, /* in: fields */ + ulint n_fields); /* in: number of fields */ + +/************************************************************************* +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +UNIV_INTERN +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /* in: tuple */ + ulint n_fields); /* in: number of fields */ +/************************************************************************* +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + /* out, own: copy of tuple */ + const dtuple_t* tuple, /* in: tuple to copy from */ + mem_heap_t* heap); /* in: memory heap + where the tuple is created */ +/************************************************************** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + /* out: sum of data lens */ + const dtuple_t* tuple); /* in: typed data tuple */ +/************************************************************************* +Computes the number of externally stored fields in a data tuple. */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + /* out: number of fields */ + const dtuple_t* tuple); /* in: tuple */ +/**************************************************************** +Compare two data tuples, respecting the collation of character fields. */ +UNIV_INTERN +int +dtuple_coll_cmp( +/*============*/ + /* out: 1, 0 , -1 if tuple1 is greater, equal, + less, respectively, than tuple2 */ + const dtuple_t* tuple1, /* in: tuple 1 */ + const dtuple_t* tuple2);/* in: tuple 2 */ +/**************************************************************** +Folds a prefix given as the number of fields of a tuple. */ +UNIV_INLINE +ulint +dtuple_fold( +/*========*/ + /* out: the folded value */ + const dtuple_t* tuple, /* in: the tuple */ + ulint n_fields,/* in: number of complete fields to fold */ + ulint n_bytes,/* in: number of bytes to fold in an + incomplete last field */ + dulint tree_id)/* in: index tree id */ + __attribute__((pure)); +/*********************************************************************** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /* in: data tuple */ + ulint n); /* in: number of fields to set */ +/************************************************************************** +Checks if a dtuple contains an SQL null value. */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + /* out: TRUE if some field is SQL null */ + const dtuple_t* tuple); /* in: dtuple */ +/************************************************************** +Checks that a data field is typed. Asserts an error if not. */ +UNIV_INTERN +ibool +dfield_check_typed( +/*===============*/ + /* out: TRUE if ok */ + const dfield_t* field); /* in: data field */ +/************************************************************** +Checks that a data tuple is typed. Asserts an error if not. */ +UNIV_INTERN +ibool +dtuple_check_typed( +/*===============*/ + /* out: TRUE if ok */ + const dtuple_t* tuple); /* in: tuple */ +/************************************************************** +Checks that a data tuple is typed. */ +UNIV_INTERN +ibool +dtuple_check_typed_no_assert( +/*=========================*/ + /* out: TRUE if ok */ + const dtuple_t* tuple); /* in: tuple */ +#ifdef UNIV_DEBUG +/************************************************************** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. */ +UNIV_INTERN +ibool +dtuple_validate( +/*============*/ + /* out: TRUE if ok */ + const dtuple_t* tuple); /* in: tuple */ +#endif /* UNIV_DEBUG */ +/***************************************************************** +Pretty prints a dfield value according to its data type. */ +UNIV_INTERN +void +dfield_print( +/*=========*/ + const dfield_t* dfield);/* in: dfield */ +/***************************************************************** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +UNIV_INTERN +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield); /* in: dfield */ +/************************************************************** +The following function prints the contents of a tuple. */ +UNIV_INTERN +void +dtuple_print( +/*=========*/ + FILE* f, /* in: output stream */ + const dtuple_t* tuple); /* in: tuple */ +/****************************************************************** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. */ +UNIV_INTERN +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + /* out, own: created big record vector, + NULL if we are not able to shorten + the entry enough, i.e., if there are + too many fixed-length or short fields + in entry or the index is clustered */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in/out: index entry */ + ulint* n_ext); /* in/out: number of + externally stored columns */ +/****************************************************************** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +UNIV_INTERN +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: entry whose data was put to vector */ + big_rec_t* vector);/* in, own: big rec vector; it is + freed in this function */ +/****************************************************************** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector); /* in, own: big rec vector; it is + freed in this function */ + +/*######################################################################*/ + +/* Structure for an SQL data field */ +struct dfield_struct{ + void* data; /* pointer to data */ + unsigned ext:1; /* TRUE=externally stored, FALSE=local */ + unsigned len:32; /* data length; UNIV_SQL_NULL if SQL null */ + dtype_t type; /* type of data */ +}; + +struct dtuple_struct { + ulint info_bits; /* info bits of an index record: + the default is 0; this field is used + if an index record is built from + a data tuple */ + ulint n_fields; /* number of fields in dtuple */ + ulint n_fields_cmp; /* number of fields which should + be used in comparison services + of rem0cmp.*; the index search + is performed by comparing only these + fields, others are ignored; the + default value in dtuple creation is + the same value as n_fields */ + dfield_t* fields; /* fields */ + UT_LIST_NODE_T(dtuple_t) tuple_list; + /* data tuples can be linked into a + list using this field */ +#ifdef UNIV_DEBUG + ulint magic_n; +# define DATA_TUPLE_MAGIC_N 65478679 +#endif /* UNIV_DEBUG */ +}; + +/* A slot for a field in a big rec vector */ + +typedef struct big_rec_field_struct big_rec_field_t; +struct big_rec_field_struct { + ulint field_no; /* field number in record */ + ulint len; /* stored data len */ + const void* data; /* stored data */ +}; + +/* Storage format for overflow data in a big record, that is, a record +which needs external storage of data fields */ + +struct big_rec_struct { + mem_heap_t* heap; /* memory heap from which allocated */ + ulint n_fields; /* number of stored fields */ + big_rec_field_t* fields; /* stored fields */ +}; + +#ifndef UNIV_NONINL +#include "data0data.ic" +#endif + +#endif diff --git a/storage/xtradb/include/data0data.ic b/storage/xtradb/include/data0data.ic new file mode 100644 index 00000000000..f11dbd9fce6 --- /dev/null +++ b/storage/xtradb/include/data0data.ic @@ -0,0 +1,608 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0mem.h" +#include "ut0rnd.h" + +#ifdef UNIV_DEBUG +extern byte data_error; + +/************************************************************************* +Gets pointer to the type struct of SQL data field. */ +UNIV_INLINE +dtype_t* +dfield_get_type( +/*============*/ + /* out: pointer to the type struct */ + const dfield_t* field) /* in: SQL data field */ +{ + ut_ad(field); + + return((dtype_t*) &(field->type)); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /* in: SQL data field */ + dtype_t* type) /* in: pointer to data type struct */ +{ + ut_ad(field && type); + + field->type = *type; +} + +#ifdef UNIV_DEBUG +/************************************************************************* +Gets pointer to the data in a field. */ +UNIV_INLINE +void* +dfield_get_data( +/*============*/ + /* out: pointer to data */ + const dfield_t* field) /* in: field */ +{ + ut_ad(field); + ut_ad((field->len == UNIV_SQL_NULL) + || (field->data != &data_error)); + + return((void*) field->data); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Gets length of field data. */ +UNIV_INLINE +ulint +dfield_get_len( +/*===========*/ + /* out: length of data; UNIV_SQL_NULL if + SQL null data */ + const dfield_t* field) /* in: field */ +{ + ut_ad(field); + ut_ad((field->len == UNIV_SQL_NULL) + || (field->data != &data_error)); + + return(field->len); +} + +/************************************************************************* +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /* in: field */ + ulint len) /* in: length or UNIV_SQL_NULL */ +{ + ut_ad(field); +#ifdef UNIV_VALGRIND_DEBUG + if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(field->data, len); +#endif /* UNIV_VALGRIND_DEBUG */ + + field->ext = 0; + field->len = len; +} + +/************************************************************************* +Determines if a field is SQL NULL */ +UNIV_INLINE +ulint +dfield_is_null( +/*===========*/ + /* out: nonzero if SQL null data */ + const dfield_t* field) /* in: field */ +{ + ut_ad(field); + + return(field->len == UNIV_SQL_NULL); +} + +/************************************************************************* +Determines if a field is externally stored */ +UNIV_INLINE +ulint +dfield_is_ext( +/*==========*/ + /* out: nonzero if externally stored */ + const dfield_t* field) /* in: field */ +{ + ut_ad(field); + + return(UNIV_UNLIKELY(field->ext)); +} + +/************************************************************************* +Sets the "external storage" flag */ +UNIV_INLINE +void +dfield_set_ext( +/*===========*/ + dfield_t* field) /* in/out: field */ +{ + ut_ad(field); + + field->ext = 1; +} + +/************************************************************************* +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /* in: field */ + const void* data, /* in: data */ + ulint len) /* in: length or UNIV_SQL_NULL */ +{ + ut_ad(field); + +#ifdef UNIV_VALGRIND_DEBUG + if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(data, len); +#endif /* UNIV_VALGRIND_DEBUG */ + field->data = (void*) data; + field->ext = 0; + field->len = len; +} + +/************************************************************************* +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /* in/out: field */ +{ + dfield_set_data(field, NULL, UNIV_SQL_NULL); +} + +/************************************************************************* +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /* out: field to copy to */ + const dfield_t* field2) /* in: field to copy from */ +{ + ut_ad(field1 && field2); + + field1->data = field2->data; + field1->len = field2->len; + field1->ext = field2->ext; +} + +/************************************************************************* +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /* out: field to copy to */ + const dfield_t* field2) /* in: field to copy from */ +{ + *field1 = *field2; +} + +/************************************************************************* +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /* in/out: data field */ + mem_heap_t* heap) /* in: memory heap where allocated */ +{ + if (!dfield_is_null(field)) { + UNIV_MEM_ASSERT_RW(field->data, field->len); + field->data = mem_heap_dup(heap, field->data, field->len); + } +} + +/************************************************************************* +Tests if data length and content is equal for two dfields. */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + /* out: TRUE if equal */ + const dfield_t* field1, /* in: field */ + const dfield_t* field2) /* in: field */ +{ + ulint len; + + len = field1->len; + + return(len == field2->len + && (len == UNIV_SQL_NULL + || !memcmp(field1->data, field2->data, len))); +} + +/************************************************************************* +Gets info bits in a data tuple. */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + /* out: info bits */ + const dtuple_t* tuple) /* in: tuple */ +{ + ut_ad(tuple); + + return(tuple->info_bits); +} + +/************************************************************************* +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /* in: tuple */ + ulint info_bits) /* in: info bits */ +{ + ut_ad(tuple); + + tuple->info_bits = info_bits; +} + +/************************************************************************* +Gets number of fields used in record comparisons. */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + /* out: number of fields used in comparisons + in rem0cmp.* */ + const dtuple_t* tuple) /* in: tuple */ +{ + ut_ad(tuple); + + return(tuple->n_fields_cmp); +} + +/************************************************************************* +Sets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /* in: tuple */ + ulint n_fields_cmp) /* in: number of fields used in + comparisons in rem0cmp.* */ +{ + ut_ad(tuple); + ut_ad(n_fields_cmp <= tuple->n_fields); + + tuple->n_fields_cmp = n_fields_cmp; +} + +/************************************************************************* +Gets number of fields in a data tuple. */ +UNIV_INLINE +ulint +dtuple_get_n_fields( +/*================*/ + /* out: number of fields */ + const dtuple_t* tuple) /* in: tuple */ +{ + ut_ad(tuple); + + return(tuple->n_fields); +} + +#ifdef UNIV_DEBUG +/************************************************************************* +Gets nth field of a tuple. */ +UNIV_INLINE +dfield_t* +dtuple_get_nth_field( +/*=================*/ + /* out: nth field */ + const dtuple_t* tuple, /* in: tuple */ + ulint n) /* in: index of field */ +{ + ut_ad(tuple); + ut_ad(n < tuple->n_fields); + + return((dfield_t*) tuple->fields + n); +} +#endif /* UNIV_DEBUG */ + +/************************************************************** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + /* out, own: created tuple */ + mem_heap_t* heap, /* in: memory heap where the tuple + is created */ + ulint n_fields) /* in: number of fields */ +{ + dtuple_t* tuple; + + ut_ad(heap); + + tuple = (dtuple_t*) mem_heap_alloc(heap, sizeof(dtuple_t) + + n_fields * sizeof(dfield_t)); + tuple->info_bits = 0; + tuple->n_fields = n_fields; + tuple->n_fields_cmp = n_fields; + tuple->fields = (dfield_t*) &tuple[1]; + +#ifdef UNIV_DEBUG + tuple->magic_n = DATA_TUPLE_MAGIC_N; + + { /* In the debug version, initialize fields to an error value */ + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_t* field; + + field = dtuple_get_nth_field(tuple, i); + + dfield_set_len(field, UNIV_SQL_NULL); + field->data = &data_error; + dfield_get_type(field)->mtype = DATA_ERROR; + } + } + + UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields); +#endif + return(tuple); +} + +/************************************************************** +Wrap data fields in a tuple. The default value for number +of fields used in record comparisons for this tuple is n_fields. */ +UNIV_INLINE +const dtuple_t* +dtuple_from_fields( +/*===============*/ + /* out: data tuple */ + dtuple_t* tuple, /* in: storage for data tuple */ + const dfield_t* fields, /* in: fields */ + ulint n_fields) /* in: number of fields */ +{ + tuple->info_bits = 0; + tuple->n_fields = tuple->n_fields_cmp = n_fields; + tuple->fields = (dfield_t*) fields; + ut_d(tuple->magic_n = DATA_TUPLE_MAGIC_N); + + return(tuple); +} + +/************************************************************************* +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + /* out, own: copy of tuple */ + const dtuple_t* tuple, /* in: tuple to copy from */ + mem_heap_t* heap) /* in: memory heap + where the tuple is created */ +{ + ulint n_fields = dtuple_get_n_fields(tuple); + dtuple_t* new_tuple = dtuple_create(heap, n_fields); + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_copy(dtuple_get_nth_field(new_tuple, i), + dtuple_get_nth_field(tuple, i)); + } + + return(new_tuple); +} + +/************************************************************** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. Neither +is possible space in externally stored parts of the field. */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + /* out: sum of data lengths */ + const dtuple_t* tuple) /* in: typed data tuple */ +{ + const dfield_t* field; + ulint n_fields; + ulint len; + ulint i; + ulint sum = 0; + + ut_ad(tuple); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + n_fields = tuple->n_fields; + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + len = dfield_get_len(field); + + if (len == UNIV_SQL_NULL) { + len = dtype_get_sql_null_size(dfield_get_type(field)); + } + + sum += len; + } + + return(sum); +} + +/************************************************************************* +Computes the number of externally stored fields in a data tuple. */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + /* out: number of externally stored fields */ + const dtuple_t* tuple) /* in: tuple */ +{ + ulint n_ext = 0; + ulint n_fields = tuple->n_fields; + ulint i; + + ut_ad(tuple); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + for (i = 0; i < n_fields; i++) { + n_ext += dtuple_get_nth_field(tuple, i)->ext; + } + + return(n_ext); +} + +/*********************************************************************** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /* in: data tuple */ + ulint n) /* in: number of fields to set */ +{ + dtype_t* dfield_type; + ulint i; + + for (i = 0; i < n; i++) { + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dtype_set(dfield_type, DATA_BINARY, 0, 0); + } +} + +/**************************************************************** +Folds a prefix given as the number of fields of a tuple. */ +UNIV_INLINE +ulint +dtuple_fold( +/*========*/ + /* out: the folded value */ + const dtuple_t* tuple, /* in: the tuple */ + ulint n_fields,/* in: number of complete fields to fold */ + ulint n_bytes,/* in: number of bytes to fold in an + incomplete last field */ + dulint tree_id)/* in: index tree id */ +{ + const dfield_t* field; + ulint i; + const byte* data; + ulint len; + ulint fold; + + ut_ad(tuple); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(tuple)); + + fold = ut_fold_dulint(tree_id); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} + +/************************************************************************** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /* in: pointer to a buffer of size len */ + ulint len) /* in: SQL null size in bytes */ +{ + memset(data, 0, len); +} + +/************************************************************************** +Checks if a dtuple contains an SQL null value. */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + /* out: TRUE if some field is SQL null */ + const dtuple_t* tuple) /* in: dtuple */ +{ + ulint n; + ulint i; + + n = dtuple_get_n_fields(tuple); + + for (i = 0; i < n; i++) { + if (dfield_is_null(dtuple_get_nth_field(tuple, i))) { + + return(TRUE); + } + } + + return(FALSE); +} + +/****************************************************************** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /* in, own: big rec vector; it is + freed in this function */ +{ + mem_heap_free(vector->heap); +} diff --git a/storage/xtradb/include/data0type.h b/storage/xtradb/include/data0type.h new file mode 100644 index 00000000000..1f10878984b --- /dev/null +++ b/storage/xtradb/include/data0type.h @@ -0,0 +1,471 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#ifndef data0type_h +#define data0type_h + +#include "univ.i" + +extern ulint data_mysql_default_charset_coll; +#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8 +#define DATA_MYSQL_BINARY_CHARSET_COLL 63 + +/* SQL data type struct */ +typedef struct dtype_struct dtype_t; + +/*-------------------------------------------*/ +/* The 'MAIN TYPE' of a column */ +#define DATA_VARCHAR 1 /* character varying of the + latin1_swedish_ci charset-collation; note + that the MySQL format for this, DATA_BINARY, + DATA_VARMYSQL, is also affected by whether the + 'precise type' contains + DATA_MYSQL_TRUE_VARCHAR */ +#define DATA_CHAR 2 /* fixed length character of the + latin1_swedish_ci charset-collation */ +#define DATA_FIXBINARY 3 /* binary string of fixed length */ +#define DATA_BINARY 4 /* binary string */ +#define DATA_BLOB 5 /* binary large object, or a TEXT type; + if prtype & DATA_BINARY_TYPE == 0, then this is + actually a TEXT column (or a BLOB created + with < 4.0.14; since column prefix indexes + came only in 4.0.14, the missing flag in BLOBs + created before that does not cause any harm) */ +#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ +#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ +#define DATA_SYS 8 /* system column */ + +/* Data types >= DATA_FLOAT must be compared using the whole field, not as +binary strings */ + +#define DATA_FLOAT 9 +#define DATA_DOUBLE 10 +#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */ +#define DATA_VARMYSQL 12 /* any charset varying length char */ +#define DATA_MYSQL 13 /* any charset fixed length char */ + /* NOTE that 4.1.1 used DATA_MYSQL and + DATA_VARMYSQL for all character sets, and the + charset-collation for tables created with it + can also be latin1_swedish_ci */ +#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size() + requires the values are <= 63 */ +/*-------------------------------------------*/ +/* The 'PRECISE TYPE' of a column */ +/* +Tables created by a MySQL user have the following convention: + +- In the least significant byte in the precise type we store the MySQL type +code (not applicable for system columns). + +- In the second least significant byte we OR flags DATA_NOT_NULL, +DATA_UNSIGNED, DATA_BINARY_TYPE. + +- In the third least significant byte of the precise type of string types we +store the MySQL charset-collation code. In DATA_BLOB columns created with +< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there +are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no +problem, though. + +Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the +precise type, since the charset was always the default charset of the MySQL +installation. If the stored charset code is 0 in the system table SYS_COLUMNS +of InnoDB, that means that the default charset of this MySQL installation +should be used. + +When loading a table definition from the system tables to the InnoDB data +dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check +if the stored charset-collation is 0, and if that is the case and the type is +a non-binary string, replace that 0 by the default charset-collation code of +this MySQL installation. In short, in old tables, the charset-collation code +in the system tables on disk can be 0, but in in-memory data structures +(dtype_t), the charset-collation code is always != 0 for non-binary string +types. + +In new tables, in binary string types, the charset-collation code is the +MySQL code for the 'binary charset', that is, != 0. + +For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those +DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci, +InnoDB performs all comparisons internally, without resorting to the MySQL +comparison functions. This is to save CPU time. + +InnoDB's own internal system tables have different precise types for their +columns, and for them the precise type is usually not used at all. +*/ + +#define DATA_ENGLISH 4 /* English language character string: this + is a relic from pre-MySQL time and only used + for InnoDB's own system tables */ +#define DATA_ERROR 111 /* another relic from pre-MySQL time */ + +#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL + type from the precise type */ +#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3 + format true VARCHAR */ + +/* Precise data types for system columns and the length of those columns; +NOTE: the values must run from 0 up in the order given! All codes must +be less than 256 */ +#define DATA_ROW_ID 0 /* row id: a dulint */ +#define DATA_ROW_ID_LEN 6 /* stored length for row id */ + +#define DATA_TRX_ID 1 /* transaction id: 6 bytes */ +#define DATA_TRX_ID_LEN 6 + +#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */ +#define DATA_ROLL_PTR_LEN 7 + +#define DATA_N_SYS_COLS 3 /* number of system columns defined above */ + +#define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */ + +/* Flags ORed to the precise data type */ +#define DATA_NOT_NULL 256 /* this is ORed to the precise type when + the column is declared as NOT NULL */ +#define DATA_UNSIGNED 512 /* this id ORed to the precise type when + we have an unsigned integer type */ +#define DATA_BINARY_TYPE 1024 /* if the data type is a binary character + string, this is ORed to the precise type: + this only holds for tables created with + >= MySQL-4.0.14 */ +/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1. + In earlier versions this was set for some + BLOB columns. +*/ +#define DATA_LONG_TRUE_VARCHAR 4096 /* this is ORed to the precise data + type when the column is true VARCHAR where + MySQL uses 2 bytes to store the data len; + for shorter VARCHARs MySQL uses only 1 byte */ +/*-------------------------------------------*/ + +/* This many bytes we need to store the type information affecting the +alphabetical order for a single field and decide the storage size of an +SQL null*/ +#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +/* In the >= 4.1.x storage format we add 2 bytes more so that we can also +store the charset-collation number; one byte is left unused, though */ +#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 + +/************************************************************************* +Gets the MySQL type code from a dtype. */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + /* out: MySQL type code; this is NOT an InnoDB + type code! */ + const dtype_t* type); /* in: type struct */ +/************************************************************************* +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. */ +UNIV_INTERN +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + /* out: length of the prefix, + in bytes */ + ulint prtype, /* in: precise type */ + ulint mbminlen, /* in: minimum length of a + multi-byte character */ + ulint mbmaxlen, /* in: maximum length of a + multi-byte character */ + ulint prefix_len, /* in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /* in: length of str (in bytes) */ + const char* str); /* in: the string whose prefix + length is being determined */ +/************************************************************************* +Checks if a data main type is a string type. Also a BLOB is considered a +string type. */ +UNIV_INTERN +ibool +dtype_is_string_type( +/*=================*/ + /* out: TRUE if string type */ + ulint mtype); /* in: InnoDB main data type code: DATA_CHAR, ... */ +/************************************************************************* +Checks if a type is a binary string type. Note that for tables created with +< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For +those DATA_BLOB columns this function currently returns FALSE. */ +UNIV_INTERN +ibool +dtype_is_binary_string_type( +/*========================*/ + /* out: TRUE if binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype);/* in: precise type */ +/************************************************************************* +Checks if a type is a non-binary string type. That is, dtype_is_string_type is +TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created +with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. +For those DATA_BLOB columns this function currently returns TRUE. */ +UNIV_INTERN +ibool +dtype_is_non_binary_string_type( +/*============================*/ + /* out: TRUE if non-binary string type */ + ulint mtype, /* in: main data type */ + ulint prtype);/* in: precise type */ +/************************************************************************* +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /* in: type struct to init */ + ulint mtype, /* in: main data type */ + ulint prtype, /* in: precise type */ + ulint len); /* in: precision of type */ +/************************************************************************* +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /* in: type struct to copy to */ + const dtype_t* type2); /* in: type struct to copy from */ +/************************************************************************* +Gets the SQL main data type. */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type); +/************************************************************************* +Gets the precise data type. */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type); +/************************************************************************* +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type (and collation) */ + ulint* mbminlen, /* out: minimum length of a + multi-byte character */ + ulint* mbmaxlen); /* out: maximum length of a + multi-byte character */ +/************************************************************************* +Gets the MySQL charset-collation code for MySQL string types. */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype);/* in: precise data type */ +/************************************************************************* +Forms a precise type from the < 4.1.2 format precise type plus the +charset-collation code. */ +UNIV_INTERN +ulint +dtype_form_prtype( +/*==============*/ + ulint old_prtype, /* in: the MySQL type code and the flags + DATA_BINARY_TYPE etc. */ + ulint charset_coll); /* in: MySQL charset-collation code */ +/************************************************************************* +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. */ +UNIV_INLINE +ibool +dtype_is_utf8( +/*==========*/ + /* out: TRUE if a subset of UTF-8 */ + ulint prtype);/* in: precise data type */ +/************************************************************************* +Gets the type length. */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type); +/************************************************************************* +Gets the minimum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + /* out: minimum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type); /* in: type */ +/************************************************************************* +Gets the maximum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + /* out: maximum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type); /* in: type */ +/************************************************************************* +Gets the padding character code for the type. */ +UNIV_INLINE +ulint +dtype_get_pad_char( +/*===============*/ + /* out: padding character code, or + ULINT_UNDEFINED if no padding specified */ + ulint mtype, /* in: main type */ + ulint prtype); /* in: precise type */ +/*************************************************************************** +Returns the size of a fixed size data type, 0 if not a fixed size type. */ +UNIV_INLINE +ulint +dtype_get_fixed_size_low( +/*=====================*/ + /* out: fixed size, or 0 */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + ulint len, /* in: length */ + ulint mbminlen, /* in: minimum length of a multibyte char */ + ulint mbmaxlen); /* in: maximum length of a multibyte char */ +/*************************************************************************** +Returns the minimum size of a data type. */ +UNIV_INLINE +ulint +dtype_get_min_size_low( +/*===================*/ + /* out: minimum size */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + ulint len, /* in: length */ + ulint mbminlen, /* in: minimum length of a multibyte char */ + ulint mbmaxlen); /* in: maximum length of a multibyte char */ +/*************************************************************************** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + /* out: maximum size */ + ulint mtype, /* in: main type */ + ulint len); /* in: length */ +/*************************************************************************** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + /* out: SQL null storage size + in ROW_FORMAT=REDUNDANT */ + const dtype_t* type); /* in: type */ +/************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /* in: type struct */ + const byte* buf); /* in: buffer for the stored order info */ +/************************************************************************** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /* in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /* in: type struct */ + ulint prefix_len);/* in: prefix length to + replace type->len, or 0 */ +/************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /* in: type struct */ + const byte* buf); /* in: buffer for stored type order info */ + +/************************************************************************* +Validates a data type structure. */ +UNIV_INTERN +ibool +dtype_validate( +/*===========*/ + /* out: TRUE if ok */ + const dtype_t* type); /* in: type struct to validate */ +/************************************************************************* +Prints a data type structure. */ +UNIV_INTERN +void +dtype_print( +/*========*/ + const dtype_t* type); /* in: type */ + +/* Structure for an SQL data type. +If you add fields to this structure, be sure to initialize them everywhere. +This structure is initialized in the following functions: +dtype_set() +dtype_read_for_order_and_null_size() +dtype_new_read_for_order_and_null_size() +sym_tab_add_null_lit() */ + +struct dtype_struct{ + unsigned mtype:8; /* main data type */ + unsigned prtype:24; /* precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /* length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + + unsigned mbminlen:2; /* minimum length of a + character, in bytes */ + unsigned mbmaxlen:3; /* maximum length of a + character, in bytes */ +}; + +#ifndef UNIV_NONINL +#include "data0type.ic" +#endif + +#endif diff --git a/storage/xtradb/include/data0type.ic b/storage/xtradb/include/data0type.ic new file mode 100644 index 00000000000..d4c1080bebe --- /dev/null +++ b/storage/xtradb/include/data0type.ic @@ -0,0 +1,587 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "ha_prototypes.h" + +/************************************************************************* +Gets the MySQL charset-collation code for MySQL string types. */ +UNIV_INLINE +ulint +dtype_get_charset_coll( +/*===================*/ + ulint prtype) /* in: precise data type */ +{ + return((prtype >> 16) & 0xFFUL); +} + +/************************************************************************* +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. */ +UNIV_INLINE +ibool +dtype_is_utf8( +/*==========*/ + /* out: TRUE if a subset of UTF-8 */ + ulint prtype) /* in: precise data type */ +{ + /* These codes have been copied from strings/ctype-extra.c + and strings/ctype-utf8.c. */ + switch (dtype_get_charset_coll(prtype)) { + case 11: /* ascii_general_ci */ + case 65: /* ascii_bin */ + case 33: /* utf8_general_ci */ + case 83: /* utf8_bin */ + case 254: /* utf8_general_cs */ + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Gets the MySQL type code from a dtype. */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + /* out: MySQL type code; this is NOT an InnoDB + type code! */ + const dtype_t* type) /* in: type struct */ +{ + return(type->prtype & 0xFFUL); +} + +/************************************************************************* +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_get_mblen( +/*============*/ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type (and collation) */ + ulint* mbminlen, /* out: minimum length of a + multi-byte character */ + ulint* mbmaxlen) /* out: maximum length of a + multi-byte character */ +{ + if (dtype_is_string_type(mtype)) { +#ifndef UNIV_HOTBACKUP + innobase_get_cset_width(dtype_get_charset_coll(prtype), + mbminlen, mbmaxlen); + ut_ad(*mbminlen <= *mbmaxlen); + ut_ad(*mbminlen <= 2); /* mbminlen in dtype_t is 0..3 */ + ut_ad(*mbmaxlen < 1 << 3); /* mbmaxlen in dtype_t is 0..7 */ +#else /* !UNIV_HOTBACKUP */ + ut_a(mtype <= DATA_BINARY); + *mbminlen = *mbmaxlen = 1; +#endif /* !UNIV_HOTBACKUP */ + } else { + *mbminlen = *mbmaxlen = 0; + } +} + +/************************************************************************* +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_set_mblen( +/*============*/ + dtype_t* type) /* in/out: type */ +{ + ulint mbminlen; + ulint mbmaxlen; + + dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen); + type->mbminlen = mbminlen; + type->mbmaxlen = mbmaxlen; + + ut_ad(dtype_validate(type)); +} + +/************************************************************************* +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /* in: type struct to init */ + ulint mtype, /* in: main data type */ + ulint prtype, /* in: precise type */ + ulint len) /* in: precision of type */ +{ + ut_ad(type); + ut_ad(mtype <= DATA_MTYPE_MAX); + + type->mtype = mtype; + type->prtype = prtype; + type->len = len; + + dtype_set_mblen(type); +} + +/************************************************************************* +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /* in: type struct to copy to */ + const dtype_t* type2) /* in: type struct to copy from */ +{ + *type1 = *type2; + + ut_ad(dtype_validate(type1)); +} + +/************************************************************************* +Gets the SQL main data type. */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type) +{ + ut_ad(type); + + return(type->mtype); +} + +/************************************************************************* +Gets the precise data type. */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type) +{ + ut_ad(type); + + return(type->prtype); +} + +/************************************************************************* +Gets the type length. */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type) +{ + ut_ad(type); + + return(type->len); +} + +/************************************************************************* +Gets the minimum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + /* out: minimum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type) /* in: type */ +{ + ut_ad(type); + return(type->mbminlen); +} +/************************************************************************* +Gets the maximum length of a character, in bytes. */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + /* out: maximum length of a char, in bytes, + or 0 if this is not a character type */ + const dtype_t* type) /* in: type */ +{ + ut_ad(type); + return(type->mbmaxlen); +} + +/************************************************************************* +Gets the padding character code for a type. */ +UNIV_INLINE +ulint +dtype_get_pad_char( +/*===============*/ + /* out: padding character code, or + ULINT_UNDEFINED if no padding specified */ + ulint mtype, /* in: main type */ + ulint prtype) /* in: precise type */ +{ + switch (mtype) { + case DATA_FIXBINARY: + case DATA_BINARY: + if (UNIV_UNLIKELY(dtype_get_charset_coll(prtype) + == DATA_MYSQL_BINARY_CHARSET_COLL)) { + /* Starting from 5.0.18, do not pad + VARBINARY or BINARY columns. */ + return(ULINT_UNDEFINED); + } + /* Fall through */ + case DATA_CHAR: + case DATA_VARCHAR: + case DATA_MYSQL: + case DATA_VARMYSQL: + /* Space is the padding character for all char and binary + strings, and starting from 5.0.3, also for TEXT strings. */ + + return(0x20); + case DATA_BLOB: + if (!(prtype & DATA_BINARY_TYPE)) { + return(0x20); + } + /* Fall through */ + default: + /* No padding specified */ + return(ULINT_UNDEFINED); + } +} + +/************************************************************************** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /* in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /* in: type struct */ + ulint prefix_len)/* in: prefix length to + replace type->len, or 0 */ +{ +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + ulint len; + + buf[0] = (byte)(type->mtype & 0xFFUL); + + if (type->prtype & DATA_BINARY_TYPE) { + buf[0] = buf[0] | 128; + } + + /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) { + buf[0] = buf[0] | 64; + } + */ + + buf[1] = (byte)(type->prtype & 0xFFUL); + + len = prefix_len ? prefix_len : type->len; + + mach_write_to_2(buf + 2, len & 0xFFFFUL); + + ut_ad(dtype_get_charset_coll(type->prtype) < 256); + mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); + + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } +} + +/************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the < 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /* in: type struct */ + const byte* buf) /* in: buffer for stored type order info */ +{ +#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE +# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE" +#endif + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype = type->prtype | DATA_BINARY_TYPE; + } + + type->len = mach_read_from_2(buf + 2); + + type->prtype = dtype_form_prtype(type->prtype, + data_mysql_default_charset_coll); + dtype_set_mblen(type); +} + +/************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the >= 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /* in: type struct */ + const byte* buf) /* in: buffer for stored type order info */ +{ + ulint charset_coll; + +#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE +#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE" +#endif + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; + } + + type->len = mach_read_from_2(buf + 2); + + charset_coll = mach_read_from_2(buf + 4) & 0x7fff; + + if (dtype_is_string_type(type->mtype)) { + ut_a(charset_coll < 256); + + if (charset_coll == 0) { + /* This insert buffer record was inserted with MySQL + version < 4.1.2, and the charset-collation code was not + explicitly stored to dtype->prtype at that time. It + must be the default charset-collation of this MySQL + installation. */ + + charset_coll = data_mysql_default_charset_coll; + } + + type->prtype = dtype_form_prtype(type->prtype, charset_coll); + } + dtype_set_mblen(type); +} + +/*************************************************************************** +Returns the size of a fixed size data type, 0 if not a fixed size type. */ +UNIV_INLINE +ulint +dtype_get_fixed_size_low( +/*=====================*/ + /* out: fixed size, or 0 */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + ulint len, /* in: length */ + ulint mbminlen, /* in: minimum length of a multibyte char */ + ulint mbmaxlen) /* in: maximum length of a multibyte char */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return(len); + case DATA_MYSQL: + if (prtype & DATA_BINARY_TYPE) { + return(len); + } else { +#ifdef UNIV_HOTBACKUP + if (mbminlen == mbmaxlen) { + return(len); + } +#else /* UNIV_HOTBACKUP */ + /* We play it safe here and ask MySQL for + mbminlen and mbmaxlen. Although + mbminlen and mbmaxlen are + initialized if and only if prtype + is (in one of the 3 functions in this file), + it could be that none of these functions + has been called. */ + + ulint i_mbminlen, i_mbmaxlen; + + innobase_get_cset_width( + dtype_get_charset_coll(prtype), + &i_mbminlen, &i_mbmaxlen); + + if (UNIV_UNLIKELY(mbminlen != i_mbminlen) + || UNIV_UNLIKELY(mbmaxlen != i_mbmaxlen)) { + + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: " + "mbminlen=%lu, " + "mbmaxlen=%lu, " + "type->mbminlen=%lu, " + "type->mbmaxlen=%lu\n", + (ulong) i_mbminlen, + (ulong) i_mbmaxlen, + (ulong) mbminlen, + (ulong) mbmaxlen); + } + if (mbminlen == mbmaxlen) { + return(len); + } +#endif /* !UNIV_HOTBACKUP */ + } + /* fall through for variable-length charsets */ + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/*************************************************************************** +Returns the minimum size of a data type. */ +UNIV_INLINE +ulint +dtype_get_min_size_low( +/*===================*/ + /* out: minimum size */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + ulint len, /* in: length */ + ulint mbminlen, /* in: minimum length of a multibyte char */ + ulint mbmaxlen) /* in: maximum length of a multibyte char */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return(len); + case DATA_MYSQL: + if ((prtype & DATA_BINARY_TYPE) || mbminlen == mbmaxlen) { + return(len); + } + /* this is a variable-length character set */ + ut_a(mbminlen > 0); + ut_a(mbmaxlen > mbminlen); + ut_a(len % mbmaxlen == 0); + return(len * mbminlen / mbmaxlen); + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/*************************************************************************** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + /* out: maximum size */ + ulint mtype, /* in: main type */ + ulint len) /* in: length */ +{ + switch (mtype) { + case DATA_SYS: + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_MYSQL: + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + return(len); + case DATA_BLOB: + break; + default: + ut_error; + } + + return(ULINT_MAX); +} + +/*************************************************************************** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + /* out: SQL null storage size + in ROW_FORMAT=REDUNDANT */ + const dtype_t* type) /* in: type */ +{ + return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, + type->mbminlen, type->mbmaxlen)); +} diff --git a/storage/xtradb/include/data0types.h b/storage/xtradb/include/data0types.h new file mode 100644 index 00000000000..9e536478d68 --- /dev/null +++ b/storage/xtradb/include/data0types.h @@ -0,0 +1,35 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +Some type definitions + +Created 9/21/2000 Heikki Tuuri +*************************************************************************/ + +#ifndef data0types_h +#define data0types_h + +/* SQL data field struct */ +typedef struct dfield_struct dfield_t; + +/* SQL data tuple struct */ +typedef struct dtuple_struct dtuple_t; + +#endif + diff --git a/storage/xtradb/include/db0err.h b/storage/xtradb/include/db0err.h new file mode 100644 index 00000000000..d6d2a9785a5 --- /dev/null +++ b/storage/xtradb/include/db0err.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Global error codes for the database + +Created 5/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef db0err_h +#define db0err_h + + +enum db_err { + DB_SUCCESS = 10, + + /* The following are error codes */ + DB_ERROR, + DB_OUT_OF_MEMORY, + DB_OUT_OF_FILE_SPACE, + DB_LOCK_WAIT, + DB_DEADLOCK, + DB_ROLLBACK, + DB_DUPLICATE_KEY, + DB_QUE_THR_SUSPENDED, + DB_MISSING_HISTORY, /* required history data has been + deleted due to lack of space in + rollback segment */ + DB_CLUSTER_NOT_FOUND = 30, + DB_TABLE_NOT_FOUND, + DB_MUST_GET_MORE_FILE_SPACE, /* the database has to be stopped + and restarted with more file space */ + DB_TABLE_IS_BEING_USED, + DB_TOO_BIG_RECORD, /* a record in an index would not fit + on a compressed page, or it would + become bigger than 1/2 free space in + an uncompressed page frame */ + DB_LOCK_WAIT_TIMEOUT, /* lock wait lasted too long */ + DB_NO_REFERENCED_ROW, /* referenced key value not found + for a foreign key in an insert or + update of a row */ + DB_ROW_IS_REFERENCED, /* cannot delete or update a row + because it contains a key value + which is referenced */ + DB_CANNOT_ADD_CONSTRAINT, /* adding a foreign key constraint + to a table failed */ + DB_CORRUPTION, /* data structure corruption noticed */ + DB_COL_APPEARS_TWICE_IN_INDEX, /* InnoDB cannot handle an index + where same column appears twice */ + DB_CANNOT_DROP_CONSTRAINT, /* dropping a foreign key constraint + from a table failed */ + DB_NO_SAVEPOINT, /* no savepoint exists with the given + name */ + DB_TABLESPACE_ALREADY_EXISTS, /* we cannot create a new single-table + tablespace because a file of the same + name already exists */ + DB_TABLESPACE_DELETED, /* tablespace does not exist or is + being dropped right now */ + DB_LOCK_TABLE_FULL, /* lock structs have exhausted the + buffer pool (for big transactions, + InnoDB stores the lock structs in the + buffer pool) */ + DB_FOREIGN_DUPLICATE_KEY, /* foreign key constraints + activated by the operation would + lead to a duplicate key in some + table */ + DB_TOO_MANY_CONCURRENT_TRXS, /* when InnoDB runs out of the + preconfigured undo slots, this can + only happen when there are too many + concurrent transactions */ + DB_UNSUPPORTED, /* when InnoDB sees any artefact or + a feature that it can't recoginize or + work with e.g., FT indexes created by + a later version of the engine. */ + + DB_PRIMARY_KEY_IS_NULL, /* a column in the PRIMARY KEY + was found to be NULL */ + + /* The following are partial failure codes */ + DB_FAIL = 1000, + DB_OVERFLOW, + DB_UNDERFLOW, + DB_STRONG_FAIL, + DB_ZIP_OVERFLOW, + DB_RECORD_NOT_FOUND = 1500, + DB_END_OF_INDEX +}; + +#endif diff --git a/storage/xtradb/include/dict0boot.h b/storage/xtradb/include/dict0boot.h new file mode 100644 index 00000000000..e1556bdb16e --- /dev/null +++ b/storage/xtradb/include/dict0boot.h @@ -0,0 +1,150 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0boot_h +#define dict0boot_h + +#include "univ.i" + +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "buf0buf.h" +#include "fsp0fsp.h" +#include "dict0dict.h" + +typedef byte dict_hdr_t; + +/************************************************************************** +Gets a pointer to the dictionary header and x-latches its page. */ +UNIV_INTERN +dict_hdr_t* +dict_hdr_get( +/*=========*/ + /* out: pointer to the dictionary header, + page x-latched */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Returns a new row, table, index, or tree id. */ +UNIV_INTERN +dulint +dict_hdr_get_new_id( +/*================*/ + /* out: the new id */ + ulint type); /* in: DICT_HDR_ROW_ID, ... */ +/************************************************************************** +Returns a new row id. */ +UNIV_INLINE +dulint +dict_sys_get_new_row_id(void); +/*=========================*/ + /* out: the new id */ +/************************************************************************** +Reads a row id from a record or other 6-byte stored form. */ +UNIV_INLINE +dulint +dict_sys_read_row_id( +/*=================*/ + /* out: row id */ + byte* field); /* in: record field */ +/************************************************************************** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /* in: record field */ + dulint row_id);/* in: row id */ +/********************************************************************* +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. */ +UNIV_INTERN +void +dict_boot(void); +/*===========*/ +/********************************************************************* +Creates and initializes the data dictionary at the database creation. */ +UNIV_INTERN +void +dict_create(void); +/*=============*/ + + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ids for the basic system tables and their indexes */ +#define DICT_TABLES_ID ut_dulint_create(0, 1) +#define DICT_COLUMNS_ID ut_dulint_create(0, 2) +#define DICT_INDEXES_ID ut_dulint_create(0, 3) +#define DICT_FIELDS_ID ut_dulint_create(0, 4) +/* The following is a secondary index on SYS_TABLES */ +#define DICT_TABLE_IDS_ID ut_dulint_create(0, 5) + +#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start + from this number, except for basic + system tables and their above defined + indexes; ibuf tables and indexes are + assigned as the id the number + DICT_IBUF_ID_MIN plus the space id */ +#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFFUL, 0) + +/* The offset of the dictionary header on the page */ +#define DICT_HDR FSEG_PAGE_DATA + +/*-------------------------------------------------------------*/ +/* Dictionary header offsets */ +#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */ +#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */ +#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */ +#define DICT_HDR_MIX_ID 24 /* Obsolete, always 0. */ +#define DICT_HDR_TABLES 32 /* Root of the table index tree */ +#define DICT_HDR_TABLE_IDS 36 /* Root of the table index tree */ +#define DICT_HDR_COLUMNS 40 /* Root of the column index tree */ +#define DICT_HDR_INDEXES 44 /* Root of the index index tree */ +#define DICT_HDR_FIELDS 48 /* Root of the index field + index tree */ + +#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace + segment into which the dictionary + header is created */ +/*-------------------------------------------------------------*/ + +/* The field number of the page number field in the sys_indexes table +clustered index */ +#define DICT_SYS_INDEXES_PAGE_NO_FIELD 8 +#define DICT_SYS_INDEXES_SPACE_NO_FIELD 7 +#define DICT_SYS_INDEXES_TYPE_FIELD 6 + +/* When a row id which is zero modulo this number (which must be a power of +two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is +updated */ +#define DICT_HDR_ROW_ID_WRITE_MARGIN 256 + +#ifndef UNIV_NONINL +#include "dict0boot.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0boot.ic b/storage/xtradb/include/dict0boot.ic new file mode 100644 index 00000000000..9b45f9e84be --- /dev/null +++ b/storage/xtradb/include/dict0boot.ic @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +/************************************************************************** +Writes the current value of the row id counter to the dictionary header file +page. */ +UNIV_INTERN +void +dict_hdr_flush_row_id(void); +/*=======================*/ + + +/************************************************************************** +Returns a new row id. */ +UNIV_INLINE +dulint +dict_sys_get_new_row_id(void) +/*=========================*/ + /* out: the new id */ +{ + dulint id; + + mutex_enter(&(dict_sys->mutex)); + + id = dict_sys->row_id; + + if (0 == (ut_dulint_get_low(id) % DICT_HDR_ROW_ID_WRITE_MARGIN)) { + + dict_hdr_flush_row_id(); + } + + UT_DULINT_INC(dict_sys->row_id); + + mutex_exit(&(dict_sys->mutex)); + + return(id); +} + +/************************************************************************** +Reads a row id from a record or other 6-byte stored form. */ +UNIV_INLINE +dulint +dict_sys_read_row_id( +/*=================*/ + /* out: row id */ + byte* field) /* in: record field */ +{ +#if DATA_ROW_ID_LEN != 6 +# error "DATA_ROW_ID_LEN != 6" +#endif + + return(mach_read_from_6(field)); +} + +/************************************************************************** +Writes a row id to a record or other 6-byte stored form. */ +UNIV_INLINE +void +dict_sys_write_row_id( +/*==================*/ + byte* field, /* in: record field */ + dulint row_id) /* in: row id */ +{ +#if DATA_ROW_ID_LEN != 6 +# error "DATA_ROW_ID_LEN != 6" +#endif + + mach_write_to_6(field, row_id); +} + + diff --git a/storage/xtradb/include/dict0crea.h b/storage/xtradb/include/dict0crea.h new file mode 100644 index 00000000000..9ac3e408f1f --- /dev/null +++ b/storage/xtradb/include/dict0crea.h @@ -0,0 +1,199 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0crea_h +#define dict0crea_h + +#include "univ.i" +#include "dict0types.h" +#include "dict0dict.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/************************************************************************* +Creates a table create graph. */ +UNIV_INTERN +tab_node_t* +tab_create_graph_create( +/*====================*/ + /* out, own: table create node */ + dict_table_t* table, /* in: table to create, built as a memory data + structure */ + mem_heap_t* heap); /* in: heap where created */ +/************************************************************************* +Creates an index create graph. */ +UNIV_INTERN +ind_node_t* +ind_create_graph_create( +/*====================*/ + /* out, own: index create node */ + dict_index_t* index, /* in: index to create, built as a memory data + structure */ + mem_heap_t* heap); /* in: heap where created */ +/*************************************************************** +Creates a table. This is a high-level function used in SQL execution graphs. */ +UNIV_INTERN +que_thr_t* +dict_create_table_step( +/*===================*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/*************************************************************** +Creates an index. This is a high-level function used in SQL execution +graphs. */ +UNIV_INTERN +que_thr_t* +dict_create_index_step( +/*===================*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/*********************************************************************** +Truncates the index tree associated with a row in SYS_INDEXES table. */ +UNIV_INTERN +ulint +dict_truncate_index_tree( +/*=====================*/ + /* out: new root page number, or + FIL_NULL on failure */ + dict_table_t* table, /* in: the table the index belongs to */ + ulint space, /* in: 0=truncate, + nonzero=create the index tree in the + given tablespace */ + btr_pcur_t* pcur, /* in/out: persistent cursor pointing to + record in the clustered index of + SYS_INDEXES table. The cursor may be + repositioned in this call. */ + mtr_t* mtr); /* in: mtr having the latch + on the record page. The mtr may be + committed and restarted in this call. */ +/*********************************************************************** +Drops the index tree associated with a row in SYS_INDEXES table. */ +UNIV_INTERN +void +dict_drop_index_tree( +/*=================*/ + rec_t* rec, /* in/out: record in the clustered index + of SYS_INDEXES table */ + mtr_t* mtr); /* in: mtr having the latch on the record page */ +#ifndef UNIV_HOTBACKUP +/******************************************************************** +Creates the foreign key constraints system tables inside InnoDB +at database creation or database start if they are not found or are +not of the right form. */ +UNIV_INTERN +ulint +dict_create_or_check_foreign_constraint_tables(void); +/*================================================*/ + /* out: DB_SUCCESS or error code */ +/************************************************************************ +Adds foreign key definitions to data dictionary tables in the database. We +look at table->foreign_list, and also generate names to constraints that were +not named by the user. A generated constraint has a name of the format +databasename/tablename_ibfk_, where the numbers start from 1, and are +given locally for this table, that is, the number is not global, as in the +old format constraints < 4.0.18 it used to be. */ +UNIV_INTERN +ulint +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + /* out: error code or DB_SUCCESS */ + ulint start_id,/* in: if we are actually doing ALTER TABLE + ADD CONSTRAINT, we want to generate constraint + numbers which are bigger than in the table so + far; we number the constraints from + start_id + 1 up; start_id should be set to 0 if + we are creating a new table, or if the table + so far has no constraints for which the name + was generated here */ + dict_table_t* table, /* in: table */ + trx_t* trx); /* in: transaction */ +#endif /* !UNIV_HOTBACKUP */ + +/* Table create node structure */ + +struct tab_node_struct{ + que_common_t common; /* node type: QUE_NODE_TABLE_CREATE */ + dict_table_t* table; /* table to create, built as a memory data + structure with dict_mem_... functions */ + ins_node_t* tab_def; /* child node which does the insert of + the table definition; the row to be inserted + is built by the parent node */ + ins_node_t* col_def; /* child node which does the inserts of + the column definitions; the row to be inserted + is built by the parent node */ + commit_node_t* commit_node; + /* child node which performs a commit after + a successful table creation */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /* node execution state */ + ulint col_no; /* next column definition to insert */ + mem_heap_t* heap; /* memory heap used as auxiliary storage */ +}; + +/* Table create node states */ +#define TABLE_BUILD_TABLE_DEF 1 +#define TABLE_BUILD_COL_DEF 2 +#define TABLE_COMMIT_WORK 3 +#define TABLE_ADD_TO_CACHE 4 +#define TABLE_COMPLETED 5 + +/* Index create node struct */ + +struct ind_node_struct{ + que_common_t common; /* node type: QUE_NODE_INDEX_CREATE */ + dict_index_t* index; /* index to create, built as a memory data + structure with dict_mem_... functions */ + ins_node_t* ind_def; /* child node which does the insert of + the index definition; the row to be inserted + is built by the parent node */ + ins_node_t* field_def; /* child node which does the inserts of + the field definitions; the row to be inserted + is built by the parent node */ + commit_node_t* commit_node; + /* child node which performs a commit after + a successful index creation */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /* node execution state */ + ulint page_no;/* root page number of the index */ + dict_table_t* table; /* table which owns the index */ + dtuple_t* ind_row;/* index definition row built */ + ulint field_no;/* next field definition to insert */ + mem_heap_t* heap; /* memory heap used as auxiliary storage */ +}; + +/* Index create node states */ +#define INDEX_BUILD_INDEX_DEF 1 +#define INDEX_BUILD_FIELD_DEF 2 +#define INDEX_CREATE_INDEX_TREE 3 +#define INDEX_COMMIT_WORK 4 +#define INDEX_ADD_TO_CACHE 5 + +#ifndef UNIV_NONINL +#include "dict0crea.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0crea.ic b/storage/xtradb/include/dict0crea.ic new file mode 100644 index 00000000000..b05385fa121 --- /dev/null +++ b/storage/xtradb/include/dict0crea.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h new file mode 100644 index 00000000000..82a139a7ff9 --- /dev/null +++ b/storage/xtradb/include/dict0dict.h @@ -0,0 +1,1147 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0dict_h +#define dict0dict_h + +#include "univ.i" +#include "dict0types.h" +#include "dict0mem.h" +#include "data0type.h" +#include "data0data.h" +#include "sync0sync.h" +#include "sync0rw.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "ut0mem.h" +#include "ut0lst.h" +#include "hash0hash.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "trx0types.h" + +#ifndef UNIV_HOTBACKUP +/********************************************************************** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +dict_casedn_str( +/*============*/ + char* a); /* in/out: string to put in lower case */ +#endif /* !UNIV_HOTBACKUP */ +/************************************************************************ +Get the database name length in a table name. */ +UNIV_INTERN +ulint +dict_get_db_name_len( +/*=================*/ + /* out: database name length */ + const char* name); /* in: table name in the form + dbname '/' tablename */ +/************************************************************************ +Return the end of table name where we have removed dbname and '/'. */ + +const char* +dict_remove_db_name( +/*================*/ + /* out: table name */ + const char* name); /* in: table name in the form + dbname '/' tablename */ +/************************************************************************** +Returns a table object based on table id. */ +UNIV_INTERN +dict_table_t* +dict_table_get_on_id( +/*=================*/ + /* out: table, NULL if does not exist */ + dulint table_id, /* in: table id */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************ +Decrements the count of open MySQL handles to a table. */ +UNIV_INTERN +void +dict_table_decrement_handle_count( +/*==============================*/ + dict_table_t* table, /* in/out: table */ + ibool dict_locked); /* in: TRUE=data dictionary locked */ +/************************************************************************** +Inits the data dictionary module. */ +UNIV_INTERN +void +dict_init(void); +/*===========*/ +/************************************************************************ +Gets the space id of every table of the data dictionary and makes a linear +list and a hash table of them to the data dictionary cache. This function +can be called at database startup if we did not need to do a crash recovery. +In crash recovery we must scan the space id's from the .ibd files in MySQL +database directories. */ +UNIV_INTERN +void +dict_load_space_id_list(void); +/*=========================*/ +/************************************************************************* +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /* in: column */ + dtype_t* type); /* out: data type */ +#ifdef UNIV_DEBUG +/************************************************************************* +Assert that a column and a data type match. */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + /* out: TRUE */ + const dict_col_t* col, /* in: column */ + const dtype_t* type); /* in: data type */ +#endif /* UNIV_DEBUG */ +/*************************************************************************** +Returns the minimum size of the column. */ +UNIV_INLINE +ulint +dict_col_get_min_size( +/*==================*/ + /* out: minimum size */ + const dict_col_t* col); /* in: column */ +/*************************************************************************** +Returns the maximum size of the column. */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + /* out: maximum size */ + const dict_col_t* col); /* in: column */ +/*************************************************************************** +Returns the size of a fixed size column, 0 if not a fixed size column. */ +UNIV_INLINE +ulint +dict_col_get_fixed_size( +/*====================*/ + /* out: fixed size, or 0 */ + const dict_col_t* col); /* in: column */ +/*************************************************************************** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. */ +UNIV_INLINE +ulint +dict_col_get_sql_null_size( +/*=======================*/ + /* out: SQL null storage size + in ROW_FORMAT=REDUNDANT */ + const dict_col_t* col); /* in: column */ + +/************************************************************************* +Gets the column number. */ +UNIV_INLINE +ulint +dict_col_get_no( +/*============*/ + const dict_col_t* col); +/************************************************************************* +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /* in: table column */ + const dict_index_t* clust_index); /* in: clustered index */ +/******************************************************************** +If the given column name is reserved for InnoDB system columns, return +TRUE. */ +UNIV_INTERN +ibool +dict_col_name_is_reserved( +/*======================*/ + /* out: TRUE if name is reserved */ + const char* name); /* in: column name */ +/************************************************************************ +Acquire the autoinc lock.*/ +UNIV_INTERN +void +dict_table_autoinc_lock( +/*====================*/ + dict_table_t* table); /* in/out: table */ +/************************************************************************ +Unconditionally set the autoinc counter. */ +UNIV_INTERN +void +dict_table_autoinc_initialize( +/*==========================*/ + dict_table_t* table, /* in/out: table */ + ib_uint64_t value); /* in: next value to assign to a row */ +/************************************************************************ +Reads the next autoinc value (== autoinc counter value), 0 if not yet +initialized. */ +UNIV_INTERN +ib_uint64_t +dict_table_autoinc_read( +/*====================*/ + /* out: value for a new row, or 0 */ + const dict_table_t* table); /* in: table */ +/************************************************************************ +Updates the autoinc counter if the value supplied is greater than the +current value. */ +UNIV_INTERN +void +dict_table_autoinc_update_if_greater( +/*=================================*/ + + dict_table_t* table, /* in/out: table */ + ib_uint64_t value); /* in: value which was assigned to a row */ +/************************************************************************ +Release the autoinc lock.*/ +UNIV_INTERN +void +dict_table_autoinc_unlock( +/*======================*/ + dict_table_t* table); /* in/out: table */ +/************************************************************************** +Adds system columns to a table object. */ +UNIV_INTERN +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /* in/out: table */ + mem_heap_t* heap); /* in: temporary heap */ +/************************************************************************** +Adds a table object to the dictionary cache. */ +UNIV_INTERN +void +dict_table_add_to_cache( +/*====================*/ + dict_table_t* table, /* in: table */ + mem_heap_t* heap); /* in: temporary heap */ +/************************************************************************** +Removes a table object from the dictionary cache. */ +UNIV_INTERN +void +dict_table_remove_from_cache( +/*=========================*/ + dict_table_t* table); /* in, own: table */ +/************************************************************************** +Renames a table object. */ +UNIV_INTERN +ibool +dict_table_rename_in_cache( +/*=======================*/ + /* out: TRUE if success */ + dict_table_t* table, /* in/out: table */ + const char* new_name, /* in: new name */ + ibool rename_also_foreigns);/* in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ +/************************************************************************** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /* in/out: table */ + dict_index_t* index); /* in, own: index */ +/************************************************************************** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +UNIV_INTERN +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /* in/out: table object already in cache */ + dulint new_id);/* in: new id to set */ +/************************************************************************** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of foreign table or referenced table must already be in +the dictionary cache! */ +UNIV_INTERN +ulint +dict_foreign_add_to_cache( +/*======================*/ + /* out: DB_SUCCESS or error code */ + dict_foreign_t* foreign, /* in, own: foreign key constraint */ + ibool check_charsets);/* in: TRUE=check charset + compatibility */ +/************************************************************************* +Check if the index is referenced by a foreign key, if TRUE return the +matching instance NULL otherwise. */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_referenced_constraint( +/*=================================*/ + /* out: pointer to foreign key struct if index + is defined for foreign key, otherwise NULL */ + dict_table_t* table, /* in: InnoDB table */ + dict_index_t* index); /* in: InnoDB index */ +/************************************************************************* +Checks if a table is referenced by foreign keys. */ +UNIV_INTERN +ibool +dict_table_is_referenced_by_foreign_key( +/*====================================*/ + /* out: TRUE if table is referenced + by a foreign key */ + const dict_table_t* table); /* in: InnoDB table */ +/************************************************************************** +Replace the index in the foreign key list that matches this index's +definition with an equivalent index. */ +UNIV_INTERN +void +dict_table_replace_index_in_foreign_list( +/*=====================================*/ + dict_table_t* table, /* in/out: table */ + dict_index_t* index); /* in: index to be replaced */ +/************************************************************************* +Checks if a index is defined for a foreign key constraint. Index is a part +of a foreign key constraint if the index is referenced by foreign key +or index is a foreign key index */ +UNIV_INTERN +dict_foreign_t* +dict_table_get_foreign_constraint( +/*==============================*/ + /* out: pointer to foreign key struct if index + is defined for foreign key, otherwise NULL */ + dict_table_t* table, /* in: InnoDB table */ + dict_index_t* index); /* in: InnoDB index */ +/************************************************************************* +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. */ +UNIV_INTERN +ulint +dict_create_foreign_constraints( +/*============================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES + table2(c, d), table2 can be written + also with the database + name before it: test.table2; the + default database id the database of + parameter name */ + const char* name, /* in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks); /* in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ +/************************************************************************** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. */ +UNIV_INTERN +ulint +dict_foreign_parse_drop_constraints( +/*================================*/ + /* out: DB_SUCCESS or + DB_CANNOT_DROP_CONSTRAINT if + syntax error or the constraint + id does not match */ + mem_heap_t* heap, /* in: heap from which we can + allocate memory */ + trx_t* trx, /* in: transaction */ + dict_table_t* table, /* in: table */ + ulint* n, /* out: number of constraints + to drop */ + const char*** constraints_to_drop); /* out: id's of the + constraints to drop */ +/************************************************************************** +Returns a table object and optionally increment its MySQL open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low is usually the +appropriate function. */ +UNIV_INTERN +dict_table_t* +dict_table_get( +/*===========*/ + /* out: table, NULL if + does not exist */ + const char* table_name, /* in: table name */ + ibool inc_mysql_count); + /* in: whether to increment the open + handle count on the table */ +/************************************************************************** +Returns a index object, based on table and index id, and memoryfixes it. */ +UNIV_INTERN +dict_index_t* +dict_index_get_on_id_low( +/*=====================*/ + /* out: index, NULL if does not + exist */ + dict_table_t* table, /* in: table */ + dulint index_id); /* in: index id */ +/************************************************************************** +Checks if a table is in the dictionary cache. */ + +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + /* out: table, NULL if not found */ + const char* table_name); /* in: table name */ +/************************************************************************** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + /* out: table, NULL if not found */ + const char* table_name); /* in: table name */ +/************************************************************************** +Returns a table object based on table id. */ +UNIV_INLINE +dict_table_t* +dict_table_get_on_id_low( +/*=====================*/ + /* out: table, NULL if does not exist */ + dulint table_id); /* in: table id */ +/************************************************************************** +Find an index that is equivalent to the one passed in and is not marked +for deletion. */ +UNIV_INTERN +dict_index_t* +dict_foreign_find_equiv_index( +/*==========================*/ + /* out: index equivalent to + foreign->foreign_index, or NULL */ + dict_foreign_t* foreign);/* in: foreign key */ +/************************************************************************** +Returns an index object by matching on the name and column names and if +more than index is found return the index with the higher id.*/ +UNIV_INTERN +dict_index_t* +dict_table_get_index_by_max_id( +/*===========================*/ + /* out: matching index, NULL if not found */ + dict_table_t* table, /* in: table */ + const char* name, /* in: the index name to find */ + const char** columns,/* in: array of column names */ + ulint n_cols);/* in: number of columns */ +/************************************************************************** +Returns a column's name. */ + +const char* +dict_table_get_col_name( +/*====================*/ + /* out: column name. NOTE: not + guaranteed to stay valid if table is + modified in any way (columns added, + etc.). */ + const dict_table_t* table, /* in: table */ + ulint col_nr);/* in: column number */ + +/************************************************************************** +Prints a table definition. */ +UNIV_INTERN +void +dict_table_print( +/*=============*/ + dict_table_t* table); /* in: table */ +/************************************************************************** +Prints a table data. */ +UNIV_INTERN +void +dict_table_print_low( +/*=================*/ + dict_table_t* table); /* in: table */ +/************************************************************************** +Prints a table data when we know the table name. */ +UNIV_INTERN +void +dict_table_print_by_name( +/*=====================*/ + const char* name); +/************************************************************************** +Outputs info on foreign keys of a table. */ +UNIV_INTERN +void +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /* in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + FILE* file, /* in: file where to print */ + trx_t* trx, /* in: transaction */ + dict_table_t* table); /* in: table */ +/************************************************************************** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +UNIV_INTERN +void +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + FILE* file, /* in: file where to print */ + trx_t* trx, /* in: transaction */ + dict_foreign_t* foreign, /* in: foreign key constraint */ + ibool add_newline); /* in: whether to add a newline */ +/************************************************************************ +Displays the names of the index and the table. */ +UNIV_INTERN +void +dict_index_name_print( +/*==================*/ + FILE* file, /* in: output stream */ + trx_t* trx, /* in: transaction */ + const dict_index_t* index); /* in: index to print */ +#ifdef UNIV_DEBUG +/************************************************************************ +Gets the first index on the table (the clustered index). */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + /* out: index, NULL if none exists */ + const dict_table_t* table); /* in: table */ +/************************************************************************ +Gets the next index on the table. */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + /* out: index, NULL if none left */ + const dict_index_t* index); /* in: index */ +#else /* UNIV_DEBUG */ +# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes) +# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index) +#endif /* UNIV_DEBUG */ +/************************************************************************ +Check whether the index is the clustered index. */ +UNIV_INLINE +ulint +dict_index_is_clust( +/*================*/ + /* out: nonzero for clustered index, + zero for other indexes */ + const dict_index_t* index) /* in: index */ + __attribute__((pure)); +/************************************************************************ +Check whether the index is unique. */ +UNIV_INLINE +ulint +dict_index_is_unique( +/*=================*/ + /* out: nonzero for unique index, + zero for other indexes */ + const dict_index_t* index) /* in: index */ + __attribute__((pure)); +/************************************************************************ +Check whether the index is the insert buffer tree. */ +UNIV_INLINE +ulint +dict_index_is_ibuf( +/*===============*/ + /* out: nonzero for insert buffer, + zero for other indexes */ + const dict_index_t* index) /* in: index */ + __attribute__((pure)); + +/************************************************************************ +Gets the number of user-defined columns in a table in the dictionary +cache. */ +UNIV_INLINE +ulint +dict_table_get_n_user_cols( +/*=======================*/ + /* out: number of user-defined + (e.g., not ROW_ID) + columns of a table */ + const dict_table_t* table); /* in: table */ +/************************************************************************ +Gets the number of system columns in a table in the dictionary cache. */ +UNIV_INLINE +ulint +dict_table_get_n_sys_cols( +/*======================*/ + /* out: number of system (e.g., + ROW_ID) columns of a table */ + const dict_table_t* table); /* in: table */ +/************************************************************************ +Gets the number of all columns (also system) in a table in the dictionary +cache. */ +UNIV_INLINE +ulint +dict_table_get_n_cols( +/*==================*/ + /* out: number of columns of a table */ + const dict_table_t* table); /* in: table */ +#ifdef UNIV_DEBUG +/************************************************************************ +Gets the nth column of a table. */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + /* out: pointer to column object */ + const dict_table_t* table, /* in: table */ + ulint pos); /* in: position of column */ +/************************************************************************ +Gets the given system column of a table. */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + /* out: pointer to column object */ + const dict_table_t* table, /* in: table */ + ulint sys); /* in: DATA_ROW_ID, ... */ +#else /* UNIV_DEBUG */ +#define dict_table_get_nth_col(table, pos) \ +((table)->cols + (pos)) +#define dict_table_get_sys_col(table, sys) \ +((table)->cols + (table)->n_cols + (sys) - DATA_N_SYS_COLS) +#endif /* UNIV_DEBUG */ +/************************************************************************ +Gets the given system column number of a table. */ +UNIV_INLINE +ulint +dict_table_get_sys_col_no( +/*======================*/ + /* out: column number */ + const dict_table_t* table, /* in: table */ + ulint sys); /* in: DATA_ROW_ID, ... */ +/************************************************************************ +Returns the minimum data size of an index record. */ +UNIV_INLINE +ulint +dict_index_get_min_size( +/*====================*/ + /* out: minimum data size in bytes */ + const dict_index_t* index); /* in: index */ +/************************************************************************ +Check whether the table uses the compact page format. */ +UNIV_INLINE +ibool +dict_table_is_comp( +/*===============*/ + /* out: TRUE if table uses the + compact page format */ + const dict_table_t* table); /* in: table */ +/************************************************************************ +Determine the file format of a table. */ +UNIV_INLINE +ulint +dict_table_get_format( +/*==================*/ + /* out: file format version */ + const dict_table_t* table); /* in: table */ +/************************************************************************ +Set the file format of a table. */ +UNIV_INLINE +void +dict_table_set_format( +/*==================*/ + dict_table_t* table, /* in/out: table */ + ulint format);/* in: file format version */ +/************************************************************************ +Extract the compressed page size from table flags. */ +UNIV_INLINE +ulint +dict_table_flags_to_zip_size( +/*=========================*/ + /* out: compressed page size, + or 0 if not compressed */ + ulint flags) /* in: flags */ + __attribute__((const)); +/************************************************************************ +Check whether the table uses the compressed compact page format. */ +UNIV_INLINE +ulint +dict_table_zip_size( +/*================*/ + /* out: compressed page size, + or 0 if not compressed */ + const dict_table_t* table); /* in: table */ +/************************************************************************ +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. */ +UNIV_INTERN +ibool +dict_table_col_in_clustered_key( +/*============================*/ + /* out: TRUE if the column, or its + prefix, is in the clustered key */ + const dict_table_t* table, /* in: table */ + ulint n); /* in: column number */ +/*********************************************************************** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +UNIV_INTERN +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /* in/out: data tuple */ + const dict_table_t* table); /* in: table */ +/************************************************************************** +Looks for an index with the given id. NOTE that we do not reserve +the dictionary mutex: this function is for emergency purposes like +printing info of a corrupt database page! */ +UNIV_INTERN +dict_index_t* +dict_index_find_on_id_low( +/*======================*/ + /* out: index or NULL if not found from cache */ + dulint id); /* in: index id */ +/************************************************************************** +Adds an index to the dictionary cache. */ +UNIV_INTERN +ulint +dict_index_add_to_cache( +/*====================*/ + /* out: DB_SUCCESS or error code */ + dict_table_t* table, /* in: table on which the index is */ + dict_index_t* index, /* in, own: index; NOTE! The index memory + object is freed in this function! */ + ulint page_no,/* in: root page number of the index */ + ibool strict);/* in: TRUE=refuse to create the index + if records could be too big to fit in + an B-tree page */ +/************************************************************************** +Removes an index from the dictionary cache. */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /* in/out: table */ + dict_index_t* index); /* in, own: index */ +/************************************************************************ +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. */ +UNIV_INLINE +ulint +dict_index_get_n_fields( +/*====================*/ + /* out: number of fields */ + const dict_index_t* index); /* in: an internal + representation of index (in + the dictionary cache) */ +/************************************************************************ +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. */ +UNIV_INLINE +ulint +dict_index_get_n_unique( +/*====================*/ + /* out: number of fields */ + const dict_index_t* index); /* in: an internal representation + of index (in the dictionary cache) */ +/************************************************************************ +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. */ +UNIV_INLINE +ulint +dict_index_get_n_unique_in_tree( +/*============================*/ + /* out: number of fields */ + const dict_index_t* index); /* in: an internal representation + of index (in the dictionary cache) */ +/************************************************************************ +Gets the number of user-defined ordering fields in the index. In the internal +representation we add the row id to the ordering fields to make all indexes +unique, but this function returns the number of fields the user defined +in the index as ordering fields. */ +UNIV_INLINE +ulint +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + /* out: number of fields */ + const dict_index_t* index); /* in: an internal representation + of index (in the dictionary cache) */ +#ifdef UNIV_DEBUG +/************************************************************************ +Gets the nth field of an index. */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + /* out: pointer to field object */ + const dict_index_t* index, /* in: index */ + ulint pos); /* in: position of field */ +#else /* UNIV_DEBUG */ +# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos)) +#endif /* UNIV_DEBUG */ +/************************************************************************ +Gets pointer to the nth column in an index. */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + /* out: column */ + const dict_index_t* index, /* in: index */ + ulint pos); /* in: position of the field */ +/************************************************************************ +Gets the column number of the nth field in an index. */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + /* out: column number */ + const dict_index_t* index, /* in: index */ + ulint pos); /* in: position of the field */ +/************************************************************************ +Looks for column n in an index. */ +UNIV_INTERN +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + /* out: position in internal + representation of the index; + if not contained, returns + ULINT_UNDEFINED */ + const dict_index_t* index, /* in: index */ + ulint n); /* in: column number */ +/************************************************************************ +Returns TRUE if the index contains a column or a prefix of that column. */ +UNIV_INTERN +ibool +dict_index_contains_col_or_prefix( +/*==============================*/ + /* out: TRUE if contains the column + or its prefix */ + const dict_index_t* index, /* in: index */ + ulint n); /* in: column number */ +/************************************************************************ +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. */ +UNIV_INTERN +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + /* out: position in internal + representation of the index; + if not contained, returns + ULINT_UNDEFINED */ + const dict_index_t* index, /* in: index from which to search */ + const dict_index_t* index2, /* in: index */ + ulint n); /* in: field number in index2 */ +/************************************************************************ +Looks for column n position in the clustered index. */ +UNIV_INTERN +ulint +dict_table_get_nth_col_pos( +/*=======================*/ + /* out: position in internal + representation of + the clustered index */ + const dict_table_t* table, /* in: table */ + ulint n); /* in: column number */ +/************************************************************************ +Returns the position of a system column in an index. */ +UNIV_INLINE +ulint +dict_index_get_sys_col_pos( +/*=======================*/ + /* out: position, + ULINT_UNDEFINED if not contained */ + const dict_index_t* index, /* in: index */ + ulint type); /* in: DATA_ROW_ID, ... */ +/*********************************************************************** +Adds a column to index. */ +UNIV_INTERN +void +dict_index_add_col( +/*===============*/ + dict_index_t* index, /* in/out: index */ + const dict_table_t* table, /* in: table */ + dict_col_t* col, /* in: column */ + ulint prefix_len); /* in: column prefix length */ +/*********************************************************************** +Copies types of fields contained in index to tuple. */ +UNIV_INTERN +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /* in/out: data tuple */ + const dict_index_t* index, /* in: index */ + ulint n_fields); /* in: number of + field types to copy */ +/************************************************************************* +Gets the field column. */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field); + +/************************************************************************** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys->mutex is already being held. */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + /* out: index, NULL if not found */ + dulint index_id); /* in: index id */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +/************************************************************************** +Returns an index object if it is found in the dictionary cache. */ +UNIV_INTERN +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + /* out: index, NULL if not found */ + dulint index_id); /* in: index id */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#ifdef UNIV_DEBUG +/************************************************************************** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. */ +UNIV_INTERN +ibool +dict_index_check_search_tuple( +/*==========================*/ + /* out: TRUE if ok */ + const dict_index_t* index, /* in: index tree */ + const dtuple_t* tuple); /* in: tuple used in a search */ +/************************************************************************** +Check for duplicate index entries in a table [using the index name] */ +UNIV_INTERN +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table); /* in: Check for dup indexes + in this table */ + +#endif /* UNIV_DEBUG */ +/************************************************************************** +Builds a node pointer out of a physical record and a page number. */ +UNIV_INTERN +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + /* out, own: node pointer */ + const dict_index_t* index, /* in: index */ + const rec_t* rec, /* in: record for which to build node + pointer */ + ulint page_no,/* in: page number to put in node + pointer */ + mem_heap_t* heap, /* in: memory heap where pointer + created */ + ulint level); /* in: level of rec in tree: + 0 means leaf level */ +/************************************************************************** +Copies an initial segment of a physical record, long enough to specify an +index entry uniquely. */ +UNIV_INTERN +rec_t* +dict_index_copy_rec_order_prefix( +/*=============================*/ + /* out: pointer to the prefix record */ + const dict_index_t* index, /* in: index */ + const rec_t* rec, /* in: record for which to + copy prefix */ + ulint* n_fields,/* out: number of fields copied */ + byte** buf, /* in/out: memory buffer for the + copied prefix, or NULL */ + ulint* buf_size);/* in/out: buffer size */ +/************************************************************************** +Builds a typed data tuple out of a physical record. */ +UNIV_INTERN +dtuple_t* +dict_index_build_data_tuple( +/*========================*/ + /* out, own: data tuple */ + dict_index_t* index, /* in: index */ + rec_t* rec, /* in: record for which to build data tuple */ + ulint n_fields,/* in: number of data fields */ + mem_heap_t* heap); /* in: memory heap where tuple created */ +/************************************************************************* +Gets the space id of the root of the index tree. */ +UNIV_INLINE +ulint +dict_index_get_space( +/*=================*/ + /* out: space id */ + const dict_index_t* index); /* in: index */ +/************************************************************************* +Sets the space id of the root of the index tree. */ +UNIV_INLINE +void +dict_index_set_space( +/*=================*/ + dict_index_t* index, /* in/out: index */ + ulint space); /* in: space id */ +/************************************************************************* +Gets the page number of the root of the index tree. */ +UNIV_INLINE +ulint +dict_index_get_page( +/*================*/ + /* out: page number */ + const dict_index_t* tree); /* in: index */ +/************************************************************************* +Sets the page number of the root of index tree. */ +UNIV_INLINE +void +dict_index_set_page( +/*================*/ + dict_index_t* index, /* in/out: index */ + ulint page); /* in: page number */ +/************************************************************************* +Gets the read-write lock of the index tree. */ +UNIV_INLINE +rw_lock_t* +dict_index_get_lock( +/*================*/ + /* out: read-write lock */ + dict_index_t* index); /* in: index */ +/************************************************************************ +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void); +/*==============================*/ + /* out: number of free bytes on page, + reserved for updates */ +/************************************************************************* +Calculates the minimum record length in an index. */ +UNIV_INTERN +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index); /* in: index */ +/************************************************************************* +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. */ +UNIV_INTERN +void +dict_update_statistics_low( +/*=======================*/ + dict_table_t* table, /* in/out: table */ + ibool has_dict_mutex);/* in: TRUE if the caller has the + dictionary mutex */ +/************************************************************************* +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. */ +UNIV_INTERN +void +dict_update_statistics( +/*===================*/ + dict_table_t* table); /* in/out: table */ +/************************************************************************ +Reserves the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_enter_for_mysql(void); +/*============================*/ +/************************************************************************ +Releases the dictionary system mutex for MySQL. */ +UNIV_INTERN +void +dict_mutex_exit_for_mysql(void); +/*===========================*/ +/************************************************************************ +Checks if the database name in two table names is the same. */ +UNIV_INTERN +ibool +dict_tables_have_same_db( +/*=====================*/ + /* out: TRUE if same db name */ + const char* name1, /* in: table name in the form + dbname '/' tablename */ + const char* name2); /* in: table name in the form + dbname '/' tablename */ +/************************************************************************* +Removes an index from the cache */ +UNIV_INTERN +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /* in/out: table */ + dict_index_t* index); /* in, own: index */ +/************************************************************************** +Get index by name */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name( +/*=========================*/ + /* out: index, NULL if does not exist */ + dict_table_t* table, /* in: table */ + const char* name); /* in: name of the index to find */ +/************************************************************************** +In case there is more than one index with the same name return the index +with the min(id). */ +UNIV_INTERN +dict_index_t* +dict_table_get_index_on_name_and_min_id( +/*====================================*/ + /* out: index, NULL if does not exist */ + dict_table_t* table, /* in: table */ + const char* name); /* in: name of the index to find */ +/* Buffers for storing detailed information about the latest foreign key +and unique key errors */ +extern FILE* dict_foreign_err_file; +extern mutex_t dict_foreign_err_mutex; /* mutex protecting the buffers */ + +extern dict_sys_t* dict_sys; /* the dictionary system */ +extern rw_lock_t dict_operation_lock; + +/* Dictionary system struct */ +struct dict_sys_struct{ + mutex_t mutex; /* mutex protecting the data + dictionary; protects also the + disk-based dictionary system tables; + this mutex serializes CREATE TABLE + and DROP TABLE, as well as reading + the dictionary data for a table from + system tables */ + dulint row_id; /* the next row id to assign; + NOTE that at a checkpoint this + must be written to the dict system + header and flushed to a file; in + recovery this must be derived from + the log records */ + hash_table_t* table_hash; /* hash table of the tables, based + on name */ + hash_table_t* table_id_hash; /* hash table of the tables, based + on id */ + UT_LIST_BASE_NODE_T(dict_table_t) + table_LRU; /* LRU list of tables */ + ulint size; /* varying space in bytes occupied + by the data dictionary table and + index objects */ + dict_table_t* sys_tables; /* SYS_TABLES table */ + dict_table_t* sys_columns; /* SYS_COLUMNS table */ + dict_table_t* sys_indexes; /* SYS_INDEXES table */ + dict_table_t* sys_fields; /* SYS_FIELDS table */ +}; + +#ifndef UNIV_NONINL +#include "dict0dict.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic new file mode 100644 index 00000000000..628d207b329 --- /dev/null +++ b/storage/xtradb/include/dict0dict.ic @@ -0,0 +1,785 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "dict0load.h" +#include "rem0types.h" +#include "data0type.h" + +/************************************************************************* +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /* in: column */ + dtype_t* type) /* out: data type */ +{ + ut_ad(col && type); + + type->mtype = col->mtype; + type->prtype = col->prtype; + type->len = col->len; + type->mbminlen = col->mbminlen; + type->mbmaxlen = col->mbmaxlen; +} + +#ifdef UNIV_DEBUG +/************************************************************************* +Assert that a column and a data type match. */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + /* out: TRUE */ + const dict_col_t* col, /* in: column */ + const dtype_t* type) /* in: data type */ +{ + ut_ad(col); + ut_ad(type); + + ut_ad(col->mtype == type->mtype); + ut_ad(col->prtype == type->prtype); + ut_ad(col->len == type->len); + ut_ad(col->mbminlen == type->mbminlen); + ut_ad(col->mbmaxlen == type->mbmaxlen); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/*************************************************************************** +Returns the minimum size of the column. */ +UNIV_INLINE +ulint +dict_col_get_min_size( +/*==================*/ + /* out: minimum size */ + const dict_col_t* col) /* in: column */ +{ + return(dtype_get_min_size_low(col->mtype, col->prtype, col->len, + col->mbminlen, col->mbmaxlen)); +} +/*************************************************************************** +Returns the maximum size of the column. */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + /* out: maximum size */ + const dict_col_t* col) /* in: column */ +{ + return(dtype_get_max_size_low(col->mtype, col->len)); +} +/*************************************************************************** +Returns the size of a fixed size column, 0 if not a fixed size column. */ +UNIV_INLINE +ulint +dict_col_get_fixed_size( +/*====================*/ + /* out: fixed size, or 0 */ + const dict_col_t* col) /* in: column */ +{ + return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len, + col->mbminlen, col->mbmaxlen)); +} +/*************************************************************************** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. */ +UNIV_INLINE +ulint +dict_col_get_sql_null_size( +/*=======================*/ + /* out: SQL null storage size + in ROW_FORMAT=REDUNDANT */ + const dict_col_t* col) /* in: column */ +{ + return(dict_col_get_fixed_size(col)); +} + +/************************************************************************* +Gets the column number. */ +UNIV_INLINE +ulint +dict_col_get_no( +/*============*/ + const dict_col_t* col) +{ + ut_ad(col); + + return(col->ind); +} + +/************************************************************************* +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /* in: table column */ + const dict_index_t* clust_index) /* in: clustered index */ +{ + ulint i; + + ut_ad(col); + ut_ad(clust_index); + ut_ad(dict_index_is_clust(clust_index)); + + for (i = 0; i < clust_index->n_def; i++) { + const dict_field_t* field = &clust_index->fields[i]; + + if (!field->prefix_len && field->col == col) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +#ifdef UNIV_DEBUG +/************************************************************************ +Gets the first index on the table (the clustered index). */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + /* out: index, NULL if none exists */ + const dict_table_t* table) /* in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes)); +} + +/************************************************************************ +Gets the next index on the table. */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + /* out: index, NULL if none left */ + const dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index)); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************ +Check whether the index is the clustered index. */ +UNIV_INLINE +ulint +dict_index_is_clust( +/*================*/ + /* out: nonzero for clustered index, + zero for other indexes */ + const dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(UNIV_UNLIKELY(index->type & DICT_CLUSTERED)); +} +/************************************************************************ +Check whether the index is unique. */ +UNIV_INLINE +ulint +dict_index_is_unique( +/*=================*/ + /* out: nonzero for unique index, + zero for other indexes */ + const dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(UNIV_UNLIKELY(index->type & DICT_UNIQUE)); +} + +/************************************************************************ +Check whether the index is the insert buffer tree. */ +UNIV_INLINE +ulint +dict_index_is_ibuf( +/*===============*/ + /* out: nonzero for insert buffer, + zero for other indexes */ + const dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(UNIV_UNLIKELY(index->type & DICT_IBUF)); +} + +/************************************************************************ +Gets the number of user-defined columns in a table in the dictionary +cache. */ +UNIV_INLINE +ulint +dict_table_get_n_user_cols( +/*=======================*/ + /* out: number of user-defined + (e.g., not ROW_ID) + columns of a table */ + const dict_table_t* table) /* in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols - DATA_N_SYS_COLS); +} + +/************************************************************************ +Gets the number of system columns in a table in the dictionary cache. */ +UNIV_INLINE +ulint +dict_table_get_n_sys_cols( +/*======================*/ + /* out: number of system (e.g., + ROW_ID) columns of a table */ + const dict_table_t* table __attribute__((unused))) /* in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(table->cached); + + return(DATA_N_SYS_COLS); +} + +/************************************************************************ +Gets the number of all columns (also system) in a table in the dictionary +cache. */ +UNIV_INLINE +ulint +dict_table_get_n_cols( +/*==================*/ + /* out: number of columns of a table */ + const dict_table_t* table) /* in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols); +} + +#ifdef UNIV_DEBUG +/************************************************************************ +Gets the nth column of a table. */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + /* out: pointer to column object */ + const dict_table_t* table, /* in: table */ + ulint pos) /* in: position of column */ +{ + ut_ad(table); + ut_ad(pos < table->n_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return((dict_col_t*) (table->cols) + pos); +} + +/************************************************************************ +Gets the given system column of a table. */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + /* out: pointer to column object */ + const dict_table_t* table, /* in: table */ + ulint sys) /* in: DATA_ROW_ID, ... */ +{ + dict_col_t* col; + + ut_ad(table); + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + col = dict_table_get_nth_col(table, table->n_cols + - DATA_N_SYS_COLS + sys); + ut_ad(col->mtype == DATA_SYS); + ut_ad(col->prtype == (sys | DATA_NOT_NULL)); + + return(col); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************ +Gets the given system column number of a table. */ +UNIV_INLINE +ulint +dict_table_get_sys_col_no( +/*======================*/ + /* out: column number */ + const dict_table_t* table, /* in: table */ + ulint sys) /* in: DATA_ROW_ID, ... */ +{ + ut_ad(table); + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_cols - DATA_N_SYS_COLS + sys); +} + +/************************************************************************ +Check whether the table uses the compact page format. */ +UNIV_INLINE +ibool +dict_table_is_comp( +/*===============*/ + /* out: TRUE if table uses the + compact page format */ + const dict_table_t* table) /* in: table */ +{ + ut_ad(table); + +#if DICT_TF_COMPACT != TRUE +#error +#endif + + return(UNIV_LIKELY(table->flags & DICT_TF_COMPACT)); +} + +/************************************************************************ +Determine the file format of a table. */ +UNIV_INLINE +ulint +dict_table_get_format( +/*==================*/ + /* out: file format version */ + const dict_table_t* table) /* in: table */ +{ + ut_ad(table); + + return((table->flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT); +} + +/************************************************************************ +Determine the file format of a table. */ +UNIV_INLINE +void +dict_table_set_format( +/*==================*/ + dict_table_t* table, /* in/out: table */ + ulint format) /* in: file format version */ +{ + ut_ad(table); + + table->flags = (table->flags & ~DICT_TF_FORMAT_MASK) + | (format << DICT_TF_FORMAT_SHIFT); +} + +/************************************************************************ +Extract the compressed page size from table flags. */ +UNIV_INLINE +ulint +dict_table_flags_to_zip_size( +/*=========================*/ + /* out: compressed page size, + or 0 if not compressed */ + ulint flags) /* in: flags */ +{ + ulint zip_size = flags & DICT_TF_ZSSIZE_MASK; + + if (UNIV_UNLIKELY(zip_size)) { + zip_size = ((PAGE_ZIP_MIN_SIZE >> 1) + << (zip_size >> DICT_TF_ZSSIZE_SHIFT)); + + ut_ad(zip_size <= UNIV_PAGE_SIZE); + } + + return(zip_size); +} + +/************************************************************************ +Check whether the table uses the compressed compact page format. */ +UNIV_INLINE +ulint +dict_table_zip_size( +/*================*/ + /* out: compressed page size, + or 0 if not compressed */ + const dict_table_t* table) /* in: table */ +{ + ut_ad(table); + + return(dict_table_flags_to_zip_size(table->flags)); +} + +/************************************************************************ +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. */ +UNIV_INLINE +ulint +dict_index_get_n_fields( +/*====================*/ + /* out: number of fields */ + const dict_index_t* index) /* in: an internal + representation of index (in + the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->n_fields); +} + +/************************************************************************ +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. */ +UNIV_INLINE +ulint +dict_index_get_n_unique( +/*====================*/ + /* out: number of fields */ + const dict_index_t* index) /* in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + return(index->n_uniq); +} + +/************************************************************************ +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. */ +UNIV_INLINE +ulint +dict_index_get_n_unique_in_tree( +/*============================*/ + /* out: number of fields */ + const dict_index_t* index) /* in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + if (dict_index_is_clust(index)) { + + return(dict_index_get_n_unique(index)); + } + + return(dict_index_get_n_fields(index)); +} + +/************************************************************************ +Gets the number of user-defined ordering fields in the index. In the internal +representation of clustered indexes we add the row id to the ordering fields +to make a clustered index unique, but this function returns the number of +fields the user defined in the index as ordering fields. */ +UNIV_INLINE +ulint +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + /* out: number of fields */ + const dict_index_t* index) /* in: an internal representation + of index (in the dictionary cache) */ +{ + return(index->n_user_defined_cols); +} + +#ifdef UNIV_DEBUG +/************************************************************************ +Gets the nth field of an index. */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + /* out: pointer to field object */ + const dict_index_t* index, /* in: index */ + ulint pos) /* in: position of field */ +{ + ut_ad(index); + ut_ad(pos < index->n_def); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return((dict_field_t*) (index->fields) + pos); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************ +Returns the position of a system column in an index. */ +UNIV_INLINE +ulint +dict_index_get_sys_col_pos( +/*=======================*/ + /* out: position, + ULINT_UNDEFINED if not contained */ + const dict_index_t* index, /* in: index */ + ulint type) /* in: DATA_ROW_ID, ... */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(!(index->type & DICT_UNIVERSAL)); + + if (dict_index_is_clust(index)) { + + return(dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, type), + index)); + } + + return(dict_index_get_nth_col_pos( + index, dict_table_get_sys_col_no(index->table, type))); +} + +/************************************************************************* +Gets the field column. */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) +{ + ut_ad(field); + + return(field->col); +} + +/************************************************************************ +Gets pointer to the nth column in an index. */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + /* out: column */ + const dict_index_t* index, /* in: index */ + ulint pos) /* in: position of the field */ +{ + return(dict_field_get_col(dict_index_get_nth_field(index, pos))); +} + +/************************************************************************ +Gets the column number the nth field in an index. */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + /* out: column number */ + const dict_index_t* index, /* in: index */ + ulint pos) /* in: position of the field */ +{ + return(dict_col_get_no(dict_index_get_nth_col(index, pos))); +} + +/************************************************************************ +Returns the minimum data size of an index record. */ +UNIV_INLINE +ulint +dict_index_get_min_size( +/*====================*/ + /* out: minimum data size in bytes */ + const dict_index_t* index) /* in: index */ +{ + ulint n = dict_index_get_n_fields(index); + ulint size = 0; + + while (n--) { + size += dict_col_get_min_size(dict_index_get_nth_col(index, + n)); + } + + return(size); +} + +/************************************************************************* +Gets the space id of the root of the index tree. */ +UNIV_INLINE +ulint +dict_index_get_space( +/*=================*/ + /* out: space id */ + const dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->space); +} + +/************************************************************************* +Sets the space id of the root of the index tree. */ +UNIV_INLINE +void +dict_index_set_space( +/*=================*/ + dict_index_t* index, /* in/out: index */ + ulint space) /* in: space id */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->space = space; +} + +/************************************************************************* +Gets the page number of the root of the index tree. */ +UNIV_INLINE +ulint +dict_index_get_page( +/*================*/ + /* out: page number */ + const dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->page); +} + +/************************************************************************* +Sets the page number of the root of index tree. */ +UNIV_INLINE +void +dict_index_set_page( +/*================*/ + dict_index_t* index, /* in/out: index */ + ulint page) /* in: page number */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->page = page; +} + +/************************************************************************* +Gets the read-write lock of the index tree. */ +UNIV_INLINE +rw_lock_t* +dict_index_get_lock( +/*================*/ + /* out: read-write lock */ + dict_index_t* index) /* in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(&(index->lock)); +} + +/************************************************************************ +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void) +/*==============================*/ + /* out: number of free bytes on page, + reserved for updates */ +{ + return(UNIV_PAGE_SIZE / 16); +} + +/************************************************************************** +Checks if a table is in the dictionary cache. */ +UNIV_INLINE +dict_table_t* +dict_table_check_if_in_cache_low( +/*=============================*/ + /* out: table, NULL if not found */ + const char* table_name) /* in: table name */ +{ + dict_table_t* table; + ulint table_fold; + + ut_ad(table_name); + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Look for the table name in the hash table */ + table_fold = ut_fold_string(table_name); + + HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, + dict_table_t*, table, ut_ad(table->cached), + !strcmp(table->name, table_name)); + return(table); +} + +/************************************************************************** +Gets a table; loads it to the dictionary cache if necessary. A low-level +function. */ +UNIV_INLINE +dict_table_t* +dict_table_get_low( +/*===============*/ + /* out: table, NULL if not found */ + const char* table_name) /* in: table name */ +{ + dict_table_t* table; + + ut_ad(table_name); + ut_ad(mutex_own(&(dict_sys->mutex))); + + table = dict_table_check_if_in_cache_low(table_name); + + if (table == NULL) { + table = dict_load_table(table_name); + } + + ut_ad(!table || table->cached); + + return(table); +} + +/************************************************************************** +Returns a table object based on table id. */ +UNIV_INLINE +dict_table_t* +dict_table_get_on_id_low( +/*=====================*/ + /* out: table, NULL if does not exist */ + dulint table_id) /* in: table id */ +{ + dict_table_t* table; + ulint fold; + + ut_ad(mutex_own(&(dict_sys->mutex))); + + /* Look for the table name in the hash table */ + fold = ut_fold_dulint(table_id); + + HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, + dict_table_t*, table, ut_ad(table->cached), + !ut_dulint_cmp(table->id, table_id)); + if (table == NULL) { + table = dict_load_table_on_id(table_id); + } + + ut_ad(!table || table->cached); + + /* TODO: should get the type information from MySQL */ + + return(table); +} + diff --git a/storage/xtradb/include/dict0load.h b/storage/xtradb/include/dict0load.h new file mode 100644 index 00000000000..759cbcdb14a --- /dev/null +++ b/storage/xtradb/include/dict0load.h @@ -0,0 +1,119 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0load_h +#define dict0load_h + +#include "univ.i" +#include "dict0types.h" +#include "ut0byte.h" +#include "mem0mem.h" + +/************************************************************************ +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). + +In a normal startup, we create the tablespace objects for every table in +InnoDB's data dictionary, if the corresponding .ibd file exists. +We also scan the biggest space id, and store it to fil_system. */ +UNIV_INTERN +void +dict_check_tablespaces_and_store_max_id( +/*====================================*/ + ibool in_crash_recovery); /* in: are we doing a crash recovery */ +/************************************************************************ +Finds the first table name in the given database. */ +UNIV_INTERN +char* +dict_get_first_table_name_in_db( +/*============================*/ + /* out, own: table name, NULL if + does not exist; the caller must free + the memory in the string! */ + const char* name); /* in: database name which ends to '/' */ +/************************************************************************ +Loads a table definition and also all its index definitions, and also +the cluster definition if the table is a member in a cluster. Also loads +all foreign key constraints where the foreign key is in the table or where +a foreign key references columns in this table. */ +UNIV_INTERN +dict_table_t* +dict_load_table( +/*============*/ + /* out: table, NULL if does not exist; + if the table is stored in an .ibd file, + but the file does not exist, + then we set the ibd_file_missing flag TRUE + in the table object we return */ + const char* name); /* in: table name in the + databasename/tablename format */ +/*************************************************************************** +Loads a table object based on the table id. */ +UNIV_INTERN +dict_table_t* +dict_load_table_on_id( +/*==================*/ + /* out: table; NULL if table does not exist */ + dulint table_id); /* in: table id */ +/************************************************************************ +This function is called when the database is booted. +Loads system table index definitions except for the clustered index which +is added to the dictionary cache at booting before calling this function. */ +UNIV_INTERN +void +dict_load_sys_table( +/*================*/ + dict_table_t* table); /* in: system table */ +#ifndef UNIV_HOTBACKUP +/*************************************************************************** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. Note that we know that the dictionary +cache already contains all constraints where the other relevant table is +already in the dictionary cache. */ +UNIV_INTERN +ulint +dict_load_foreigns( +/*===============*/ + /* out: DB_SUCCESS or error code */ + const char* table_name, /* in: table name */ + ibool check_charsets);/* in: TRUE=check charsets + compatibility */ +#endif /* !UNIV_HOTBACKUP */ +/************************************************************************ +Prints to the standard output information on all tables found in the data +dictionary system table. */ +UNIV_INTERN +void +dict_print(void); +/*============*/ + + +#ifndef UNIV_NONINL +#include "dict0load.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0load.ic b/storage/xtradb/include/dict0load.ic new file mode 100644 index 00000000000..72eac2f621a --- /dev/null +++ b/storage/xtradb/include/dict0load.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h new file mode 100644 index 00000000000..e2b3cfa3679 --- /dev/null +++ b/storage/xtradb/include/dict0mem.h @@ -0,0 +1,501 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0mem_h +#define dict0mem_h + +#include "univ.i" +#include "dict0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "btr0types.h" +#include "ut0mem.h" +#include "ut0lst.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "sync0rw.h" +#include "lock0types.h" +#include "hash0hash.h" +#include "que0types.h" +#include "trx0types.h" + +/* Type flags of an index: OR'ing of the flags is allowed to define a +combination of types */ +#define DICT_CLUSTERED 1 /* clustered index */ +#define DICT_UNIQUE 2 /* unique index */ +#define DICT_UNIVERSAL 4 /* index which can contain records from any + other index */ +#define DICT_IBUF 8 /* insert buffer tree */ + +/* Types for a table object */ +#define DICT_TABLE_ORDINARY 1 +#if 0 /* not implemented */ +#define DICT_TABLE_CLUSTER_MEMBER 2 +#define DICT_TABLE_CLUSTER 3 /* this means that the table is + really a cluster definition */ +#endif + +/* Table flags. All unused bits must be 0. */ +#define DICT_TF_COMPACT 1 /* Compact page format. + This must be set for + new file formats + (later than + DICT_TF_FORMAT_51). */ + +/* compressed page size (0=uncompressed, up to 15 compressed sizes) */ +#define DICT_TF_ZSSIZE_SHIFT 1 +#define DICT_TF_ZSSIZE_MASK (15 << DICT_TF_ZSSIZE_SHIFT) +#define DICT_TF_ZSSIZE_MAX (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 1) + + +#define DICT_TF_FORMAT_SHIFT 5 /* file format */ +#define DICT_TF_FORMAT_MASK (127 << DICT_TF_FORMAT_SHIFT) +#define DICT_TF_FORMAT_51 0 /* InnoDB/MySQL up to 5.1 */ +#define DICT_TF_FORMAT_ZIP 1 /* InnoDB plugin for 5.1: + compressed tables, + new BLOB treatment */ +#define DICT_TF_FORMAT_MAX DICT_TF_FORMAT_ZIP + +#define DICT_TF_BITS 6 /* number of flag bits */ +#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX +# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX" +#endif + +/************************************************************************** +Creates a table memory object. */ +UNIV_INTERN +dict_table_t* +dict_mem_table_create( +/*==================*/ + /* out, own: table object */ + const char* name, /* in: table name */ + ulint space, /* in: space where the clustered index + of the table is placed; this parameter + is ignored if the table is made + a member of a cluster */ + ulint n_cols, /* in: number of columns */ + ulint flags); /* in: table flags */ +/******************************************************************** +Free a table memory object. */ +UNIV_INTERN +void +dict_mem_table_free( +/*================*/ + dict_table_t* table); /* in: table */ +/************************************************************************** +Adds a column definition to a table. */ +UNIV_INTERN +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /* in: table */ + mem_heap_t* heap, /* in: temporary memory heap, or NULL */ + const char* name, /* in: column name, or NULL */ + ulint mtype, /* in: main datatype */ + ulint prtype, /* in: precise type */ + ulint len); /* in: precision */ +/************************************************************************** +Creates an index memory object. */ +UNIV_INTERN +dict_index_t* +dict_mem_index_create( +/*==================*/ + /* out, own: index object */ + const char* table_name, /* in: table name */ + const char* index_name, /* in: index name */ + ulint space, /* in: space where the index tree is + placed, ignored if the index is of + the clustered type */ + ulint type, /* in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /* in: number of fields */ +/************************************************************************** +Adds a field definition to an index. NOTE: does not take a copy +of the column name if the field is a column. The memory occupied +by the column name may be released only after publishing the index. */ +UNIV_INTERN +void +dict_mem_index_add_field( +/*=====================*/ + dict_index_t* index, /* in: index */ + const char* name, /* in: column name */ + ulint prefix_len); /* in: 0 or the column prefix length + in a MySQL index like + INDEX (textcol(25)) */ +/************************************************************************** +Frees an index memory object. */ +UNIV_INTERN +void +dict_mem_index_free( +/*================*/ + dict_index_t* index); /* in: index */ +/************************************************************************** +Creates and initializes a foreign constraint memory object. */ +UNIV_INTERN +dict_foreign_t* +dict_mem_foreign_create(void); +/*=========================*/ + /* out, own: foreign constraint struct */ + +/* Data structure for a column in a table */ +struct dict_col_struct{ + /*----------------------*/ + /* The following are copied from dtype_t, + so that all bit-fields can be packed tightly. */ + unsigned mtype:8; /* main data type */ + unsigned prtype:24; /* precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /* length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + + unsigned mbminlen:2; /* minimum length of a + character, in bytes */ + unsigned mbmaxlen:3; /* maximum length of a + character, in bytes */ + /*----------------------*/ + /* End of definitions copied from dtype_t */ + + unsigned ind:10; /* table column position + (starting from 0) */ + unsigned ord_part:1; /* nonzero if this column + appears in the ordering fields + of an index */ +}; + +/* DICT_MAX_INDEX_COL_LEN is measured in bytes and is the maximum +indexed column length (or indexed prefix length). It is set to 3*256, +so that one can create a column prefix index on 256 characters of a +TEXT or VARCHAR column also in the UTF-8 charset. In that charset, +a character may take at most 3 bytes. +This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ + +#define DICT_MAX_INDEX_COL_LEN REC_MAX_INDEX_COL_LEN + +/* Data structure for a field in an index */ +struct dict_field_struct{ + dict_col_t* col; /* pointer to the table column */ + const char* name; /* name of the column */ + unsigned prefix_len:10; /* 0 or the length of the column + prefix in bytes in a MySQL index of + type, e.g., INDEX (textcol(25)); + must be smaller than + DICT_MAX_INDEX_COL_LEN; NOTE that + in the UTF-8 charset, MySQL sets this + to 3 * the prefix len in UTF-8 chars */ + unsigned fixed_len:10; /* 0 or the fixed length of the + column if smaller than + DICT_MAX_INDEX_COL_LEN */ +}; + +/* Data structure for an index. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_index_create(). */ +struct dict_index_struct{ + dulint id; /* id of the index */ + mem_heap_t* heap; /* memory heap */ + const char* name; /* index name */ + const char* table_name; /* table name */ + dict_table_t* table; /* back pointer to table */ + unsigned space:32; + /* space where the index tree is placed */ + unsigned page:32;/* index tree root page number */ + unsigned type:4; /* index type (DICT_CLUSTERED, DICT_UNIQUE, + DICT_UNIVERSAL, DICT_IBUF) */ + unsigned trx_id_offset:10;/* position of the trx id column + in a clustered index record, if the fields + before it are known to be of a fixed size, + 0 otherwise */ + unsigned n_user_defined_cols:10; + /* number of columns the user defined to + be in the index: in the internal + representation we add more columns */ + unsigned n_uniq:10;/* number of fields from the beginning + which are enough to determine an index + entry uniquely */ + unsigned n_def:10;/* number of fields defined so far */ + unsigned n_fields:10;/* number of fields in the index */ + unsigned n_nullable:10;/* number of nullable fields */ + unsigned cached:1;/* TRUE if the index object is in the + dictionary cache */ + unsigned to_be_dropped:1; + /* TRUE if this index is marked to be + dropped in ha_innobase::prepare_drop_index(), + otherwise FALSE */ + dict_field_t* fields; /* array of field descriptions */ + UT_LIST_NODE_T(dict_index_t) + indexes;/* list of indexes of the table */ + btr_search_t* search_info; /* info used in optimistic searches */ + /*----------------------*/ + ib_int64_t* stat_n_diff_key_vals; + /* approximate number of different key values + for this index, for each n-column prefix + where n <= dict_get_n_unique(index); we + periodically calculate new estimates */ + ulint stat_index_size; + /* approximate index size in database pages */ + ulint stat_n_leaf_pages; + /* approximate number of leaf pages in the + index tree */ + rw_lock_t lock; /* read-write lock protecting the upper levels + of the index tree */ +#ifdef ROW_MERGE_IS_INDEX_USABLE + dulint trx_id; /* id of the transaction that created this + index, or ut_dulint_zero if the index existed + when InnoDB was started up */ +#endif /* ROW_MERGE_IS_INDEX_USABLE */ +#ifdef UNIV_DEBUG + ulint magic_n;/* magic number */ +# define DICT_INDEX_MAGIC_N 76789786 +#endif +}; + +/* Data structure for a foreign key constraint; an example: +FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be +initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */ + +struct dict_foreign_struct{ + mem_heap_t* heap; /* this object is allocated from + this memory heap */ + char* id; /* id of the constraint as a + null-terminated string */ + unsigned n_fields:10; /* number of indexes' first fields + for which the the foreign key + constraint is defined: we allow the + indexes to contain more fields than + mentioned in the constraint, as long + as the first fields are as mentioned */ + unsigned type:6; /* 0 or DICT_FOREIGN_ON_DELETE_CASCADE + or DICT_FOREIGN_ON_DELETE_SET_NULL */ + char* foreign_table_name;/* foreign table name */ + dict_table_t* foreign_table; /* table where the foreign key is */ + const char** foreign_col_names;/* names of the columns in the + foreign key */ + char* referenced_table_name;/* referenced table name */ + dict_table_t* referenced_table;/* table where the referenced key + is */ + const char** referenced_col_names;/* names of the referenced + columns in the referenced table */ + dict_index_t* foreign_index; /* foreign index; we require that + both tables contain explicitly defined + indexes for the constraint: InnoDB + does not generate new indexes + implicitly */ + dict_index_t* referenced_index;/* referenced index */ + UT_LIST_NODE_T(dict_foreign_t) + foreign_list; /* list node for foreign keys of the + table */ + UT_LIST_NODE_T(dict_foreign_t) + referenced_list;/* list node for referenced keys of the + table */ +}; + +/* The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that +a foreign key constraint is enforced, therefore RESTRICT just means no flag */ +#define DICT_FOREIGN_ON_DELETE_CASCADE 1 +#define DICT_FOREIGN_ON_DELETE_SET_NULL 2 +#define DICT_FOREIGN_ON_UPDATE_CASCADE 4 +#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8 +#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16 +#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32 + + +/* Data structure for a database table. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_table_create(). */ +struct dict_table_struct{ + dulint id; /* id of the table */ + mem_heap_t* heap; /* memory heap */ + const char* name; /* table name */ + const char* dir_path_of_temp_table;/* NULL or the directory path + where a TEMPORARY table that was explicitly + created by a user should be placed if + innodb_file_per_table is defined in my.cnf; + in Unix this is usually /tmp/..., in Windows + \temp\... */ + unsigned space:32; + /* space where the clustered index of the + table is placed */ + unsigned flags:DICT_TF_BITS;/* DICT_TF_COMPACT, ... */ + unsigned ibd_file_missing:1; + /* TRUE if this is in a single-table + tablespace and the .ibd file is missing; then + we must return in ha_innodb.cc an error if the + user tries to query such an orphaned table */ + unsigned tablespace_discarded:1; + /* this flag is set TRUE when the user + calls DISCARD TABLESPACE on this + table, and reset to FALSE in IMPORT + TABLESPACE */ + unsigned cached:1;/* TRUE if the table object has been added + to the dictionary cache */ + unsigned n_def:10;/* number of columns defined so far */ + unsigned n_cols:10;/* number of columns */ + dict_col_t* cols; /* array of column descriptions */ + const char* col_names; + /* Column names packed in a character string + "name1\0name2\0...nameN\0". Until + the string contains n_cols, it will be + allocated from a temporary heap. The final + string will be allocated from table->heap. */ + hash_node_t name_hash; /* hash chain node */ + hash_node_t id_hash; /* hash chain node */ + UT_LIST_BASE_NODE_T(dict_index_t) + indexes; /* list of indexes of the table */ + UT_LIST_BASE_NODE_T(dict_foreign_t) + foreign_list;/* list of foreign key constraints + in the table; these refer to columns + in other tables */ + UT_LIST_BASE_NODE_T(dict_foreign_t) + referenced_list;/* list of foreign key constraints + which refer to this table */ + UT_LIST_NODE_T(dict_table_t) + table_LRU; /* node of the LRU list of tables */ + ulint n_mysql_handles_opened; + /* count of how many handles MySQL has opened + to this table; dropping of the table is + NOT allowed until this count gets to zero; + MySQL does NOT itself check the number of + open handles at drop */ + ulint n_foreign_key_checks_running; + /* count of how many foreign key check + operations are currently being performed + on the table: we cannot drop the table while + there are foreign key checks running on + it! */ + dulint query_cache_inv_trx_id; + /* transactions whose trx id < than this + number are not allowed to store to the MySQL + query cache or retrieve from it; when a trx + with undo logs commits, it sets this to the + value of the trx id counter for the tables it + had an IX lock on */ + UT_LIST_BASE_NODE_T(lock_t) + locks; /* list of locks on the table */ +#ifdef UNIV_DEBUG + /*----------------------*/ + ibool does_not_fit_in_memory; + /* this field is used to specify in simulations + tables which are so big that disk should be + accessed: disk access is simulated by + putting the thread to sleep for a while; + NOTE that this flag is not stored to the data + dictionary on disk, and the database will + forget about value TRUE if it has to reload + the table definition from disk */ +#endif /* UNIV_DEBUG */ + /*----------------------*/ + unsigned big_rows:1; + /* flag: TRUE if the maximum length of + a single row exceeds BIG_ROW_SIZE; + initialized in dict_table_add_to_cache() */ + unsigned stat_initialized:1; /* TRUE if statistics have + been calculated the first time + after database startup or table creation */ + ib_int64_t stat_n_rows; + /* approximate number of rows in the table; + we periodically calculate new estimates */ + ulint stat_clustered_index_size; + /* approximate clustered index size in + database pages */ + ulint stat_sum_of_other_index_sizes; + /* other indexes in database pages */ + ulint stat_modified_counter; + /* when a row is inserted, updated, or deleted, + we add 1 to this number; we calculate new + estimates for the stat_... values for the + table and the indexes at an interval of 2 GB + or when about 1 / 16 of table has been + modified; also when the estimate operation is + called for MySQL SHOW TABLE STATUS; the + counter is reset to zero at statistics + calculation; this counter is not protected by + any latch, because this is only used for + heuristics */ + /*----------------------*/ + /* The following fields are used by the + AUTOINC code. The actual collection of + tables locked during AUTOINC read/write is + kept in trx_t. In order to quickly determine + whether a transaction has locked the AUTOINC + lock we keep a pointer to the transaction + here in the autoinc_trx variable. This is to + avoid acquiring the kernel mutex and scanning + the vector in trx_t. + + When an AUTOINC lock has to wait, the + corresponding lock instance is created on + the trx lock heap rather than use the + pre-allocated instance in autoinc_lock below.*/ + lock_t* autoinc_lock; + /* a buffer for an AUTOINC lock + for this table: we allocate the memory here + so that individual transactions can get it + and release it without a need to allocate + space from the lock heap of the trx: + otherwise the lock heap would grow rapidly + if we do a large insert from a select */ + mutex_t autoinc_mutex; + /* mutex protecting the autoincrement + counter */ + ib_uint64_t autoinc;/* autoinc counter value to give to the + next inserted row */ + ulong n_waiting_or_granted_auto_inc_locks; + /* This counter is used to track the number + of granted and pending autoinc locks on this + table. This value is set after acquiring the + kernel mutex but we peek the contents to + determine whether other transactions have + acquired the AUTOINC lock or not. Of course + only one transaction can be granted the + lock but there can be multiple waiters. */ + const trx_t* autoinc_trx; + /* The transaction that currently holds the + the AUTOINC lock on this table. */ + /*----------------------*/ + +#ifdef UNIV_DEBUG + ulint magic_n;/* magic number */ +# define DICT_TABLE_MAGIC_N 76333786 +#endif /* UNIV_DEBUG */ +}; + +#ifndef UNIV_NONINL +#include "dict0mem.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0mem.ic b/storage/xtradb/include/dict0mem.ic new file mode 100644 index 00000000000..6916393a9cd --- /dev/null +++ b/storage/xtradb/include/dict0mem.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + + diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h new file mode 100644 index 00000000000..b93e995e01b --- /dev/null +++ b/storage/xtradb/include/dict0types.h @@ -0,0 +1,45 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Data dictionary global types + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0types_h +#define dict0types_h + +#include "ut0list.h" + +typedef struct dict_sys_struct dict_sys_t; +typedef struct dict_col_struct dict_col_t; +typedef struct dict_field_struct dict_field_t; +typedef struct dict_index_struct dict_index_t; +typedef struct dict_table_struct dict_table_t; +typedef struct dict_foreign_struct dict_foreign_t; + +/* A cluster object is a table object with the type field set to +DICT_CLUSTERED */ + +typedef dict_table_t dict_cluster_t; + +typedef struct ind_node_struct ind_node_t; +typedef struct tab_node_struct tab_node_t; + +#endif diff --git a/storage/xtradb/include/dyn0dyn.h b/storage/xtradb/include/dyn0dyn.h new file mode 100644 index 00000000000..c06d6b88d2f --- /dev/null +++ b/storage/xtradb/include/dyn0dyn.h @@ -0,0 +1,182 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dyn0dyn_h +#define dyn0dyn_h + +#include "univ.i" +#include "ut0lst.h" +#include "mem0mem.h" + +typedef struct dyn_block_struct dyn_block_t; +typedef dyn_block_t dyn_array_t; + + +/* This is the initial 'payload' size of a dynamic array; +this must be > MLOG_BUF_MARGIN + 30! */ +#define DYN_ARRAY_DATA_SIZE 512 + +/************************************************************************* +Initializes a dynamic array. */ +UNIV_INLINE +dyn_array_t* +dyn_array_create( +/*=============*/ + /* out: initialized dyn array */ + dyn_array_t* arr); /* in: pointer to a memory buffer of + size sizeof(dyn_array_t) */ +/**************************************************************** +Frees a dynamic array. */ +UNIV_INLINE +void +dyn_array_free( +/*===========*/ + dyn_array_t* arr); /* in: dyn array */ +/************************************************************************* +Makes room on top of a dyn array and returns a pointer to a buffer in it. +After copying the elements, the caller must close the buffer using +dyn_array_close. */ +UNIV_INLINE +byte* +dyn_array_open( +/*===========*/ + /* out: pointer to the buffer */ + dyn_array_t* arr, /* in: dynamic array */ + ulint size); /* in: size in bytes of the buffer; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +/************************************************************************* +Closes the buffer returned by dyn_array_open. */ +UNIV_INLINE +void +dyn_array_close( +/*============*/ + dyn_array_t* arr, /* in: dynamic array */ + byte* ptr); /* in: buffer space from ptr up was not used */ +/************************************************************************* +Makes room on top of a dyn array and returns a pointer to +the added element. The caller must copy the element to +the pointer returned. */ +UNIV_INLINE +void* +dyn_array_push( +/*===========*/ + /* out: pointer to the element */ + dyn_array_t* arr, /* in: dynamic array */ + ulint size); /* in: size in bytes of the element */ +/**************************************************************** +Returns pointer to an element in dyn array. */ +UNIV_INLINE +void* +dyn_array_get_element( +/*==================*/ + /* out: pointer to element */ + dyn_array_t* arr, /* in: dyn array */ + ulint pos); /* in: position of element as bytes + from array start */ +/**************************************************************** +Returns the size of stored data in a dyn array. */ +UNIV_INLINE +ulint +dyn_array_get_data_size( +/*====================*/ + /* out: data size in bytes */ + dyn_array_t* arr); /* in: dyn array */ +/**************************************************************** +Gets the first block in a dyn array. */ +UNIV_INLINE +dyn_block_t* +dyn_array_get_first_block( +/*======================*/ + dyn_array_t* arr); /* in: dyn array */ +/**************************************************************** +Gets the last block in a dyn array. */ +UNIV_INLINE +dyn_block_t* +dyn_array_get_last_block( +/*=====================*/ + dyn_array_t* arr); /* in: dyn array */ +/************************************************************************ +Gets the next block in a dyn array. */ +UNIV_INLINE +dyn_block_t* +dyn_array_get_next_block( +/*=====================*/ + /* out: pointer to next, NULL if end of list */ + dyn_array_t* arr, /* in: dyn array */ + dyn_block_t* block); /* in: dyn array block */ +/************************************************************************ +Gets the number of used bytes in a dyn array block. */ +UNIV_INLINE +ulint +dyn_block_get_used( +/*===============*/ + /* out: number of bytes used */ + dyn_block_t* block); /* in: dyn array block */ +/************************************************************************ +Gets pointer to the start of data in a dyn array block. */ +UNIV_INLINE +byte* +dyn_block_get_data( +/*===============*/ + /* out: pointer to data */ + dyn_block_t* block); /* in: dyn array block */ +/************************************************************ +Pushes n bytes to a dyn array. */ +UNIV_INLINE +void +dyn_push_string( +/*============*/ + dyn_array_t* arr, /* in: dyn array */ + const byte* str, /* in: string to write */ + ulint len); /* in: string length */ + +/*#################################################################*/ + +/* NOTE! Do not use the fields of the struct directly: the definition +appears here only for the compiler to know its size! */ +struct dyn_block_struct{ + mem_heap_t* heap; /* in the first block this is != NULL + if dynamic allocation has been needed */ + ulint used; /* number of data bytes used in this block */ + byte data[DYN_ARRAY_DATA_SIZE]; + /* storage for array elements */ + UT_LIST_BASE_NODE_T(dyn_block_t) base; + /* linear list of dyn blocks: this node is + used only in the first block */ + UT_LIST_NODE_T(dyn_block_t) list; + /* linear list node: used in all blocks */ +#ifdef UNIV_DEBUG + ulint buf_end;/* only in the debug version: if dyn array is + opened, this is the buffer end offset, else + this is 0 */ + ulint magic_n; +#endif +}; + + +#ifndef UNIV_NONINL +#include "dyn0dyn.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dyn0dyn.ic b/storage/xtradb/include/dyn0dyn.ic new file mode 100644 index 00000000000..1ef8b284a99 --- /dev/null +++ b/storage/xtradb/include/dyn0dyn.ic @@ -0,0 +1,362 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The dynamically allocated array + +Created 2/5/1996 Heikki Tuuri +*******************************************************/ + +#define DYN_BLOCK_MAGIC_N 375767 +#define DYN_BLOCK_FULL_FLAG 0x1000000UL + +/**************************************************************** +Adds a new block to a dyn array. */ +UNIV_INTERN +dyn_block_t* +dyn_array_add_block( +/*================*/ + /* out: created block */ + dyn_array_t* arr); /* in: dyn array */ + + +/**************************************************************** +Gets the first block in a dyn array. */ +UNIV_INLINE +dyn_block_t* +dyn_array_get_first_block( +/*======================*/ + dyn_array_t* arr) /* in: dyn array */ +{ + return(arr); +} + +/**************************************************************** +Gets the last block in a dyn array. */ +UNIV_INLINE +dyn_block_t* +dyn_array_get_last_block( +/*=====================*/ + dyn_array_t* arr) /* in: dyn array */ +{ + if (arr->heap == NULL) { + + return(arr); + } + + return(UT_LIST_GET_LAST(arr->base)); +} + +/************************************************************************ +Gets the next block in a dyn array. */ +UNIV_INLINE +dyn_block_t* +dyn_array_get_next_block( +/*=====================*/ + /* out: pointer to next, NULL if end of list */ + dyn_array_t* arr, /* in: dyn array */ + dyn_block_t* block) /* in: dyn array block */ +{ + ut_ad(arr && block); + + if (arr->heap == NULL) { + ut_ad(arr == block); + + return(NULL); + } + + return(UT_LIST_GET_NEXT(list, block)); +} + +/************************************************************************ +Gets the number of used bytes in a dyn array block. */ +UNIV_INLINE +ulint +dyn_block_get_used( +/*===============*/ + /* out: number of bytes used */ + dyn_block_t* block) /* in: dyn array block */ +{ + ut_ad(block); + + return((block->used) & ~DYN_BLOCK_FULL_FLAG); +} + +/************************************************************************ +Gets pointer to the start of data in a dyn array block. */ +UNIV_INLINE +byte* +dyn_block_get_data( +/*===============*/ + /* out: pointer to data */ + dyn_block_t* block) /* in: dyn array block */ +{ + ut_ad(block); + + return(block->data); +} + +/************************************************************************* +Initializes a dynamic array. */ +UNIV_INLINE +dyn_array_t* +dyn_array_create( +/*=============*/ + /* out: initialized dyn array */ + dyn_array_t* arr) /* in: pointer to a memory buffer of + size sizeof(dyn_array_t) */ +{ + ut_ad(arr); +#if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG +# error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG" +#endif + + arr->heap = NULL; + arr->used = 0; + +#ifdef UNIV_DEBUG + arr->buf_end = 0; + arr->magic_n = DYN_BLOCK_MAGIC_N; +#endif + return(arr); +} + +/**************************************************************** +Frees a dynamic array. */ +UNIV_INLINE +void +dyn_array_free( +/*===========*/ + dyn_array_t* arr) /* in: dyn array */ +{ + if (arr->heap != NULL) { + mem_heap_free(arr->heap); + } + +#ifdef UNIV_DEBUG + arr->magic_n = 0; +#endif +} + +/************************************************************************* +Makes room on top of a dyn array and returns a pointer to the added element. +The caller must copy the element to the pointer returned. */ +UNIV_INLINE +void* +dyn_array_push( +/*===========*/ + /* out: pointer to the element */ + dyn_array_t* arr, /* in: dynamic array */ + ulint size) /* in: size in bytes of the element */ +{ + dyn_block_t* block; + ulint used; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + ut_ad(size <= DYN_ARRAY_DATA_SIZE); + ut_ad(size); + + block = arr; + used = block->used; + + if (used + size > DYN_ARRAY_DATA_SIZE) { + /* Get the last array block */ + + block = dyn_array_get_last_block(arr); + used = block->used; + + if (used + size > DYN_ARRAY_DATA_SIZE) { + block = dyn_array_add_block(arr); + used = block->used; + } + } + + block->used = used + size; + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + + return((block->data) + used); +} + +/************************************************************************* +Makes room on top of a dyn array and returns a pointer to a buffer in it. +After copying the elements, the caller must close the buffer using +dyn_array_close. */ +UNIV_INLINE +byte* +dyn_array_open( +/*===========*/ + /* out: pointer to the buffer */ + dyn_array_t* arr, /* in: dynamic array */ + ulint size) /* in: size in bytes of the buffer; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +{ + dyn_block_t* block; + ulint used; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + ut_ad(size <= DYN_ARRAY_DATA_SIZE); + ut_ad(size); + + block = arr; + used = block->used; + + if (used + size > DYN_ARRAY_DATA_SIZE) { + /* Get the last array block */ + + block = dyn_array_get_last_block(arr); + used = block->used; + + if (used + size > DYN_ARRAY_DATA_SIZE) { + block = dyn_array_add_block(arr); + used = block->used; + ut_a(size <= DYN_ARRAY_DATA_SIZE); + } + } + + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); +#ifdef UNIV_DEBUG + ut_ad(arr->buf_end == 0); + + arr->buf_end = used + size; +#endif + return((block->data) + used); +} + +/************************************************************************* +Closes the buffer returned by dyn_array_open. */ +UNIV_INLINE +void +dyn_array_close( +/*============*/ + dyn_array_t* arr, /* in: dynamic array */ + byte* ptr) /* in: buffer space from ptr up was not used */ +{ + dyn_block_t* block; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + block = dyn_array_get_last_block(arr); + + ut_ad(arr->buf_end + block->data >= ptr); + + block->used = ptr - block->data; + + ut_ad(block->used <= DYN_ARRAY_DATA_SIZE); + +#ifdef UNIV_DEBUG + arr->buf_end = 0; +#endif +} + +/**************************************************************** +Returns pointer to an element in dyn array. */ +UNIV_INLINE +void* +dyn_array_get_element( +/*==================*/ + /* out: pointer to element */ + dyn_array_t* arr, /* in: dyn array */ + ulint pos) /* in: position of element as bytes + from array start */ +{ + dyn_block_t* block; + ulint used; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + /* Get the first array block */ + block = dyn_array_get_first_block(arr); + + if (arr->heap != NULL) { + used = dyn_block_get_used(block); + + while (pos >= used) { + pos -= used; + block = UT_LIST_GET_NEXT(list, block); + ut_ad(block); + + used = dyn_block_get_used(block); + } + } + + ut_ad(block); + ut_ad(dyn_block_get_used(block) >= pos); + + return(block->data + pos); +} + +/**************************************************************** +Returns the size of stored data in a dyn array. */ +UNIV_INLINE +ulint +dyn_array_get_data_size( +/*====================*/ + /* out: data size in bytes */ + dyn_array_t* arr) /* in: dyn array */ +{ + dyn_block_t* block; + ulint sum = 0; + + ut_ad(arr); + ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N); + + if (arr->heap == NULL) { + + return(arr->used); + } + + /* Get the first array block */ + block = dyn_array_get_first_block(arr); + + while (block != NULL) { + sum += dyn_block_get_used(block); + block = dyn_array_get_next_block(arr, block); + } + + return(sum); +} + +/************************************************************ +Pushes n bytes to a dyn array. */ +UNIV_INLINE +void +dyn_push_string( +/*============*/ + dyn_array_t* arr, /* in: dyn array */ + const byte* str, /* in: string to write */ + ulint len) /* in: string length */ +{ + ulint n_copied; + + while (len > 0) { + if (len > DYN_ARRAY_DATA_SIZE) { + n_copied = DYN_ARRAY_DATA_SIZE; + } else { + n_copied = len; + } + + memcpy(dyn_array_push(arr, n_copied), str, n_copied); + + str += n_copied; + len -= n_copied; + } +} diff --git a/storage/xtradb/include/eval0eval.h b/storage/xtradb/include/eval0eval.h new file mode 100644 index 00000000000..75cf9b38c3a --- /dev/null +++ b/storage/xtradb/include/eval0eval.h @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef eval0eval_h +#define eval0eval_h + +#include "univ.i" +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/********************************************************************* +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +UNIV_INTERN +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node); /* in: query graph node */ +/********************************************************************* +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node); /* in: symbol table node */ +/********************************************************************* +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node); /* in: expression */ +/********************************************************************* +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /* in: expression node */ + lint val); /* in: value to set */ +/********************************************************************* +Gets an integer value from an expression node. */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + /* out: integer value */ + que_node_t* node); /* in: expression node */ +/********************************************************************* +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /* in: query graph node */ + const byte* str, /* in: binary string */ + ulint len); /* in: string length or UNIV_SQL_NULL */ +/********************************************************************* +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /* in: node to copy to */ + que_node_t* node2); /* in: node to copy from */ +/********************************************************************* +Gets a iboolean value from a query node. */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + /* out: iboolean value */ + que_node_t* node); /* in: query graph node */ +/********************************************************************* +Evaluates a comparison node. */ +UNIV_INTERN +ibool +eval_cmp( +/*=====*/ + /* out: the result of the comparison */ + func_node_t* cmp_node); /* in: comparison node */ + + +#ifndef UNIV_NONINL +#include "eval0eval.ic" +#endif + +#endif diff --git a/storage/xtradb/include/eval0eval.ic b/storage/xtradb/include/eval0eval.ic new file mode 100644 index 00000000000..a6330ae441f --- /dev/null +++ b/storage/xtradb/include/eval0eval.ic @@ -0,0 +1,250 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" +#include "rem0cmp.h" +#include "pars0grm.h" + +/********************************************************************* +Evaluates a function node. */ +UNIV_INTERN +void +eval_func( +/*======*/ + func_node_t* func_node); /* in: function node */ +/********************************************************************* +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. */ +UNIV_INTERN +byte* +eval_node_alloc_val_buf( +/*====================*/ + /* out: pointer to allocated buffer */ + que_node_t* node, /* in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size); /* in: buffer size */ + + +/********************************************************************* +Allocates a new buffer if needed. */ +UNIV_INLINE +byte* +eval_node_ensure_val_buf( +/*=====================*/ + /* out: pointer to buffer */ + que_node_t* node, /* in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /* in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + dfield_set_len(dfield, size); + + data = dfield_get_data(dfield); + + if (!data || que_node_get_val_buf_size(node) < size) { + + data = eval_node_alloc_val_buf(node, size); + } + + return(data); +} + +/********************************************************************* +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node) /* in: symbol table node */ +{ + + ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); + + if (sym_node->indirection) { + /* The symbol table node is an alias for a variable or a + column */ + + dfield_copy_data(que_node_get_val(sym_node), + que_node_get_val(sym_node->indirection)); + } +} + +/********************************************************************* +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node) /* in: expression */ +{ + if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) { + + eval_sym((sym_node_t*)exp_node); + + return; + } + + eval_func(exp_node); +} + +/********************************************************************* +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /* in: expression node */ + lint val) /* in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = dfield_get_data(dfield); + + if (data == NULL) { + data = eval_node_alloc_val_buf(node, 4); + } + + ut_ad(dfield_get_len(dfield) == 4); + + mach_write_to_4(data, (ulint)val); +} + +/********************************************************************* +Gets an integer non-SQL null value from an expression node. */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + /* out: integer value */ + que_node_t* node) /* in: expression node */ +{ + dfield_t* dfield; + + dfield = que_node_get_val(node); + + ut_ad(dfield_get_len(dfield) == 4); + + return((int)mach_read_from_4(dfield_get_data(dfield))); +} + +/********************************************************************* +Gets a iboolean value from a query node. */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + /* out: iboolean value */ + que_node_t* node) /* in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = dfield_get_data(dfield); + + ut_ad(data != NULL); + + return(mach_read_from_1(data)); +} + +/********************************************************************* +Sets a iboolean value as the value of a function node. */ +UNIV_INLINE +void +eval_node_set_ibool_val( +/*====================*/ + func_node_t* func_node, /* in: function node */ + ibool val) /* in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(func_node); + + data = dfield_get_data(dfield); + + if (data == NULL) { + /* Allocate 1 byte to hold the value */ + + data = eval_node_alloc_val_buf(func_node, 1); + } + + ut_ad(dfield_get_len(dfield) == 1); + + mach_write_to_1(data, val); +} + +/********************************************************************* +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /* in: query graph node */ + const byte* str, /* in: binary string */ + ulint len) /* in: string length or UNIV_SQL_NULL */ +{ + byte* data; + + if (len == UNIV_SQL_NULL) { + dfield_set_len(que_node_get_val(node), len); + + return; + } + + data = eval_node_ensure_val_buf(node, len); + + ut_memcpy(data, str, len); +} + +/********************************************************************* +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /* in: node to copy to */ + que_node_t* node2) /* in: node to copy from */ +{ + dfield_t* dfield2; + + dfield2 = que_node_get_val(node2); + + eval_node_copy_and_alloc_val(node1, dfield_get_data(dfield2), + dfield_get_len(dfield2)); +} diff --git a/storage/xtradb/include/eval0proc.h b/storage/xtradb/include/eval0proc.h new file mode 100644 index 00000000000..58937c18124 --- /dev/null +++ b/storage/xtradb/include/eval0proc.h @@ -0,0 +1,103 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#ifndef eval0proc_h +#define eval0proc_h + +#include "univ.i" +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/************************************************************************** +Performs an execution step of a procedure node. */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of an if-statement node. */ +UNIV_INTERN +que_thr_t* +if_step( +/*====*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of a while-statement node. */ +UNIV_INTERN +que_thr_t* +while_step( +/*=======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of a for-loop node. */ +UNIV_INTERN +que_thr_t* +for_step( +/*=====*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of an assignment statement node. */ +UNIV_INTERN +que_thr_t* +assign_step( +/*========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of a procedure call node. */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of an exit statement node. */ +UNIV_INTERN +que_thr_t* +exit_step( +/*======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of a return-statement node. */ +UNIV_INTERN +que_thr_t* +return_step( +/*========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ + + +#ifndef UNIV_NONINL +#include "eval0proc.ic" +#endif + +#endif diff --git a/storage/xtradb/include/eval0proc.ic b/storage/xtradb/include/eval0proc.ic new file mode 100644 index 00000000000..6bd978ad3fc --- /dev/null +++ b/storage/xtradb/include/eval0proc.ic @@ -0,0 +1,87 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "pars0pars.h" +#include "que0que.h" +#include "eval0eval.h" + +/************************************************************************** +Performs an execution step of a procedure node. */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + proc_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_PROC); + + if (thr->prev_node == que_node_get_parent(node)) { + /* Start execution from the first statement in the statement + list */ + + thr->run_node = node->stat_list; + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/************************************************************************** +Performs an execution step of a procedure call node. */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + func_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + /* Evaluate the procedure */ + + eval_exp(node); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h new file mode 100644 index 00000000000..587e5ee48a8 --- /dev/null +++ b/storage/xtradb/include/fil0fil.h @@ -0,0 +1,711 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The low-level file system + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fil0fil_h +#define fil0fil_h + +#include "univ.i" +#include "sync0rw.h" +#include "dict0types.h" +#include "ut0byte.h" +#include "os0file.h" + +/* When mysqld is run, the default directory "." is the mysqld datadir, but in +ibbackup we must set it explicitly; the patgh must NOT contain the trailing +'/' or '\' */ +extern const char* fil_path_to_mysql_datadir; + +/* Initial size of a single-table tablespace in pages */ +#define FIL_IBD_FILE_INITIAL_SIZE 4 + +/* 'null' (undefined) page offset in the context of file spaces */ +#define FIL_NULL ULINT32_UNDEFINED + +/* Space address data type; this is intended to be used when +addresses accurate to a byte are stored in file pages. If the page part +of the address is FIL_NULL, the address is considered undefined. */ + +typedef byte fil_faddr_t; /* 'type' definition in C: an address + stored in a file page is a string of bytes */ +#define FIL_ADDR_PAGE 0 /* first in address is the page offset */ +#define FIL_ADDR_BYTE 4 /* then comes 2-byte byte offset within page*/ + +#define FIL_ADDR_SIZE 6 /* address size is 6 bytes */ + +/* A struct for storing a space address FIL_ADDR, when it is used +in C program data structures. */ + +typedef struct fil_addr_struct fil_addr_t; +struct fil_addr_struct{ + ulint page; /* page number within a space */ + ulint boffset; /* byte offset within the page */ +}; + +/* Null file address */ +extern fil_addr_t fil_addr_null; + +/* The byte offsets on a file page for various variables */ +#define FIL_PAGE_SPACE_OR_CHKSUM 0 /* in < MySQL-4.0.14 space id the + page belongs to (== 0) but in later + versions the 'new' checksum of the + page */ +#define FIL_PAGE_OFFSET 4 /* page offset inside space */ +#define FIL_PAGE_PREV 8 /* if there is a 'natural' predecessor + of the page, its offset. + Otherwise FIL_NULL. + This field is not set on BLOB pages, + which are stored as a singly-linked + list. See also FIL_PAGE_NEXT. */ +#define FIL_PAGE_NEXT 12 /* if there is a 'natural' successor + of the page, its offset. + Otherwise FIL_NULL. + B-tree index pages + (FIL_PAGE_TYPE contains FIL_PAGE_INDEX) + on the same PAGE_LEVEL are maintained + as a doubly linked list via + FIL_PAGE_PREV and FIL_PAGE_NEXT + in the collation order of the + smallest user record on each page. */ +#define FIL_PAGE_LSN 16 /* lsn of the end of the newest + modification log record to the page */ +#define FIL_PAGE_TYPE 24 /* file page type: FIL_PAGE_INDEX,..., + 2 bytes. + + The contents of this field can only + be trusted in the following case: + if the page is an uncompressed + B-tree index page, then it is + guaranteed that the value is + FIL_PAGE_INDEX. + The opposite does not hold. + + In tablespaces created by + MySQL/InnoDB 5.1.7 or later, the + contents of this field is valid + for all uncompressed pages. */ +#define FIL_PAGE_FILE_FLUSH_LSN 26 /* this is only defined for the + first page in a data file: the file + has been flushed to disk at least up + to this lsn */ +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /* starting from 4.1.x this + contains the space id of the page */ +#define FIL_PAGE_DATA 38 /* start of the data on the page */ + +/* File page trailer */ +#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /* the low 4 bytes of this are used + to store the page checksum, the + last 4 bytes should be identical + to the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_DATA_END 8 + +/* File page types (values of FIL_PAGE_TYPE) */ +#define FIL_PAGE_INDEX 17855 /* B-tree node */ +#define FIL_PAGE_UNDO_LOG 2 /* Undo log page */ +#define FIL_PAGE_INODE 3 /* Index node */ +#define FIL_PAGE_IBUF_FREE_LIST 4 /* Insert buffer free list */ +/* File page types introduced in MySQL/InnoDB 5.1.7 */ +#define FIL_PAGE_TYPE_ALLOCATED 0 /* Freshly allocated page */ +#define FIL_PAGE_IBUF_BITMAP 5 /* Insert buffer bitmap */ +#define FIL_PAGE_TYPE_SYS 6 /* System page */ +#define FIL_PAGE_TYPE_TRX_SYS 7 /* Transaction system data */ +#define FIL_PAGE_TYPE_FSP_HDR 8 /* File space header */ +#define FIL_PAGE_TYPE_XDES 9 /* Extent descriptor page */ +#define FIL_PAGE_TYPE_BLOB 10 /* Uncompressed BLOB page */ +#define FIL_PAGE_TYPE_ZBLOB 11 /* First compressed BLOB page */ +#define FIL_PAGE_TYPE_ZBLOB2 12 /* Subsequent compressed BLOB page */ + +/* Space types */ +#define FIL_TABLESPACE 501 +#define FIL_LOG 502 + +extern ulint fil_n_log_flushes; + +extern ulint fil_n_pending_log_flushes; +extern ulint fil_n_pending_tablespace_flushes; + + +/*********************************************************************** +Returns the version number of a tablespace, -1 if not found. */ +UNIV_INTERN +ib_int64_t +fil_space_get_version( +/*==================*/ + /* out: version number, -1 if the tablespace does not + exist in the memory cache */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the latch of a file space. */ +UNIV_INTERN +rw_lock_t* +fil_space_get_latch( +/*================*/ + /* out: latch protecting storage allocation */ + ulint id, /* in: space id */ + ulint* zip_size);/* out: compressed page size, or + 0 for uncompressed tablespaces */ +/*********************************************************************** +Returns the type of a file space. */ +UNIV_INTERN +ulint +fil_space_get_type( +/*===============*/ + /* out: FIL_TABLESPACE or FIL_LOG */ + ulint id); /* in: space id */ +/*********************************************************************** +Appends a new file to the chain of files of a space. File must be closed. */ +UNIV_INTERN +void +fil_node_create( +/*============*/ + const char* name, /* in: file name (file must be closed) */ + ulint size, /* in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /* in: space id where to append */ + ibool is_raw);/* in: TRUE if a raw device or + a raw disk partition */ +#ifdef UNIV_LOG_ARCHIVE +/******************************************************************** +Drops files from the start of a file space, so that its size is cut by +the amount given. */ +UNIV_INTERN +void +fil_space_truncate_start( +/*=====================*/ + ulint id, /* in: space id */ + ulint trunc_len); /* in: truncate by this much; it is an error + if this does not equal to the combined size of + some initial files in the space */ +#endif /* UNIV_LOG_ARCHIVE */ +/*********************************************************************** +Creates a space memory object and puts it to the 'fil system' hash table. If +there is an error, prints an error message to the .err log. */ +UNIV_INTERN +ibool +fil_space_create( +/*=============*/ + /* out: TRUE if success */ + const char* name, /* in: space name */ + ulint id, /* in: space id */ + ulint zip_size,/* in: compressed page size, or + 0 for uncompressed tablespaces */ + ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */ +/*********************************************************************** +Frees a space object from a the tablespace memory cache. Closes the files in +the chain but does not delete them. */ +UNIV_INTERN +ibool +fil_space_free( +/*===========*/ + /* out: TRUE if success */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. */ +UNIV_INTERN +ulint +fil_space_get_size( +/*===============*/ + /* out: space size, 0 if space not found */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the flags of the space. The tablespace must be cached +in the memory cache. */ +UNIV_INTERN +ulint +fil_space_get_flags( +/*================*/ + /* out: flags, ULINT_UNDEFINED if space not found */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the compressed page size of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. */ +UNIV_INTERN +ulint +fil_space_get_zip_size( +/*===================*/ + /* out: compressed page size, ULINT_UNDEFINED + if space not found */ + ulint id); /* in: space id */ +/*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. */ +UNIV_INTERN +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no);/* in: page number */ +/******************************************************************** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_init( +/*=====*/ + ulint max_n_open); /* in: max number of open files */ +/*********************************************************************** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ +UNIV_INTERN +void +fil_open_log_and_system_tablespace_files(void); +/*==========================================*/ +/*********************************************************************** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ +UNIV_INTERN +void +fil_close_all_files(void); +/*=====================*/ +/*********************************************************************** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +UNIV_INTERN +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id);/* in: maximum known id */ +/******************************************************************** +Writes the flushed lsn and the latest archived log number to the page +header of the first page of each data file in the system tablespace. */ +UNIV_INTERN +ulint +fil_write_flushed_lsn_to_data_files( +/*================================*/ + /* out: DB_SUCCESS or error number */ + ib_uint64_t lsn, /* in: lsn to write */ + ulint arch_log_no); /* in: latest archived log + file number */ +/*********************************************************************** +Reads the flushed lsn and arch no fields from a data file at database +startup. */ +UNIV_INTERN +void +fil_read_flushed_lsn_and_arch_log_no( +/*=================================*/ + os_file_t data_file, /* in: open data file */ + ibool one_read_already, /* in: TRUE if min and max + parameters below already + contain sensible data */ +#ifdef UNIV_LOG_ARCHIVE + ulint* min_arch_log_no, /* in/out: */ + ulint* max_arch_log_no, /* in/out: */ +#endif /* UNIV_LOG_ARCHIVE */ + ib_uint64_t* min_flushed_lsn, /* in/out: */ + ib_uint64_t* max_flushed_lsn); /* in/out: */ +/*********************************************************************** +Increments the count of pending insert buffer page merges, if space is not +being deleted. */ +UNIV_INTERN +ibool +fil_inc_pending_ibuf_merges( +/*========================*/ + /* out: TRUE if being deleted, and ibuf merges should + be skipped */ + ulint id); /* in: space id */ +/*********************************************************************** +Decrements the count of pending insert buffer page merges. */ +UNIV_INTERN +void +fil_decr_pending_ibuf_merges( +/*=========================*/ + ulint id); /* in: space id */ +/*********************************************************************** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the +datadir that we should use in replaying the file operations. */ +UNIV_INTERN +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + /* out: end of log record, or NULL if the + record was not completely contained between + ptr and end_ptr */ + byte* ptr, /* in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /* in: buffer end */ + ulint type, /* in: the type of this log record */ + ulint space_id); /* in: the space id of the tablespace in + question, or 0 if the log record should + only be parsed but not replayed */ +/*********************************************************************** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. */ +UNIV_INTERN +ibool +fil_delete_tablespace( +/*==================*/ + /* out: TRUE if success */ + ulint id); /* in: space id */ +/*********************************************************************** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but +1) we do not drop the table from the data dictionary; +2) we remove all insert buffer entries for the tablespace immediately; in DROP +TABLE they are only removed gradually in the background; +3) when the user does IMPORT TABLESPACE, the tablespace will have the same id +as it originally had. */ +UNIV_INTERN +ibool +fil_discard_tablespace( +/*===================*/ + /* out: TRUE if success */ + ulint id); /* in: space id */ +/*********************************************************************** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. */ +UNIV_INTERN +ibool +fil_rename_tablespace( +/*==================*/ + /* out: TRUE if success */ + const char* old_name, /* in: old table name in the standard + databasename/tablename format of + InnoDB, or NULL if we do the rename + based on the space id only */ + ulint id, /* in: space id */ + const char* new_name); /* in: new table name in the standard + databasename/tablename format + of InnoDB */ + +/*********************************************************************** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp +dir of the mysqld server. */ +UNIV_INTERN +ulint +fil_create_new_single_table_tablespace( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + ulint* space_id, /* in/out: space id; if this is != 0, + then this is an input parameter, + otherwise output */ + const char* tablename, /* in: the table name in the usual + databasename/tablename format + of InnoDB, or a dir path to a temp + table */ + ibool is_temp, /* in: TRUE if a table created with + CREATE TEMPORARY TABLE */ + ulint flags, /* in: tablespace flags */ + ulint size); /* in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ +/************************************************************************ +Tries to open a single-table tablespace and optionally checks the space id is +right in it. If does not succeed, prints an error message to the .err log. This +function is used to open a tablespace when we start up mysqld, and also in +IMPORT TABLESPACE. +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. */ +UNIV_INTERN +ibool +fil_open_single_table_tablespace( +/*=============================*/ + /* out: TRUE if success */ + ibool check_space_id, /* in: should we check that the space + id in the file is right; we assume + that this function runs much faster + if no check is made, since accessing + the file inode probably is much + faster (the OS caches them) than + accessing the first page of the file */ + ulint id, /* in: space id */ + ulint flags, /* in: tablespace flags */ + const char* name); /* in: table name in the + databasename/tablename format */ +/************************************************************************ +It is possible, though very improbable, that the lsn's in the tablespace to be +imported have risen above the current system lsn, if a lengthy purge, ibuf +merge, or rollback was performed on a backup taken with ibbackup. If that is +the case, reset page lsn's in the file. We assume that mysqld was shut down +after it performed these cleanup operations on the .ibd file, so that it at +the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the +first page of the .ibd file, and we can determine whether we need to reset the +lsn's just by looking at that flush lsn. */ +UNIV_INTERN +ibool +fil_reset_too_high_lsns( +/*====================*/ + /* out: TRUE if success */ + const char* name, /* in: table name in the + databasename/tablename format */ + ib_uint64_t current_lsn); /* in: reset lsn's if the lsn stamped + to FIL_PAGE_FILE_FLUSH_LSN in the + first page is too high */ +/************************************************************************ +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. */ +UNIV_INTERN +ulint +fil_load_single_table_tablespaces(void); +/*===================================*/ + /* out: DB_SUCCESS or error number */ +/************************************************************************ +If we need crash recovery, and we have called +fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(), +we can call this function to print an error message of orphaned .ibd files +for which there is not a data dictionary entry with a matching table name +and space id. */ +UNIV_INTERN +void +fil_print_orphaned_tablespaces(void); +/*================================*/ +/*********************************************************************** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. */ +UNIV_INTERN +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + /* out: TRUE if does not exist or is being\ + deleted */ + ulint id, /* in: space id */ + ib_int64_t version);/* in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +/*********************************************************************** +Returns TRUE if a single-table tablespace exists in the memory cache. */ +UNIV_INTERN +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + /* out: TRUE if exists */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. */ +UNIV_INTERN +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + /* out: TRUE if a matching tablespace + exists in the memory cache */ + ulint id, /* in: space id */ + const char* name, /* in: table name in the standard + 'databasename/tablename' format or + the dir path to a temp table */ + ibool is_temp, /* in: TRUE if created with CREATE + TEMPORARY TABLE */ + ibool mark_space, /* in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist); + /* in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ +/************************************************************************** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. */ +UNIV_INTERN +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + /* out: TRUE if success */ + ulint* actual_size, /* out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /* in: space id */ + ulint size_after_extend);/* in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +#ifdef UNIV_HOTBACKUP +/************************************************************************ +Extends all tablespaces to the size stored in the space header. During the +ibbackup --apply-log phase we extended the spaces on-demand so that log records +could be appllied, but that may have left spaces still too small compared to +the size stored in the space header. */ +UNIV_INTERN +void +fil_extend_tablespaces_to_stored_len(void); +/*======================================*/ +#endif +/*********************************************************************** +Tries to reserve free extents in a file space. */ +UNIV_INTERN +ibool +fil_space_reserve_free_extents( +/*===========================*/ + /* out: TRUE if succeed */ + ulint id, /* in: space id */ + ulint n_free_now, /* in: number of free extents now */ + ulint n_to_reserve); /* in: how many one wants to reserve */ +/*********************************************************************** +Releases free extents in a file space. */ +UNIV_INTERN +void +fil_space_release_free_extents( +/*===========================*/ + ulint id, /* in: space id */ + ulint n_reserved); /* in: how many one reserved */ +/*********************************************************************** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ +UNIV_INTERN +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id); /* in: space id */ +/************************************************************************ +Reads or writes data. This operation is asynchronous (aio). */ +UNIV_INTERN +ulint +fil_io( +/*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ + ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE, + ORed to OS_FILE_LOG, if a log i/o + and ORed to OS_AIO_SIMULATED_WAKE_LATER + if simulated aio and we want to post a + batch of i/os; NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + ibool sync, /* in: TRUE if synchronous aio is desired */ + ulint space_id, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /* in: offset in number of blocks */ + ulint byte_offset, /* in: remainder of offset in bytes; in + aio this must be divisible by the OS block + size */ + ulint len, /* in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ + void* buf, /* in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ + void* message); /* in: message for aio handler if non-sync + aio used, else ignored */ +/************************************************************************** +Waits for an aio operation to complete. This function is used to write the +handler for completed requests. The aio array of pending requests is divided +into segments (see os0file.c for more info). The thread specifies which +segment it wants to wait for. */ +UNIV_INTERN +void +fil_aio_wait( +/*=========*/ + ulint segment); /* in: the number of the segment in the aio + array to wait for */ +/************************************************************************** +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ +UNIV_INTERN +void +fil_flush( +/*======*/ + ulint space_id); /* in: file space id (this can be a group of + log files or a tablespace of the database) */ +/************************************************************************** +Flushes to disk writes in file spaces of the given type possibly cached by +the OS. */ +UNIV_INTERN +void +fil_flush_file_spaces( +/*==================*/ + ulint purpose); /* in: FIL_TABLESPACE, FIL_LOG */ +/********************************************************************** +Checks the consistency of the tablespace cache. */ +UNIV_INTERN +ibool +fil_validate(void); +/*==============*/ + /* out: TRUE if ok */ +/************************************************************************ +Returns TRUE if file address is undefined. */ +UNIV_INTERN +ibool +fil_addr_is_null( +/*=============*/ + /* out: TRUE if undefined */ + fil_addr_t addr); /* in: address */ +/************************************************************************ +Accessor functions for a file page */ +UNIV_INTERN +ulint +fil_page_get_prev(const byte* page); +ulint +fil_page_get_next(const byte* page); +/************************************************************************* +Sets the file page type. */ +UNIV_INTERN +void +fil_page_set_type( +/*==============*/ + byte* page, /* in: file page */ + ulint type); /* in: type */ +/************************************************************************* +Gets the file page type. */ +UNIV_INTERN +ulint +fil_page_get_type( +/*==============*/ + /* out: type; NOTE that if the type + has not been written to page, the + return value not defined */ + const byte* page); /* in: file page */ + +/************************************************************************* +Return local hash table informations. */ + +ulint +fil_system_hash_cells(void); +/*========================*/ + +ulint +fil_system_hash_nodes(void); +/*========================*/ + +typedef struct fil_space_struct fil_space_t; + +#endif diff --git a/storage/xtradb/include/fsp0fsp.h b/storage/xtradb/include/fsp0fsp.h new file mode 100644 index 00000000000..1f6ae4b614b --- /dev/null +++ b/storage/xtradb/include/fsp0fsp.h @@ -0,0 +1,433 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fsp0fsp_h +#define fsp0fsp_h + +#include "univ.i" + +#include "mtr0mtr.h" +#include "fut0lst.h" +#include "ut0byte.h" +#include "page0types.h" + +/* If records are inserted in order, there are the following +flags to tell this (their type is made byte for the compiler +to warn if direction and hint parameters are switched in +fseg_alloc_free_page): */ +#define FSP_UP ((byte)111) /* alphabetically upwards */ +#define FSP_DOWN ((byte)112) /* alphabetically downwards */ +#define FSP_NO_DIR ((byte)113) /* no order */ + +/* File space extent size (one megabyte) in pages */ +#define FSP_EXTENT_SIZE (1 << (20 - UNIV_PAGE_SIZE_SHIFT)) + +/* On a page of any file segment, data may be put starting from this offset: */ +#define FSEG_PAGE_DATA FIL_PAGE_DATA + +/* File segment header which points to the inode describing the file segment */ +typedef byte fseg_header_t; + +#define FSEG_HDR_SPACE 0 /* space id of the inode */ +#define FSEG_HDR_PAGE_NO 4 /* page number of the inode */ +#define FSEG_HDR_OFFSET 8 /* byte offset of the inode */ + +#define FSEG_HEADER_SIZE 10 + +/************************************************************************** +Initializes the file space system. */ +UNIV_INTERN +void +fsp_init(void); +/*==========*/ +/************************************************************************** +Gets the current free limit of the system tablespace. The free limit +means the place of the first page which has never been put to the the +free list for allocation. The space above that address is initialized +to zero. Sets also the global variable log_fsp_current_free_limit. */ +UNIV_INTERN +ulint +fsp_header_get_free_limit(void); +/*===========================*/ + /* out: free limit in megabytes */ +/************************************************************************** +Gets the size of the system tablespace from the tablespace header. If +we do not have an auto-extending data file, this should be equal to +the size of the data files. If there is an auto-extending data file, +this can be smaller. */ +UNIV_INTERN +ulint +fsp_header_get_tablespace_size(void); +/*================================*/ + /* out: size in pages */ +/************************************************************************** +Reads the file space size stored in the header page. */ +UNIV_INTERN +ulint +fsp_get_size_low( +/*=============*/ + /* out: tablespace size stored in the space header */ + page_t* page); /* in: header page (page 0 in the tablespace) */ +/************************************************************************** +Reads the space id from the first page of a tablespace. */ +UNIV_INTERN +ulint +fsp_header_get_space_id( +/*====================*/ + /* out: space id, ULINT UNDEFINED if error */ + const page_t* page); /* in: first page of a tablespace */ +/************************************************************************** +Reads the space flags from the first page of a tablespace. */ +UNIV_INTERN +ulint +fsp_header_get_flags( +/*=================*/ + /* out: flags */ + const page_t* page); /* in: first page of a tablespace */ +/************************************************************************** +Reads the compressed page size from the first page of a tablespace. */ +UNIV_INTERN +ulint +fsp_header_get_zip_size( +/*====================*/ + /* out: compressed page size in bytes, + or 0 if uncompressed */ + const page_t* page); /* in: first page of a tablespace */ +/************************************************************************** +Writes the space id and compressed page size to a tablespace header. +This function is used past the buffer pool when we in fil0fil.c create +a new single-table tablespace. */ +UNIV_INTERN +void +fsp_header_init_fields( +/*===================*/ + page_t* page, /* in/out: first page in the space */ + ulint space_id, /* in: space id */ + ulint flags); /* in: tablespace flags (FSP_SPACE_FLAGS): + 0, or table->flags if newer than COMPACT */ +/************************************************************************** +Initializes the space header of a new created space and creates also the +insert buffer tree root if space == 0. */ +UNIV_INTERN +void +fsp_header_init( +/*============*/ + ulint space, /* in: space id */ + ulint size, /* in: current size in blocks */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************** +Increases the space size field of a space. */ +UNIV_INTERN +void +fsp_header_inc_size( +/*================*/ + ulint space, /* in: space id */ + ulint size_inc,/* in: size increment in pages */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************** +Creates a new segment. */ +UNIV_INTERN +buf_block_t* +fseg_create( +/*========*/ + /* out: the block where the segment header is placed, + x-latched, NULL if could not create segment + because of lack of space */ + ulint space, /* in: space id */ + ulint page, /* in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /* in: byte offset of the created segment header + on the page */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Creates a new segment. */ +UNIV_INTERN +buf_block_t* +fseg_create_general( +/*================*/ + /* out: the block where the segment header is placed, + x-latched, NULL if could not create segment + because of lack of space */ + ulint space, /* in: space id */ + ulint page, /* in: page where the segment header is placed: if + this is != 0, the page must belong to another segment, + if this is 0, a new page will be allocated and it + will belong to the created segment */ + ulint byte_offset, /* in: byte offset of the created segment header + on the page */ + ibool has_done_reservation, /* in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. */ +UNIV_INTERN +ulint +fseg_n_reserved_pages( +/*==================*/ + /* out: number of reserved pages */ + fseg_header_t* header, /* in: segment header */ + ulint* used, /* out: number of pages used (<= reserved) */ + mtr_t* mtr); /* in: mtr handle */ +/************************************************************************** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize +file space fragmentation. */ +UNIV_INTERN +ulint +fseg_alloc_free_page( +/*=================*/ + /* out: the allocated page offset + FIL_NULL if no page could be allocated */ + fseg_header_t* seg_header, /* in: segment header */ + ulint hint, /* in: hint of which page would be desirable */ + byte direction, /* in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + mtr_t* mtr); /* in: mtr handle */ +/************************************************************************** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. */ +UNIV_INTERN +ulint +fseg_alloc_free_page_general( +/*=========================*/ + /* out: allocated page offset, FIL_NULL if no + page could be allocated */ + fseg_header_t* seg_header,/* in: segment header */ + ulint hint, /* in: hint of which page would be desirable */ + byte direction,/* in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + ibool has_done_reservation, /* in: TRUE if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr); /* in: mtr handle */ +/************************************************************************** +Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_release_free_extents! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. */ +UNIV_INTERN +ibool +fsp_reserve_free_extents( +/*=====================*/ + /* out: TRUE if we were able to make the reservation */ + ulint* n_reserved,/* out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ + ulint space, /* in: space id */ + ulint n_ext, /* in: number of extents to reserve */ + ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +This function should be used to get information on how much we still +will be able to insert new data to the database without running out the +tablespace. Only free extents are taken into account and we also subtract +the safety margin required by the above function fsp_reserve_free_extents. */ +UNIV_INTERN +ullint +fsp_get_available_space_in_free_extents( +/*====================================*/ + /* out: available space in kB */ + ulint space); /* in: space id */ +/************************************************************************** +Frees a single page of a segment. */ +UNIV_INTERN +void +fseg_free_page( +/*===========*/ + fseg_header_t* seg_header, /* in: segment header */ + ulint space, /* in: space id */ + ulint page, /* in: page offset */ + mtr_t* mtr); /* in: mtr handle */ +/*********************************************************************** +Frees a segment. The freeing is performed in several mini-transactions, +so that there is no danger of bufferfixing too many buffer pages. */ +UNIV_INTERN +void +fseg_free( +/*======*/ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/* in: page number where the segment header is + placed */ + ulint offset);/* in: byte offset of the segment header on that + page */ +/************************************************************************** +Frees part of a segment. This function can be used to free a segment +by repeatedly calling this function in different mini-transactions. +Doing the freeing in a single mini-transaction might result in +too big a mini-transaction. */ +UNIV_INTERN +ibool +fseg_free_step( +/*===========*/ + /* out: TRUE if freeing completed */ + fseg_header_t* header, /* in, own: segment header; NOTE: if the header + resides on the first page of the frag list + of the segment, this pointer becomes obsolete + after the last freeing step */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +Frees part of a segment. Differs from fseg_free_step because this function +leaves the header page unfreed. */ +UNIV_INTERN +ibool +fseg_free_step_not_header( +/*======================*/ + /* out: TRUE if freeing completed, except the + header page */ + fseg_header_t* header, /* in: segment header which must reside on + the first fragment page of the segment */ + mtr_t* mtr); /* in: mtr */ +/*************************************************************************** +Checks if a page address is an extent descriptor page address. */ +UNIV_INLINE +ibool +fsp_descr_page( +/*===========*/ + /* out: TRUE if a descriptor page */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no);/* in: page number */ +/*************************************************************** +Parses a redo log record of a file page init. */ +UNIV_INTERN +byte* +fsp_parse_init_file_page( +/*=====================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr, /* in: buffer end */ + buf_block_t* block); /* in: block or NULL */ +/*********************************************************************** +Validates the file space system and its segments. */ +UNIV_INTERN +ibool +fsp_validate( +/*=========*/ + /* out: TRUE if ok */ + ulint space); /* in: space id */ +/*********************************************************************** +Prints info of a file space. */ +UNIV_INTERN +void +fsp_print( +/*======*/ + ulint space); /* in: space id */ +/*********************************************************************** +Validates a segment. */ +UNIV_INTERN +ibool +fseg_validate( +/*==========*/ + /* out: TRUE if ok */ + fseg_header_t* header, /* in: segment header */ + mtr_t* mtr2); /* in: mtr */ +#ifdef UNIV_BTR_PRINT +/*********************************************************************** +Writes info of a segment. */ +UNIV_INTERN +void +fseg_print( +/*=======*/ + fseg_header_t* header, /* in: segment header */ + mtr_t* mtr); /* in: mtr */ +#endif /* UNIV_BTR_PRINT */ + +/* Flags for fsp_reserve_free_extents */ +#define FSP_NORMAL 1000000 +#define FSP_UNDO 2000000 +#define FSP_CLEANING 3000000 + +/* Number of pages described in a single descriptor page: currently each page +description takes less than 1 byte; a descriptor page is repeated every +this many file pages */ +/* #define XDES_DESCRIBED_PER_PAGE UNIV_PAGE_SIZE */ +/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */ + +/* The space low address page map */ +/*--------------------------------------*/ + /* The following two pages are repeated + every XDES_DESCRIBED_PER_PAGE pages in + every tablespace. */ +#define FSP_XDES_OFFSET 0 /* extent descriptor */ +#define FSP_IBUF_BITMAP_OFFSET 1 /* insert buffer bitmap */ + /* The ibuf bitmap pages are the ones whose + page number is the number above plus a + multiple of XDES_DESCRIBED_PER_PAGE */ + +#define FSP_FIRST_INODE_PAGE_NO 2 /* in every tablespace */ + /* The following pages exist + in the system tablespace (space 0). */ +#define FSP_IBUF_HEADER_PAGE_NO 3 /* in tablespace 0 */ +#define FSP_IBUF_TREE_ROOT_PAGE_NO 4 /* in tablespace 0 */ + /* The ibuf tree root page number in + tablespace 0; its fseg inode is on the page + number FSP_FIRST_INODE_PAGE_NO */ +#define FSP_TRX_SYS_PAGE_NO 5 /* in tablespace 0 */ +#define FSP_FIRST_RSEG_PAGE_NO 6 /* in tablespace 0 */ +#define FSP_DICT_HDR_PAGE_NO 7 /* in tablespace 0 */ +/*--------------------------------------*/ + +#ifndef UNIV_NONINL +#include "fsp0fsp.ic" +#endif + +#endif diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic new file mode 100644 index 00000000000..f0301cc5e18 --- /dev/null +++ b/storage/xtradb/include/fsp0fsp.ic @@ -0,0 +1,44 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +/*************************************************************************** +Checks if a page address is an extent descriptor page address. */ +UNIV_INLINE +ibool +fsp_descr_page( +/*===========*/ + /* out: TRUE if a descriptor page */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/* in: page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1)) + == FSP_XDES_OFFSET)); + } + + return(UNIV_UNLIKELY((page_no & (zip_size - 1)) == FSP_XDES_OFFSET)); +} diff --git a/storage/xtradb/include/fut0fut.h b/storage/xtradb/include/fut0fut.h new file mode 100644 index 00000000000..4de0c97294c --- /dev/null +++ b/storage/xtradb/include/fut0fut.h @@ -0,0 +1,54 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + + +#ifndef fut0fut_h +#define fut0fut_h + +#include "univ.i" + +#include "fil0fil.h" +#include "mtr0mtr.h" + +/************************************************************************ +Gets a pointer to a file address and latches the page. */ +UNIV_INLINE +byte* +fut_get_ptr( +/*========*/ + /* out: pointer to a byte in a frame; the file + page in the frame is bufferfixed and latched */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t addr, /* in: file address */ + ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH */ + mtr_t* mtr); /* in: mtr handle */ + +#ifndef UNIV_NONINL +#include "fut0fut.ic" +#endif + +#endif + diff --git a/storage/xtradb/include/fut0fut.ic b/storage/xtradb/include/fut0fut.ic new file mode 100644 index 00000000000..f7e820da008 --- /dev/null +++ b/storage/xtradb/include/fut0fut.ic @@ -0,0 +1,55 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +File-based utilities + +Created 12/13/1995 Heikki Tuuri +***********************************************************************/ + +#include "sync0rw.h" +#include "buf0buf.h" + +/************************************************************************ +Gets a pointer to a file address and latches the page. */ +UNIV_INLINE +byte* +fut_get_ptr( +/*========*/ + /* out: pointer to a byte in a frame; the file + page in the frame is bufferfixed and latched */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + fil_addr_t addr, /* in: file address */ + ulint rw_latch, /* in: RW_S_LATCH, RW_X_LATCH */ + mtr_t* mtr) /* in: mtr handle */ +{ + buf_block_t* block; + byte* ptr; + + ut_ad(addr.boffset < UNIV_PAGE_SIZE); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); + + block = buf_page_get(space, zip_size, addr.page, rw_latch, mtr); + ptr = buf_block_get_frame(block) + addr.boffset; + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + return(ptr); +} diff --git a/storage/xtradb/include/fut0lst.h b/storage/xtradb/include/fut0lst.h new file mode 100644 index 00000000000..f812874fe00 --- /dev/null +++ b/storage/xtradb/include/fut0lst.h @@ -0,0 +1,214 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef fut0lst_h +#define fut0lst_h + +#include "univ.i" + +#include "fil0fil.h" +#include "mtr0mtr.h" + + +/* The C 'types' of base node and list node: these should be used to +write self-documenting code. Of course, the sizeof macro cannot be +applied to these types! */ + +typedef byte flst_base_node_t; +typedef byte flst_node_t; + +/* The physical size of a list base node in bytes */ +#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) + +/* The physical size of a list node in bytes */ +#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE) + + +/************************************************************************ +Initializes a list base node. */ +UNIV_INLINE +void +flst_init( +/*======*/ + flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Adds a node as the last node in a list. */ +UNIV_INTERN +void +flst_add_last( +/*==========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node, /* in: node to add */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Adds a node as the first node in a list. */ +UNIV_INTERN +void +flst_add_first( +/*===========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node, /* in: node to add */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Inserts a node after another in a list. */ +UNIV_INTERN +void +flst_insert_after( +/*==============*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node1, /* in: node to insert after */ + flst_node_t* node2, /* in: node to add */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Inserts a node before another in a list. */ +UNIV_INTERN +void +flst_insert_before( +/*===============*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: node to insert */ + flst_node_t* node3, /* in: node to insert before */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Removes a node. */ +UNIV_INTERN +void +flst_remove( +/*========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: node to remove */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Cuts off the tail of the list, including the node given. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_cut_end( +/*=========*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: first node to remove */ + ulint n_nodes,/* in: number of nodes to remove, + must be >= 1 */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Cuts off the tail of the list, not including the given node. The number of +nodes which will be removed must be provided by the caller, as this function +does not measure the length of the tail. */ +UNIV_INTERN +void +flst_truncate_end( +/*==============*/ + flst_base_node_t* base, /* in: pointer to base node of list */ + flst_node_t* node2, /* in: first node not to remove */ + ulint n_nodes,/* in: number of nodes to remove */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Gets list length. */ +UNIV_INLINE +ulint +flst_get_len( +/*=========*/ + /* out: length */ + const flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Gets list first node address. */ +UNIV_INLINE +fil_addr_t +flst_get_first( +/*===========*/ + /* out: file address */ + const flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Gets list last node address. */ +UNIV_INLINE +fil_addr_t +flst_get_last( +/*==========*/ + /* out: file address */ + const flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Gets list next node address. */ +UNIV_INLINE +fil_addr_t +flst_get_next_addr( +/*===============*/ + /* out: file address */ + const flst_node_t* node, /* in: pointer to node */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Gets list prev node address. */ +UNIV_INLINE +fil_addr_t +flst_get_prev_addr( +/*===============*/ + /* out: file address */ + const flst_node_t* node, /* in: pointer to node */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Writes a file address. */ +UNIV_INLINE +void +flst_write_addr( +/*============*/ + fil_faddr_t* faddr, /* in: pointer to file faddress */ + fil_addr_t addr, /* in: file address */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Reads a file address. */ +UNIV_INLINE +fil_addr_t +flst_read_addr( +/*===========*/ + /* out: file address */ + const fil_faddr_t* faddr, /* in: pointer to file faddress */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************ +Validates a file-based list. */ +UNIV_INTERN +ibool +flst_validate( +/*==========*/ + /* out: TRUE if ok */ + const flst_base_node_t* base, /* in: pointer to base node of list */ + mtr_t* mtr1); /* in: mtr */ +/************************************************************************ +Prints info of a file-based list. */ +UNIV_INTERN +void +flst_print( +/*=======*/ + const flst_base_node_t* base, /* in: pointer to base node of list */ + mtr_t* mtr); /* in: mtr */ + + +#ifndef UNIV_NONINL +#include "fut0lst.ic" +#endif + +#endif diff --git a/storage/xtradb/include/fut0lst.ic b/storage/xtradb/include/fut0lst.ic new file mode 100644 index 00000000000..5899e996059 --- /dev/null +++ b/storage/xtradb/include/fut0lst.ic @@ -0,0 +1,166 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0fut.h" +#include "mtr0log.h" +#include "buf0buf.h" + +/* We define the field offsets of a node for the list */ +#define FLST_PREV 0 /* 6-byte address of the previous list element; + the page part of address is FIL_NULL, if no + previous element */ +#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next + list element; the page part of address + is FIL_NULL, if no next element */ + +/* We define the field offsets of a base node for the list */ +#define FLST_LEN 0 /* 32-bit list length field */ +#define FLST_FIRST 4 /* 6-byte address of the first element + of the list; undefined if empty list */ +#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the + last element of the list; undefined + if empty list */ + +/************************************************************************ +Writes a file address. */ +UNIV_INLINE +void +flst_write_addr( +/*============*/ + fil_faddr_t* faddr, /* in: pointer to file faddress */ + fil_addr_t addr, /* in: file address */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ut_ad(faddr && mtr); + ut_ad(mtr_memo_contains_page(mtr, faddr, MTR_MEMO_PAGE_X_FIX)); + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + + mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr); + mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset, + MLOG_2BYTES, mtr); +} + +/************************************************************************ +Reads a file address. */ +UNIV_INLINE +fil_addr_t +flst_read_addr( +/*===========*/ + /* out: file address */ + const fil_faddr_t* faddr, /* in: pointer to file faddress */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + fil_addr_t addr; + + ut_ad(faddr && mtr); + + addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr); + addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES, + mtr); + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA); + return(addr); +} + +/************************************************************************ +Initializes a list base node. */ +UNIV_INLINE +void +flst_init( +/*======*/ + flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); + + mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr); + flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); + flst_write_addr(base + FLST_LAST, fil_addr_null, mtr); +} + +/************************************************************************ +Gets list length. */ +UNIV_INLINE +ulint +flst_get_len( +/*=========*/ + /* out: length */ + const flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr)); +} + +/************************************************************************ +Gets list first node address. */ +UNIV_INLINE +fil_addr_t +flst_get_first( +/*===========*/ + /* out: file address */ + const flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + return(flst_read_addr(base + FLST_FIRST, mtr)); +} + +/************************************************************************ +Gets list last node address. */ +UNIV_INLINE +fil_addr_t +flst_get_last( +/*==========*/ + /* out: file address */ + const flst_base_node_t* base, /* in: pointer to base node */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + return(flst_read_addr(base + FLST_LAST, mtr)); +} + +/************************************************************************ +Gets list next node address. */ +UNIV_INLINE +fil_addr_t +flst_get_next_addr( +/*===============*/ + /* out: file address */ + const flst_node_t* node, /* in: pointer to node */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + return(flst_read_addr(node + FLST_NEXT, mtr)); +} + +/************************************************************************ +Gets list prev node address. */ +UNIV_INLINE +fil_addr_t +flst_get_prev_addr( +/*===============*/ + /* out: file address */ + const flst_node_t* node, /* in: pointer to node */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + return(flst_read_addr(node + FLST_PREV, mtr)); +} diff --git a/storage/xtradb/include/ha0ha.h b/storage/xtradb/include/ha0ha.h new file mode 100644 index 00000000000..768f3d7aca3 --- /dev/null +++ b/storage/xtradb/include/ha0ha.h @@ -0,0 +1,188 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The hash table with external chains + +Created 8/18/1994 Heikki Tuuri +*******************************************************/ + +#ifndef ha0ha_h +#define ha0ha_h + +#include "univ.i" + +#include "hash0hash.h" +#include "page0types.h" +#include "buf0types.h" + +/***************************************************************** +Looks for an element in a hash table. */ +UNIV_INLINE +void* +ha_search_and_get_data( +/*===================*/ + /* out: pointer to the data of the first hash + table node in chain having the fold number, + NULL if not found */ + hash_table_t* table, /* in: hash table */ + ulint fold); /* in: folded value of the searched data */ +/************************************************************* +Looks for an element when we know the pointer to the data and updates +the pointer to data if found. */ +UNIV_INTERN +void +ha_search_and_update_if_found_func( +/*===============================*/ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of the searched data */ + void* data, /* in: pointer to the data */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* new_block,/* in: block containing new_data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + void* new_data);/* in: new pointer to the data */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found_func(table,fold,data,new_block,new_data) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found_func(table,fold,data,new_data) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/***************************************************************** +Creates a hash table with >= n array cells. The actual number of cells is +chosen to be a prime number slightly bigger than n. */ +UNIV_INTERN +hash_table_t* +ha_create_func( +/*===========*/ + /* out, own: created table */ + ulint n, /* in: number of array cells */ +#ifdef UNIV_SYNC_DEBUG + ulint mutex_level, /* in: level of the mutexes in the latching + order: this is used in the debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_mutexes); /* in: number of mutexes to protect the + hash table: must be a power of 2 */ +#ifdef UNIV_SYNC_DEBUG +# define ha_create(n_c,n_m,level) ha_create_func(n_c,level,n_m) +#else /* UNIV_SYNC_DEBUG */ +# define ha_create(n_c,n_m,level) ha_create_func(n_c,n_m) +#endif /* UNIV_SYNC_DEBUG */ + +/***************************************************************** +Empties a hash table and frees the memory heaps. */ +UNIV_INTERN +void +ha_clear( +/*=====*/ + hash_table_t* table); /* in, own: hash table */ + +/***************************************************************** +Inserts an entry into a hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. */ +UNIV_INTERN +ibool +ha_insert_for_fold_func( +/*====================*/ + /* out: TRUE if succeed, FALSE if no more + memory could be allocated */ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of data; if a node with + the same fold value already exists, it is + updated to point to the same data, and no new + node is created! */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /* in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + void* data); /* in: data, must not be NULL */ + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,b,d) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,d) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/***************************************************************** +Deletes an entry from a hash table. */ +UNIV_INTERN +void +ha_delete( +/*======*/ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of data */ + void* data); /* in: data, must not be NULL and must exist + in the hash table */ +/************************************************************* +Looks for an element when we know the pointer to the data and deletes +it from the hash table if found. */ +UNIV_INLINE +ibool +ha_search_and_delete_if_found( +/*==========================*/ + /* out: TRUE if found */ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of the searched data */ + void* data); /* in: pointer to the data */ +/********************************************************************* +Removes from the chain determined by fold all nodes whose data pointer +points to the page given. */ +UNIV_INTERN +void +ha_remove_all_nodes_to_page( +/*========================*/ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: fold value */ + const page_t* page); /* in: buffer page */ +/***************************************************************** +Validates a given range of the cells in hash table. */ +UNIV_INTERN +ibool +ha_validate( +/*========*/ + /* out: TRUE if ok */ + hash_table_t* table, /* in: hash table */ + ulint start_index, /* in: start index */ + ulint end_index); /* in: end index */ +/***************************************************************** +Prints info of a hash table. */ +UNIV_INTERN +void +ha_print_info( +/*==========*/ + FILE* file, /* in: file where to print */ + hash_table_t* table); /* in: hash table */ + +/* The hash table external chain node */ + +typedef struct ha_node_struct ha_node_t; +struct ha_node_struct { + ha_node_t* next; /* next chain node or NULL if none */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block; /* buffer block containing the data, or NULL */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + void* data; /* pointer to the data */ + ulint fold; /* fold value for the data */ +}; + +#ifndef UNIV_NONINL +#include "ha0ha.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ha0ha.ic b/storage/xtradb/include/ha0ha.ic new file mode 100644 index 00000000000..35fd802eaef --- /dev/null +++ b/storage/xtradb/include/ha0ha.ic @@ -0,0 +1,214 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The hash table with external chains + +Created 8/18/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0rnd.h" +#include "mem0mem.h" + +/*************************************************************** +Deletes a hash node. */ +UNIV_INTERN +void +ha_delete_hash_node( +/*================*/ + hash_table_t* table, /* in: hash table */ + ha_node_t* del_node); /* in: node to be deleted */ + +/********************************************************************** +Gets a hash node data. */ +UNIV_INLINE +void* +ha_node_get_data( +/*=============*/ + /* out: pointer to the data */ + ha_node_t* node) /* in: hash chain node */ +{ + return(node->data); +} + +/********************************************************************** +Sets hash node data. */ +UNIV_INLINE +void +ha_node_set_data_func( +/*==================*/ + ha_node_t* node, /* in: hash chain node */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /* in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + void* data) /* in: pointer to the data */ +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + node->block = block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data = data; +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/********************************************************************** +Gets the next node in a hash chain. */ +UNIV_INLINE +ha_node_t* +ha_chain_get_next( +/*==============*/ + /* out: next node, NULL if none */ + ha_node_t* node) /* in: hash chain node */ +{ + return(node->next); +} + +/********************************************************************** +Gets the first node in a hash chain. */ +UNIV_INLINE +ha_node_t* +ha_chain_get_first( +/*===============*/ + /* out: first node, NULL if none */ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: fold value determining the chain */ +{ + return((ha_node_t*) + hash_get_nth_cell(table, hash_calc_hash(fold, table))->node); +} + +/***************************************************************** +Looks for an element in a hash table. */ +UNIV_INLINE +ha_node_t* +ha_search( +/*======*/ + /* out: pointer to the first hash table node + in chain having the fold number, NULL if not + found */ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: folded value of the searched data */ +{ + ha_node_t* node; + + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->fold == fold) { + + return(node); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +/***************************************************************** +Looks for an element in a hash table. */ +UNIV_INLINE +void* +ha_search_and_get_data( +/*===================*/ + /* out: pointer to the data of the first hash + table node in chain having the fold number, + NULL if not found */ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: folded value of the searched data */ +{ + ha_node_t* node; + + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->fold == fold) { + + return(node->data); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +/************************************************************* +Looks for an element when we know the pointer to the data. */ +UNIV_INLINE +ha_node_t* +ha_search_with_data( +/*================*/ + /* out: pointer to the hash table node, NULL + if not found in the table */ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of the searched data */ + void* data) /* in: pointer to the data */ +{ + ha_node_t* node; + + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->data == data) { + + return(node); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +/************************************************************* +Looks for an element when we know the pointer to the data, and deletes +it from the hash table, if found. */ +UNIV_INLINE +ibool +ha_search_and_delete_if_found( +/*==========================*/ + /* out: TRUE if found */ + hash_table_t* table, /* in: hash table */ + ulint fold, /* in: folded value of the searched data */ + void* data) /* in: pointer to the data */ +{ + ha_node_t* node; + + ut_ad(!table->mutexes || mutex_own(hash_get_mutex(table, fold))); + + node = ha_search_with_data(table, fold, data); + + if (node) { + ha_delete_hash_node(table, node); + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/xtradb/include/ha0storage.h b/storage/xtradb/include/ha0storage.h new file mode 100644 index 00000000000..f5a3938f434 --- /dev/null +++ b/storage/xtradb/include/ha0storage.h @@ -0,0 +1,129 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#ifndef ha0storage_h +#define ha0storage_h + +#include "univ.i" + +/* This value is used by default by ha_storage_create(). More memory +is allocated later when/if it is needed. */ +#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024 + +/* This value is used by default by ha_storage_create(). It is a +constant per ha_storage's lifetime. */ +#define HA_STORAGE_DEFAULT_HASH_CELLS 4096 + +typedef struct ha_storage_struct ha_storage_t; + +/*********************************************************************** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + /* out, own: hash storage */ + ulint initial_heap_bytes, /* in: initial heap's size */ + ulint initial_hash_cells); /* in: initial number of cells + in the hash table */ + +/*********************************************************************** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". */ + +const void* +ha_storage_put_memlim( +/*==================*/ + /* out: pointer to the copy */ + ha_storage_t* storage, /* in/out: hash storage */ + const void* data, /* in: data to store */ + ulint data_len, /* in: data length */ + ulint memlim); /* in: memory limit to obey */ + +/*********************************************************************** +Same as ha_storage_put_memlim() but without memory limit. */ + +#define ha_storage_put(storage, data, data_len) \ + ha_storage_put_memlim((storage), (data), (data_len), 0) + +/*********************************************************************** +Copies string into the storage and returns a pointer to the copy. If the +same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. */ + +#define ha_storage_put_str(storage, str) \ + ((const char*) ha_storage_put((storage), (str), strlen(str) + 1)) + +/*********************************************************************** +Copies string into the storage and returns a pointer to the copy obeying +a memory limit. */ + +#define ha_storage_put_str_memlim(storage, str, memlim) \ + ((const char*) ha_storage_put_memlim((storage), (str), \ + strlen(str) + 1, (memlim))) + +/*********************************************************************** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage); /* in/out: hash storage */ + +/*********************************************************************** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). +*/ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage); /* in/out: hash storage */ + +/*********************************************************************** +Gets the size of the memory used by a storage. */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + /* out: bytes used */ + const ha_storage_t* storage); /* in: hash storage */ + +#ifndef UNIV_NONINL +#include "ha0storage.ic" +#endif + +#endif /* ha0storage_h */ diff --git a/storage/xtradb/include/ha0storage.ic b/storage/xtradb/include/ha0storage.ic new file mode 100644 index 00000000000..7ab43bc00ba --- /dev/null +++ b/storage/xtradb/include/ha0storage.ic @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 24, 2007 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include "ha0storage.h" +#include "hash0hash.h" +#include "mem0mem.h" + +struct ha_storage_struct { + mem_heap_t* heap; /* storage from which memory is + allocated */ + hash_table_t* hash; /* hash table used to avoid + duplicates */ +}; + +/* Objects of this type are put in the hash */ +typedef struct ha_storage_node_struct ha_storage_node_t; +struct ha_storage_node_struct { + ulint data_len;/* length of the data */ + const void* data; /* pointer to data */ + ha_storage_node_t* next; /* next node in hash chain */ +}; + +/*********************************************************************** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + /* out, own: hash storage */ + ulint initial_heap_bytes, /* in: initial heap's size */ + ulint initial_hash_cells) /* in: initial number of cells + in the hash table */ +{ + ha_storage_t* storage; + mem_heap_t* heap; + + if (initial_heap_bytes == 0) { + + initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES; + } + + if (initial_hash_cells == 0) { + + initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS; + } + + /* we put "storage" within "storage->heap" */ + + heap = mem_heap_create(sizeof(ha_storage_t) + + initial_heap_bytes); + + storage = (ha_storage_t*) mem_heap_alloc(heap, + sizeof(ha_storage_t)); + + storage->heap = heap; + storage->hash = hash_create(initial_hash_cells); + + return(storage); +} + +/*********************************************************************** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage) /* in/out: hash storage */ +{ + ha_storage_t temp_storage; + + temp_storage.heap = (*storage)->heap; + temp_storage.hash = (*storage)->hash; + + hash_table_clear(temp_storage.hash); + mem_heap_empty(temp_storage.heap); + + *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap, + sizeof(ha_storage_t)); + + (*storage)->heap = temp_storage.heap; + (*storage)->hash = temp_storage.hash; +} + +/*********************************************************************** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). +*/ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage) /* in/out: hash storage */ +{ + /* order is important because the pointer storage->hash is + within the heap */ + hash_table_free(storage->hash); + mem_heap_free(storage->heap); +} + +/*********************************************************************** +Gets the size of the memory used by a storage. */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + /* out: bytes used */ + const ha_storage_t* storage) /* in: hash storage */ +{ + ulint ret; + + ret = mem_heap_get_size(storage->heap); + + /* this assumes hash->heap and hash->heaps are NULL */ + ret += sizeof(hash_table_t); + ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash); + + return(ret); +} diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h new file mode 100644 index 00000000000..116242b32e4 --- /dev/null +++ b/storage/xtradb/include/ha_prototypes.h @@ -0,0 +1,249 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +#ifndef HA_INNODB_PROTOTYPES_H +#define HA_INNODB_PROTOTYPES_H + +#ifndef UNIV_HOTBACKUP + +#include "univ.i" /* ulint, uint */ +#include "m_ctype.h" /* CHARSET_INFO */ + +/* Prototypes for global functions in ha_innodb.cc that are called by +InnoDB's C-code. */ + +/************************************************************************* +Wrapper around MySQL's copy_and_convert function, see it for +documentation. */ +UNIV_INTERN +ulint +innobase_convert_string( +/*====================*/ + void* to, + ulint to_length, + CHARSET_INFO* to_cs, + const void* from, + ulint from_length, + CHARSET_INFO* from_cs, + uint* errors); + +/*********************************************************************** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). */ +UNIV_INTERN +ulint +innobase_raw_format( +/*================*/ + /* out: number of bytes + that were written */ + const char* data, /* in: raw data */ + ulint data_len, /* in: raw data length + in bytes */ + ulint charset_coll, /* in: charset collation */ + char* buf, /* out: output buffer */ + ulint buf_size); /* in: output buffer size + in bytes */ + +/********************************************************************* +Convert a table or index name to the MySQL system_charset_info (UTF-8) +and quote it if needed. */ +UNIV_INTERN +char* +innobase_convert_name( +/*==================*/ + /* out: pointer to the end of buf */ + char* buf, /* out: buffer for converted identifier */ + ulint buflen, /* in: length of buf, in bytes */ + const char* id, /* in: identifier to convert */ + ulint idlen, /* in: length of id, in bytes */ + void* thd, /* in: MySQL connection thread, or NULL */ + ibool table_id);/* in: TRUE=id is a table or database name; + FALSE=id is an index name */ + +/********************************************************************** +Returns true if the thread is the replication thread on the slave +server. Used in srv_conc_enter_innodb() to determine if the thread +should be allowed to enter InnoDB - the replication thread is treated +differently than other threads. Also used in +srv_conc_force_exit_innodb(). */ +UNIV_INTERN +ibool +thd_is_replication_slave_thread( +/*============================*/ + /* out: true if thd is the replication thread */ + void* thd); /* in: thread handle (THD*) */ + +/********************************************************************** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. */ +UNIV_INTERN +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + /* out: true if non-transactional tables have + been edited */ + void* thd); /* in: thread handle (THD*) */ + +/***************************************************************** +Prints info of a THD object (== user session thread) to the given file. */ +UNIV_INTERN +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /* in: output stream */ + void* thd, /* in: pointer to a MySQL THD object */ + uint max_query_len); /* in: max query length to print, or 0 to + use the default max length */ + +/****************************************************************** +Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. */ +UNIV_INTERN +ulint +get_innobase_type_from_mysql_type( +/*==============================*/ + /* out: DATA_BINARY, + DATA_VARCHAR, ... */ + ulint* unsigned_flag, /* out: DATA_UNSIGNED if an + 'unsigned type'; + at least ENUM and SET, + and unsigned integer + types are 'unsigned types' */ + const void* field) /* in: MySQL Field */ + __attribute__((nonnull)); + +/***************************************************************** +If you want to print a thd that is not associated with the current thread, +you must call this function before reserving the InnoDB kernel_mutex, to +protect MySQL from setting thd->query NULL. If you print a thd of the current +thread, we know that MySQL cannot modify thd->query, and it is not necessary +to call this. Call innobase_mysql_end_print_arbitrary_thd() after you release +the kernel_mutex. */ +UNIV_INTERN +void +innobase_mysql_prepare_print_arbitrary_thd(void); +/*============================================*/ + +/***************************************************************** +Releases the mutex reserved by innobase_mysql_prepare_print_arbitrary_thd(). +In the InnoDB latching order, the mutex sits right above the +kernel_mutex. In debug builds, we assert that the kernel_mutex is +released before this function is invoked. */ +UNIV_INTERN +void +innobase_mysql_end_print_arbitrary_thd(void); +/*========================================*/ + +/********************************************************************** +Get the variable length bounds of the given character set. */ +UNIV_INTERN +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /* in: MySQL charset-collation code */ + ulint* mbminlen, /* out: minimum length of a char (in bytes) */ + ulint* mbmaxlen); /* out: maximum length of a char (in bytes) */ + +/********************************************************************** +Compares NUL-terminated UTF-8 strings case insensitively. */ +UNIV_INTERN +int +innobase_strcasecmp( +/*================*/ + /* out: 0 if a=b, <0 if a1 if a>b */ + const char* a, /* in: first string to compare */ + const char* b); /* in: second string to compare */ + +/********************************************************************** +Returns true if the thread is executing a SELECT statement. */ + +ibool +thd_is_select( +/*==========*/ + /* out: true if thd is executing SELECT */ + const void* thd); /* in: thread handle (THD*) */ + +/********************************************************************** +Converts an identifier to a table name. */ +UNIV_INTERN +void +innobase_convert_from_table_id( +/*===========================*/ + struct charset_info_st* cs, /* in: the 'from' character set */ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len); /* in: length of 'to', in bytes; should + be at least 5 * strlen(to) + 1 */ +/********************************************************************** +Converts an identifier to UTF-8. */ +UNIV_INTERN +void +innobase_convert_from_id( +/*=====================*/ + struct charset_info_st* cs, /* in: the 'from' character set */ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len); /* in: length of 'to', in bytes; should + be at least 3 * strlen(to) + 1 */ +/********************************************************************** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +innobase_casedn_str( +/*================*/ + char* a); /* in/out: string to put in lower case */ + +/************************************************************************** +Determines the connection character set. */ +struct charset_info_st* +innobase_get_charset( +/*=================*/ + /* out: connection character set */ + void* mysql_thd); /* in: MySQL thread handle */ + +/********************************************************************** +Returns true if the thread supports XA, +global value of innodb_supports_xa if thd is NULL. */ + +ibool +thd_supports_xa( +/*============*/ + /* out: true if thd supports XA */ + void* thd); /* in: thread handle (THD*), or NULL to query + the global innodb_supports_xa */ + +/********************************************************************** +Returns the lock wait timeout for the current connection. */ + +ulong +thd_lock_wait_timeout( +/*==================*/ + /* out: the lock wait timeout, in seconds */ + void* thd); /* in: thread handle (THD*), or NULL to query + the global innodb_lock_wait_timeout */ + +#endif +#endif diff --git a/storage/xtradb/include/handler0alter.h b/storage/xtradb/include/handler0alter.h new file mode 100644 index 00000000000..69488b67b2b --- /dev/null +++ b/storage/xtradb/include/handler0alter.h @@ -0,0 +1,41 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Smart ALTER TABLE +*******************************************************/ + +/***************************************************************** +Copies an InnoDB record to table->record[0]. */ +UNIV_INTERN +void +innobase_rec_to_mysql( +/*==================*/ + TABLE* table, /* in/out: MySQL table */ + const rec_t* rec, /* in: record */ + const dict_index_t* index, /* in: index */ + const ulint* offsets); /* in: rec_get_offsets( + rec, index, ...) */ + +/***************************************************************** +Resets table->record[0]. */ +UNIV_INTERN +void +innobase_rec_reset( +/*===============*/ + TABLE* table); /* in/out: MySQL table */ diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h new file mode 100644 index 00000000000..2b3eea62754 --- /dev/null +++ b/storage/xtradb/include/hash0hash.h @@ -0,0 +1,429 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#ifndef hash0hash_h +#define hash0hash_h + +#include "univ.i" +#include "mem0mem.h" +#include "sync0sync.h" + +typedef struct hash_table_struct hash_table_t; +typedef struct hash_cell_struct hash_cell_t; + +typedef void* hash_node_t; + +/* Fix Bug #13859: symbol collision between imap/mysql */ +#define hash_create hash0_create + +/***************************************************************** +Creates a hash table with >= n array cells. The actual number +of cells is chosen to be a prime number slightly bigger than n. */ +UNIV_INTERN +hash_table_t* +hash_create( +/*========*/ + /* out, own: created table */ + ulint n); /* in: number of array cells */ +/***************************************************************** +Creates a mutex array to protect a hash table. */ +UNIV_INTERN +void +hash_create_mutexes_func( +/*=====================*/ + hash_table_t* table, /* in: hash table */ +#ifdef UNIV_SYNC_DEBUG + ulint sync_level, /* in: latching order level of the + mutexes: used in the debug version */ +#endif /* UNIV_SYNC_DEBUG */ + ulint n_mutexes); /* in: number of mutexes */ +#ifdef UNIV_SYNC_DEBUG +# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,level,n) +#else /* UNIV_SYNC_DEBUG */ +# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,n) +#endif /* UNIV_SYNC_DEBUG */ + +/***************************************************************** +Frees a hash table. */ +UNIV_INTERN +void +hash_table_free( +/*============*/ + hash_table_t* table); /* in, own: hash table */ +/****************************************************************** +Calculates the hash value from a folded value. */ +UNIV_INLINE +ulint +hash_calc_hash( +/*===========*/ + /* out: hashed value */ + ulint fold, /* in: folded value */ + hash_table_t* table); /* in: hash table */ +/************************************************************************ +Assert that the mutex for the table in a hash operation is owned. */ +#define HASH_ASSERT_OWNED(TABLE, FOLD) \ +ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD))); + +/*********************************************************************** +Inserts a struct to a hash table. */ + +#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + HASH_ASSERT_OWNED(TABLE, FOLD)\ +\ + (DATA)->NAME = NULL;\ +\ + cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\ +\ + if (cell3333->node == NULL) {\ + cell3333->node = DATA;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != NULL) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + }\ +\ + struct3333->NAME = DATA;\ + }\ +} while (0) + +#ifdef UNIV_HASH_DEBUG +# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1) +# define HASH_INVALIDATE(DATA, NAME) DATA->NAME = (void*) -1 +#else +# define HASH_ASSERT_VALID(DATA) do {} while (0) +# define HASH_INVALIDATE(DATA, NAME) do {} while (0) +#endif + +/*********************************************************************** +Deletes a struct from a hash table. */ + +#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + HASH_ASSERT_OWNED(TABLE, FOLD)\ +\ + cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\ +\ + if (cell3333->node == DATA) {\ + HASH_ASSERT_VALID(DATA->NAME);\ + cell3333->node = DATA->NAME;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != DATA) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + ut_a(struct3333);\ + }\ +\ + struct3333->NAME = DATA->NAME;\ + }\ + HASH_INVALIDATE(DATA, NAME);\ +} while (0) + +/*********************************************************************** +Gets the first struct in a hash chain, NULL if none. */ + +#define HASH_GET_FIRST(TABLE, HASH_VAL)\ + (hash_get_nth_cell(TABLE, HASH_VAL)->node) + +/*********************************************************************** +Gets the next struct in a hash chain, NULL if none. */ + +#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME) + +/************************************************************************ +Looks for a struct in a hash table. */ +#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\ +{\ +\ + HASH_ASSERT_OWNED(TABLE, FOLD)\ +\ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\ + HASH_ASSERT_VALID(DATA);\ +\ + while ((DATA) != NULL) {\ + ASSERTION;\ + if (TEST) {\ + break;\ + } else {\ + HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\ + }\ + }\ +} + +/************************************************************************ +Looks for an item in all hash buckets. */ +#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \ +do { \ + ulint i3333; \ + \ + for (i3333 = (TABLE)->n_cells; i3333--; ) { \ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \ + \ + while ((DATA) != NULL) { \ + HASH_ASSERT_VALID(DATA); \ + ASSERTION; \ + \ + if (TEST) { \ + break; \ + } \ + \ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \ + } \ + \ + if ((DATA) != NULL) { \ + break; \ + } \ + } \ +} while (0) + +/**************************************************************** +Gets the nth cell in a hash table. */ +UNIV_INLINE +hash_cell_t* +hash_get_nth_cell( +/*==============*/ + /* out: pointer to cell */ + hash_table_t* table, /* in: hash table */ + ulint n); /* in: cell index */ + +/***************************************************************** +Clears a hash table so that all the cells become empty. */ +UNIV_INLINE +void +hash_table_clear( +/*=============*/ + hash_table_t* table); /* in/out: hash table */ + +/***************************************************************** +Returns the number of cells in a hash table. */ +UNIV_INLINE +ulint +hash_get_n_cells( +/*=============*/ + /* out: number of cells */ + hash_table_t* table); /* in: table */ +/*********************************************************************** +Deletes a struct which is stored in the heap of the hash table, and compacts +the heap. The fold value must be stored in the struct NODE in a field named +'fold'. */ + +#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\ +do {\ + TYPE* node111;\ + TYPE* top_node111;\ + hash_cell_t* cell111;\ + ulint fold111;\ +\ + fold111 = (NODE)->fold;\ +\ + HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\ +\ + top_node111 = (TYPE*)mem_heap_get_top(\ + hash_get_heap(TABLE, fold111),\ + sizeof(TYPE));\ +\ + /* If the node to remove is not the top node in the heap, compact the\ + heap of nodes by moving the top node in the place of NODE. */\ +\ + if (NODE != top_node111) {\ +\ + /* Copy the top node in place of NODE */\ +\ + *(NODE) = *top_node111;\ +\ + cell111 = hash_get_nth_cell(TABLE,\ + hash_calc_hash(top_node111->fold, TABLE));\ +\ + /* Look for the pointer to the top node, to update it */\ +\ + if (cell111->node == top_node111) {\ + /* The top node is the first in the chain */\ +\ + cell111->node = NODE;\ + } else {\ + /* We have to look for the predecessor of the top\ + node */\ + node111 = cell111->node;\ +\ + while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\ +\ + node111 = HASH_GET_NEXT(NAME, node111);\ + }\ +\ + /* Now we have the predecessor node */\ +\ + node111->NAME = NODE;\ + }\ + }\ +\ + /* Free the space occupied by the top node */\ +\ + mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\ +} while (0) + +/******************************************************************** +Move all hash table entries from OLD_TABLE to NEW_TABLE.*/ + +#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \ +do {\ + ulint i2222;\ + ulint cell_count2222;\ +\ + cell_count2222 = hash_get_n_cells(OLD_TABLE);\ +\ + for (i2222 = 0; i2222 < cell_count2222; i2222++) {\ + NODE_TYPE* node2222 = HASH_GET_FIRST((OLD_TABLE), i2222);\ +\ + while (node2222) {\ + NODE_TYPE* next2222 = node2222->PTR_NAME;\ + ulint fold2222 = FOLD_FUNC(node2222);\ +\ + HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\ + fold2222, node2222);\ +\ + node2222 = next2222;\ + }\ + }\ +} while (0) + + +/**************************************************************** +Gets the mutex index for a fold value in a hash table. */ +UNIV_INLINE +ulint +hash_get_mutex_no( +/*==============*/ + /* out: mutex number */ + hash_table_t* table, /* in: hash table */ + ulint fold); /* in: fold */ +/**************************************************************** +Gets the nth heap in a hash table. */ +UNIV_INLINE +mem_heap_t* +hash_get_nth_heap( +/*==============*/ + /* out: mem heap */ + hash_table_t* table, /* in: hash table */ + ulint i); /* in: index of the heap */ +/**************************************************************** +Gets the heap for a fold value in a hash table. */ +UNIV_INLINE +mem_heap_t* +hash_get_heap( +/*==========*/ + /* out: mem heap */ + hash_table_t* table, /* in: hash table */ + ulint fold); /* in: fold */ +/**************************************************************** +Gets the nth mutex in a hash table. */ +UNIV_INLINE +mutex_t* +hash_get_nth_mutex( +/*===============*/ + /* out: mutex */ + hash_table_t* table, /* in: hash table */ + ulint i); /* in: index of the mutex */ +/**************************************************************** +Gets the mutex for a fold value in a hash table. */ +UNIV_INLINE +mutex_t* +hash_get_mutex( +/*===========*/ + /* out: mutex */ + hash_table_t* table, /* in: hash table */ + ulint fold); /* in: fold */ +/**************************************************************** +Reserves the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_enter( +/*=============*/ + hash_table_t* table, /* in: hash table */ + ulint fold); /* in: fold */ +/**************************************************************** +Releases the mutex for a fold value in a hash table. */ +UNIV_INTERN +void +hash_mutex_exit( +/*============*/ + hash_table_t* table, /* in: hash table */ + ulint fold); /* in: fold */ +/**************************************************************** +Reserves all the mutexes of a hash table, in an ascending order. */ +UNIV_INTERN +void +hash_mutex_enter_all( +/*=================*/ + hash_table_t* table); /* in: hash table */ +/**************************************************************** +Releases all the mutexes of a hash table. */ +UNIV_INTERN +void +hash_mutex_exit_all( +/*================*/ + hash_table_t* table); /* in: hash table */ + + +struct hash_cell_struct{ + void* node; /* hash chain node, NULL if none */ +}; + +/* The hash table structure */ +struct hash_table_struct { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ibool adaptive;/* TRUE if this is the hash table of the + adaptive hash index */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ulint n_cells;/* number of cells in the hash table */ + hash_cell_t* array; /* pointer to cell array */ + ulint n_mutexes;/* if mutexes != NULL, then the number of + mutexes, must be a power of 2 */ + mutex_t* mutexes;/* NULL, or an array of mutexes used to + protect segments of the hash table */ + mem_heap_t** heaps; /* if this is non-NULL, hash chain nodes for + external chaining can be allocated from these + memory heaps; there are then n_mutexes many of + these heaps */ + mem_heap_t* heap; + ulint magic_n; +}; + +#define HASH_TABLE_MAGIC_N 76561114 + +#ifndef UNIV_NONINL +#include "hash0hash.ic" +#endif + +#endif diff --git a/storage/xtradb/include/hash0hash.ic b/storage/xtradb/include/hash0hash.ic new file mode 100644 index 00000000000..792fdcbf4f8 --- /dev/null +++ b/storage/xtradb/include/hash0hash.ic @@ -0,0 +1,160 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#include "ut0rnd.h" + +/**************************************************************** +Gets the nth cell in a hash table. */ +UNIV_INLINE +hash_cell_t* +hash_get_nth_cell( +/*==============*/ + /* out: pointer to cell */ + hash_table_t* table, /* in: hash table */ + ulint n) /* in: cell index */ +{ + ut_ad(n < table->n_cells); + + return(table->array + n); +} + +/***************************************************************** +Clears a hash table so that all the cells become empty. */ +UNIV_INLINE +void +hash_table_clear( +/*=============*/ + hash_table_t* table) /* in/out: hash table */ +{ + memset(table->array, 0x0, + table->n_cells * sizeof(*table->array)); +} + +/***************************************************************** +Returns the number of cells in a hash table. */ +UNIV_INLINE +ulint +hash_get_n_cells( +/*=============*/ + /* out: number of cells */ + hash_table_t* table) /* in: table */ +{ + return(table->n_cells); +} + +/****************************************************************** +Calculates the hash value from a folded value. */ +UNIV_INLINE +ulint +hash_calc_hash( +/*===========*/ + /* out: hashed value */ + ulint fold, /* in: folded value */ + hash_table_t* table) /* in: hash table */ +{ + return(ut_hash_ulint(fold, table->n_cells)); +} + +/**************************************************************** +Gets the mutex index for a fold value in a hash table. */ +UNIV_INLINE +ulint +hash_get_mutex_no( +/*==============*/ + /* out: mutex number */ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: fold */ +{ + ut_ad(ut_is_2pow(table->n_mutexes)); + return(ut_2pow_remainder(hash_calc_hash(fold, table), + table->n_mutexes)); +} + +/**************************************************************** +Gets the nth heap in a hash table. */ +UNIV_INLINE +mem_heap_t* +hash_get_nth_heap( +/*==============*/ + /* out: mem heap */ + hash_table_t* table, /* in: hash table */ + ulint i) /* in: index of the heap */ +{ + ut_ad(i < table->n_mutexes); + + return(table->heaps[i]); +} + +/**************************************************************** +Gets the heap for a fold value in a hash table. */ +UNIV_INLINE +mem_heap_t* +hash_get_heap( +/*==========*/ + /* out: mem heap */ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: fold */ +{ + ulint i; + + if (table->heap) { + return(table->heap); + } + + i = hash_get_mutex_no(table, fold); + + return(hash_get_nth_heap(table, i)); +} + +/**************************************************************** +Gets the nth mutex in a hash table. */ +UNIV_INLINE +mutex_t* +hash_get_nth_mutex( +/*===============*/ + /* out: mutex */ + hash_table_t* table, /* in: hash table */ + ulint i) /* in: index of the mutex */ +{ + ut_ad(i < table->n_mutexes); + + return(table->mutexes + i); +} + +/**************************************************************** +Gets the mutex for a fold value in a hash table. */ +UNIV_INLINE +mutex_t* +hash_get_mutex( +/*===========*/ + /* out: mutex */ + hash_table_t* table, /* in: hash table */ + ulint fold) /* in: fold */ +{ + ulint i; + + i = hash_get_mutex_no(table, fold); + + return(hash_get_nth_mutex(table, i)); +} diff --git a/storage/xtradb/include/ibuf0ibuf.h b/storage/xtradb/include/ibuf0ibuf.h new file mode 100644 index 00000000000..41e2392cc4a --- /dev/null +++ b/storage/xtradb/include/ibuf0ibuf.h @@ -0,0 +1,369 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0ibuf_h +#define ibuf0ibuf_h + +#include "univ.i" + +#include "dict0mem.h" +#include "mtr0mtr.h" +#include "que0types.h" +#include "ibuf0types.h" +#include "fsp0fsp.h" + +/** Combinations of operations that can be buffered. Because the enum +values are used for indexing innobase_change_buffering_values[], they +should start at 0 and there should not be any gaps. */ +typedef enum { + IBUF_USE_NONE = 0, + IBUF_USE_INSERT, /* insert */ + + IBUF_USE_COUNT /* number of entries in ibuf_use_t */ +} ibuf_use_t; + +/** Operations that can currently be buffered. */ +extern ibuf_use_t ibuf_use; + +/** The insert buffer control structure */ +extern ibuf_t* ibuf; + +/* The purpose of the insert buffer is to reduce random disk access. +When we wish to insert a record into a non-unique secondary index and +the B-tree leaf page where the record belongs to is not in the buffer +pool, we insert the record into the insert buffer B-tree, indexed by +(space_id, page_no). When the page is eventually read into the buffer +pool, we look up the insert buffer B-tree for any modifications to the +page, and apply these upon the completion of the read operation. This +is called the insert buffer merge. */ + +/* The insert buffer merge must always succeed. To guarantee this, +the insert buffer subsystem keeps track of the free space in pages for +which it can buffer operations. Two bits per page in the insert +buffer bitmap indicate the available space in coarse increments. The +free bits in the insert buffer bitmap must never exceed the free space +on a page. It is safe to decrement or reset the bits in the bitmap in +a mini-transaction that is committed before the mini-transaction that +affects the free space. It is unsafe to increment the bits in a +separately committed mini-transaction, because in crash recovery, the +free bits could momentarily be set too high. */ + +/********************************************************************** +Creates the insert buffer data structure at a database startup and +initializes the data structures for the insert buffer of each tablespace. */ +UNIV_INTERN +void +ibuf_init_at_db_start(void); +/*=======================*/ +/************************************************************************* +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +UNIV_INTERN +void +ibuf_update_max_tablespace_id(void); +/*===============================*/ +/************************************************************************* +Initializes an ibuf bitmap page. */ +UNIV_INTERN +void +ibuf_bitmap_page_init( +/*==================*/ + buf_block_t* block, /* in: bitmap page */ + mtr_t* mtr); /* in: mtr */ +/**************************************************************************** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +UNIV_INTERN +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block); /* in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +/**************************************************************************** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /* in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/* in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase);/* in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +/************************************************************************** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /* in: index page */ + ulint max_ins_size, /* in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr); /* in/out: mtr */ +/************************************************************************** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +UNIV_INTERN +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /* in/out: index page */ + mtr_t* mtr); /* in/out: mtr */ +/************************************************************************** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +UNIV_INTERN +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + buf_block_t* block1, /* in: index page */ + buf_block_t* block2, /* in: index page */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /* in: index where to insert */ + ulint ignore_sec_unique); /* in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +/********************************************************************** +Returns TRUE if the current OS thread is performing an insert buffer +routine. */ +UNIV_INTERN +ibool +ibuf_inside(void); +/*=============*/ + /* out: TRUE if inside an insert buffer routine: for instance, + a read-ahead of non-ibuf pages is then forbidden */ +/*************************************************************************** +Checks if a page address is an ibuf bitmap page (level 3 page) address. */ +UNIV_INLINE +ibool +ibuf_bitmap_page( +/*=============*/ + /* out: TRUE if a bitmap page */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no);/* in: page number */ +/*************************************************************************** +Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==TRUE. */ +UNIV_INTERN +ibool +ibuf_page( +/*======*/ + /* out: TRUE if level 2 or level 3 page */ + ulint space, /* in: space id */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint page_no,/* in: page number */ + mtr_t* mtr); /* in: mtr which will contain an x-latch to the + bitmap page if the page is not one of the fixed + address ibuf pages, or NULL, in which case a new + transaction is created. */ +/*************************************************************************** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +UNIV_INTERN +void +ibuf_free_excess_pages(void); +/*========================*/ +/************************************************************************* +Makes an index insert to the insert buffer, instead of directly to the disk +page, if this is possible. Does not do insert if the index is clustered +or unique. */ +UNIV_INTERN +ibool +ibuf_insert( +/*========*/ + /* out: TRUE if success */ + const dtuple_t* entry, /* in: index entry to insert */ + dict_index_t* index, /* in: index where to insert */ + ulint space, /* in: space id where to insert */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint page_no,/* in: page number where to insert */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +When an index page is read from a disk to the buffer pool, this function +inserts to the page the possible index entries buffered in the insert buffer. +The entries are deleted from the insert buffer. If the page is not read, but +created in the buffer pool, this function deletes its buffered entries from +the insert buffer; there can exist entries for such a page if the page +belonged to an index which subsequently was dropped. */ +UNIV_INTERN +void +ibuf_merge_or_delete_for_page( +/*==========================*/ + buf_block_t* block, /* in: if page has been read from + disk, pointer to the page x-latched, + else NULL */ + ulint space, /* in: space id of the index page */ + ulint page_no,/* in: page number of the index page */ + ulint zip_size,/* in: compressed page size in bytes, + or 0 */ + ibool update_ibuf_bitmap);/* in: normally this is set + to TRUE, but if we have deleted or are + deleting the tablespace, then we + naturally do not want to update a + non-existent bitmap page */ +/************************************************************************* +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ +UNIV_INTERN +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space); /* in: space id */ +/************************************************************************* +Contracts insert buffer trees by reading pages to the buffer pool. */ +UNIV_INTERN +ulint +ibuf_contract( +/*==========*/ + /* out: a lower limit for the combined size in bytes + of entries which will be merged from ibuf trees to the + pages read, 0 if ibuf is empty */ + ibool sync); /* in: TRUE if the caller wants to wait for the + issued read with the highest tablespace address + to complete */ +/************************************************************************* +Contracts insert buffer trees by reading pages to the buffer pool. */ +UNIV_INTERN +ulint +ibuf_contract_for_n_pages( +/*======================*/ + /* out: a lower limit for the combined size in bytes + of entries which will be merged from ibuf trees to the + pages read, 0 if ibuf is empty */ + ibool sync, /* in: TRUE if the caller wants to wait for the + issued read with the highest tablespace address + to complete */ + ulint n_pages);/* in: try to read at least this many pages to + the buffer pool and merge the ibuf contents to + them */ +/************************************************************************* +Parses a redo log record of an ibuf bitmap page init. */ +UNIV_INTERN +byte* +ibuf_parse_bitmap_init( +/*===================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: block or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ +#ifdef UNIV_IBUF_COUNT_DEBUG +/********************************************************************** +Gets the ibuf count for a given page. */ +UNIV_INTERN +ulint +ibuf_count_get( +/*===========*/ + /* out: number of entries in the insert buffer + currently buffered for this page */ + ulint space, /* in: space id */ + ulint page_no);/* in: page number */ +#endif +/********************************************************************** +Looks if the insert buffer is empty. */ +UNIV_INTERN +ibool +ibuf_is_empty(void); +/*===============*/ + /* out: TRUE if empty */ +/********************************************************************** +Prints info of ibuf. */ +UNIV_INTERN +void +ibuf_print( +/*=======*/ + FILE* file); /* in: file where to print */ + +#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO +#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO + +/* The ibuf header page currently contains only the file segment header +for the file segment from which the pages for the ibuf tree are allocated */ +#define IBUF_HEADER PAGE_DATA +#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */ + +/* The insert buffer tree itself is always located in space 0. */ +#define IBUF_SPACE_ID 0 + +#ifndef UNIV_NONINL +#include "ibuf0ibuf.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ibuf0ibuf.ic b/storage/xtradb/include/ibuf0ibuf.ic new file mode 100644 index 00000000000..170e5dba473 --- /dev/null +++ b/storage/xtradb/include/ibuf0ibuf.ic @@ -0,0 +1,325 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" +#include "page0page.h" +#include "page0zip.h" + +extern ulint ibuf_flush_count; + +/* If this number is n, an index page must contain at least the page size +per n bytes of free space for ibuf to try to buffer inserts to this page. +If there is this much of free space, the corresponding bits are set in the +ibuf bitmap. */ +#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32 + +/* Insert buffer struct */ + +struct ibuf_struct{ + ulint size; /* current size of the ibuf index + tree, in pages */ + ulint max_size; /* recommended maximum size of the + ibuf index tree, in pages */ + ulint seg_size; /* allocated pages of the file + segment containing ibuf header and + tree */ + ibool empty; /* after an insert to the ibuf tree + is performed, this is set to FALSE, + and if a contract operation finds + the tree empty, this is set to + TRUE */ + ulint free_list_len; /* length of the free list */ + ulint height; /* tree height */ + dict_index_t* index; /* insert buffer index */ + + ulint n_inserts; /* number of inserts made to + the insert buffer */ + ulint n_merges; /* number of pages merged */ + ulint n_merged_recs; /* number of records merged */ +}; + +/**************************************************************************** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INTERN +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /* in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/* in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val); /* in: value to set: < 4 */ +#ifdef UNIV_IBUF_DEBUG +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v) +#else /* UNIV_IBUF_DEBUG */ +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v) +#endif /* UNIV_IBUF_DEBUG */ + +/************************************************************************** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /* in: index where to insert */ + ulint ignore_sec_unique) /* in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +{ + if (ibuf_use != IBUF_USE_NONE + && !dict_index_is_clust(index) + && (ignore_sec_unique || !dict_index_is_unique(index))) { + + ibuf_flush_count++; + + if (ibuf_flush_count % 4 == 0) { + + buf_LRU_try_free_flushed_blocks(); + } + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************************** +Checks if a page address is an ibuf bitmap page address. */ +UNIV_INLINE +ibool +ibuf_bitmap_page( +/*=============*/ + /* out: TRUE if a bitmap page */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint page_no)/* in: page number */ +{ + ut_ad(ut_is_2pow(zip_size)); + + if (!zip_size) { + return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1)) + == FSP_IBUF_BITMAP_OFFSET)); + } + + return(UNIV_UNLIKELY((page_no & (zip_size - 1)) + == FSP_IBUF_BITMAP_OFFSET)); +} + +/************************************************************************* +Translates the free space on a page to a value in the ibuf bitmap.*/ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_bits( +/*===========================*/ + /* out: value for ibuf bitmap bits */ + ulint zip_size, /* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint max_ins_size) /* in: maximum insert size after reorganize + for the page */ +{ + ulint n; + ut_ad(ut_is_2pow(zip_size)); + ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + + if (zip_size) { + n = max_ins_size + / (zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } else { + n = max_ins_size + / (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + if (n == 3) { + n = 2; + } + + if (n > 3) { + n = 3; + } + + return(n); +} + +/************************************************************************* +Translates the ibuf free bits to the free space on a page in bytes. */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_from_bits( +/*================================*/ + /* out: maximum insert size after reorganize for the + page */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint bits) /* in: value for ibuf bitmap bits */ +{ + ut_ad(bits < 4); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + + if (zip_size) { + if (bits == 3) { + return(4 * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + return(bits * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + if (bits == 3) { + return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); + } + + return(bits * (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE)); +} + +/************************************************************************* +Translates the free space on a compressed page to a value in the ibuf bitmap.*/ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_zip( +/*==========================*/ + /* out: value for ibuf bitmap bits */ + ulint zip_size, + /* in: compressed page size in bytes */ + const buf_block_t* block) /* in: buffer block */ +{ + ulint max_ins_size; + const page_zip_des_t* page_zip; + lint zip_max_ins; + + ut_ad(zip_size == buf_block_get_zip_size(block)); + ut_ad(zip_size); + + max_ins_size = page_get_max_insert_size_after_reorganize( + buf_block_get_frame(block), 1); + + page_zip = buf_block_get_page_zip(block); + zip_max_ins = page_zip_max_ins_size(page_zip, + FALSE/* not clustered */); + + if (UNIV_UNLIKELY(zip_max_ins < 0)) { + return(0); + } else if (UNIV_LIKELY(max_ins_size > (ulint) zip_max_ins)) { + max_ins_size = (ulint) zip_max_ins; + } + + return(ibuf_index_page_calc_free_bits(zip_size, max_ins_size)); +} + +/************************************************************************* +Translates the free space on a page to a value in the ibuf bitmap.*/ +UNIV_INLINE +ulint +ibuf_index_page_calc_free( +/*======================*/ + /* out: value for ibuf bitmap bits */ + ulint zip_size,/* in: compressed page size in bytes; + 0 for uncompressed pages */ + const buf_block_t* block) /* in: buffer block */ +{ + ut_ad(zip_size == buf_block_get_zip_size(block)); + + if (!zip_size) { + ulint max_ins_size; + + max_ins_size = page_get_max_insert_size_after_reorganize( + buf_block_get_frame(block), 1); + + return(ibuf_index_page_calc_free_bits(0, max_ins_size)); + } else { + return(ibuf_index_page_calc_free_zip(zip_size, block)); + } +} + +/**************************************************************************** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /* in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/* in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase)/* in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +{ + ulint before; + ulint after; + + ut_ad(!buf_block_get_page_zip(block)); + + before = ibuf_index_page_calc_free_bits(0, max_ins_size); + + if (max_ins_size >= increase) { +#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE +# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE" +#endif + after = ibuf_index_page_calc_free_bits(0, max_ins_size + - increase); +#ifdef UNIV_IBUF_DEBUG + ut_a(after <= ibuf_index_page_calc_free(0, block)); +#endif + } else { + after = ibuf_index_page_calc_free(0, block); + } + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + if (before > after) { + ibuf_set_free_bits(block, after, before); + } +} diff --git a/storage/xtradb/include/ibuf0types.h b/storage/xtradb/include/ibuf0types.h new file mode 100644 index 00000000000..264415196a1 --- /dev/null +++ b/storage/xtradb/include/ibuf0types.h @@ -0,0 +1,30 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Insert buffer global types + +Created 7/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0types_h +#define ibuf0types_h + +typedef struct ibuf_struct ibuf_t; + +#endif diff --git a/storage/xtradb/include/lock0iter.h b/storage/xtradb/include/lock0iter.h new file mode 100644 index 00000000000..3cd47bb95d2 --- /dev/null +++ b/storage/xtradb/include/lock0iter.h @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Lock queue iterator type and function prototypes. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0iter_h +#define lock0iter_h + +#include "univ.i" +#include "lock0types.h" + +typedef struct lock_queue_iterator_struct { + const lock_t* current_lock; + /* In case this is a record lock queue (not table lock queue) + then bit_no is the record number within the heap in which the + record is stored. */ + ulint bit_no; +} lock_queue_iterator_t; + +/*********************************************************************** +Initialize lock queue iterator so that it starts to iterate from +"lock". bit_no specifies the record number within the heap where the +record is stored. It can be undefined (ULINT_UNDEFINED) in two cases: +1. If the lock is a table lock, thus we have a table lock queue; +2. If the lock is a record lock and it is a wait lock. In this case + bit_no is calculated in this function by using + lock_rec_find_set_bit(). There is exactly one bit set in the bitmap + of a wait lock. */ +UNIV_INTERN +void +lock_queue_iterator_reset( +/*======================*/ + lock_queue_iterator_t* iter, /* out: iterator */ + const lock_t* lock, /* in: lock to start from */ + ulint bit_no);/* in: record number in the + heap */ + +/*********************************************************************** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). */ + +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + /* out: previous lock or NULL */ + lock_queue_iterator_t* iter); /* in/out: iterator */ + +#endif /* lock0iter_h */ diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h new file mode 100644 index 00000000000..2deeb804737 --- /dev/null +++ b/storage/xtradb/include/lock0lock.h @@ -0,0 +1,838 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0lock_h +#define lock0lock_h + +#include "univ.i" +#include "buf0types.h" +#include "trx0types.h" +#include "rem0types.h" +#include "dict0types.h" +#include "que0types.h" +#include "lock0types.h" +#include "read0types.h" +#include "hash0hash.h" +#include "ut0vec.h" + +#ifdef UNIV_DEBUG +extern ibool lock_print_waits; +#endif /* UNIV_DEBUG */ +/* Buffer for storing information about the most recent deadlock error */ +extern FILE* lock_latest_err_file; + +/************************************************************************* +Gets the size of a lock struct. */ +UNIV_INTERN +ulint +lock_get_size(void); +/*===============*/ + /* out: size in bytes */ +/************************************************************************* +Creates the lock system at database start. */ +UNIV_INTERN +void +lock_sys_create( +/*============*/ + ulint n_cells); /* in: number of slots in lock hash table */ +/************************************************************************* +Checks if some transaction has an implicit x-lock on a record in a clustered +index. */ +UNIV_INLINE +trx_t* +lock_clust_rec_some_has_impl( +/*=========================*/ + /* out: transaction which has the x-lock, or + NULL */ + const rec_t* rec, /* in: user record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ +/************************************************************************* +Gets the heap_no of the smallest user record on a page. */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + /* out: heap_no of smallest + user record, or + PAGE_HEAP_NO_SUPREMUM */ + const buf_block_t* block); /* in: buffer block */ +/***************************************************************** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +UNIV_INTERN +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /* in: old index page, now + reorganized */ + const buf_block_t* oblock);/* in: copy of the old, not + reorganized page */ +/***************************************************************** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /* in: index page to move to */ + const buf_block_t* block, /* in: index page */ + const rec_t* rec); /* in: record on page: this + is the first record moved */ +/***************************************************************** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /* in: index page to move to */ + const buf_block_t* block, /* in: index page */ + const rec_t* rec, /* in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end); /* in: old + previous-to-last + record on new_page + before the records + were copied */ +/***************************************************************** +Updates the lock table when a page is split to the right. */ +UNIV_INTERN +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /* in: right page */ + const buf_block_t* left_block); /* in: left page */ +/***************************************************************** +Updates the lock table when a page is merged to the right. */ +UNIV_INTERN +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /* in: right page to + which merged */ + const rec_t* orig_succ, /* in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block); /* in: merged index + page which will be + discarded */ +/***************************************************************** +Updates the lock table when the root page is copied to another in +btr_root_raise_and_insert. Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +UNIV_INTERN +void +lock_update_root_raise( +/*===================*/ + const buf_block_t* block, /* in: index page to which copied */ + const buf_block_t* root); /* in: root page */ +/***************************************************************** +Updates the lock table when a page is copied to another and the original page +is removed from the chain of leaf pages, except if page is the root! */ +UNIV_INTERN +void +lock_update_copy_and_discard( +/*=========================*/ + const buf_block_t* new_block, /* in: index page to + which copied */ + const buf_block_t* block); /* in: index page; + NOT the root! */ +/***************************************************************** +Updates the lock table when a page is split to the left. */ +UNIV_INTERN +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /* in: right page */ + const buf_block_t* left_block); /* in: left page */ +/***************************************************************** +Updates the lock table when a page is merged to the left. */ +UNIV_INTERN +void +lock_update_merge_left( +/*===================*/ + const buf_block_t* left_block, /* in: left page to + which merged */ + const rec_t* orig_pred, /* in: original predecessor + of supremum on the left page + before merge */ + const buf_block_t* right_block); /* in: merged index page + which will be discarded */ +/***************************************************************** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +UNIV_INTERN +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t* heir_block, /* in: block containing the + record which inherits */ + const buf_block_t* block, /* in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /* in: heap_no of the + inheriting record */ + ulint heap_no); /* in: heap_no of the + donating record */ +/***************************************************************** +Updates the lock table when a page is discarded. */ +UNIV_INTERN +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /* in: index page + which will inherit the locks */ + ulint heir_heap_no, /* in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block); /* in: index page + which will be discarded */ +/***************************************************************** +Updates the lock table when a new user record is inserted. */ +UNIV_INTERN +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec); /* in: the inserted record */ +/***************************************************************** +Updates the lock table when a record is removed. */ +UNIV_INTERN +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec); /* in: the record to be removed */ +/************************************************************************* +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is in such an update moved, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +UNIV_INTERN +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec); /* in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +/************************************************************************* +Restores the state of explicit lock requests on a single record, where the +state was stored on the infimum of the page. */ +UNIV_INTERN +void +lock_rec_restore_from_page_infimum( +/*===============================*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec, /* in: record whose lock state + is restored */ + const buf_block_t* donator);/* in: page (rec is not + necessarily on this page) + whose infimum stored the lock + state; lock bits are reset on + the infimum */ +/************************************************************************* +Returns TRUE if there are explicit record locks on a page. */ +UNIV_INTERN +ibool +lock_rec_expl_exist_on_page( +/*========================*/ + /* out: TRUE if there are explicit record locks on + the page */ + ulint space, /* in: space id */ + ulint page_no);/* in: page number */ +/************************************************************************* +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. */ +UNIV_INTERN +ulint +lock_rec_insert_check_and_lock( +/*===========================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is + set, does nothing */ + rec_t* rec, /* in: record after which to insert */ + buf_block_t* block, /* in/out: buffer block of rec */ + dict_index_t* index, /* in: index */ + que_thr_t* thr, /* in: query thread */ + ibool* inherit);/* out: set to TRUE if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ +/************************************************************************* +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. */ +UNIV_INTERN +ulint +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: record which should be + modified */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Checks if locks of other transactions prevent an immediate modify +(delete mark or delete unmark) of a secondary index record. */ +UNIV_INTERN +ulint +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /* in/out: buffer block of rec */ + rec_t* rec, /* in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /* in: secondary index */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Like the counterpart for a clustered index below, but now we read a +secondary index record. */ +UNIV_INTERN +ulint +lock_sec_rec_read_check_and_lock( +/*=============================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /* in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. */ +UNIV_INTERN +ulint +lock_clust_rec_read_check_and_lock( +/*===============================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /* in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". */ +UNIV_INTERN +ulint +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /* in: clustered index */ + enum lock_mode mode, /* in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Checks that a record is seen in a consistent read. */ +UNIV_INTERN +ibool +lock_clust_rec_cons_read_sees( +/*==========================*/ + /* out: TRUE if sees, or FALSE if an earlier + version of the record should be retrieved */ + const rec_t* rec, /* in: user record which should be read or + passed over by a read cursor */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + read_view_t* view); /* in: consistent read view */ +/************************************************************************* +Checks that a non-clustered index record is seen in a consistent read. */ +UNIV_INTERN +ulint +lock_sec_rec_cons_read_sees( +/*========================*/ + /* out: TRUE if certainly + sees, or FALSE if an earlier + version of the clustered index + record might be needed: NOTE + that a non-clustered index + page contains so little + information on its + modifications that also in the + case FALSE, the present + version of rec may be the + right, but we must check this + from the clustered index + record */ + const rec_t* rec, /* in: user record which + should be read or passed over + by a read cursor */ + const read_view_t* view); /* in: consistent read view */ +/************************************************************************* +Locks the specified database table in the mode given. If the lock cannot +be granted immediately, the query thread is put to wait. */ +UNIV_INTERN +ulint +lock_table( +/*=======*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + dict_table_t* table, /* in: database table in dictionary cache */ + enum lock_mode mode, /* in: lock mode */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Checks if there are any locks set on the table. */ +UNIV_INTERN +ibool +lock_is_on_table( +/*=============*/ + /* out: TRUE if there are lock(s) */ + dict_table_t* table); /* in: database table in dictionary cache */ +/***************************************************************** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +UNIV_INTERN +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /* in: transaction that has + set a record lock */ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec, /* in: record */ + enum lock_mode lock_mode);/* in: LOCK_S or LOCK_X */ +/************************************************************************* +Releases a table lock. +Releases possible other transactions waiting for this lock. */ +UNIV_INTERN +void +lock_table_unlock( +/*==============*/ + lock_t* lock); /* in: lock */ +/************************************************************************* +Releases transaction locks, and releases possible other transactions waiting +because of these locks. */ +UNIV_INTERN +void +lock_release_off_kernel( +/*====================*/ + trx_t* trx); /* in: transaction */ +/************************************************************************* +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +UNIV_INTERN +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock); /* in: waiting lock request */ + +/************************************************************************* +Removes locks on a table to be dropped or truncated. +If remove_also_table_sx_locks is TRUE then table-level S and X locks are +also removed in addition to other table-level and record-level locks. +No lock, that is going to be removed, is allowed to be a wait lock. */ +UNIV_INTERN +void +lock_remove_all_on_table( +/*=====================*/ + dict_table_t* table, /* in: table to be dropped + or truncated */ + ibool remove_also_table_sx_locks);/* in: also removes + table S and X locks */ + +/************************************************************************* +Calculates the fold value of a page file address: used in inserting or +searching for a lock in the hash table. */ +UNIV_INLINE +ulint +lock_rec_fold( +/*==========*/ + /* out: folded value */ + ulint space, /* in: space */ + ulint page_no)/* in: page number */ + __attribute__((const)); +/************************************************************************* +Calculates the hash value of a page file address: used in inserting or +searching for a lock in the hash table. */ +UNIV_INLINE +ulint +lock_rec_hash( +/*==========*/ + /* out: hashed value */ + ulint space, /* in: space */ + ulint page_no);/* in: page number */ + +/************************************************************************** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. */ +UNIV_INTERN +ulint +lock_rec_find_set_bit( +/*==================*/ + /* out: bit index == heap number of + the record, or ULINT_UNDEFINED if none found */ + const lock_t* lock); /* in: record lock with at least one bit set */ + +/************************************************************************* +Gets the source table of an ALTER TABLE transaction. The table must be +covered by an IX or IS table lock. */ +UNIV_INTERN +dict_table_t* +lock_get_src_table( +/*===============*/ + /* out: the source table of transaction, + if it is covered by an IX or IS table lock; + dest if there is no source table, and + NULL if the transaction is locking more than + two tables or an inconsistency is found */ + trx_t* trx, /* in: transaction */ + dict_table_t* dest, /* in: destination of ALTER TABLE */ + enum lock_mode* mode); /* out: lock mode of the source table */ +/************************************************************************* +Determine if the given table is exclusively "owned" by the given +transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC +on the table. */ +UNIV_INTERN +ibool +lock_is_table_exclusive( +/*====================*/ + /* out: TRUE if table is only locked by trx, + with LOCK_IX, and possibly LOCK_AUTO_INC */ + dict_table_t* table, /* in: table */ + trx_t* trx); /* in: transaction */ +/************************************************************************* +Checks if a lock request lock1 has to wait for request lock2. */ +UNIV_INTERN +ibool +lock_has_to_wait( +/*=============*/ + /* out: TRUE if lock1 has to wait for + lock2 to be removed */ + const lock_t* lock1, /* in: waiting lock */ + const lock_t* lock2); /* in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +/************************************************************************* +Checks that a transaction id is sensible, i.e., not in the future. */ +UNIV_INTERN +ibool +lock_check_trx_id_sanity( +/*=====================*/ + /* out: TRUE if ok */ + dulint trx_id, /* in: trx id */ + const rec_t* rec, /* in: user record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + ibool has_kernel_mutex);/* in: TRUE if the caller owns the + kernel mutex */ +/************************************************************************* +Prints info of a table lock. */ +UNIV_INTERN +void +lock_table_print( +/*=============*/ + FILE* file, /* in: file where to print */ + const lock_t* lock); /* in: table type lock */ +/************************************************************************* +Prints info of a record lock. */ +UNIV_INTERN +void +lock_rec_print( +/*===========*/ + FILE* file, /* in: file where to print */ + const lock_t* lock); /* in: record type lock */ +/************************************************************************* +Prints info of locks for all transactions. */ +UNIV_INTERN +void +lock_print_info_summary( +/*====================*/ + FILE* file); /* in: file where to print */ +/************************************************************************* +Prints info of locks for each transaction. */ +UNIV_INTERN +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file); /* in: file where to print */ +/************************************************************************* +Return approximate number or record locks (bits set in the bitmap) for +this transaction. Since delete-marked records may be removed, the +record count will not be precise. */ +UNIV_INTERN +ulint +lock_number_of_rows_locked( +/*=======================*/ + trx_t* trx); /* in: transaction */ +/*********************************************************************** +Release all the transaction's autoinc locks. */ +UNIV_INTERN +void +lock_release_autoinc_locks( +/*=======================*/ + trx_t* trx); /* in/out: transaction */ + +/*********************************************************************** +Gets the type of a lock. Non-inline version for using outside of the +lock module. */ +UNIV_INTERN +ulint +lock_get_type( +/*==========*/ + /* out: LOCK_TABLE or LOCK_REC */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +Gets the id of the transaction owning a lock. */ +UNIV_INTERN +ullint +lock_get_trx_id( +/*============*/ + /* out: transaction id */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +Gets the mode of a lock in a human readable string. +The string should not be free()'d or modified. */ + +const char* +lock_get_mode_str( +/*==============*/ + /* out: lock mode */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +Gets the type of a lock in a human readable string. +The string should not be free()'d or modified. */ + +const char* +lock_get_type_str( +/*==============*/ + /* out: lock type */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +Gets the id of the table on which the lock is. */ +UNIV_INTERN +ullint +lock_get_table_id( +/*==============*/ + /* out: id of the table */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +Gets the name of the table on which the lock is. +The string should not be free()'d or modified. */ + +const char* +lock_get_table_name( +/*================*/ + /* out: name of the table */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +For a record lock, gets the index on which the lock is. */ + +const dict_index_t* +lock_rec_get_index( +/*===============*/ + /* out: index */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +For a record lock, gets the name of the index on which the lock is. +The string should not be free()'d or modified. */ + +const char* +lock_rec_get_index_name( +/*====================*/ + /* out: name of the index */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +For a record lock, gets the tablespace number on which the lock is. */ +UNIV_INTERN +ulint +lock_rec_get_space_id( +/*==================*/ + /* out: tablespace number */ + const lock_t* lock); /* in: lock */ + +/*********************************************************************** +For a record lock, gets the page number on which the lock is. */ +UNIV_INTERN +ulint +lock_rec_get_page_no( +/*=================*/ + /* out: page number */ + const lock_t* lock); /* in: lock */ + +/* Lock modes and types */ +#define LOCK_MODE_MASK 0xFUL /* mask used to extract mode from the + type_mode field in a lock */ +/* Lock types */ +#define LOCK_TABLE 16 /* these type values should be so high that */ +#define LOCK_REC 32 /* they can be ORed to the lock mode */ +#define LOCK_TYPE_MASK 0xF0UL /* mask used to extract lock type from the + type_mode field in a lock */ +/* Waiting lock flag */ +#define LOCK_WAIT 256 /* this wait bit should be so high that + it can be ORed to the lock mode and type; + when this bit is set, it means that the + lock has not yet been granted, it is just + waiting for its turn in the wait queue */ +/* Precise modes */ +#define LOCK_ORDINARY 0 /* this flag denotes an ordinary next-key lock + in contrast to LOCK_GAP or LOCK_REC_NOT_GAP */ +#define LOCK_GAP 512 /* this gap bit should be so high that + it can be ORed to the other flags; + when this bit is set, it means that the + lock holds only on the gap before the record; + for instance, an x-lock on the gap does not + give permission to modify the record on which + the bit is set; locks of this type are created + when records are removed from the index chain + of records */ +#define LOCK_REC_NOT_GAP 1024 /* this bit means that the lock is only on + the index record and does NOT block inserts + to the gap before the index record; this is + used in the case when we retrieve a record + with a unique key, and is also used in + locking plain SELECTs (not part of UPDATE + or DELETE) when the user has set the READ + COMMITTED isolation level */ +#define LOCK_INSERT_INTENTION 2048 /* this bit is set when we place a waiting + gap type record lock request in order to let + an insert of an index record to wait until + there are no conflicting locks by other + transactions on the gap; note that this flag + remains set when the waiting lock is granted, + or if the lock is inherited to a neighboring + record */ + +/* When lock bits are reset, the following flags are available: */ +#define LOCK_RELEASE_WAIT 1 +#define LOCK_NOT_RELEASE_WAIT 2 + +/* Lock operation struct */ +typedef struct lock_op_struct lock_op_t; +struct lock_op_struct{ + dict_table_t* table; /* table to be locked */ + enum lock_mode mode; /* lock mode */ +}; + +#define LOCK_OP_START 1 +#define LOCK_OP_COMPLETE 2 + +/* The lock system struct */ +struct lock_sys_struct{ + hash_table_t* rec_hash; /* hash table of the record locks */ +}; + +/* The lock system */ +extern lock_sys_t* lock_sys; + + +#ifndef UNIV_NONINL +#include "lock0lock.ic" +#endif + +#endif diff --git a/storage/xtradb/include/lock0lock.ic b/storage/xtradb/include/lock0lock.ic new file mode 100644 index 00000000000..f978cc70678 --- /dev/null +++ b/storage/xtradb/include/lock0lock.ic @@ -0,0 +1,123 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#include "sync0sync.h" +#include "srv0srv.h" +#include "dict0dict.h" +#include "row0row.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "row0vers.h" +#include "que0que.h" +#include "btr0cur.h" +#include "read0read.h" +#include "log0recv.h" + +/************************************************************************* +Calculates the fold value of a page file address: used in inserting or +searching for a lock in the hash table. */ +UNIV_INLINE +ulint +lock_rec_fold( +/*==========*/ + /* out: folded value */ + ulint space, /* in: space */ + ulint page_no)/* in: page number */ +{ + return(ut_fold_ulint_pair(space, page_no)); +} + +/************************************************************************* +Calculates the hash value of a page file address: used in inserting or +searching for a lock in the hash table. */ +UNIV_INLINE +ulint +lock_rec_hash( +/*==========*/ + /* out: hashed value */ + ulint space, /* in: space */ + ulint page_no)/* in: page number */ +{ + return(hash_calc_hash(lock_rec_fold(space, page_no), + lock_sys->rec_hash)); +} + +/************************************************************************* +Checks if some transaction has an implicit x-lock on a record in a clustered +index. */ +UNIV_INLINE +trx_t* +lock_clust_rec_some_has_impl( +/*=========================*/ + /* out: transaction which has the x-lock, or + NULL */ + const rec_t* rec, /* in: user record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + dulint trx_id; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + + trx_id = row_get_rec_trx_id(rec, index, offsets); + + if (trx_is_active(trx_id)) { + /* The modifying or inserting transaction is active */ + + return(trx_get_on_id(trx_id)); + } + + return(NULL); +} + +/************************************************************************* +Gets the heap_no of the smallest user record on a page. */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + /* out: heap_no of smallest + user record, or + PAGE_HEAP_NO_SUPREMUM */ + const buf_block_t* block) /* in: buffer block */ +{ + const page_t* page = block->frame; + + if (page_is_comp(page)) { + return(rec_get_heap_no_new( + page + + rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE))); + } else { + return(rec_get_heap_no_old( + page + + rec_get_next_offs(page + PAGE_OLD_INFIMUM, + FALSE))); + } +} diff --git a/storage/xtradb/include/lock0priv.h b/storage/xtradb/include/lock0priv.h new file mode 100644 index 00000000000..0a0d41e6aaa --- /dev/null +++ b/storage/xtradb/include/lock0priv.h @@ -0,0 +1,106 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Lock module internal structures and methods. + +Created July 12, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0priv_h +#define lock0priv_h + +#ifndef LOCK_MODULE_IMPLEMENTATION +/* If you need to access members of the structures defined in this +file, please write appropriate functions that retrieve them and put +those functions in lock/ */ +#error Do not include lock0priv.h outside of the lock/ module +#endif + +#include "univ.i" +#include "dict0types.h" +#include "hash0hash.h" +#include "trx0types.h" +#include "ut0lst.h" + +/* A table lock */ +typedef struct lock_table_struct lock_table_t; +struct lock_table_struct { + dict_table_t* table; /* database table in dictionary + cache */ + UT_LIST_NODE_T(lock_t) + locks; /* list of locks on the same + table */ +}; + +/* Record lock for a page */ +typedef struct lock_rec_struct lock_rec_t; +struct lock_rec_struct { + ulint space; /* space id */ + ulint page_no; /* page number */ + ulint n_bits; /* number of bits in the lock + bitmap; NOTE: the lock bitmap is + placed immediately after the + lock struct */ +}; + +/* Lock struct */ +struct lock_struct { + trx_t* trx; /* transaction owning the + lock */ + UT_LIST_NODE_T(lock_t) + trx_locks; /* list of the locks of the + transaction */ + ulint type_mode; /* lock type, mode, LOCK_GAP or + LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION, + wait flag, ORed */ + hash_node_t hash; /* hash chain node for a record + lock */ + dict_index_t* index; /* index for a record lock */ + union { + lock_table_t tab_lock;/* table lock */ + lock_rec_t rec_lock;/* record lock */ + } un_member; +}; + +/************************************************************************* +Gets the type of a lock. */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + /* out: LOCK_TABLE or LOCK_REC */ + const lock_t* lock); /* in: lock */ + +/************************************************************************* +Gets the previous record lock set on a record. */ + +const lock_t* +lock_rec_get_prev( +/*==============*/ + /* out: previous lock on the same + record, NULL if none exists */ + const lock_t* in_lock,/* in: record lock */ + ulint heap_no);/* in: heap number of the record */ + +#ifndef UNIV_NONINL +#include "lock0priv.ic" +#endif + +#endif /* lock0priv_h */ diff --git a/storage/xtradb/include/lock0priv.ic b/storage/xtradb/include/lock0priv.ic new file mode 100644 index 00000000000..ae633a4fc61 --- /dev/null +++ b/storage/xtradb/include/lock0priv.ic @@ -0,0 +1,48 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Lock module internal inline methods. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +/* This file contains only methods which are used in +lock/lock0* files, other than lock/lock0lock.c. +I.e. lock/lock0lock.c contains more internal inline +methods but they are used only in that file. */ + +#ifndef LOCK_MODULE_IMPLEMENTATION +#error Do not include lock0priv.ic outside of the lock/ module +#endif + +/************************************************************************* +Gets the type of a lock. */ +UNIV_INLINE +ulint +lock_get_type_low( +/*==============*/ + /* out: LOCK_TABLE or LOCK_REC */ + const lock_t* lock) /* in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_TYPE_MASK); +} + +/* vim: set filetype=c: */ diff --git a/storage/xtradb/include/lock0types.h b/storage/xtradb/include/lock0types.h new file mode 100644 index 00000000000..52631b56532 --- /dev/null +++ b/storage/xtradb/include/lock0types.h @@ -0,0 +1,44 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The transaction lock system global types + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0types_h +#define lock0types_h + +#define lock_t ib_lock_t +typedef struct lock_struct lock_t; +typedef struct lock_sys_struct lock_sys_t; + +/* Basic lock modes */ +enum lock_mode { + LOCK_IS = 0, /* intention shared */ + LOCK_IX, /* intention exclusive */ + LOCK_S, /* shared */ + LOCK_X, /* exclusive */ + LOCK_AUTO_INC, /* locks the auto-inc counter of a table + in an exclusive mode */ + LOCK_NONE, /* this is used elsewhere to note consistent read */ + LOCK_NUM = LOCK_NONE/* number of lock modes */ +}; + +#endif diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h new file mode 100644 index 00000000000..51b57ae929c --- /dev/null +++ b/storage/xtradb/include/log0log.h @@ -0,0 +1,893 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#ifndef log0log_h +#define log0log_h + +#include "univ.i" +#include "ut0byte.h" +#include "sync0sync.h" +#include "sync0rw.h" + +typedef struct log_struct log_t; +typedef struct log_group_struct log_group_t; + +#ifdef UNIV_DEBUG +extern ibool log_do_write; +extern ibool log_debug_writes; +#else /* UNIV_DEBUG */ +# define log_do_write TRUE +#endif /* UNIV_DEBUG */ + +/* Wait modes for log_write_up_to */ +#define LOG_NO_WAIT 91 +#define LOG_WAIT_ONE_GROUP 92 +#define LOG_WAIT_ALL_GROUPS 93 +#define LOG_MAX_N_GROUPS 32 + +/******************************************************************** +Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint, +so that we know that the limit has been written to a log checkpoint field +on disk. */ +UNIV_INTERN +void +log_fsp_current_free_limit_set_and_checkpoint( +/*==========================================*/ + ulint limit); /* in: limit to set */ +/*********************************************************************** +Calculates where in log files we find a specified lsn. */ +UNIV_INTERN +ulint +log_calc_where_lsn_is( +/*==================*/ + /* out: log file number */ + ib_int64_t* log_file_offset, /* out: offset in that file + (including the header) */ + ib_uint64_t first_header_lsn, /* in: first log file start + lsn */ + ib_uint64_t lsn, /* in: lsn whose position to + determine */ + ulint n_log_files, /* in: total number of log + files */ + ib_int64_t log_file_size); /* in: log file size + (including the header) */ +/**************************************************************** +Writes to the log the string given. The log must be released with +log_release. */ +UNIV_INLINE +ib_uint64_t +log_reserve_and_write_fast( +/*=======================*/ + /* out: end lsn of the log record, + zero if did not succeed */ + byte* str, /* in: string */ + ulint len, /* in: string length */ + ib_uint64_t* start_lsn,/* out: start lsn of the log record */ + ibool* success);/* out: TRUE if success */ +/*************************************************************************** +Releases the log mutex. */ +UNIV_INLINE +void +log_release(void); +/*=============*/ +/*************************************************************************** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void); +/*================*/ +/**************************************************************** +Opens the log for log_write_low. The log must be closed with log_close and +released with log_release. */ +UNIV_INTERN +ib_uint64_t +log_reserve_and_open( +/*=================*/ + /* out: start lsn of the log record */ + ulint len); /* in: length of data to be catenated */ +/**************************************************************** +Writes to the log the string given. It is assumed that the caller holds the +log mutex. */ +UNIV_INTERN +void +log_write_low( +/*==========*/ + byte* str, /* in: string */ + ulint str_len); /* in: string length */ +/**************************************************************** +Closes the log. */ +UNIV_INTERN +ib_uint64_t +log_close(void); +/*===========*/ + /* out: lsn */ +/**************************************************************** +Gets the current lsn. */ +UNIV_INLINE +ib_uint64_t +log_get_lsn(void); +/*=============*/ + /* out: current lsn */ +/********************************************************** +Initializes the log. */ +UNIV_INTERN +void +log_init(void); +/*==========*/ +/********************************************************************** +Inits a log group to the log system. */ +UNIV_INTERN +void +log_group_init( +/*===========*/ + ulint id, /* in: group id */ + ulint n_files, /* in: number of log files */ + ulint file_size, /* in: log file size in bytes */ + ulint space_id, /* in: space id of the file space + which contains the log files of this + group */ + ulint archive_space_id); /* in: space id of the file space + which contains some archived log + files for this group; currently, only + for the first log group this is + used */ +/********************************************************** +Completes an i/o to a log file. */ +UNIV_INTERN +void +log_io_complete( +/*============*/ + log_group_t* group); /* in: log group */ +/********************************************************** +This function is called, e.g., when a transaction wants to commit. It checks +that the log has been written to the log file up to the last log entry written +by the transaction. If there is a flush running, it waits and checks if the +flush flushed enough. If not, starts a new flush. */ +UNIV_INTERN +void +log_write_up_to( +/*============*/ + ib_uint64_t lsn, /* in: log sequence number up to which + the log should be written, + IB_ULONGLONG_MAX if not specified */ + ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + or LOG_WAIT_ALL_GROUPS */ + ibool flush_to_disk); + /* in: TRUE if we want the written log + also to be flushed to disk */ +/******************************************************************** +Does a syncronous flush of the log buffer to disk. */ +UNIV_INTERN +void +log_buffer_flush_to_disk(void); +/*==========================*/ +/******************************************************************** +Advances the smallest lsn for which there are unflushed dirty blocks in the +buffer pool and also may make a new checkpoint. NOTE: this function may only +be called if the calling thread owns no synchronization objects! */ +UNIV_INTERN +ibool +log_preflush_pool_modified_pages( +/*=============================*/ + /* out: FALSE if there was a + flush batch of the same type + running, which means that we + could not start this flush + batch */ + ib_uint64_t new_oldest, /* in: try to advance + oldest_modified_lsn at least + to this lsn */ + ibool sync); /* in: TRUE if synchronous + operation is desired */ +/********************************************************** +Makes a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log files. Use log_make_checkpoint_at to flush also the pool. */ +UNIV_INTERN +ibool +log_checkpoint( +/*===========*/ + /* out: TRUE if success, FALSE if a checkpoint + write was already running */ + ibool sync, /* in: TRUE if synchronous operation is + desired */ + ibool write_always); /* in: the function normally checks if the + the new checkpoint would have a greater + lsn than the previous one: if not, then no + physical write is done; by setting this + parameter TRUE, a physical write will always be + made to log files */ +/******************************************************************** +Makes a checkpoint at a given lsn or later. */ +UNIV_INTERN +void +log_make_checkpoint_at( +/*===================*/ + ib_uint64_t lsn, /* in: make a checkpoint at this or a + later lsn, if IB_ULONGLONG_MAX, makes + a checkpoint at the latest lsn */ + ibool write_always); /* in: the function normally checks if + the the new checkpoint would have a + greater lsn than the previous one: if + not, then no physical write is done; + by setting this parameter TRUE, a + physical write will always be made to + log files */ +/******************************************************************** +Makes a checkpoint at the latest lsn and writes it to first page of each +data file in the database, so that we know that the file spaces contain +all modifications up to that lsn. This can only be called at database +shutdown. This function also writes all log in log files to the log archive. */ +UNIV_INTERN +void +logs_empty_and_mark_files_at_shutdown(void); +/*=======================================*/ +/********************************************************** +Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */ +UNIV_INTERN +void +log_group_read_checkpoint_info( +/*===========================*/ + log_group_t* group, /* in: log group */ + ulint field); /* in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */ +/*********************************************************************** +Gets info from a checkpoint about a log group. */ +UNIV_INTERN +void +log_checkpoint_get_nth_group_info( +/*==============================*/ + byte* buf, /* in: buffer containing checkpoint info */ + ulint n, /* in: nth slot */ + ulint* file_no,/* out: archived file number */ + ulint* offset);/* out: archived file offset */ +/********************************************************** +Writes checkpoint info to groups. */ +UNIV_INTERN +void +log_groups_write_checkpoint_info(void); +/*==================================*/ +#ifdef UNIV_HOTBACKUP +/********************************************************** +Writes info to a buffer of a log group when log files are created in +backup restoration. */ +UNIV_INTERN +void +log_reset_first_header_and_checkpoint( +/*==================================*/ + byte* hdr_buf,/* in: buffer which will be written to the + start of the first log file */ + ib_uint64_t start); /* in: lsn of the start of the first log file; + we pretend that there is a checkpoint at + start + LOG_BLOCK_HDR_SIZE */ +#endif /* UNIV_HOTBACKUP */ +/************************************************************************ +Starts an archiving operation. */ +UNIV_INTERN +ibool +log_archive_do( +/*===========*/ + /* out: TRUE if succeed, FALSE if an archiving + operation was already running */ + ibool sync, /* in: TRUE if synchronous operation is desired */ + ulint* n_bytes);/* out: archive log buffer size, 0 if nothing to + archive */ +/******************************************************************** +Writes the log contents to the archive up to the lsn when this function was +called, and stops the archiving. When archiving is started again, the archived +log file numbers start from a number one higher, so that the archiving will +not write again to the archived log files which exist when this function +returns. */ +UNIV_INTERN +ulint +log_archive_stop(void); +/*==================*/ + /* out: DB_SUCCESS or DB_ERROR */ +/******************************************************************** +Starts again archiving which has been stopped. */ +UNIV_INTERN +ulint +log_archive_start(void); +/*===================*/ + /* out: DB_SUCCESS or DB_ERROR */ +/******************************************************************** +Stop archiving the log so that a gap may occur in the archived log files. */ +UNIV_INTERN +ulint +log_archive_noarchivelog(void); +/*==========================*/ + /* out: DB_SUCCESS or DB_ERROR */ +/******************************************************************** +Start archiving the log so that a gap may occur in the archived log files. */ +UNIV_INTERN +ulint +log_archive_archivelog(void); +/*========================*/ + /* out: DB_SUCCESS or DB_ERROR */ +/********************************************************** +Generates an archived log file name. */ +UNIV_INTERN +void +log_archived_file_name_gen( +/*=======================*/ + char* buf, /* in: buffer where to write */ + ulint id, /* in: group id */ + ulint file_no);/* in: file number */ +/************************************************************************ +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +UNIV_INTERN +void +log_check_margins(void); +/*===================*/ +/********************************************************** +Reads a specified log segment to a buffer. */ +UNIV_INTERN +void +log_group_read_log_seg( +/*===================*/ + ulint type, /* in: LOG_ARCHIVE or LOG_RECOVER */ + byte* buf, /* in: buffer where to read */ + log_group_t* group, /* in: log group */ + ib_uint64_t start_lsn, /* in: read area start */ + ib_uint64_t end_lsn); /* in: read area end */ +/********************************************************** +Writes a buffer to a log file group. */ +UNIV_INTERN +void +log_group_write_buf( +/*================*/ + log_group_t* group, /* in: log group */ + byte* buf, /* in: buffer */ + ulint len, /* in: buffer len; must be divisible + by OS_FILE_LOG_BLOCK_SIZE */ + ib_uint64_t start_lsn, /* in: start lsn of the buffer; must + be divisible by + OS_FILE_LOG_BLOCK_SIZE */ + ulint new_data_offset);/* in: start offset of new data in + buf: this parameter is used to decide + if we have to write a new log file + header */ +/************************************************************ +Sets the field values in group to correspond to a given lsn. For this function +to work, the values must already be correctly initialized to correspond to +some lsn, for instance, a checkpoint lsn. */ +UNIV_INTERN +void +log_group_set_fields( +/*=================*/ + log_group_t* group, /* in: group */ + ib_uint64_t lsn); /* in: lsn for which the values should be + set */ +/********************************************************** +Calculates the data capacity of a log group, when the log file headers are not +included. */ +UNIV_INTERN +ulint +log_group_get_capacity( +/*===================*/ + /* out: capacity in bytes */ + log_group_t* group); /* in: log group */ +/**************************************************************** +Gets a log block flush bit. */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + /* out: TRUE if this block was the first + to be written in a log flush */ + byte* log_block); /* in: log block */ +/**************************************************************** +Gets a log block number stored in the header. */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + /* out: log block number stored in the block + header */ + byte* log_block); /* in: log block */ +/**************************************************************** +Gets a log block data length. */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + /* out: log block data length measured as a + byte offset from the block start */ + byte* log_block); /* in: log block */ +/**************************************************************** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /* in: log block */ + ulint len); /* in: data length */ +/**************************************************************** +Calculates the checksum for a log block. */ +UNIV_INLINE +ulint +log_block_calc_checksum( +/*====================*/ + /* out: checksum */ + const byte* block); /* in: log block */ +/**************************************************************** +Gets a log block checksum field value. */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + /* out: checksum */ + const byte* log_block); /* in: log block */ +/**************************************************************** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /* in: log block */ + ulint checksum); /* in: checksum */ +/**************************************************************** +Gets a log block first mtr log record group offset. */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + /* out: first mtr log record group byte offset + from the block start, 0 if none */ + byte* log_block); /* in: log block */ +/**************************************************************** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /* in: log block */ + ulint offset); /* in: offset, 0 if none */ +/**************************************************************** +Gets a log block checkpoint number field (4 lowest bytes). */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + /* out: checkpoint no (4 lowest bytes) */ + byte* log_block); /* in: log block */ +/**************************************************************** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /* in: pointer to the log buffer */ + ib_uint64_t lsn); /* in: lsn within the log block */ +/**************************************************************** +Initializes a log block in the log buffer in the old, < 3.23.52 format, where +there was no checksum yet. */ +UNIV_INLINE +void +log_block_init_in_old_format( +/*=========================*/ + byte* log_block, /* in: pointer to the log buffer */ + ib_uint64_t lsn); /* in: lsn within the log block */ +/**************************************************************** +Converts a lsn to a log block number. */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + /* out: log block number, + it is > 0 and <= 1G */ + ib_uint64_t lsn); /* in: lsn of a byte within the block */ +/********************************************************** +Prints info of the log. */ +UNIV_INTERN +void +log_print( +/*======*/ + FILE* file); /* in: file where to print */ +/********************************************************** +Peeks the current lsn. */ +UNIV_INTERN +ibool +log_peek_lsn( +/*=========*/ + /* out: TRUE if success, FALSE if + could not get the log system mutex */ + ib_uint64_t* lsn); /* out: if returns TRUE, current lsn is here */ +/************************************************************************** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +log_refresh_stats(void); +/*===================*/ + +extern log_t* log_sys; + +/* Values used as flags */ +#define LOG_FLUSH 7652559 +#define LOG_CHECKPOINT 78656949 +#ifdef UNIV_LOG_ARCHIVE +# define LOG_ARCHIVE 11122331 +#endif /* UNIV_LOG_ARCHIVE */ +#define LOG_RECOVER 98887331 + +/* The counting of lsn's starts from this value: this must be non-zero */ +#define LOG_START_LSN ((ib_uint64_t) (16 * OS_FILE_LOG_BLOCK_SIZE)) + +#define LOG_BUFFER_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE) +#define LOG_ARCHIVE_BUF_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE / 4) + +/* Offsets of a log block header */ +#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and + is allowed to wrap around at 2G; the + highest bit is set to 1 if this is the + first log block in a log flush write + segment */ +#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL + /* mask used to get the highest bit in + the preceding field */ +#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to + this block */ +#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an + mtr log record group in this log block, + 0 if none; if the value is the same + as LOG_BLOCK_HDR_DATA_LEN, it means + that the first rec group has not yet + been catenated to this log block, but + if it will, it will start at this + offset; an archive recovery can + start parsing the log records starting + from this offset in this log block, + if value not 0 */ +#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of + log_sys->next_checkpoint_no when the + log block was last written to: if the + block has not yet been written full, + this value is only updated before a + log buffer flush */ +#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in + bytes */ + +/* Offsets of a log block trailer from the end of the block */ +#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block + contents; in InnoDB versions + < 3.23.52 this did not contain the + checksum but the same value as + .._HDR_NO */ +#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */ + +/* Offsets for a checkpoint field */ +#define LOG_CHECKPOINT_NO 0 +#define LOG_CHECKPOINT_LSN 8 +#define LOG_CHECKPOINT_OFFSET 16 +#define LOG_CHECKPOINT_LOG_BUF_SIZE 20 +#define LOG_CHECKPOINT_ARCHIVED_LSN 24 +#define LOG_CHECKPOINT_GROUP_ARRAY 32 + +/* For each value < LOG_MAX_N_GROUPS the following 8 bytes: */ + +#define LOG_CHECKPOINT_ARCHIVED_FILE_NO 0 +#define LOG_CHECKPOINT_ARCHIVED_OFFSET 4 + +#define LOG_CHECKPOINT_ARRAY_END (LOG_CHECKPOINT_GROUP_ARRAY\ + + LOG_MAX_N_GROUPS * 8) +#define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END +#define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END) +#define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END) + /* current fsp free limit in + tablespace 0, in units of one + megabyte; this information is only used + by ibbackup to decide if it can + truncate unused ends of + non-auto-extending data files in space + 0 */ +#define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END) + /* this magic number tells if the + checkpoint contains the above field: + the field was added to + InnoDB-3.23.50 */ +#define LOG_CHECKPOINT_SIZE (16 + LOG_CHECKPOINT_ARRAY_END) + +#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243 + +/* Offsets of a log file header */ +#define LOG_GROUP_ID 0 /* log group number */ +#define LOG_FILE_START_LSN 4 /* lsn of the start of data in this + log file */ +#define LOG_FILE_NO 12 /* 4-byte archived log file number; + this field is only defined in an + archived log file */ +#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16 + /* a 32-byte field which contains + the string 'ibbackup' and the + creation time if the log file was + created by ibbackup --restore; + when mysqld is first time started + on the restored database, it can + print helpful info for the user */ +#define LOG_FILE_ARCH_COMPLETED OS_FILE_LOG_BLOCK_SIZE + /* this 4-byte field is TRUE when + the writing of an archived log file + has been completed; this field is + only defined in an archived log file */ +#define LOG_FILE_END_LSN (OS_FILE_LOG_BLOCK_SIZE + 4) + /* lsn where the archived log file + at least extends: actually the + archived log file may extend to a + later lsn, as long as it is within the + same log block as this lsn; this field + is defined only when an archived log + file has been completely written */ +#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE + /* first checkpoint field in the log + header; we write alternately to the + checkpoint fields when we make new + checkpoints; this field is only defined + in the first log file of a log group */ +#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE) + /* second checkpoint field in the log + header */ +#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE) + +#define LOG_GROUP_OK 301 +#define LOG_GROUP_CORRUPTED 302 + +/* Log group consists of a number of log files, each of the same size; a log +group is implemented as a space in the sense of the module fil0fil. */ + +struct log_group_struct{ + /* The following fields are protected by log_sys->mutex */ + ulint id; /* log group id */ + ulint n_files; /* number of files in the group */ + ulint file_size; /* individual log file size in bytes, + including the log file header */ + ulint space_id; /* file space which implements the log + group */ + ulint state; /* LOG_GROUP_OK or + LOG_GROUP_CORRUPTED */ + ib_uint64_t lsn; /* lsn used to fix coordinates within + the log group */ + ulint lsn_offset; /* the offset of the above lsn */ + ulint n_pending_writes;/* number of currently pending flush + writes for this log group */ + byte** file_header_bufs;/* buffers for each file header in the + group */ + /*-----------------------------*/ + byte** archive_file_header_bufs;/* buffers for each file + header in the group */ + ulint archive_space_id;/* file space which implements the log + group archive */ + ulint archived_file_no;/* file number corresponding to + log_sys->archived_lsn */ + ulint archived_offset;/* file offset corresponding to + log_sys->archived_lsn, 0 if we have + not yet written to the archive file + number archived_file_no */ + ulint next_archived_file_no;/* during an archive write, + until the write is completed, we + store the next value for + archived_file_no here: the write + completion function then sets the new + value to ..._file_no */ + ulint next_archived_offset; /* like the preceding field */ + /*-----------------------------*/ + ib_uint64_t scanned_lsn; /* used only in recovery: recovery scan + succeeded up to this lsn in this log + group */ + byte* checkpoint_buf; /* checkpoint header is written from + this buffer to the group */ + UT_LIST_NODE_T(log_group_t) + log_groups; /* list of log groups */ +}; + +struct log_struct{ + byte pad[64]; /* padding to prevent other memory + update hotspots from residing on the + same memory cache line */ + ib_uint64_t lsn; /* log sequence number */ + ulint buf_free; /* first free offset within the log + buffer */ + mutex_t mutex; /* mutex protecting the log */ + byte* buf; /* log buffer */ + ulint buf_size; /* log buffer size in bytes */ + ulint max_buf_free; /* recommended maximum value of + buf_free, after which the buffer is + flushed */ + ulint old_buf_free; /* value of buf free when log was + last time opened; only in the debug + version */ + ib_uint64_t old_lsn; /* value of lsn when log was last time + opened; only in the debug version */ + ibool check_flush_or_checkpoint; + /* this is set to TRUE when there may + be need to flush the log buffer, or + preflush buffer pool pages, or make + a checkpoint; this MUST be TRUE when + lsn - last_checkpoint_lsn > + max_checkpoint_age; this flag is + peeked at by log_free_check(), which + does not reserve the log mutex */ + UT_LIST_BASE_NODE_T(log_group_t) + log_groups; /* log groups */ + + /* The fields involved in the log buffer flush */ + + ulint buf_next_to_write;/* first offset in the log buffer + where the byte content may not exist + written to file, e.g., the start + offset of a log record catenated + later; this is advanced when a flush + operation is completed to all the log + groups */ + ib_uint64_t written_to_some_lsn; + /* first log sequence number not yet + written to any log group; for this to + be advanced, it is enough that the + write i/o has been completed for any + one log group */ + ib_uint64_t written_to_all_lsn; + /* first log sequence number not yet + written to some log group; for this to + be advanced, it is enough that the + write i/o has been completed for all + log groups */ + ib_uint64_t write_lsn; /* end lsn for the current running + write */ + ulint write_end_offset;/* the data in buffer has been written + up to this offset when the current + write ends: this field will then + be copied to buf_next_to_write */ + ib_uint64_t current_flush_lsn;/* end lsn for the current running + write + flush operation */ + ib_uint64_t flushed_to_disk_lsn; + /* how far we have written the log + AND flushed to disk */ + ulint n_pending_writes;/* number of currently pending flushes + or writes */ + /* NOTE on the 'flush' in names of the fields below: starting from + 4.0.14, we separate the write of the log file and the actual fsync() + or other method to flush it to disk. The names below shhould really + be 'flush_or_write'! */ + os_event_t no_flush_event; /* this event is in the reset state + when a flush or a write is running; + a thread should wait for this without + owning the log mutex, but NOTE that + to set or reset this event, the + thread MUST own the log mutex! */ + ibool one_flushed; /* during a flush, this is first FALSE + and becomes TRUE when one log group + has been written or flushed */ + os_event_t one_flushed_event;/* this event is reset when the + flush or write has not yet completed + for any log group; e.g., this means + that a transaction has been committed + when this is set; a thread should wait + for this without owning the log mutex, + but NOTE that to set or reset this + event, the thread MUST own the log + mutex! */ + ulint n_log_ios; /* number of log i/os initiated thus + far */ + ulint n_log_ios_old; /* number of log i/o's at the + previous printout */ + time_t last_printout_time;/* when log_print was last time + called */ + + /* Fields involved in checkpoints */ + ulint log_group_capacity; /* capacity of the log group; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ + ulint max_modified_age_async; + /* when this recommended value for lsn + - buf_pool_get_oldest_modification() + is exceeded, we start an asynchronous + preflush of pool pages */ + ulint max_modified_age_sync; + /* when this recommended value for lsn + - buf_pool_get_oldest_modification() + is exceeded, we start a synchronous + preflush of pool pages */ + ulint adm_checkpoint_interval; + /* administrator-specified checkpoint + interval in terms of log growth in + bytes; the interval actually used by + the database can be smaller */ + ulint max_checkpoint_age_async; + /* when this checkpoint age is exceeded + we start an asynchronous writing of a + new checkpoint */ + ulint max_checkpoint_age; + /* this is the maximum allowed value + for lsn - last_checkpoint_lsn when a + new query step is started */ + ib_uint64_t next_checkpoint_no; + /* next checkpoint number */ + ib_uint64_t last_checkpoint_lsn; + /* latest checkpoint lsn */ + ib_uint64_t next_checkpoint_lsn; + /* next checkpoint lsn */ + ulint n_pending_checkpoint_writes; + /* number of currently pending + checkpoint writes */ + rw_lock_t checkpoint_lock;/* this latch is x-locked when a + checkpoint write is running; a thread + should wait for this without owning + the log mutex */ + byte* checkpoint_buf; /* checkpoint header is read to this + buffer */ +#ifdef UNIV_LOG_ARCHIVE + /* Fields involved in archiving */ + ulint archiving_state;/* LOG_ARCH_ON, LOG_ARCH_STOPPING + LOG_ARCH_STOPPED, LOG_ARCH_OFF */ + ib_uint64_t archived_lsn; /* archiving has advanced to this + lsn */ + ulint max_archived_lsn_age_async; + /* recommended maximum age of + archived_lsn, before we start + asynchronous copying to the archive */ + ulint max_archived_lsn_age; + /* maximum allowed age for + archived_lsn */ + ib_uint64_t next_archived_lsn;/* during an archive write, + until the write is completed, we + store the next value for + archived_lsn here: the write + completion function then sets the new + value to archived_lsn */ + ulint archiving_phase;/* LOG_ARCHIVE_READ or + LOG_ARCHIVE_WRITE */ + ulint n_pending_archive_ios; + /* number of currently pending reads + or writes in archiving */ + rw_lock_t archive_lock; /* this latch is x-locked when an + archive write is running; a thread + should wait for this without owning + the log mutex */ + ulint archive_buf_size;/* size of archive_buf */ + byte* archive_buf; /* log segment is written to the + archive from this buffer */ + os_event_t archiving_on; /* if archiving has been stopped, + a thread can wait for this event to + become signaled */ +#endif /* UNIV_LOG_ARCHIVE */ +}; + +#define LOG_ARCH_ON 71 +#define LOG_ARCH_STOPPING 72 +#define LOG_ARCH_STOPPING2 73 +#define LOG_ARCH_STOPPED 74 +#define LOG_ARCH_OFF 75 + +#ifndef UNIV_NONINL +#include "log0log.ic" +#endif + +#endif diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic new file mode 100644 index 00000000000..85eebda4942 --- /dev/null +++ b/storage/xtradb/include/log0log.ic @@ -0,0 +1,407 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" +#include "mach0data.h" +#include "mtr0mtr.h" + +/********************************************************** +Checks by parsing that the catenated log segment for a single mtr is +consistent. */ +UNIV_INTERN +ibool +log_check_log_recs( +/*===============*/ + byte* buf, /* in: pointer to the start of + the log segment in the + log_sys->buf log buffer */ + ulint len, /* in: segment length in bytes */ + ib_uint64_t buf_start_lsn); /* in: buffer start lsn */ + +/**************************************************************** +Gets a log block flush bit. */ +UNIV_INLINE +ibool +log_block_get_flush_bit( +/*====================*/ + /* out: TRUE if this block was the first + to be written in a log flush */ + byte* log_block) /* in: log block */ +{ + if (LOG_BLOCK_FLUSH_BIT_MASK + & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) { + + return(TRUE); + } + + return(FALSE); +} + +/**************************************************************** +Sets the log block flush bit. */ +UNIV_INLINE +void +log_block_set_flush_bit( +/*====================*/ + byte* log_block, /* in: log block */ + ibool val) /* in: value to set */ +{ + ulint field; + + field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO); + + if (val) { + field = field | LOG_BLOCK_FLUSH_BIT_MASK; + } else { + field = field & ~LOG_BLOCK_FLUSH_BIT_MASK; + } + + mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field); +} + +/**************************************************************** +Gets a log block number stored in the header. */ +UNIV_INLINE +ulint +log_block_get_hdr_no( +/*=================*/ + /* out: log block number stored in the block + header */ + byte* log_block) /* in: log block */ +{ + return(~LOG_BLOCK_FLUSH_BIT_MASK + & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)); +} + +/**************************************************************** +Sets the log block number stored in the header; NOTE that this must be set +before the flush bit! */ +UNIV_INLINE +void +log_block_set_hdr_no( +/*=================*/ + byte* log_block, /* in: log block */ + ulint n) /* in: log block number: must be > 0 and + < LOG_BLOCK_FLUSH_BIT_MASK */ +{ + ut_ad(n > 0); + ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK); + + mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n); +} + +/**************************************************************** +Gets a log block data length. */ +UNIV_INLINE +ulint +log_block_get_data_len( +/*===================*/ + /* out: log block data length measured as a + byte offset from the block start */ + byte* log_block) /* in: log block */ +{ + return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN)); +} + +/**************************************************************** +Sets the log block data length. */ +UNIV_INLINE +void +log_block_set_data_len( +/*===================*/ + byte* log_block, /* in: log block */ + ulint len) /* in: data length */ +{ + mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len); +} + +/**************************************************************** +Gets a log block first mtr log record group offset. */ +UNIV_INLINE +ulint +log_block_get_first_rec_group( +/*==========================*/ + /* out: first mtr log record group byte offset + from the block start, 0 if none */ + byte* log_block) /* in: log block */ +{ + return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP)); +} + +/**************************************************************** +Sets the log block first mtr log record group offset. */ +UNIV_INLINE +void +log_block_set_first_rec_group( +/*==========================*/ + byte* log_block, /* in: log block */ + ulint offset) /* in: offset, 0 if none */ +{ + mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset); +} + +/**************************************************************** +Gets a log block checkpoint number field (4 lowest bytes). */ +UNIV_INLINE +ulint +log_block_get_checkpoint_no( +/*========================*/ + /* out: checkpoint no (4 lowest bytes) */ + byte* log_block) /* in: log block */ +{ + return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO)); +} + +/**************************************************************** +Sets a log block checkpoint number field (4 lowest bytes). */ +UNIV_INLINE +void +log_block_set_checkpoint_no( +/*========================*/ + byte* log_block, /* in: log block */ + ib_uint64_t no) /* in: checkpoint no */ +{ + mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no); +} + +/**************************************************************** +Converts a lsn to a log block number. */ +UNIV_INLINE +ulint +log_block_convert_lsn_to_no( +/*========================*/ + /* out: log block number, + it is > 0 and <= 1G */ + ib_uint64_t lsn) /* in: lsn of a byte within the block */ +{ + return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1); +} + +/**************************************************************** +Calculates the checksum for a log block. */ +UNIV_INLINE +ulint +log_block_calc_checksum( +/*====================*/ + /* out: checksum */ + const byte* block) /* in: log block */ +{ + ulint sum; + ulint sh; + ulint i; + + sum = 1; + sh = 0; + + for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) { + ulint b = (ulint) block[i]; + sum &= 0x7FFFFFFFUL; + sum += b; + sum += b << sh; + sh++; + if (sh > 24) { + sh = 0; + } + } + + return(sum); +} + +/**************************************************************** +Gets a log block checksum field value. */ +UNIV_INLINE +ulint +log_block_get_checksum( +/*===================*/ + /* out: checksum */ + const byte* log_block) /* in: log block */ +{ + return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM)); +} + +/**************************************************************** +Sets a log block checksum field value. */ +UNIV_INLINE +void +log_block_set_checksum( +/*===================*/ + byte* log_block, /* in: log block */ + ulint checksum) /* in: checksum */ +{ + mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM, + checksum); +} + +/**************************************************************** +Initializes a log block in the log buffer. */ +UNIV_INLINE +void +log_block_init( +/*===========*/ + byte* log_block, /* in: pointer to the log buffer */ + ib_uint64_t lsn) /* in: lsn within the log block */ +{ + ulint no; + + ut_ad(mutex_own(&(log_sys->mutex))); + + no = log_block_convert_lsn_to_no(lsn); + + log_block_set_hdr_no(log_block, no); + + log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); + log_block_set_first_rec_group(log_block, 0); +} + +/**************************************************************** +Initializes a log block in the log buffer in the old format, where there +was no checksum yet. */ +UNIV_INLINE +void +log_block_init_in_old_format( +/*=========================*/ + byte* log_block, /* in: pointer to the log buffer */ + ib_uint64_t lsn) /* in: lsn within the log block */ +{ + ulint no; + + ut_ad(mutex_own(&(log_sys->mutex))); + + no = log_block_convert_lsn_to_no(lsn); + + log_block_set_hdr_no(log_block, no); + mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE + - LOG_BLOCK_CHECKSUM, no); + log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE); + log_block_set_first_rec_group(log_block, 0); +} + +/**************************************************************** +Writes to the log the string given. The log must be released with +log_release. */ +UNIV_INLINE +ib_uint64_t +log_reserve_and_write_fast( +/*=======================*/ + /* out: end lsn of the log record, + zero if did not succeed */ + byte* str, /* in: string */ + ulint len, /* in: string length */ + ib_uint64_t* start_lsn,/* out: start lsn of the log record */ + ibool* success)/* out: TRUE if success */ +{ + log_t* log = log_sys; + ulint data_len; + ib_uint64_t lsn; + + *success = TRUE; + + mutex_enter(&(log->mutex)); + + data_len = len + log->buf_free % OS_FILE_LOG_BLOCK_SIZE; + + if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + + /* The string does not fit within the current log block + or the log block would become full */ + + *success = FALSE; + + mutex_exit(&(log->mutex)); + + return(0); + } + + *start_lsn = log->lsn; + + ut_memcpy(log->buf + log->buf_free, str, len); + + log_block_set_data_len((byte*) ut_align_down(log->buf + log->buf_free, + OS_FILE_LOG_BLOCK_SIZE), + data_len); +#ifdef UNIV_LOG_DEBUG + log->old_buf_free = log->buf_free; + log->old_lsn = log->lsn; +#endif + log->buf_free += len; + + ut_ad(log->buf_free <= log->buf_size); + + lsn = log->lsn += len; + +#ifdef UNIV_LOG_DEBUG + log_check_log_recs(log->buf + log->old_buf_free, + log->buf_free - log->old_buf_free, log->old_lsn); +#endif + return(lsn); +} + +/*************************************************************************** +Releases the log mutex. */ +UNIV_INLINE +void +log_release(void) +/*=============*/ +{ + mutex_exit(&(log_sys->mutex)); +} + +/**************************************************************** +Gets the current lsn. */ +UNIV_INLINE +ib_uint64_t +log_get_lsn(void) +/*=============*/ + /* out: current lsn */ +{ + ib_uint64_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + return(lsn); +} + +/*************************************************************************** +Checks if there is need for a log buffer flush or a new checkpoint, and does +this if yes. Any database operation should call this when it has modified +more than about 4 pages. NOTE that this function may only be called when the +OS thread owns no synchronization objects except the dictionary mutex. */ +UNIV_INLINE +void +log_free_check(void) +/*================*/ +{ + /* ut_ad(sync_thread_levels_empty()); */ + + if (log_sys->check_flush_or_checkpoint) { + + log_check_margins(); + } +} diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h new file mode 100644 index 00000000000..e3fe9ed330a --- /dev/null +++ b/storage/xtradb/include/log0recv.h @@ -0,0 +1,392 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#ifndef log0recv_h +#define log0recv_h + +#include "univ.i" +#include "ut0byte.h" +#include "buf0types.h" +#include "hash0hash.h" +#include "log0log.h" + +#ifdef UNIV_HOTBACKUP +extern ibool recv_replay_file_ops; + +/*********************************************************************** +Reads the checkpoint info needed in hot backup. */ +UNIV_INTERN +ibool +recv_read_cp_info_for_backup( +/*=========================*/ + /* out: TRUE if success */ + byte* hdr, /* in: buffer containing the log group + header */ + ib_uint64_t* lsn, /* out: checkpoint lsn */ + ulint* offset, /* out: checkpoint offset in the log group */ + ulint* fsp_limit,/* out: fsp limit of space 0, + 1000000000 if the database is running + with < version 3.23.50 of InnoDB */ + ib_uint64_t* cp_no, /* out: checkpoint number */ + ib_uint64_t* first_header_lsn); + /* out: lsn of of the start of the + first log file */ +/*********************************************************************** +Scans the log segment and n_bytes_scanned is set to the length of valid +log scanned. */ +UNIV_INTERN +void +recv_scan_log_seg_for_backup( +/*=========================*/ + byte* buf, /* in: buffer containing log data */ + ulint buf_len, /* in: data length in that buffer */ + ib_uint64_t* scanned_lsn, /* in/out: lsn of buffer start, + we return scanned lsn */ + ulint* scanned_checkpoint_no, + /* in/out: 4 lowest bytes of the + highest scanned checkpoint number so + far */ + ulint* n_bytes_scanned);/* out: how much we were able to + scan, smaller than buf_len if log + data ended here */ +#endif /* UNIV_HOTBACKUP */ +/*********************************************************************** +Returns TRUE if recovery is currently running. */ +UNIV_INLINE +ibool +recv_recovery_is_on(void); +/*=====================*/ +/*********************************************************************** +Returns TRUE if recovery from backup is currently running. */ +UNIV_INLINE +ibool +recv_recovery_from_backup_is_on(void); +/*=================================*/ +/**************************************************************************** +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. */ +UNIV_INTERN +void +recv_recover_page( +/*==============*/ + ibool recover_backup, + /* in: TRUE if we are recovering a backup + page: then we do not acquire any latches + since the page was read in outside the + buffer pool */ + ibool just_read_in, + /* in: TRUE if the i/o-handler calls this for + a freshly read page */ + buf_block_t* block); /* in: buffer block */ +/************************************************************ +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. */ +UNIV_INTERN +ulint +recv_recovery_from_checkpoint_start_func( +/*=====================================*/ + /* out: error code or DB_SUCCESS */ +#ifdef UNIV_LOG_ARCHIVE + ulint type, /* in: LOG_CHECKPOINT or LOG_ARCHIVE */ + ib_uint64_t limit_lsn, /* in: recover up to this lsn + if possible */ +#endif /* UNIV_LOG_ARCHIVE */ + ib_uint64_t min_flushed_lsn,/* in: min flushed lsn from + data files */ + ib_uint64_t max_flushed_lsn);/* in: max flushed lsn from + data files */ +#ifdef UNIV_LOG_ARCHIVE +# define recv_recovery_from_checkpoint_start(type,lim,min,max) \ + recv_recovery_from_checkpoint_start_func(type,lim,min,max) +#else /* UNIV_LOG_ARCHIVE */ +# define recv_recovery_from_checkpoint_start(type,lim,min,max) \ + recv_recovery_from_checkpoint_start_func(min,max) +#endif /* UNIV_LOG_ARCHIVE */ +/************************************************************ +Completes recovery from a checkpoint. */ +UNIV_INTERN +void +recv_recovery_from_checkpoint_finish(void); +/*======================================*/ +/*********************************************************** +Scans log from a buffer and stores new log data to the parsing buffer. Parses +and hashes the log records if new data found. */ +UNIV_INTERN +ibool +recv_scan_log_recs( +/*===============*/ + /* out: TRUE if limit_lsn has been + reached, or not able to scan any more + in this log group */ + ibool apply_automatically,/* in: TRUE if we want this + function to apply log records + automatically when the hash table + becomes full; in the hot backup tool + the tool does the applying, not this + function */ + ulint available_memory,/* in: we let the hash table of recs + to grow to this size, at the maximum */ + ibool store_to_hash, /* in: TRUE if the records should be + stored to the hash table; this is set + to FALSE if just debug checking is + needed */ + byte* buf, /* in: buffer containing a log segment + or garbage */ + ulint len, /* in: buffer length */ + ib_uint64_t start_lsn, /* in: buffer start lsn */ + ib_uint64_t* contiguous_lsn, /* in/out: it is known that all log + groups contain contiguous log data up + to this lsn */ + ib_uint64_t* group_scanned_lsn);/* out: scanning succeeded up to + this lsn */ +/********************************************************** +Resets the logs. The contents of log files will be lost! */ +UNIV_INTERN +void +recv_reset_logs( +/*============*/ + ib_uint64_t lsn, /* in: reset to this lsn + rounded up to be divisible by + OS_FILE_LOG_BLOCK_SIZE, after + which we add + LOG_BLOCK_HDR_SIZE */ +#ifdef UNIV_LOG_ARCHIVE + ulint arch_log_no, /* in: next archived log file number */ +#endif /* UNIV_LOG_ARCHIVE */ + ibool new_logs_created);/* in: TRUE if resetting logs + is done at the log creation; + FALSE if it is done after + archive recovery */ +#ifdef UNIV_HOTBACKUP +/********************************************************** +Creates new log files after a backup has been restored. */ +UNIV_INTERN +void +recv_reset_log_files_for_backup( +/*============================*/ + const char* log_dir, /* in: log file directory path */ + ulint n_log_files, /* in: number of log files */ + ulint log_file_size, /* in: log file size */ + ib_uint64_t lsn); /* in: new start lsn, must be + divisible by OS_FILE_LOG_BLOCK_SIZE */ +#endif /* UNIV_HOTBACKUP */ +/************************************************************ +Creates the recovery system. */ +UNIV_INTERN +void +recv_sys_create(void); +/*=================*/ +/************************************************************ +Inits the recovery system for a recovery operation. */ +UNIV_INTERN +void +recv_sys_init( +/*==========*/ + ibool recover_from_backup, /* in: TRUE if this is called + to recover from a hot backup */ + ulint available_memory); /* in: available memory in bytes */ +/*********************************************************************** +Empties the hash table of stored log records, applying them to appropriate +pages. */ +UNIV_INTERN +void +recv_apply_hashed_log_recs( +/*=======================*/ + ibool allow_ibuf); /* in: if TRUE, also ibuf operations are + allowed during the application; if FALSE, + no ibuf operations are allowed, and after + the application all file pages are flushed to + disk and invalidated in buffer pool: this + alternative means that no new log records + can be generated during the application */ +#ifdef UNIV_HOTBACKUP +/*********************************************************************** +Applies log records in the hash table to a backup. */ +UNIV_INTERN +void +recv_apply_log_recs_for_backup(void); +/*================================*/ +#endif +#ifdef UNIV_LOG_ARCHIVE +/************************************************************ +Recovers from archived log files, and also from log files, if they exist. */ +UNIV_INTERN +ulint +recv_recovery_from_archive_start( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + ib_uint64_t min_flushed_lsn,/* in: min flushed lsn field from the + data files */ + ib_uint64_t limit_lsn, /* in: recover up to this lsn if + possible */ + ulint first_log_no); /* in: number of the first archived + log file to use in the recovery; the + file will be searched from + INNOBASE_LOG_ARCH_DIR specified in + server config file */ +/************************************************************ +Completes recovery from archive. */ +UNIV_INTERN +void +recv_recovery_from_archive_finish(void); +/*===================================*/ +#endif /* UNIV_LOG_ARCHIVE */ + +/* Block of log record data */ +typedef struct recv_data_struct recv_data_t; +struct recv_data_struct{ + recv_data_t* next; /* pointer to the next block or NULL */ + /* the log record data is stored physically + immediately after this struct, max amount + RECV_DATA_BLOCK_SIZE bytes of it */ +}; + +/* Stored log record struct */ +typedef struct recv_struct recv_t; +struct recv_struct{ + byte type; /* log record type */ + ulint len; /* log record body length in bytes */ + recv_data_t* data; /* chain of blocks containing the log record + body */ + ib_uint64_t start_lsn;/* start lsn of the log segment written by + the mtr which generated this log record: NOTE + that this is not necessarily the start lsn of + this log record */ + ib_uint64_t end_lsn;/* end lsn of the log segment written by + the mtr which generated this log record: NOTE + that this is not necessarily the end lsn of + this log record */ + UT_LIST_NODE_T(recv_t) + rec_list;/* list of log records for this page */ +}; + +/* Hashed page file address struct */ +typedef struct recv_addr_struct recv_addr_t; +struct recv_addr_struct{ + ulint state; /* RECV_NOT_PROCESSED, RECV_BEING_PROCESSED, + or RECV_PROCESSED */ + ulint space; /* space id */ + ulint page_no;/* page number */ + UT_LIST_BASE_NODE_T(recv_t) + rec_list;/* list of log records for this page */ + hash_node_t addr_hash; +}; + +/* Recovery system data structure */ +typedef struct recv_sys_struct recv_sys_t; +struct recv_sys_struct{ + mutex_t mutex; /* mutex protecting the fields apply_log_recs, + n_addrs, and the state field in each recv_addr + struct */ + ibool apply_log_recs; + /* this is TRUE when log rec application to + pages is allowed; this flag tells the + i/o-handler if it should do log record + application */ + ibool apply_batch_on; + /* this is TRUE when a log rec application + batch is running */ + ib_uint64_t lsn; /* log sequence number */ + ulint last_log_buf_size; + /* size of the log buffer when the database + last time wrote to the log */ + byte* last_block; + /* possible incomplete last recovered log + block */ + byte* last_block_buf_start; + /* the nonaligned start address of the + preceding buffer */ + byte* buf; /* buffer for parsing log records */ + ulint len; /* amount of data in buf */ + ib_uint64_t parse_start_lsn; + /* this is the lsn from which we were able to + start parsing log records and adding them to + the hash table; zero if a suitable + start point not found yet */ + ib_uint64_t scanned_lsn; + /* the log data has been scanned up to this + lsn */ + ulint scanned_checkpoint_no; + /* the log data has been scanned up to this + checkpoint number (lowest 4 bytes) */ + ulint recovered_offset; + /* start offset of non-parsed log records in + buf */ + ib_uint64_t recovered_lsn; + /* the log records have been parsed up to + this lsn */ + ib_uint64_t limit_lsn;/* recovery should be made at most up to this + lsn */ + ibool found_corrupt_log; + /* this is set to TRUE if we during log + scan find a corrupt log block, or a corrupt + log record, or there is a log parsing + buffer overflow */ +#ifdef UNIV_LOG_ARCHIVE + log_group_t* archive_group; + /* in archive recovery: the log group whose + archive is read */ +#endif /* !UNIV_LOG_ARCHIVE */ + mem_heap_t* heap; /* memory heap of log records and file + addresses*/ + hash_table_t* addr_hash;/* hash table of file addresses of pages */ + ulint n_addrs;/* number of not processed hashed file + addresses in the hash table */ +}; + +extern recv_sys_t* recv_sys; +extern ibool recv_recovery_on; +extern ibool recv_no_ibuf_operations; +extern ibool recv_needed_recovery; + +extern ibool recv_lsn_checks_on; +#ifdef UNIV_HOTBACKUP +extern ibool recv_is_making_a_backup; +#endif /* UNIV_HOTBACKUP */ +extern ulint recv_max_parsed_page_no; + +/* Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many +times! */ +#define RECV_PARSING_BUF_SIZE (2 * 1024 * 1024) + +/* Size of block reads when the log groups are scanned forward to do a +roll-forward */ +#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE) + +/* States of recv_addr_struct */ +#define RECV_NOT_PROCESSED 71 +#define RECV_BEING_READ 72 +#define RECV_BEING_PROCESSED 73 +#define RECV_PROCESSED 74 + +extern ulint recv_n_pool_free_frames; + +#ifndef UNIV_NONINL +#include "log0recv.ic" +#endif + +#endif diff --git a/storage/xtradb/include/log0recv.ic b/storage/xtradb/include/log0recv.ic new file mode 100644 index 00000000000..e114bede38f --- /dev/null +++ b/storage/xtradb/include/log0recv.ic @@ -0,0 +1,48 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#include "univ.i" + +extern ibool recv_recovery_from_backup_on; + +/*********************************************************************** +Returns TRUE if recovery is currently running. */ +UNIV_INLINE +ibool +recv_recovery_is_on(void) +/*=====================*/ +{ + return(UNIV_UNLIKELY(recv_recovery_on)); +} + +/*********************************************************************** +Returns TRUE if recovery from backup is currently running. */ +UNIV_INLINE +ibool +recv_recovery_from_backup_is_on(void) +/*=================================*/ +{ + return(recv_recovery_from_backup_on); +} + diff --git a/storage/xtradb/include/mach0data.h b/storage/xtradb/include/mach0data.h new file mode 100644 index 00000000000..78b48af0836 --- /dev/null +++ b/storage/xtradb/include/mach0data.h @@ -0,0 +1,398 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef mach0data_h +#define mach0data_h + +#include "univ.i" +#include "ut0byte.h" + +/* The data and all fields are always stored in a database file +in the same format: ascii, big-endian, ... . +All data in the files MUST be accessed using the functions in this +module. */ + +/*********************************************************** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /* in: pointer to byte where to store */ + ulint n); /* in: ulint integer to be stored, >= 0, < 256 */ +/************************************************************ +The following function is used to fetch data from one byte. */ +UNIV_INLINE +ulint +mach_read_from_1( +/*=============*/ + /* out: ulint integer, >= 0, < 256 */ + const byte* b) /* in: pointer to byte */ + __attribute__((nonnull, pure)); +/*********************************************************** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lower address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /* in: pointer to two bytes where to store */ + ulint n); /* in: ulint integer to be stored, >= 0, < 64k */ +/************************************************************ +The following function is used to fetch data from two consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ulint +mach_read_from_2( +/*=============*/ + /* out: ulint integer, >= 0, < 64k */ + const byte* b) /* in: pointer to two bytes */ + __attribute__((nonnull, pure)); + +/************************************************************ +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + /* out: 16-bit integer in canonical format */ + ulint n) /* in: integer in machine-dependent format */ + __attribute__((const)); +/************************************************************ +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + /* out: integer in machine-dependent format */ + uint16 n) /* in: 16-bit integer in canonical format */ + __attribute__((const)); +/*********************************************************** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /* in: pointer to 3 bytes where to store */ + ulint n); /* in: ulint integer to be stored */ +/************************************************************ +The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ulint +mach_read_from_3( +/*=============*/ + /* out: ulint integer */ + const byte* b) /* in: pointer to 3 bytes */ + __attribute__((nonnull, pure)); +/*********************************************************** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /* in: pointer to four bytes where to store */ + ulint n); /* in: ulint integer to be stored */ +/************************************************************ +The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ulint +mach_read_from_4( +/*=============*/ + /* out: ulint integer */ + const byte* b) /* in: pointer to four bytes */ + __attribute__((nonnull, pure)); +/************************************************************* +Writes a ulint in a compressed form (1..5 bytes). */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + /* out: stored size in bytes */ + byte* b, /* in: pointer to memory where to store */ + ulint n); /* in: ulint integer to be stored */ +/************************************************************* +Returns the size of an ulint when written in the compressed form. */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + /* out: compressed size in bytes */ + ulint n) /* in: ulint integer to be stored */ + __attribute__((const)); +/************************************************************* +Reads a ulint in a compressed form. */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + /* out: read integer */ + const byte* b) /* in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/*********************************************************** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /* in: pointer to 6 bytes where to store */ + dulint n); /* in: dulint integer to be stored */ +/************************************************************ +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +dulint +mach_read_from_6( +/*=============*/ + /* out: dulint integer */ + const byte* b) /* in: pointer to 6 bytes */ + __attribute__((nonnull, pure)); +/*********************************************************** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /* in: pointer to 7 bytes where to store */ + dulint n); /* in: dulint integer to be stored */ +/************************************************************ +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +dulint +mach_read_from_7( +/*=============*/ + /* out: dulint integer */ + const byte* b) /* in: pointer to 7 bytes */ + __attribute__((nonnull, pure)); +/*********************************************************** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + byte* b, /* in: pointer to 8 bytes where to store */ + dulint n); /* in: dulint integer to be stored */ +/*********************************************************** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_ull( +/*===========*/ + byte* b, /* in: pointer to 8 bytes where to store */ + ib_uint64_t n); /* in: 64-bit integer to be stored */ +/************************************************************ +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +dulint +mach_read_from_8( +/*=============*/ + /* out: dulint integer */ + const byte* b) /* in: pointer to 8 bytes */ + __attribute__((nonnull, pure)); +/************************************************************ +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ib_uint64_t +mach_read_ull( +/*==========*/ + /* out: 64-bit integer */ + const byte* b) /* in: pointer to 8 bytes */ + __attribute__((nonnull, pure)); +/************************************************************* +Writes a dulint in a compressed form (5..9 bytes). */ +UNIV_INLINE +ulint +mach_dulint_write_compressed( +/*=========================*/ + /* out: size in bytes */ + byte* b, /* in: pointer to memory where to store */ + dulint n); /* in: dulint integer to be stored */ +/************************************************************* +Returns the size of a dulint when written in the compressed form. */ +UNIV_INLINE +ulint +mach_dulint_get_compressed_size( +/*============================*/ + /* out: compressed size in bytes */ + dulint n); /* in: dulint integer to be stored */ +/************************************************************* +Reads a dulint in a compressed form. */ +UNIV_INLINE +dulint +mach_dulint_read_compressed( +/*========================*/ + /* out: read dulint */ + const byte* b) /* in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/************************************************************* +Writes a dulint in a compressed form (1..11 bytes). */ +UNIV_INLINE +ulint +mach_dulint_write_much_compressed( +/*==============================*/ + /* out: size in bytes */ + byte* b, /* in: pointer to memory where to store */ + dulint n); /* in: dulint integer to be stored */ +/************************************************************* +Returns the size of a dulint when written in the compressed form. */ +UNIV_INLINE +ulint +mach_dulint_get_much_compressed_size( +/*=================================*/ + /* out: compressed size in bytes */ + dulint n) /* in: dulint integer to be stored */ + __attribute__((const)); +/************************************************************* +Reads a dulint in a compressed form. */ +UNIV_INLINE +dulint +mach_dulint_read_much_compressed( +/*=============================*/ + /* out: read dulint */ + const byte* b) /* in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/************************************************************* +Reads a ulint in a compressed form if the log record fully contains it. */ +UNIV_INTERN +byte* +mach_parse_compressed( +/*==================*/ + /* out: pointer to end of the stored field, NULL if + not complete */ + byte* ptr, /* in: pointer to buffer from where to read */ + byte* end_ptr,/* in: pointer to end of the buffer */ + ulint* val); /* out: read value */ +/************************************************************* +Reads a dulint in a compressed form if the log record fully contains it. */ +UNIV_INTERN +byte* +mach_dulint_parse_compressed( +/*=========================*/ + /* out: pointer to end of the stored field, NULL if + not complete */ + byte* ptr, /* in: pointer to buffer from where to read */ + byte* end_ptr,/* in: pointer to end of the buffer */ + dulint* val); /* out: read value */ +/************************************************************* +Reads a double. It is stored in a little-endian format. */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + /* out: double read */ + const byte* b) /* in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/************************************************************* +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /* in: pointer to memory where to write */ + double d); /* in: double */ +/************************************************************* +Reads a float. It is stored in a little-endian format. */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + /* out: float read */ + const byte* b) /* in: pointer to memory from where to read */ + __attribute__((nonnull, pure)); +/************************************************************* +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /* in: pointer to memory where to write */ + float d); /* in: float */ +/************************************************************* +Reads a ulint stored in the little-endian format. */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + /* out: unsigned long int */ + const byte* buf, /* in: from where to read */ + ulint buf_size) /* in: from how many bytes to read */ + __attribute__((nonnull, pure)); +/************************************************************* +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /* in: where to write */ + ulint dest_size, /* in: into how many bytes to write */ + ulint n); /* in: unsigned long int to write */ +/************************************************************* +Reads a ulint stored in the little-endian format. */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + /* out: unsigned long int */ + const byte* buf) /* in: from where to read */ + __attribute__((nonnull, pure)); +/************************************************************* +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /* in: where to write */ + ulint n); /* in: unsigned long int to write */ + +/************************************************************* +Convert integral type from storage byte order (big endian) to +host byte order. */ +UNIV_INLINE +ullint +mach_read_int_type( +/*===============*/ + /* out: integer value */ + const byte* src, /* in: where to read from */ + ulint len, /* in: length of src */ + ibool unsigned_type); /* in: signed or unsigned flag */ +#ifndef UNIV_NONINL +#include "mach0data.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mach0data.ic b/storage/xtradb/include/mach0data.ic new file mode 100644 index 00000000000..5dda9aece2f --- /dev/null +++ b/storage/xtradb/include/mach0data.ic @@ -0,0 +1,784 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "ut0mem.h" + +/*********************************************************** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /* in: pointer to byte where to store */ + ulint n) /* in: ulint integer to be stored, >= 0, < 256 */ +{ + ut_ad(b); + ut_ad(n <= 0xFFUL); + + b[0] = (byte)n; +} + +/************************************************************ +The following function is used to fetch data from one byte. */ +UNIV_INLINE +ulint +mach_read_from_1( +/*=============*/ + /* out: ulint integer, >= 0, < 256 */ + const byte* b) /* in: pointer to byte */ +{ + ut_ad(b); + return((ulint)(b[0])); +} + +/*********************************************************** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /* in: pointer to two bytes where to store */ + ulint n) /* in: ulint integer to be stored */ +{ + ut_ad(b); + ut_ad(n <= 0xFFFFUL); + + b[0] = (byte)(n >> 8); + b[1] = (byte)(n); +} + +/************************************************************ +The following function is used to fetch data from 2 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ulint +mach_read_from_2( +/*=============*/ + /* out: ulint integer */ + const byte* b) /* in: pointer to 2 bytes */ +{ + ut_ad(b); + return( ((ulint)(b[0]) << 8) + + (ulint)(b[1]) + ); +} + +/************************************************************ +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + /* out: 16-bit integer in canonical format */ + ulint n) /* in: integer in machine-dependent format */ +{ + uint16 ret; + ut_ad(2 == sizeof ret); + mach_write_to_2((byte*) &ret, n); + return(ret); +} +/************************************************************ +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + /* out: integer in machine-dependent format */ + uint16 n) /* in: 16-bit integer in canonical format */ +{ + ut_ad(2 == sizeof n); + return(mach_read_from_2((const byte*) &n)); +} + +/*********************************************************** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /* in: pointer to 3 bytes where to store */ + ulint n) /* in: ulint integer to be stored */ +{ + ut_ad(b); + ut_ad(n <= 0xFFFFFFUL); + + b[0] = (byte)(n >> 16); + b[1] = (byte)(n >> 8); + b[2] = (byte)(n); +} + +/************************************************************ +The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ulint +mach_read_from_3( +/*=============*/ + /* out: ulint integer */ + const byte* b) /* in: pointer to 3 bytes */ +{ + ut_ad(b); + return( ((ulint)(b[0]) << 16) + + ((ulint)(b[1]) << 8) + + (ulint)(b[2]) + ); +} + +/*********************************************************** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /* in: pointer to four bytes where to store */ + ulint n) /* in: ulint integer to be stored */ +{ + ut_ad(b); + + b[0] = (byte)(n >> 24); + b[1] = (byte)(n >> 16); + b[2] = (byte)(n >> 8); + b[3] = (byte)n; +} + +/************************************************************ +The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ulint +mach_read_from_4( +/*=============*/ + /* out: ulint integer */ + const byte* b) /* in: pointer to four bytes */ +{ + ut_ad(b); + return( ((ulint)(b[0]) << 24) + + ((ulint)(b[1]) << 16) + + ((ulint)(b[2]) << 8) + + (ulint)(b[3]) + ); +} + +/************************************************************* +Writes a ulint in a compressed form where the first byte codes the +length of the stored ulint. We look at the most significant bits of +the byte. If the most significant bit is zero, it means 1-byte storage, +else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0, +it means 3-byte storage, else if 4th is 0, it means 4-byte storage, +else the storage is 5-byte. */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + /* out: compressed size in bytes */ + byte* b, /* in: pointer to memory where to store */ + ulint n) /* in: ulint integer (< 2^32) to be stored */ +{ + ut_ad(b); + + if (n < 0x80UL) { + mach_write_to_1(b, n); + return(1); + } else if (n < 0x4000UL) { + mach_write_to_2(b, n | 0x8000UL); + return(2); + } else if (n < 0x200000UL) { + mach_write_to_3(b, n | 0xC00000UL); + return(3); + } else if (n < 0x10000000UL) { + mach_write_to_4(b, n | 0xE0000000UL); + return(4); + } else { + mach_write_to_1(b, 0xF0UL); + mach_write_to_4(b + 1, n); + return(5); + } +} + +/************************************************************* +Returns the size of a ulint when written in the compressed form. */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + /* out: compressed size in bytes */ + ulint n) /* in: ulint integer (< 2^32) to be stored */ +{ + if (n < 0x80UL) { + return(1); + } else if (n < 0x4000UL) { + return(2); + } else if (n < 0x200000UL) { + return(3); + } else if (n < 0x10000000UL) { + return(4); + } else { + return(5); + } +} + +/************************************************************* +Reads a ulint in a compressed form. */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + /* out: read integer (< 2^32) */ + const byte* b) /* in: pointer to memory from where to read */ +{ + ulint flag; + + ut_ad(b); + + flag = mach_read_from_1(b); + + if (flag < 0x80UL) { + return(flag); + } else if (flag < 0xC0UL) { + return(mach_read_from_2(b) & 0x7FFFUL); + } else if (flag < 0xE0UL) { + return(mach_read_from_3(b) & 0x3FFFFFUL); + } else if (flag < 0xF0UL) { + return(mach_read_from_4(b) & 0x1FFFFFFFUL); + } else { + ut_ad(flag == 0xF0UL); + return(mach_read_from_4(b + 1)); + } +} + +/*********************************************************** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + byte* b, /* in: pointer to 8 bytes where to store */ + dulint n) /* in: dulint integer to be stored */ +{ + ut_ad(b); + + mach_write_to_4(b, ut_dulint_get_high(n)); + mach_write_to_4(b + 4, ut_dulint_get_low(n)); +} + +/*********************************************************** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_ull( +/*===========*/ + byte* b, /* in: pointer to 8 bytes where to store */ + ib_uint64_t n) /* in: 64-bit integer to be stored */ +{ + ut_ad(b); + + mach_write_to_4(b, (ulint) (n >> 32)); + mach_write_to_4(b + 4, (ulint) n); +} + +/************************************************************ +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +dulint +mach_read_from_8( +/*=============*/ + /* out: dulint integer */ + const byte* b) /* in: pointer to 8 bytes */ +{ + ulint high; + ulint low; + + ut_ad(b); + + high = mach_read_from_4(b); + low = mach_read_from_4(b + 4); + + return(ut_dulint_create(high, low)); +} + +/************************************************************ +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +ib_uint64_t +mach_read_ull( +/*==========*/ + /* out: 64-bit integer */ + const byte* b) /* in: pointer to 8 bytes */ +{ + ib_uint64_t ull; + + ull = ((ib_uint64_t) mach_read_from_4(b)) << 32; + ull |= (ib_uint64_t) mach_read_from_4(b + 4); + + return(ull); +} + +/*********************************************************** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /* in: pointer to 7 bytes where to store */ + dulint n) /* in: dulint integer to be stored */ +{ + ut_ad(b); + + mach_write_to_3(b, ut_dulint_get_high(n)); + mach_write_to_4(b + 3, ut_dulint_get_low(n)); +} + +/************************************************************ +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +dulint +mach_read_from_7( +/*=============*/ + /* out: dulint integer */ + const byte* b) /* in: pointer to 7 bytes */ +{ + ulint high; + ulint low; + + ut_ad(b); + + high = mach_read_from_3(b); + low = mach_read_from_4(b + 3); + + return(ut_dulint_create(high, low)); +} + +/*********************************************************** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /* in: pointer to 6 bytes where to store */ + dulint n) /* in: dulint integer to be stored */ +{ + ut_ad(b); + + mach_write_to_2(b, ut_dulint_get_high(n)); + mach_write_to_4(b + 2, ut_dulint_get_low(n)); +} + +/************************************************************ +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. */ +UNIV_INLINE +dulint +mach_read_from_6( +/*=============*/ + /* out: dulint integer */ + const byte* b) /* in: pointer to 6 bytes */ +{ + ulint high; + ulint low; + + ut_ad(b); + + high = mach_read_from_2(b); + low = mach_read_from_4(b + 2); + + return(ut_dulint_create(high, low)); +} + +/************************************************************* +Writes a dulint in a compressed form (5..9 bytes). */ +UNIV_INLINE +ulint +mach_dulint_write_compressed( +/*=========================*/ + /* out: size in bytes */ + byte* b, /* in: pointer to memory where to store */ + dulint n) /* in: dulint integer to be stored */ +{ + ulint size; + + ut_ad(b); + + size = mach_write_compressed(b, ut_dulint_get_high(n)); + mach_write_to_4(b + size, ut_dulint_get_low(n)); + + return(size + 4); +} + +/************************************************************* +Returns the size of a dulint when written in the compressed form. */ +UNIV_INLINE +ulint +mach_dulint_get_compressed_size( +/*============================*/ + /* out: compressed size in bytes */ + dulint n) /* in: dulint integer to be stored */ +{ + return(4 + mach_get_compressed_size(ut_dulint_get_high(n))); +} + +/************************************************************* +Reads a dulint in a compressed form. */ +UNIV_INLINE +dulint +mach_dulint_read_compressed( +/*========================*/ + /* out: read dulint */ + const byte* b) /* in: pointer to memory from where to read */ +{ + ulint high; + ulint low; + ulint size; + + ut_ad(b); + + high = mach_read_compressed(b); + + size = mach_get_compressed_size(high); + + low = mach_read_from_4(b + size); + + return(ut_dulint_create(high, low)); +} + +/************************************************************* +Writes a dulint in a compressed form (1..11 bytes). */ +UNIV_INLINE +ulint +mach_dulint_write_much_compressed( +/*==============================*/ + /* out: size in bytes */ + byte* b, /* in: pointer to memory where to store */ + dulint n) /* in: dulint integer to be stored */ +{ + ulint size; + + ut_ad(b); + + if (ut_dulint_get_high(n) == 0) { + return(mach_write_compressed(b, ut_dulint_get_low(n))); + } + + *b = (byte)0xFF; + size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n)); + + size += mach_write_compressed(b + size, ut_dulint_get_low(n)); + + return(size); +} + +/************************************************************* +Returns the size of a dulint when written in the compressed form. */ +UNIV_INLINE +ulint +mach_dulint_get_much_compressed_size( +/*=================================*/ + /* out: compressed size in bytes */ + dulint n) /* in: dulint integer to be stored */ +{ + if (0 == ut_dulint_get_high(n)) { + return(mach_get_compressed_size(ut_dulint_get_low(n))); + } + + return(1 + mach_get_compressed_size(ut_dulint_get_high(n)) + + mach_get_compressed_size(ut_dulint_get_low(n))); +} + +/************************************************************* +Reads a dulint in a compressed form. */ +UNIV_INLINE +dulint +mach_dulint_read_much_compressed( +/*=============================*/ + /* out: read dulint */ + const byte* b) /* in: pointer to memory from where to read */ +{ + ulint high; + ulint low; + ulint size; + + ut_ad(b); + + if (*b != (byte)0xFF) { + high = 0; + size = 0; + } else { + high = mach_read_compressed(b + 1); + + size = 1 + mach_get_compressed_size(high); + } + + low = mach_read_compressed(b + size); + + return(ut_dulint_create(high, low)); +} + +/************************************************************* +Reads a double. It is stored in a little-endian format. */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + /* out: double read */ + const byte* b) /* in: pointer to memory from where to read */ +{ + double d; + ulint i; + byte* ptr; + + ptr = (byte*)&d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(double) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/************************************************************* +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /* in: pointer to memory where to write */ + double d) /* in: double */ +{ + ulint i; + byte* ptr; + + ptr = (byte*)&d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(double) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/************************************************************* +Reads a float. It is stored in a little-endian format. */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + /* out: float read */ + const byte* b) /* in: pointer to memory from where to read */ +{ + float d; + ulint i; + byte* ptr; + + ptr = (byte*)&d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(float) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/************************************************************* +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /* in: pointer to memory where to write */ + float d) /* in: float */ +{ + ulint i; + byte* ptr; + + ptr = (byte*)&d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(float) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/************************************************************* +Reads a ulint stored in the little-endian format. */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + /* out: unsigned long int */ + const byte* buf, /* in: from where to read */ + ulint buf_size) /* in: from how many bytes to read */ +{ + ulint n = 0; + const byte* ptr; + + ut_ad(buf_size <= sizeof(ulint)); + ut_ad(buf_size > 0); + + ptr = buf + buf_size; + + for (;;) { + ptr--; + + n = n << 8; + + n += (ulint)(*ptr); + + if (ptr == buf) { + break; + } + } + + return(n); +} + +/************************************************************* +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /* in: where to write */ + ulint dest_size, /* in: into how many bytes to write */ + ulint n) /* in: unsigned long int to write */ +{ + byte* end; + + ut_ad(dest_size <= sizeof(ulint)); + ut_ad(dest_size > 0); + + end = dest + dest_size; + + for (;;) { + *dest = (byte)(n & 0xFF); + + n = n >> 8; + + dest++; + + if (dest == end) { + break; + } + } + + ut_ad(n == 0); +} + +/************************************************************* +Reads a ulint stored in the little-endian format. */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + /* out: unsigned long int */ + const byte* buf) /* in: from where to read */ +{ + return((ulint)(*buf) + ((ulint)(*(buf + 1))) * 256); +} + +/************************************************************* +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /* in: where to write */ + ulint n) /* in: unsigned long int to write */ +{ + ut_ad(n < 256 * 256); + + *dest = (byte)(n & 0xFFUL); + + n = n >> 8; + dest++; + + *dest = (byte)(n & 0xFFUL); +} + +/************************************************************* +Convert integral type from storage byte order (big endian) to +host byte order. */ +UNIV_INLINE +ullint +mach_read_int_type( +/*===============*/ + /* out: integer value */ + const byte* src, /* in: where to read from */ + ulint len, /* in: length of src */ + ibool unsigned_type) /* in: signed or unsigned flag */ +{ + /* XXX this can be optimized on big-endian machines */ + + ullint ret; + uint i; + + if (unsigned_type || (src[0] & 0x80)) { + + ret = 0x0000000000000000ULL; + } else { + + ret = 0xFFFFFFFFFFFFFF00ULL; + } + + if (unsigned_type) { + + ret |= src[0]; + } else { + + ret |= src[0] ^ 0x80; + } + + for (i = 1; i < len; i++) { + ret <<= 8; + ret |= src[i]; + } + + return(ret); +} diff --git a/storage/xtradb/include/mem0dbg.h b/storage/xtradb/include/mem0dbg.h new file mode 100644 index 00000000000..0568a595d06 --- /dev/null +++ b/storage/xtradb/include/mem0dbg.h @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The memory management: the debug code. This is not a compilation module, +but is included in mem0mem.* ! + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +/* In the debug version each allocated field is surrounded with +check fields whose sizes are given below */ + +#ifdef UNIV_MEM_DEBUG +#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\ + UNIV_MEM_ALIGNMENT) +#define MEM_FIELD_TRAILER_SIZE sizeof(ulint) +#else +#define MEM_FIELD_HEADER_SIZE 0 +#endif + + +/* Space needed when allocating for a user a field of +length N. The space is allocated only in multiples of +UNIV_MEM_ALIGNMENT. In the debug version there are also +check fields at the both ends of the field. */ +#ifdef UNIV_MEM_DEBUG +#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\ + + MEM_FIELD_TRAILER_SIZE, UNIV_MEM_ALIGNMENT) +#else +#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT) +#endif + +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG +/******************************************************************* +Checks a memory heap for consistency and prints the contents if requested. +Outputs the sum of sizes of buffers given to the user (only in +the debug version), the physical size of the heap and the number of +blocks in the heap. In case of error returns 0 as sizes and number +of blocks. */ +UNIV_INTERN +void +mem_heap_validate_or_print( +/*=======================*/ + mem_heap_t* heap, /* in: memory heap */ + byte* top, /* in: calculate and validate only until + this top pointer in the heap is reached, + if this pointer is NULL, ignored */ + ibool print, /* in: if TRUE, prints the contents + of the heap; works only in + the debug version */ + ibool* error, /* out: TRUE if error */ + ulint* us_size,/* out: allocated memory + (for the user) in the heap, + if a NULL pointer is passed as this + argument, it is ignored; in the + non-debug version this is always -1 */ + ulint* ph_size,/* out: physical size of the heap, + if a NULL pointer is passed as this + argument, it is ignored */ + ulint* n_blocks); /* out: number of blocks in the heap, + if a NULL pointer is passed as this + argument, it is ignored */ +/****************************************************************** +Validates the contents of a memory heap. */ +UNIV_INTERN +ibool +mem_heap_validate( +/*==============*/ + /* out: TRUE if ok */ + mem_heap_t* heap); /* in: memory heap */ +#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */ +#ifdef UNIV_DEBUG +/****************************************************************** +Checks that an object is a memory heap (or a block of it) */ +UNIV_INTERN +ibool +mem_heap_check( +/*===========*/ + /* out: TRUE if ok */ + mem_heap_t* heap); /* in: memory heap */ +#endif /* UNIV_DEBUG */ +#ifdef UNIV_MEM_DEBUG +/********************************************************************* +TRUE if no memory is currently allocated. */ +UNIV_INTERN +ibool +mem_all_freed(void); +/*===============*/ + /* out: TRUE if no heaps exist */ +/********************************************************************* +Validates the dynamic memory */ +UNIV_INTERN +ibool +mem_validate_no_assert(void); +/*=========================*/ + /* out: TRUE if error */ +/**************************************************************** +Validates the dynamic memory */ +UNIV_INTERN +ibool +mem_validate(void); +/*===============*/ + /* out: TRUE if ok */ +#endif /* UNIV_MEM_DEBUG */ +/**************************************************************** +Tries to find neigboring memory allocation blocks and dumps to stderr +the neighborhood of a given pointer. */ +UNIV_INTERN +void +mem_analyze_corruption( +/*===================*/ + void* ptr); /* in: pointer to place of possible corruption */ +/********************************************************************* +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers. Can only be used in the debug version. */ +UNIV_INTERN +void +mem_print_info(void); +/*================*/ +/********************************************************************* +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers since the last ..._print_info or..._print_new_info. */ +UNIV_INTERN +void +mem_print_new_info(void); +/*====================*/ diff --git a/storage/xtradb/include/mem0dbg.ic b/storage/xtradb/include/mem0dbg.ic new file mode 100644 index 00000000000..bf695fee785 --- /dev/null +++ b/storage/xtradb/include/mem0dbg.ic @@ -0,0 +1,109 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The memory management: the debug code. This is not an independent +compilation module but is included in mem0mem.*. + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_MEM_DEBUG +extern mutex_t mem_hash_mutex; +extern ulint mem_current_allocated_memory; + +/********************************************************************** +Initializes an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_init( +/*===========*/ + byte* buf, /* in: memory field */ + ulint n); /* in: how many bytes the user requested */ +/********************************************************************** +Erases an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_erase( +/*============*/ + byte* buf, /* in: memory field */ + ulint n); /* in: how many bytes the user requested */ +/******************************************************************* +Initializes a buffer to a random combination of hex BA and BE. +Used to initialize allocated memory. */ +UNIV_INTERN +void +mem_init_buf( +/*=========*/ + byte* buf, /* in: pointer to buffer */ + ulint n); /* in: length of buffer */ +/******************************************************************* +Initializes a buffer to a random combination of hex DE and AD. +Used to erase freed memory.*/ +UNIV_INTERN +void +mem_erase_buf( +/*==========*/ + byte* buf, /* in: pointer to buffer */ + ulint n); /* in: length of buffer */ +/******************************************************************* +Inserts a created memory heap to the hash table of +current allocated memory heaps. +Initializes the hash table when first called. */ +UNIV_INTERN +void +mem_hash_insert( +/*============*/ + mem_heap_t* heap, /* in: the created heap */ + const char* file_name, /* in: file name of creation */ + ulint line); /* in: line where created */ +/******************************************************************* +Removes a memory heap (which is going to be freed by the caller) +from the list of live memory heaps. Returns the size of the heap +in terms of how much memory in bytes was allocated for the user of +the heap (not the total space occupied by the heap). +Also validates the heap. +NOTE: This function does not free the storage occupied by the +heap itself, only the node in the list of heaps. */ +UNIV_INTERN +void +mem_hash_remove( +/*============*/ + mem_heap_t* heap, /* in: the heap to be freed */ + const char* file_name, /* in: file name of freeing */ + ulint line); /* in: line where freed */ + + +void +mem_field_header_set_len(byte* field, ulint len); + +ulint +mem_field_header_get_len(byte* field); + +void +mem_field_header_set_check(byte* field, ulint check); + +ulint +mem_field_header_get_check(byte* field); + +void +mem_field_trailer_set_check(byte* field, ulint check); + +ulint +mem_field_trailer_get_check(byte* field); +#endif /* UNIV_MEM_DEBUG */ diff --git a/storage/xtradb/include/mem0mem.h b/storage/xtradb/include/mem0mem.h new file mode 100644 index 00000000000..c20e7815001 --- /dev/null +++ b/storage/xtradb/include/mem0mem.h @@ -0,0 +1,406 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0mem_h +#define mem0mem_h + +#include "univ.i" +#include "ut0mem.h" +#include "ut0byte.h" +#include "ut0ut.h" +#include "ut0rnd.h" +#include "sync0sync.h" +#include "ut0lst.h" +#include "mach0data.h" + +/* -------------------- MEMORY HEAPS ----------------------------- */ + +/* The info structure stored at the beginning of a heap block */ +typedef struct mem_block_info_struct mem_block_info_t; + +/* A block of a memory heap consists of the info structure +followed by an area of memory */ +typedef mem_block_info_t mem_block_t; + +/* A memory heap is a nonempty linear list of memory blocks */ +typedef mem_block_t mem_heap_t; + +/* Types of allocation for memory heaps: DYNAMIC means allocation from the +dynamic memory pool of the C compiler, BUFFER means allocation from the +buffer pool; the latter method is used for very big heaps */ + +#define MEM_HEAP_DYNAMIC 0 /* the most common type */ +#define MEM_HEAP_BUFFER 1 +#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be + ORed to MEM_HEAP_BUFFER, in which + case heap->free_block is used in + some cases for memory allocations, + and if it's NULL, the memory + allocation functions can return + NULL. */ + +/* The following start size is used for the first block in the memory heap if +the size is not specified, i.e., 0 is given as the parameter in the call of +create. The standard size is the maximum (payload) size of the blocks used for +allocations of small buffers. */ + +#define MEM_BLOCK_START_SIZE 64 +#define MEM_BLOCK_STANDARD_SIZE \ + (UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF) + +/* If a memory heap is allowed to grow into the buffer pool, the following +is the maximum size for a single allocated buffer: */ +#define MEM_MAX_ALLOC_IN_BUF (UNIV_PAGE_SIZE - 200) + +/********************************************************************** +Initializes the memory system. */ +UNIV_INTERN +void +mem_init( +/*=====*/ + ulint size); /* in: common pool size in bytes */ +/****************************************************************** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +#define mem_heap_create(N) mem_heap_create_func(\ + (N), MEM_HEAP_DYNAMIC, __FILE__, __LINE__) +/****************************************************************** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +#define mem_heap_create_in_buffer(N) mem_heap_create_func(\ + (N), MEM_HEAP_BUFFER, __FILE__, __LINE__) +/****************************************************************** +Use this macro instead of the corresponding function! Macro for memory +heap creation. */ + +#define mem_heap_create_in_btr_search(N) mem_heap_create_func(\ + (N), MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER,\ + __FILE__, __LINE__) + +/****************************************************************** +Use this macro instead of the corresponding function! Macro for memory +heap freeing. */ + +#define mem_heap_free(heap) mem_heap_free_func(\ + (heap), __FILE__, __LINE__) +/********************************************************************* +NOTE: Use the corresponding macros instead of this function. Creates a +memory heap. For debugging purposes, takes also the file name and line as +arguments. */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( +/*=================*/ + /* out, own: memory heap, NULL if + did not succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps)*/ + ulint n, /* in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block */ + ulint type, /* in: heap type */ + const char* file_name, /* in: file name where created */ + ulint line); /* in: line where created */ +/********************************************************************* +NOTE: Use the corresponding macro instead of this function. Frees the space +occupied by a memory heap. In the debug version erases the heap memory +blocks. */ +UNIV_INLINE +void +mem_heap_free_func( +/*===============*/ + mem_heap_t* heap, /* in, own: heap to be freed */ + const char* file_name, /* in: file name where freed */ + ulint line); /* in: line where freed */ +/******************************************************************* +Allocates and zero-fills n bytes of memory from a memory heap. */ +UNIV_INLINE +void* +mem_heap_zalloc( +/*============*/ + /* out: allocated, zero-filled storage */ + mem_heap_t* heap, /* in: memory heap */ + ulint n); /* in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +/******************************************************************* +Allocates n bytes of memory from a memory heap. */ +UNIV_INLINE +void* +mem_heap_alloc( +/*===========*/ + /* out: allocated storage, NULL if did not + succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps) */ + mem_heap_t* heap, /* in: memory heap */ + ulint n); /* in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +/********************************************************************* +Returns a pointer to the heap top. */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( +/*==================*/ + /* out: pointer to the heap top */ + mem_heap_t* heap); /* in: memory heap */ +/********************************************************************* +Frees the space in a memory heap exceeding the pointer given. The +pointer must have been acquired from mem_heap_get_heap_top. The first +memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_free_heap_top( +/*===================*/ + mem_heap_t* heap, /* in: heap from which to free */ + byte* old_top);/* in: pointer to old top of heap */ +/********************************************************************* +Empties a memory heap. The first memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_empty( +/*===========*/ + mem_heap_t* heap); /* in: heap to empty */ +/********************************************************************* +Returns a pointer to the topmost element in a memory heap. +The size of the element must be given. */ +UNIV_INLINE +void* +mem_heap_get_top( +/*=============*/ + /* out: pointer to the topmost element */ + mem_heap_t* heap, /* in: memory heap */ + ulint n); /* in: size of the topmost element */ +/********************************************************************* +Frees the topmost element in a memory heap. +The size of the element must be given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /* in: memory heap */ + ulint n); /* in: size of the topmost element */ +/********************************************************************* +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap); /* in: heap */ +/****************************************************************** +Use this macro instead of the corresponding function! +Macro for memory buffer allocation */ + +#define mem_zalloc(N) memset(mem_alloc(N), 0, (N)); + +#define mem_alloc(N) mem_alloc_func((N), NULL, __FILE__, __LINE__) +#define mem_alloc2(N,S) mem_alloc_func((N), (S), __FILE__, __LINE__) +/******************************************************************* +NOTE: Use the corresponding macro instead of this function. +Allocates a single buffer of memory from the dynamic memory of +the C compiler. Is like malloc of C. The buffer must be freed +with mem_free. */ +UNIV_INLINE +void* +mem_alloc_func( +/*===========*/ + /* out, own: free storage */ + ulint n, /* in: requested size in bytes */ + ulint* size, /* out: allocated size in bytes, + or NULL */ + const char* file_name, /* in: file name where created */ + ulint line); /* in: line where created */ + +/****************************************************************** +Use this macro instead of the corresponding function! +Macro for memory buffer freeing */ + +#define mem_free(PTR) mem_free_func((PTR), __FILE__, __LINE__) +/******************************************************************* +NOTE: Use the corresponding macro instead of this function. +Frees a single buffer of storage from +the dynamic memory of C compiler. Similar to free of C. */ +UNIV_INLINE +void +mem_free_func( +/*==========*/ + void* ptr, /* in, own: buffer to be freed */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ +); + +/************************************************************************** +Duplicates a NUL-terminated string. */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + /* out, own: a copy of the string, + must be deallocated with mem_free */ + const char* str); /* in: string to be copied */ +/************************************************************************** +Makes a NUL-terminated copy of a nonterminated string. */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + /* out, own: a copy of the string, + must be deallocated with mem_free */ + const char* str, /* in: string to be copied */ + ulint len); /* in: length of str, in bytes */ + +/************************************************************************** +Duplicates a NUL-terminated string, allocated from a memory heap. */ +UNIV_INTERN +char* +mem_heap_strdup( +/*============*/ + /* out, own: a copy of the string */ + mem_heap_t* heap, /* in: memory heap where string is allocated */ + const char* str); /* in: string to be copied */ +/************************************************************************** +Makes a NUL-terminated copy of a nonterminated string, +allocated from a memory heap. */ +UNIV_INLINE +char* +mem_heap_strdupl( +/*=============*/ + /* out, own: a copy of the string */ + mem_heap_t* heap, /* in: memory heap where string is allocated */ + const char* str, /* in: string to be copied */ + ulint len); /* in: length of str, in bytes */ + +/************************************************************************** +Concatenate two strings and return the result, using a memory heap. */ +UNIV_INTERN +char* +mem_heap_strcat( +/*============*/ + /* out, own: the result */ + mem_heap_t* heap, /* in: memory heap where string is allocated */ + const char* s1, /* in: string 1 */ + const char* s2); /* in: string 2 */ + +/************************************************************************** +Duplicate a block of data, allocated from a memory heap. */ +UNIV_INTERN +void* +mem_heap_dup( +/*=========*/ + /* out, own: a copy of the data */ + mem_heap_t* heap, /* in: memory heap where copy is allocated */ + const void* data, /* in: data to be copied */ + ulint len); /* in: length of data, in bytes */ + +/************************************************************************** +Concatenate two memory blocks and return the result, using a memory heap. */ +UNIV_INTERN +void* +mem_heap_cat( +/*=========*/ + /* out, own: the result */ + mem_heap_t* heap, /* in: memory heap where result is allocated */ + const void* b1, /* in: block 1 */ + ulint len1, /* in: length of b1, in bytes */ + const void* b2, /* in: block 2 */ + ulint len2); /* in: length of b2, in bytes */ + +/******************************************************************** +A simple (s)printf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). */ +UNIV_INTERN +char* +mem_heap_printf( +/*============*/ + /* out: heap-allocated formatted string */ + mem_heap_t* heap, /* in: memory heap */ + const char* format, /* in: format string */ + ...) __attribute__ ((format (printf, 2, 3))); + +#ifdef MEM_PERIODIC_CHECK +/********************************************************************** +Goes through the list of all allocated mem blocks, checks their magic +numbers, and reports possible corruption. */ +UNIV_INTERN +void +mem_validate_all_blocks(void); +/*=========================*/ +#endif + +/*#######################################################################*/ + +/* The info header of a block in a memory heap */ + +struct mem_block_info_struct { + ulint magic_n;/* magic number for debugging */ + char file_name[8];/* file name where the mem heap was created */ + ulint line; /* line number where the mem heap was created */ + UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the + the list this is the base node of the list of blocks; + in subsequent blocks this is undefined */ + UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next + and prev in the list. The first block allocated + to the heap is also the first block in this list, + though it also contains the base node of the list. */ + ulint len; /* physical length of this block in bytes */ + ulint type; /* type of heap: MEM_HEAP_DYNAMIC, or + MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */ + ulint free; /* offset in bytes of the first free position for + user data in the block */ + ulint start; /* the value of the struct field 'free' at the + creation of the block */ + void* free_block; + /* if the MEM_HEAP_BTR_SEARCH bit is set in type, + and this is the heap root, this can contain an + allocated buffer frame, which can be appended as a + free block to the heap, if we need more space; + otherwise, this is NULL */ + void* buf_block; + /* if this block has been allocated from the buffer + pool, this contains the buf_block_t handle; + otherwise, this is NULL */ +#ifdef MEM_PERIODIC_CHECK + UT_LIST_NODE_T(mem_block_t) mem_block_list; + /* List of all mem blocks allocated; protected + by the mem_comm_pool mutex */ +#endif +}; + +#define MEM_BLOCK_MAGIC_N 764741555 +#define MEM_FREED_BLOCK_MAGIC_N 547711122 + +/* Header size for a memory heap block */ +#define MEM_BLOCK_HEADER_SIZE ut_calc_align(sizeof(mem_block_info_t),\ + UNIV_MEM_ALIGNMENT) +#include "mem0dbg.h" + +#ifndef UNIV_NONINL +#include "mem0mem.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mem0mem.ic b/storage/xtradb/include/mem0mem.ic new file mode 100644 index 00000000000..04b4234904a --- /dev/null +++ b/storage/xtradb/include/mem0mem.ic @@ -0,0 +1,646 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0dbg.ic" + +#include "mem0pool.h" + +/******************************************************************* +Creates a memory heap block where data can be allocated. */ +UNIV_INTERN +mem_block_t* +mem_heap_create_block( +/*==================*/ + /* out, own: memory heap block, NULL if + did not succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps) */ + mem_heap_t* heap, /* in: memory heap or NULL if first block + should be created */ + ulint n, /* in: number of bytes needed for user data */ + ulint type, /* in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ + const char* file_name,/* in: file name where created */ + ulint line); /* in: line where created */ +/********************************************************************** +Frees a block from a memory heap. */ +UNIV_INTERN +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /* in: heap */ + mem_block_t* block); /* in: block to free */ +/********************************************************************** +Frees the free_block field from a memory heap. */ +UNIV_INTERN +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap); /* in: heap */ +/******************************************************************* +Adds a new block to a memory heap. */ +UNIV_INTERN +mem_block_t* +mem_heap_add_block( +/*===============*/ + /* out: created block, NULL if did not + succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps)*/ + mem_heap_t* heap, /* in: memory heap */ + ulint n); /* in: number of bytes user needs */ + +UNIV_INLINE +void +mem_block_set_len(mem_block_t* block, ulint len) +{ + ut_ad(len > 0); + + block->len = len; +} + +UNIV_INLINE +ulint +mem_block_get_len(mem_block_t* block) +{ + return(block->len); +} + +UNIV_INLINE +void +mem_block_set_type(mem_block_t* block, ulint type) +{ + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + block->type = type; +} + +UNIV_INLINE +ulint +mem_block_get_type(mem_block_t* block) +{ + return(block->type); +} + +UNIV_INLINE +void +mem_block_set_free(mem_block_t* block, ulint free) +{ + ut_ad(free > 0); + ut_ad(free <= mem_block_get_len(block)); + + block->free = free; +} + +UNIV_INLINE +ulint +mem_block_get_free(mem_block_t* block) +{ + return(block->free); +} + +UNIV_INLINE +void +mem_block_set_start(mem_block_t* block, ulint start) +{ + ut_ad(start > 0); + + block->start = start; +} + +UNIV_INLINE +ulint +mem_block_get_start(mem_block_t* block) +{ + return(block->start); +} + +/******************************************************************* +Allocates and zero-fills n bytes of memory from a memory heap. */ +UNIV_INLINE +void* +mem_heap_zalloc( +/*============*/ + /* out: allocated, zero-filled storage */ + mem_heap_t* heap, /* in: memory heap */ + ulint n) /* in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +{ + ut_ad(heap); + ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH)); + return(memset(mem_heap_alloc(heap, n), 0, n)); +} + +/******************************************************************* +Allocates n bytes of memory from a memory heap. */ +UNIV_INLINE +void* +mem_heap_alloc( +/*===========*/ + /* out: allocated storage, NULL if did not + succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps) */ + mem_heap_t* heap, /* in: memory heap */ + ulint n) /* in: number of bytes; if the heap is allowed + to grow into the buffer pool, this must be + <= MEM_MAX_ALLOC_IN_BUF */ +{ + mem_block_t* block; + void* buf; + ulint free; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF)); + + /* Check if there is enough space in block. If not, create a new + block to the heap */ + + if (mem_block_get_len(block) + < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) { + + block = mem_heap_add_block(heap, n); + + if (block == NULL) { + + return(NULL); + } + } + + free = mem_block_get_free(block); + + buf = (byte*)block + free; + + mem_block_set_free(block, free + MEM_SPACE_NEEDED(n)); + +#ifdef UNIV_MEM_DEBUG + UNIV_MEM_ALLOC(buf, + n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE); + + /* In the debug version write debugging info to the field */ + mem_field_init((byte*)buf, n); + + /* Advance buf to point at the storage which will be given to the + caller */ + buf = (byte*)buf + MEM_FIELD_HEADER_SIZE; + +#endif +#ifdef UNIV_SET_MEM_TO_ZERO + UNIV_MEM_ALLOC(buf, n); + memset(buf, '\0', n); +#endif + UNIV_MEM_ALLOC(buf, n); + return(buf); +} + +/********************************************************************* +Returns a pointer to the heap top. */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( +/*==================*/ + /* out: pointer to the heap top */ + mem_heap_t* heap) /* in: memory heap */ +{ + mem_block_t* block; + byte* buf; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*)block + mem_block_get_free(block); + + return(buf); +} + +/********************************************************************* +Frees the space in a memory heap exceeding the pointer given. The +pointer must have been acquired from mem_heap_get_heap_top. The first +memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_free_heap_top( +/*===================*/ + mem_heap_t* heap, /* in: heap from which to free */ + byte* old_top)/* in: pointer to old top of heap */ +{ + mem_block_t* block; + mem_block_t* prev_block; +#ifdef UNIV_MEM_DEBUG + ibool error; + ulint total_size; + ulint size; +#endif + + ut_ad(mem_heap_check(heap)); + +#ifdef UNIV_MEM_DEBUG + + /* Validate the heap and get its total allocated size */ + mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size, + NULL, NULL); + ut_a(!error); + + /* Get the size below top pointer */ + mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL, + NULL); + ut_a(!error); + +#endif + + block = UT_LIST_GET_LAST(heap->base); + + while (block != NULL) { + if (((byte*)block + mem_block_get_free(block) >= old_top) + && ((byte*)block <= old_top)) { + /* Found the right block */ + + break; + } + + /* Store prev_block value before freeing the current block + (the current block will be erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } + + ut_ad(block); + + /* Set the free field of block */ + mem_block_set_free(block, old_top - (byte*)block); + +#ifdef UNIV_MEM_DEBUG + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + + /* In the debug version erase block from top up */ + mem_erase_buf(old_top, (byte*)block + block->len - old_top); + + /* Update allocated memory count */ + mutex_enter(&mem_hash_mutex); + mem_current_allocated_memory -= (total_size - size); + mutex_exit(&mem_hash_mutex); +#else /* UNIV_MEM_DEBUG */ + UNIV_MEM_ASSERT_W(old_top, (byte*)block + block->len - old_top); +#endif /* UNIV_MEM_DEBUG */ + UNIV_MEM_ALLOC(old_top, (byte*)block + block->len - old_top); + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } +} + +/********************************************************************* +Empties a memory heap. The first memory block of the heap is not freed. */ +UNIV_INLINE +void +mem_heap_empty( +/*===========*/ + mem_heap_t* heap) /* in: heap to empty */ +{ + mem_heap_free_heap_top(heap, (byte*)heap + mem_block_get_start(heap)); + + if (heap->free_block) { + mem_heap_free_block_free(heap); + } +} + +/********************************************************************* +Returns a pointer to the topmost element in a memory heap. The size of the +element must be given. */ +UNIV_INLINE +void* +mem_heap_get_top( +/*=============*/ + /* out: pointer to the topmost element */ + mem_heap_t* heap, /* in: memory heap */ + ulint n) /* in: size of the topmost element */ +{ + mem_block_t* block; + void* buf; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*)block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n); + +#ifdef UNIV_MEM_DEBUG + ut_ad(mem_block_get_start(block) <=(ulint)((byte*)buf - (byte*)block)); + + /* In the debug version, advance buf to point at the storage which + was given to the caller in the allocation*/ + + buf = (byte*)buf + MEM_FIELD_HEADER_SIZE; + + /* Check that the field lengths agree */ + ut_ad(n == (ulint)mem_field_header_get_len(buf)); +#endif + + return(buf); +} + +/********************************************************************* +Frees the topmost element in a memory heap. The size of the element must be +given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /* in: memory heap */ + ulint n) /* in: size of the topmost element */ +{ + mem_block_t* block; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + /* Subtract the free field of block */ + mem_block_set_free(block, mem_block_get_free(block) + - MEM_SPACE_NEEDED(n)); + UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n); +#ifdef UNIV_MEM_DEBUG + + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + + /* In the debug version check the consistency, and erase field */ + mem_field_erase((byte*)block + mem_block_get_free(block), n); +#endif + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } else { + /* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a + subsequent invocation of mem_heap_free_top(). + Originally, this was UNIV_MEM_FREE(), to catch writes + to freed memory. */ + UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n); + } +} + +/********************************************************************* +NOTE: Use the corresponding macros instead of this function. Creates a +memory heap. For debugging purposes, takes also the file name and line as +argument. */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( +/*=================*/ + /* out, own: memory heap, NULL if + did not succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps)*/ + ulint n, /* in: desired start block size, + this means that a single user buffer + of size n will fit in the block, + 0 creates a default size block */ + ulint type, /* in: heap type */ + const char* file_name, /* in: file name where created */ + ulint line) /* in: line where created */ +{ + mem_block_t* block; + + if (!n) { + n = MEM_BLOCK_START_SIZE; + } + + block = mem_heap_create_block(NULL, n, type, file_name, line); + + if (block == NULL) { + + return(NULL); + } + + UT_LIST_INIT(block->base); + + /* Add the created block itself as the first block in the list */ + UT_LIST_ADD_FIRST(list, block->base, block); + +#ifdef UNIV_MEM_DEBUG + + mem_hash_insert(block, file_name, line); + +#endif + + return(block); +} + +/********************************************************************* +NOTE: Use the corresponding macro instead of this function. Frees the space +occupied by a memory heap. In the debug version erases the heap memory +blocks. */ +UNIV_INLINE +void +mem_heap_free_func( +/*===============*/ + mem_heap_t* heap, /* in, own: heap to be freed */ + const char* file_name __attribute__((unused)), + /* in: file name where freed */ + ulint line __attribute__((unused))) +{ + mem_block_t* block; + mem_block_t* prev_block; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + +#ifdef UNIV_MEM_DEBUG + + /* In the debug version remove the heap from the hash table of heaps + and check its consistency */ + + mem_hash_remove(heap, file_name, line); + +#endif + + if (heap->free_block) { + mem_heap_free_block_free(heap); + } + + while (block != NULL) { + /* Store the contents of info before freeing current block + (it is erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } +} + +/******************************************************************* +NOTE: Use the corresponding macro instead of this function. +Allocates a single buffer of memory from the dynamic memory of +the C compiler. Is like malloc of C. The buffer must be freed +with mem_free. */ +UNIV_INLINE +void* +mem_alloc_func( +/*===========*/ + /* out, own: free storage */ + ulint n, /* in: desired number of bytes */ + ulint* size, /* out: allocated size in bytes, + or NULL */ + const char* file_name, /* in: file name where created */ + ulint line) /* in: line where created */ +{ + mem_heap_t* heap; + void* buf; + + heap = mem_heap_create_func(n, MEM_HEAP_DYNAMIC, file_name, line); + + /* Note that as we created the first block in the heap big enough + for the buffer requested by the caller, the buffer will be in the + first block and thus we can calculate the pointer to the heap from + the pointer to the buffer when we free the memory buffer. */ + + if (UNIV_LIKELY_NULL(size)) { + /* Adjust the allocation to the actual size of the + memory block. */ + ulint m = mem_block_get_len(heap) + - mem_block_get_free(heap); +#ifdef UNIV_MEM_DEBUG + m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE; +#endif /* UNIV_MEM_DEBUG */ + ut_ad(m >= n); + *size = n = m; + } + + buf = mem_heap_alloc(heap, n); + + ut_a((byte*)heap == (byte*)buf - MEM_BLOCK_HEADER_SIZE + - MEM_FIELD_HEADER_SIZE); + return(buf); +} + +/******************************************************************* +NOTE: Use the corresponding macro instead of this function. Frees a single +buffer of storage from the dynamic memory of the C compiler. Similar to the +free of C. */ +UNIV_INLINE +void +mem_free_func( +/*==========*/ + void* ptr, /* in, own: buffer to be freed */ + const char* file_name, /* in: file name where created */ + ulint line /* in: line where created */ + ) +{ + mem_heap_t* heap; + + heap = (mem_heap_t*)((byte*)ptr - MEM_BLOCK_HEADER_SIZE + - MEM_FIELD_HEADER_SIZE); + mem_heap_free_func(heap, file_name, line); +} + +/********************************************************************* +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap) /* in: heap */ +{ + mem_block_t* block; + ulint size = 0; + + ut_ad(mem_heap_check(heap)); + + block = heap; + + while (block != NULL) { + + size += mem_block_get_len(block); + block = UT_LIST_GET_NEXT(list, block); + } + + if (heap->free_block) { + size += UNIV_PAGE_SIZE; + } + + return(size); +} + +/************************************************************************** +Duplicates a NUL-terminated string. */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + /* out, own: a copy of the string, + must be deallocated with mem_free */ + const char* str) /* in: string to be copied */ +{ + ulint len = strlen(str) + 1; + return((char*) memcpy(mem_alloc(len), str, len)); +} + +/************************************************************************** +Makes a NUL-terminated copy of a nonterminated string. */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + /* out, own: a copy of the string, + must be deallocated with mem_free */ + const char* str, /* in: string to be copied */ + ulint len) /* in: length of str, in bytes */ +{ + char* s = (char*) mem_alloc(len + 1); + s[len] = 0; + return((char*) memcpy(s, str, len)); +} + +/************************************************************************** +Makes a NUL-terminated copy of a nonterminated string, +allocated from a memory heap. */ +UNIV_INLINE +char* +mem_heap_strdupl( +/*=============*/ + /* out, own: a copy of the string */ + mem_heap_t* heap, /* in: memory heap where string is allocated */ + const char* str, /* in: string to be copied */ + ulint len) /* in: length of str, in bytes */ +{ + char* s = (char*) mem_heap_alloc(heap, len + 1); + s[len] = 0; + return((char*) memcpy(s, str, len)); +} diff --git a/storage/xtradb/include/mem0pool.h b/storage/xtradb/include/mem0pool.h new file mode 100644 index 00000000000..7e51b07bfe0 --- /dev/null +++ b/storage/xtradb/include/mem0pool.h @@ -0,0 +1,126 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The lowest-level memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0pool_h +#define mem0pool_h + +#include "univ.i" +#include "os0file.h" +#include "ut0lst.h" + +typedef struct mem_area_struct mem_area_t; +typedef struct mem_pool_struct mem_pool_t; + +/* The common memory pool */ +extern mem_pool_t* mem_comm_pool; + +/* Memory area header */ + +struct mem_area_struct{ + ulint size_and_free; /* memory area size is obtained by + anding with ~MEM_AREA_FREE; area in + a free list if ANDing with + MEM_AREA_FREE results in nonzero */ + UT_LIST_NODE_T(mem_area_t) + free_list; /* free list node */ +}; + +/* Each memory area takes this many extra bytes for control information */ +#define MEM_AREA_EXTRA_SIZE (ut_calc_align(sizeof(struct mem_area_struct),\ + UNIV_MEM_ALIGNMENT)) + +/************************************************************************ +Creates a memory pool. */ +UNIV_INTERN +mem_pool_t* +mem_pool_create( +/*============*/ + /* out: memory pool */ + ulint size); /* in: pool size in bytes */ +/************************************************************************ +Allocates memory from a pool. NOTE: This low-level function should only be +used in mem0mem.*! */ +UNIV_INTERN +void* +mem_area_alloc( +/*===========*/ + /* out, own: allocated memory buffer */ + ulint* psize, /* in: requested size in bytes; for optimum + space usage, the size should be a power of 2 + minus MEM_AREA_EXTRA_SIZE; + out: allocated size in bytes (greater than + or equal to the requested size) */ + mem_pool_t* pool); /* in: memory pool */ +/************************************************************************ +Frees memory to a pool. */ +UNIV_INTERN +void +mem_area_free( +/*==========*/ + void* ptr, /* in, own: pointer to allocated memory + buffer */ + mem_pool_t* pool); /* in: memory pool */ +/************************************************************************ +Returns the amount of reserved memory. */ +UNIV_INTERN +ulint +mem_pool_get_reserved( +/*==================*/ + /* out: reserved mmeory in bytes */ + mem_pool_t* pool); /* in: memory pool */ +/************************************************************************ +Reserves the mem pool mutex. */ +UNIV_INTERN +void +mem_pool_mutex_enter(void); +/*======================*/ +/************************************************************************ +Releases the mem pool mutex. */ +UNIV_INTERN +void +mem_pool_mutex_exit(void); +/*=====================*/ +/************************************************************************ +Validates a memory pool. */ +UNIV_INTERN +ibool +mem_pool_validate( +/*==============*/ + /* out: TRUE if ok */ + mem_pool_t* pool); /* in: memory pool */ +/************************************************************************ +Prints info of a memory pool. */ +UNIV_INTERN +void +mem_pool_print_info( +/*================*/ + FILE* outfile,/* in: output file to write to */ + mem_pool_t* pool); /* in: memory pool */ + + +#ifndef UNIV_NONINL +#include "mem0pool.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mem0pool.ic b/storage/xtradb/include/mem0pool.ic new file mode 100644 index 00000000000..4cc65e754ce --- /dev/null +++ b/storage/xtradb/include/mem0pool.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The lowest-level memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ diff --git a/storage/xtradb/include/mtr0log.h b/storage/xtradb/include/mtr0log.h new file mode 100644 index 00000000000..44374cdf1a4 --- /dev/null +++ b/storage/xtradb/include/mtr0log.h @@ -0,0 +1,247 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mini-transaction logging routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0log_h +#define mtr0log_h + +#include "univ.i" +#include "mtr0mtr.h" +#include "dict0types.h" + +/************************************************************ +Writes 1 - 4 bytes to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_ulint( +/*=============*/ + byte* ptr, /* in: pointer where to write */ + ulint val, /* in: value to write */ + byte type, /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Writes 8 bytes to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_dulint( +/*==============*/ + byte* ptr, /* in: pointer where to write */ + dulint val, /* in: value to write */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Writes a string to a file page buffered in the buffer pool. Writes the +corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_string( +/*==============*/ + byte* ptr, /* in: pointer where to write */ + const byte* str, /* in: string to write */ + ulint len, /* in: string length */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Logs a write of a string to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_log_string( +/*============*/ + byte* ptr, /* in: pointer written to */ + ulint len, /* in: string length */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Writes initial part of a log record consisting of one-byte item +type and four-byte space and page numbers. */ +UNIV_INTERN +void +mlog_write_initial_log_record( +/*==========================*/ + const byte* ptr, /* in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /* in: log item type: MLOG_1BYTE, ... */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Writes a log record about an .ibd file create/delete/rename. */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + /* out: new value of log_ptr */ + ulint type, /* in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/* in: space id, if applicable */ + ulint page_no,/* in: page number (not relevant currently) */ + byte* log_ptr,/* in: pointer to mtr log which has been opened */ + mtr_t* mtr); /* in: mtr */ +/************************************************************ +Catenates 1 - 4 bytes to the mtr log. */ +UNIV_INLINE +void +mlog_catenate_ulint( +/*================*/ + mtr_t* mtr, /* in: mtr */ + ulint val, /* in: value to write */ + ulint type); /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ +/************************************************************ +Catenates n bytes to the mtr log. */ +UNIV_INTERN +void +mlog_catenate_string( +/*=================*/ + mtr_t* mtr, /* in: mtr */ + const byte* str, /* in: string to write */ + ulint len); /* in: string length */ +/************************************************************ +Catenates a compressed ulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_ulint_compressed( +/*===========================*/ + mtr_t* mtr, /* in: mtr */ + ulint val); /* in: value to write */ +/************************************************************ +Catenates a compressed dulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_dulint_compressed( +/*============================*/ + mtr_t* mtr, /* in: mtr */ + dulint val); /* in: value to write */ +/************************************************************ +Opens a buffer to mlog. It must be closed with mlog_close. */ +UNIV_INLINE +byte* +mlog_open( +/*======*/ + /* out: buffer, NULL if log mode MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + ulint size); /* in: buffer size in bytes; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +/************************************************************ +Closes a buffer opened to mlog. */ +UNIV_INLINE +void +mlog_close( +/*=======*/ + mtr_t* mtr, /* in: mtr */ + byte* ptr); /* in: buffer space from ptr up was not used */ +/************************************************************ +Writes the initial part of a log record (3..11 bytes). +If the implementation of this function is changed, all +size parameters to mlog_open() should be adjusted accordingly! */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_fast( +/*===============================*/ + /* out: new value of log_ptr */ + const byte* ptr, /* in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /* in: log item type: MLOG_1BYTE, ... */ + byte* log_ptr,/* in: pointer to mtr log which has + been opened */ + mtr_t* mtr); /* in: mtr */ +/************************************************************ +Parses an initial log record written by mlog_write_initial_log_record. */ +UNIV_INTERN +byte* +mlog_parse_initial_log_record( +/*==========================*/ + /* out: parsed record end, NULL if not a complete + record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + byte* type, /* out: log record type: MLOG_1BYTE, ... */ + ulint* space, /* out: space id */ + ulint* page_no);/* out: page number */ +/************************************************************ +Parses a log record written by mlog_write_ulint or mlog_write_dulint. */ +UNIV_INTERN +byte* +mlog_parse_nbytes( +/*==============*/ + /* out: parsed record end, NULL if not a complete + record */ + ulint type, /* in: log record type: MLOG_1BYTE, ... */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + byte* page, /* in: page where to apply the log record, or NULL */ + void* page_zip);/* in/out: compressed page, or NULL */ +/************************************************************ +Parses a log record written by mlog_write_string. */ +UNIV_INTERN +byte* +mlog_parse_string( +/*==============*/ + /* out: parsed record end, NULL if not a complete + record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + byte* page, /* in: page where to apply the log record, or NULL */ + void* page_zip);/* in/out: compressed page, or NULL */ + + +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. Reserves space +for further log entries. The log entry must be closed with +mtr_close(). */ +UNIV_INTERN +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size); /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ +UNIV_INTERN +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index); /* out, own: dummy index */ + +/* Insert, update, and maybe other functions may use this value to define an +extra mlog buffer size for variable size data */ +#define MLOG_BUF_MARGIN 256 + +#ifndef UNIV_NONINL +#include "mtr0log.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mtr0log.ic b/storage/xtradb/include/mtr0log.ic new file mode 100644 index 00000000000..5f05befb9cc --- /dev/null +++ b/storage/xtradb/include/mtr0log.ic @@ -0,0 +1,247 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mini-transaction logging routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "ut0lst.h" +#include "buf0buf.h" + +/************************************************************ +Opens a buffer to mlog. It must be closed with mlog_close. */ +UNIV_INLINE +byte* +mlog_open( +/*======*/ + /* out: buffer, NULL if log mode MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + ulint size) /* in: buffer size in bytes; MUST be + smaller than DYN_ARRAY_DATA_SIZE! */ +{ + dyn_array_t* mlog; + + mtr->modifications = TRUE; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return(NULL); + } + + mlog = &(mtr->log); + + return(dyn_array_open(mlog, size)); +} + +/************************************************************ +Closes a buffer opened to mlog. */ +UNIV_INLINE +void +mlog_close( +/*=======*/ + mtr_t* mtr, /* in: mtr */ + byte* ptr) /* in: buffer space from ptr up was not used */ +{ + dyn_array_t* mlog; + + ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE); + + mlog = &(mtr->log); + + dyn_array_close(mlog, ptr); +} + +/************************************************************ +Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */ +UNIV_INLINE +void +mlog_catenate_ulint( +/*================*/ + mtr_t* mtr, /* in: mtr */ + ulint val, /* in: value to write */ + ulint type) /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ +{ + dyn_array_t* mlog; + byte* ptr; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return; + } + + mlog = &(mtr->log); + +#if MLOG_1BYTE != 1 +# error "MLOG_1BYTE != 1" +#endif +#if MLOG_2BYTES != 2 +# error "MLOG_2BYTES != 2" +#endif +#if MLOG_4BYTES != 4 +# error "MLOG_4BYTES != 4" +#endif +#if MLOG_8BYTES != 8 +# error "MLOG_8BYTES != 8" +#endif + ptr = (byte*) dyn_array_push(mlog, type); + + if (type == MLOG_4BYTES) { + mach_write_to_4(ptr, val); + } else if (type == MLOG_2BYTES) { + mach_write_to_2(ptr, val); + } else { + ut_ad(type == MLOG_1BYTE); + mach_write_to_1(ptr, val); + } +} + +/************************************************************ +Catenates a compressed ulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_ulint_compressed( +/*===========================*/ + mtr_t* mtr, /* in: mtr */ + ulint val) /* in: value to write */ +{ + byte* log_ptr; + + log_ptr = mlog_open(mtr, 10); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr += mach_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/************************************************************ +Catenates a compressed dulint to mlog. */ +UNIV_INLINE +void +mlog_catenate_dulint_compressed( +/*============================*/ + mtr_t* mtr, /* in: mtr */ + dulint val) /* in: value to write */ +{ + byte* log_ptr; + + log_ptr = mlog_open(mtr, 15); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr += mach_dulint_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/************************************************************ +Writes the initial part of a log record (3..11 bytes). +If the implementation of this function is changed, all +size parameters to mlog_open() should be adjusted accordingly! */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_fast( +/*===============================*/ + /* out: new value of log_ptr */ + const byte* ptr, /* in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /* in: log item type: MLOG_1BYTE, ... */ + byte* log_ptr,/* in: pointer to mtr log which has + been opened */ + mtr_t* mtr) /* in: mtr */ +{ +#ifdef UNIV_DEBUG + buf_block_t* block; +#endif + const byte* page; + ulint space; + ulint offset; + + ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX)); + ut_ad(type <= MLOG_BIGGEST_TYPE); + ut_ad(ptr && log_ptr); + + page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE); + space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + offset = mach_read_from_4(page + FIL_PAGE_OFFSET); + + mach_write_to_1(log_ptr, type); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, space); + log_ptr += mach_write_compressed(log_ptr, offset); + + mtr->n_log_recs++; + +#ifdef UNIV_LOG_DEBUG + fprintf(stderr, + "Adding to mtr log record type %lu space %lu page no %lu\n", + (ulong) type, space, offset); +#endif + +#ifdef UNIV_DEBUG + /* We now assume that all x-latched pages have been modified! */ + block = (buf_block_t*) buf_block_align(ptr); + + if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) { + + mtr_memo_push(mtr, block, MTR_MEMO_MODIFY); + } +#endif + return(log_ptr); +} + +/************************************************************ +Writes a log record about an .ibd file create/delete/rename. */ +UNIV_INLINE +byte* +mlog_write_initial_log_record_for_file_op( +/*======================================*/ + /* out: new value of log_ptr */ + ulint type, /* in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id,/* in: space id, if applicable */ + ulint page_no,/* in: page number (not relevant currently) */ + byte* log_ptr,/* in: pointer to mtr log which has been opened */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(log_ptr); + + mach_write_to_1(log_ptr, type); + log_ptr++; + + /* We write dummy space id and page number */ + log_ptr += mach_write_compressed(log_ptr, space_id); + log_ptr += mach_write_compressed(log_ptr, page_no); + + mtr->n_log_recs++; + + return(log_ptr); +} diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h new file mode 100644 index 00000000000..a29f6c73141 --- /dev/null +++ b/storage/xtradb/include/mtr0mtr.h @@ -0,0 +1,380 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0mtr_h +#define mtr0mtr_h + +#include "univ.i" +#include "mem0mem.h" +#include "dyn0dyn.h" +#include "buf0types.h" +#include "sync0rw.h" +#include "ut0byte.h" +#include "mtr0types.h" +#include "page0types.h" + +/* Logging modes for a mini-transaction */ +#define MTR_LOG_ALL 21 /* default mode: log all operations + modifying disk-based data */ +#define MTR_LOG_NONE 22 /* log no operations */ +/*#define MTR_LOG_SPACE 23 */ /* log only operations modifying + file space page allocation data + (operations in fsp0fsp.* ) */ +#define MTR_LOG_SHORT_INSERTS 24 /* inserts are logged in a shorter + form */ + +/* Types for the mlock objects to store in the mtr memo; NOTE that the +first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ +#define MTR_MEMO_PAGE_S_FIX RW_S_LATCH +#define MTR_MEMO_PAGE_X_FIX RW_X_LATCH +#define MTR_MEMO_BUF_FIX RW_NO_LATCH +#define MTR_MEMO_MODIFY 54 +#define MTR_MEMO_S_LOCK 55 +#define MTR_MEMO_X_LOCK 56 + +/* Log item types: we have made them to be of the type 'byte' +for the compiler to warn if val and type parameters are switched +in a call to mlog_write_ulint. NOTE! For 1 - 8 bytes, the +flag value must give the length also! */ +#define MLOG_SINGLE_REC_FLAG 128 /* if the mtr contains only + one log record for one page, + i.e., write_initial_log_record + has been called only once, + this flag is ORed to the type + of that first log record */ +#define MLOG_1BYTE (1) /* one byte is written */ +#define MLOG_2BYTES (2) /* 2 bytes ... */ +#define MLOG_4BYTES (4) /* 4 bytes ... */ +#define MLOG_8BYTES (8) /* 8 bytes ... */ +#define MLOG_REC_INSERT ((byte)9) /* record insert */ +#define MLOG_REC_CLUST_DELETE_MARK ((byte)10) /* mark clustered index record + deleted */ +#define MLOG_REC_SEC_DELETE_MARK ((byte)11) /* mark secondary index record + deleted */ +#define MLOG_REC_UPDATE_IN_PLACE ((byte)13) /* update of a record, + preserves record field sizes */ +#define MLOG_REC_DELETE ((byte)14) /* delete a record from a + page */ +#define MLOG_LIST_END_DELETE ((byte)15) /* delete record list end on + index page */ +#define MLOG_LIST_START_DELETE ((byte)16) /* delete record list start on + index page */ +#define MLOG_LIST_END_COPY_CREATED ((byte)17) /* copy record list end to a + new created index page */ +#define MLOG_PAGE_REORGANIZE ((byte)18) /* reorganize an index page */ +#define MLOG_PAGE_CREATE ((byte)19) /* create an index page */ +#define MLOG_UNDO_INSERT ((byte)20) /* insert entry in an undo + log */ +#define MLOG_UNDO_ERASE_END ((byte)21) /* erase an undo log + page end */ +#define MLOG_UNDO_INIT ((byte)22) /* initialize a page in an + undo log */ +#define MLOG_UNDO_HDR_DISCARD ((byte)23) /* discard an update undo log + header */ +#define MLOG_UNDO_HDR_REUSE ((byte)24) /* reuse an insert undo log + header */ +#define MLOG_UNDO_HDR_CREATE ((byte)25) /* create an undo log header */ +#define MLOG_REC_MIN_MARK ((byte)26) /* mark an index record as the + predefined minimum record */ +#define MLOG_IBUF_BITMAP_INIT ((byte)27) /* initialize an ibuf bitmap + page */ +/*#define MLOG_FULL_PAGE ((byte)28) full contents of a page */ +#define MLOG_INIT_FILE_PAGE ((byte)29) /* this means that a file page + is taken into use and the prior + contents of the page should be + ignored: in recovery we must + not trust the lsn values stored + to the file page */ +#define MLOG_WRITE_STRING ((byte)30) /* write a string to a page */ +#define MLOG_MULTI_REC_END ((byte)31) /* if a single mtr writes + log records for several pages, + this log record ends the + sequence of these records */ +#define MLOG_DUMMY_RECORD ((byte)32) /* dummy log record used to + pad a log block full */ +#define MLOG_FILE_CREATE ((byte)33) /* log record about an .ibd + file creation */ +#define MLOG_FILE_RENAME ((byte)34) /* log record about an .ibd + file rename */ +#define MLOG_FILE_DELETE ((byte)35) /* log record about an .ibd + file deletion */ +#define MLOG_COMP_REC_MIN_MARK ((byte)36) /* mark a compact index record + as the predefined minimum + record */ +#define MLOG_COMP_PAGE_CREATE ((byte)37) /* create a compact + index page */ +#define MLOG_COMP_REC_INSERT ((byte)38) /* compact record insert */ +#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39) + /* mark compact clustered index + record deleted */ +#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/* mark compact secondary index + record deleted; this log + record type is redundant, as + MLOG_REC_SEC_DELETE_MARK is + independent of the record + format. */ +#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/* update of a compact record, + preserves record field sizes */ +#define MLOG_COMP_REC_DELETE ((byte)42) /* delete a compact record + from a page */ +#define MLOG_COMP_LIST_END_DELETE ((byte)43) /* delete compact record list + end on index page */ +#define MLOG_COMP_LIST_START_DELETE ((byte)44) /* delete compact record list + start on index page */ +#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45) + /* copy compact record list end + to a new created index page */ +#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /* reorganize an index page */ +#define MLOG_FILE_CREATE2 ((byte)47) /* log record about creating + an .ibd file, with format */ +#define MLOG_ZIP_WRITE_NODE_PTR ((byte)48) /* write the node pointer of + a record on a compressed + non-leaf B-tree page */ +#define MLOG_ZIP_WRITE_BLOB_PTR ((byte)49) /* write the BLOB pointer + of an externally stored column + on a compressed page */ +#define MLOG_ZIP_WRITE_HEADER ((byte)50) /* write to compressed page + header */ +#define MLOG_ZIP_PAGE_COMPRESS ((byte)51) /* compress an index page */ +#define MLOG_BIGGEST_TYPE ((byte)51) /* biggest value (used in + asserts) */ + +/******************************************************************* +Starts a mini-transaction and creates a mini-transaction handle +and buffer in the memory buffer given by the caller. */ +UNIV_INLINE +mtr_t* +mtr_start( +/*======*/ + /* out: mtr buffer which also acts as + the mtr handle */ + mtr_t* mtr); /* in: memory buffer for the mtr buffer */ +/******************************************************************* +Commits a mini-transaction. */ +UNIV_INTERN +void +mtr_commit( +/*=======*/ + mtr_t* mtr); /* in: mini-transaction */ +/************************************************************** +Sets and returns a savepoint in mtr. */ +UNIV_INLINE +ulint +mtr_set_savepoint( +/*==============*/ + /* out: savepoint */ + mtr_t* mtr); /* in: mtr */ +/************************************************************** +Releases the latches stored in an mtr memo down to a savepoint. +NOTE! The mtr must not have made changes to buffer pages after the +savepoint, as these can be handled only by mtr_commit. */ +UNIV_INTERN +void +mtr_rollback_to_savepoint( +/*======================*/ + mtr_t* mtr, /* in: mtr */ + ulint savepoint); /* in: savepoint */ +/************************************************************** +Releases the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +UNIV_INLINE +void +mtr_release_s_latch_at_savepoint( +/*=============================*/ + mtr_t* mtr, /* in: mtr */ + ulint savepoint, /* in: savepoint */ + rw_lock_t* lock); /* in: latch to release */ +/******************************************************************* +Gets the logging mode of a mini-transaction. */ +UNIV_INLINE +ulint +mtr_get_log_mode( +/*=============*/ + /* out: logging mode: MTR_LOG_NONE, ... */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************* +Changes the logging mode of a mini-transaction. */ +UNIV_INLINE +ulint +mtr_set_log_mode( +/*=============*/ + /* out: old mode */ + mtr_t* mtr, /* in: mtr */ + ulint mode); /* in: logging mode: MTR_LOG_NONE, ... */ +/************************************************************ +Reads 1 - 4 bytes from a file page buffered in the buffer pool. */ +UNIV_INTERN +ulint +mtr_read_ulint( +/*===========*/ + /* out: value read */ + const byte* ptr, /* in: pointer from where to read */ + ulint type, /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************ +Reads 8 bytes from a file page buffered in the buffer pool. */ +UNIV_INTERN +dulint +mtr_read_dulint( +/*============*/ + /* out: value read */ + const byte* ptr, /* in: pointer from where to read */ + mtr_t* mtr); /* in: mini-transaction handle */ +/************************************************************************* +This macro locks an rw-lock in s-mode. */ +#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), __FILE__, __LINE__,\ + (MTR)) +/************************************************************************* +This macro locks an rw-lock in x-mode. */ +#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), __FILE__, __LINE__,\ + (MTR)) +/************************************************************************* +NOTE! Use the macro above! +Locks a lock in s-mode. */ +UNIV_INLINE +void +mtr_s_lock_func( +/*============*/ + rw_lock_t* lock, /* in: rw-lock */ + const char* file, /* in: file name */ + ulint line, /* in: line number */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************* +NOTE! Use the macro above! +Locks a lock in x-mode. */ +UNIV_INLINE +void +mtr_x_lock_func( +/*============*/ + rw_lock_t* lock, /* in: rw-lock */ + const char* file, /* in: file name */ + ulint line, /* in: line number */ + mtr_t* mtr); /* in: mtr */ + +/******************************************************* +Releases an object in the memo stack. */ +UNIV_INTERN +void +mtr_memo_release( +/*=============*/ + mtr_t* mtr, /* in: mtr */ + void* object, /* in: object */ + ulint type); /* in: object type: MTR_MEMO_S_LOCK, ... */ +#ifdef UNIV_DEBUG +/************************************************************** +Checks if memo contains the given item. */ +UNIV_INLINE +ibool +mtr_memo_contains( +/*==============*/ + /* out: TRUE if contains */ + mtr_t* mtr, /* in: mtr */ + const void* object, /* in: object to search */ + ulint type); /* in: type of object */ + +/************************************************************** +Checks if memo contains the given page. */ +UNIV_INTERN +ibool +mtr_memo_contains_page( +/*===================*/ + /* out: TRUE if contains */ + mtr_t* mtr, /* in: mtr */ + const byte* ptr, /* in: pointer to buffer frame */ + ulint type); /* in: type of object */ +/************************************************************* +Prints info of an mtr handle. */ +UNIV_INTERN +void +mtr_print( +/*======*/ + mtr_t* mtr); /* in: mtr */ +#endif /* UNIV_DEBUG */ +/*######################################################################*/ + +#define MTR_BUF_MEMO_SIZE 200 /* number of slots in memo */ + +/******************************************************************* +Returns the log object of a mini-transaction buffer. */ +UNIV_INLINE +dyn_array_t* +mtr_get_log( +/*========*/ + /* out: log */ + mtr_t* mtr); /* in: mini-transaction */ +/******************************************************* +Pushes an object to an mtr memo stack. */ +UNIV_INLINE +void +mtr_memo_push( +/*==========*/ + mtr_t* mtr, /* in: mtr */ + void* object, /* in: object */ + ulint type); /* in: object type: MTR_MEMO_S_LOCK, ... */ + + +/* Type definition of a mini-transaction memo stack slot. */ +typedef struct mtr_memo_slot_struct mtr_memo_slot_t; +struct mtr_memo_slot_struct{ + ulint type; /* type of the stored object (MTR_MEMO_S_LOCK, ...) */ + void* object; /* pointer to the object */ +}; + +/* Mini-transaction handle and buffer */ +struct mtr_struct{ +#ifdef UNIV_DEBUG + ulint state; /* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */ +#endif + dyn_array_t memo; /* memo stack for locks etc. */ + dyn_array_t log; /* mini-transaction log */ + ibool modifications; + /* TRUE if the mtr made modifications to + buffer pool pages */ + ulint n_log_recs; + /* count of how many page initial log records + have been written to the mtr log */ + ulint log_mode; /* specifies which operations should be + logged; default value MTR_LOG_ALL */ + ib_uint64_t start_lsn;/* start lsn of the possible log entry for + this mtr */ + ib_uint64_t end_lsn;/* end lsn of the possible log entry for + this mtr */ +#ifdef UNIV_DEBUG + ulint magic_n; +#endif /* UNIV_DEBUG */ +}; + +#ifdef UNIV_DEBUG +# define MTR_MAGIC_N 54551 +#endif /* UNIV_DEBUG */ + +#define MTR_ACTIVE 12231 +#define MTR_COMMITTING 56456 +#define MTR_COMMITTED 34676 + +#ifndef UNIV_NONINL +#include "mtr0mtr.ic" +#endif + +#endif diff --git a/storage/xtradb/include/mtr0mtr.ic b/storage/xtradb/include/mtr0mtr.ic new file mode 100644 index 00000000000..7d6d99917b7 --- /dev/null +++ b/storage/xtradb/include/mtr0mtr.ic @@ -0,0 +1,266 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0sync.h" +#include "sync0rw.h" +#include "mach0data.h" + +/******************************************************************* +Starts a mini-transaction and creates a mini-transaction handle +and a buffer in the memory buffer given by the caller. */ +UNIV_INLINE +mtr_t* +mtr_start( +/*======*/ + /* out: mtr buffer which also acts as + the mtr handle */ + mtr_t* mtr) /* in: memory buffer for the mtr buffer */ +{ + dyn_array_create(&(mtr->memo)); + dyn_array_create(&(mtr->log)); + + mtr->log_mode = MTR_LOG_ALL; + mtr->modifications = FALSE; + mtr->n_log_recs = 0; + + ut_d(mtr->state = MTR_ACTIVE); + ut_d(mtr->magic_n = MTR_MAGIC_N); + + return(mtr); +} + +/******************************************************* +Pushes an object to an mtr memo stack. */ +UNIV_INLINE +void +mtr_memo_push( +/*==========*/ + mtr_t* mtr, /* in: mtr */ + void* object, /* in: object */ + ulint type) /* in: object type: MTR_MEMO_S_LOCK, ... */ +{ + dyn_array_t* memo; + mtr_memo_slot_t* slot; + + ut_ad(object); + ut_ad(type >= MTR_MEMO_PAGE_S_FIX); + ut_ad(type <= MTR_MEMO_X_LOCK); + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + + memo = &(mtr->memo); + + slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot); + + slot->object = object; + slot->type = type; +} + +/************************************************************** +Sets and returns a savepoint in mtr. */ +UNIV_INLINE +ulint +mtr_set_savepoint( +/*==============*/ + /* out: savepoint */ + mtr_t* mtr) /* in: mtr */ +{ + dyn_array_t* memo; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + + memo = &(mtr->memo); + + return(dyn_array_get_data_size(memo)); +} + +/************************************************************** +Releases the (index tree) s-latch stored in an mtr memo after a +savepoint. */ +UNIV_INLINE +void +mtr_release_s_latch_at_savepoint( +/*=============================*/ + mtr_t* mtr, /* in: mtr */ + ulint savepoint, /* in: savepoint */ + rw_lock_t* lock) /* in: latch to release */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + memo = &(mtr->memo); + + ut_ad(dyn_array_get_data_size(memo) > savepoint); + + slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint); + + ut_ad(slot->object == lock); + ut_ad(slot->type == MTR_MEMO_S_LOCK); + + rw_lock_s_unlock(lock); + + slot->object = NULL; +} + +#ifdef UNIV_DEBUG +/************************************************************** +Checks if memo contains the given item. */ +UNIV_INLINE +ibool +mtr_memo_contains( +/*==============*/ + /* out: TRUE if contains */ + mtr_t* mtr, /* in: mtr */ + const void* object, /* in: object to search */ + ulint type) /* in: type of object */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + + while (offset > 0) { + offset -= sizeof(mtr_memo_slot_t); + + slot = dyn_array_get_element(memo, offset); + + if ((object == slot->object) && (type == slot->type)) { + + return(TRUE); + } + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +/******************************************************************* +Returns the log object of a mini-transaction buffer. */ +UNIV_INLINE +dyn_array_t* +mtr_get_log( +/*========*/ + /* out: log */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + + return(&(mtr->log)); +} + +/******************************************************************* +Gets the logging mode of a mini-transaction. */ +UNIV_INLINE +ulint +mtr_get_log_mode( +/*=============*/ + /* out: logging mode: MTR_LOG_NONE, ... */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mtr); + ut_ad(mtr->log_mode >= MTR_LOG_ALL); + ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS); + + return(mtr->log_mode); +} + +/******************************************************************* +Changes the logging mode of a mini-transaction. */ +UNIV_INLINE +ulint +mtr_set_log_mode( +/*=============*/ + /* out: old mode */ + mtr_t* mtr, /* in: mtr */ + ulint mode) /* in: logging mode: MTR_LOG_NONE, ... */ +{ + ulint old_mode; + + ut_ad(mtr); + ut_ad(mode >= MTR_LOG_ALL); + ut_ad(mode <= MTR_LOG_SHORT_INSERTS); + + old_mode = mtr->log_mode; + + if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) { + /* Do nothing */ + } else { + mtr->log_mode = mode; + } + + ut_ad(old_mode >= MTR_LOG_ALL); + ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS); + + return(old_mode); +} + +/************************************************************************* +Locks a lock in s-mode. */ +UNIV_INLINE +void +mtr_s_lock_func( +/*============*/ + rw_lock_t* lock, /* in: rw-lock */ + const char* file, /* in: file name */ + ulint line, /* in: line number */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mtr); + ut_ad(lock); + + rw_lock_s_lock_func(lock, 0, file, line); + + mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK); +} + +/************************************************************************* +Locks a lock in x-mode. */ +UNIV_INLINE +void +mtr_x_lock_func( +/*============*/ + rw_lock_t* lock, /* in: rw-lock */ + const char* file, /* in: file name */ + ulint line, /* in: line number */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mtr); + ut_ad(lock); + + rw_lock_x_lock_func(lock, 0, file, line); + + mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK); +} diff --git a/storage/xtradb/include/mtr0types.h b/storage/xtradb/include/mtr0types.h new file mode 100644 index 00000000000..23634c98827 --- /dev/null +++ b/storage/xtradb/include/mtr0types.h @@ -0,0 +1,30 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mini-transaction buffer global types + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#ifndef mtr0types_h +#define mtr0types_h + +typedef struct mtr_struct mtr_t; + +#endif diff --git a/storage/xtradb/include/mysql_addons.h b/storage/xtradb/include/mysql_addons.h new file mode 100644 index 00000000000..2e8c87f5962 --- /dev/null +++ b/storage/xtradb/include/mysql_addons.h @@ -0,0 +1,32 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +This file contains functions that need to be added to +MySQL code but have not been added yet. + +Whenever you add a function here submit a MySQL bug +report (feature request) with the implementation. Then +write the bug number in the comment before the +function in this file. + +When MySQL commits the function it can be deleted from +here. In a perfect world this file exists but is empty. + +Created November 07, 2007 Vasil Dimov +*******************************************************/ diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h new file mode 100644 index 00000000000..26897226ff4 --- /dev/null +++ b/storage/xtradb/include/os0file.h @@ -0,0 +1,758 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system file io + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0file_h +#define os0file_h + +#include "univ.i" + +#ifndef __WIN__ +#include +#include +#include +#endif + +typedef struct fil_node_struct fil_node_t; + +#ifdef UNIV_DO_FLUSH +extern ibool os_do_not_call_flush_at_each_write; +#endif /* UNIV_DO_FLUSH */ +extern ibool os_has_said_disk_full; +extern ibool os_aio_print_debug; + +extern ulint os_file_n_pending_preads; +extern ulint os_file_n_pending_pwrites; + +extern ulint os_n_pending_reads; +extern ulint os_n_pending_writes; + +#ifdef __WIN__ + +/* We define always WIN_ASYNC_IO, and check at run-time whether + the OS actually supports it: Win 95 does not, NT does. */ +#define WIN_ASYNC_IO + +#define UNIV_NON_BUFFERED_IO + +#endif + +#ifdef __WIN__ +#define os_file_t HANDLE +#define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd) +#else +typedef int os_file_t; +#define OS_FILE_FROM_FD(fd) fd +#endif + +extern ulint os_innodb_umask; + +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads */ + +extern ibool os_aio_use_native_aio; + +#define OS_FILE_SECTOR_SIZE 512 + +/* The next value should be smaller or equal to the smallest sector size used +on any disk. A log block is required to be a portion of disk which is written +so that if the start and the end of a block get written to disk, then the +whole block gets written. This should be true even in most cases of a crash: +if this fails for a log block, then it is equivalent to a media failure in the +log. */ + +#define OS_FILE_LOG_BLOCK_SIZE 512 + +/* Options for file_create */ +#define OS_FILE_OPEN 51 +#define OS_FILE_CREATE 52 +#define OS_FILE_OVERWRITE 53 +#define OS_FILE_OPEN_RAW 54 +#define OS_FILE_CREATE_PATH 55 +#define OS_FILE_OPEN_RETRY 56 /* for os_file_create() on + the first ibdata file */ + +#define OS_FILE_READ_ONLY 333 +#define OS_FILE_READ_WRITE 444 +#define OS_FILE_READ_ALLOW_DELETE 555 /* for ibbackup */ + +/* Options for file_create */ +#define OS_FILE_AIO 61 +#define OS_FILE_NORMAL 62 + +/* Types for file create */ +#define OS_DATA_FILE 100 +#define OS_LOG_FILE 101 + +/* Error codes from os_file_get_last_error */ +#define OS_FILE_NOT_FOUND 71 +#define OS_FILE_DISK_FULL 72 +#define OS_FILE_ALREADY_EXISTS 73 +#define OS_FILE_PATH_ERROR 74 +#define OS_FILE_AIO_RESOURCES_RESERVED 75 /* wait for OS aio resources + to become available again */ +#define OS_FILE_SHARING_VIOLATION 76 +#define OS_FILE_ERROR_NOT_SPECIFIED 77 + +/* Types for aio operations */ +#define OS_FILE_READ 10 +#define OS_FILE_WRITE 11 + +#define OS_FILE_LOG 256 /* This can be ORed to type */ + +#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /* Win NT does not allow more + than 64 */ + +/* Modes for aio operations */ +#define OS_AIO_NORMAL 21 /* Normal asynchronous i/o not for ibuf + pages or ibuf bitmap pages */ +#define OS_AIO_IBUF 22 /* Asynchronous i/o for ibuf pages or ibuf + bitmap pages */ +#define OS_AIO_LOG 23 /* Asynchronous i/o for the log */ +#define OS_AIO_SYNC 24 /* Asynchronous i/o where the calling thread + will itself wait for the i/o to complete, + doing also the job of the i/o-handler thread; + can be used for any pages, ibuf or non-ibuf. + This is used to save CPU time, as we can do + with fewer thread switches. Plain synchronous + i/o is not as good, because it must serialize + the file seek and read or write, causing a + bottleneck for parallelism. */ + +#define OS_AIO_SIMULATED_WAKE_LATER 512 /* This can be ORed to mode + in the call of os_aio(...), + if the caller wants to post several i/o + requests in a batch, and only after that + wake the i/o-handler thread; this has + effect only in simulated aio */ +#define OS_WIN31 1 +#define OS_WIN95 2 +#define OS_WINNT 3 +#define OS_WIN2000 4 + +extern ulint os_n_file_reads; +extern ulint os_n_file_writes; +extern ulint os_n_fsyncs; + +/* File types for directory entry data type */ + +enum os_file_type_enum{ + OS_FILE_TYPE_UNKNOWN = 0, + OS_FILE_TYPE_FILE, /* regular file */ + OS_FILE_TYPE_DIR, /* directory */ + OS_FILE_TYPE_LINK /* symbolic link */ +}; +typedef enum os_file_type_enum os_file_type_t; + +/* Maximum path string length in bytes when referring to tables with in the +'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers +of this size from the thread stack; that is why this should not be made much +bigger than 4000 bytes */ +#define OS_FILE_MAX_PATH 4000 + +/* Struct used in fetching information of a file in a directory */ +struct os_file_stat_struct{ + char name[OS_FILE_MAX_PATH]; /* path to a file */ + os_file_type_t type; /* file type */ + ib_int64_t size; /* file size */ + time_t ctime; /* creation time */ + time_t mtime; /* modification time */ + time_t atime; /* access time */ +}; +typedef struct os_file_stat_struct os_file_stat_t; + +#ifdef __WIN__ +typedef HANDLE os_file_dir_t; /* directory stream */ +#else +typedef DIR* os_file_dir_t; /* directory stream */ +#endif + +/*************************************************************************** +Gets the operating system version. Currently works only on Windows. */ +UNIV_INTERN +ulint +os_get_os_version(void); +/*===================*/ + /* out: OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */ +/******************************************************************** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void); +/*===================*/ +/*************************************************************************** +Creates a temporary file. This function is like tmpfile(3), but +the temporary file is created in the MySQL temporary directory. +On Netware, this function is like tmpfile(3), because the C run-time +library of Netware does not expose the delete-on-close flag. */ + +FILE* +os_file_create_tmpfile(void); +/*========================*/ + /* out: temporary file handle, or NULL on error */ +/*************************************************************************** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + /* out: directory stream, NULL if + error */ + const char* dirname, /* in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal);/* in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +/*************************************************************************** +Closes a directory stream. */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + /* out: 0 if success, -1 if failure */ + os_file_dir_t dir); /* in: directory stream */ +/*************************************************************************** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + /* out: 0 if ok, -1 if error, 1 if at the end + of the directory */ + const char* dirname,/* in: directory name or path */ + os_file_dir_t dir, /* in: directory stream */ + os_file_stat_t* info); /* in/out: buffer where the info is returned */ +/********************************************************************* +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix, the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + /* out: TRUE if call succeeds, + FALSE on error */ + const char* pathname, /* in: directory name as + null-terminated string */ + ibool fail_if_exists);/* in: if TRUE, pre-existing directory + is treated as an error. */ +/******************************************************************** +A simple function to open or create a file. */ +UNIV_INTERN +os_file_t +os_file_create_simple( +/*==================*/ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is + opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), or + OS_FILE_CREATE_PATH if new file + (if exists, error) and subdirectories along + its path are created (if needed)*/ + ulint access_type,/* in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ +/******************************************************************** +A simple function to open or create a file. */ +UNIV_INTERN +os_file_t +os_file_create_simple_no_error_handling( +/*====================================*/ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ +/******************************************************************** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd, /* in: file descriptor to alter */ + const char* file_name, /* in: file name, used in the + diagnostic message */ + const char* operation_name);/* in: "open" or "create"; used in the + diagnostic message */ +/******************************************************************** +Opens an existing file or creates a new. */ +UNIV_INTERN +os_file_t +os_file_create( +/*===========*/ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), + OS_FILE_OVERWRITE if a new file is created + or an old overwritten; + OS_FILE_OPEN_RAW, if a raw device or disk + partition should be opened */ + ulint purpose,/* in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ +/*************************************************************************** +Deletes a file. The file has to be closed before calling this. */ +UNIV_INTERN +ibool +os_file_delete( +/*===========*/ + /* out: TRUE if success */ + const char* name); /* in: file path as a null-terminated string */ + +/*************************************************************************** +Deletes a file if it exists. The file has to be closed before calling this. */ +UNIV_INTERN +ibool +os_file_delete_if_exists( +/*=====================*/ + /* out: TRUE if success */ + const char* name); /* in: file path as a null-terminated string */ +/*************************************************************************** +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. */ +UNIV_INTERN +ibool +os_file_rename( +/*===========*/ + /* out: TRUE if success */ + const char* oldpath, /* in: old file path as a + null-terminated string */ + const char* newpath); /* in: new file path */ +/*************************************************************************** +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. */ +UNIV_INTERN +ibool +os_file_close( +/*==========*/ + /* out: TRUE if success */ + os_file_t file); /* in, own: handle to a file */ +/*************************************************************************** +Closes a file handle. */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + /* out: TRUE if success */ + os_file_t file); /* in, own: handle to a file */ +/*************************************************************************** +Gets a file size. */ +UNIV_INTERN +ibool +os_file_get_size( +/*=============*/ + /* out: TRUE if success */ + os_file_t file, /* in: handle to a file */ + ulint* size, /* out: least significant 32 bits of file + size */ + ulint* size_high);/* out: most significant 32 bits of size */ +/*************************************************************************** +Gets file size as a 64-bit integer ib_int64_t. */ +UNIV_INTERN +ib_int64_t +os_file_get_size_as_iblonglong( +/*===========================*/ + /* out: size in bytes, -1 if error */ + os_file_t file); /* in: handle to a file */ +/*************************************************************************** +Write the specified number of zeros to a newly created file. */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + /* out: TRUE if success */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + os_file_t file, /* in: handle to a file */ + ulint size, /* in: least significant 32 bits of file + size */ + ulint size_high);/* in: most significant 32 bits of size */ +/*************************************************************************** +Truncates a file at its current position. */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + /* out: TRUE if success */ + FILE* file); /* in: file to be truncated */ +/*************************************************************************** +Flushes the write buffers of a given file to the disk. */ +UNIV_INTERN +ibool +os_file_flush( +/*==========*/ + /* out: TRUE if success */ + os_file_t file); /* in, own: handle to a file */ +/*************************************************************************** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + /* out: error number, or OS error + number + 100 */ + ibool report_all_errors); /* in: TRUE if we want an error message + printed of all errors */ +/*********************************************************************** +Requests a synchronous read operation. */ +UNIV_INTERN +ibool +os_file_read( +/*=========*/ + /* out: TRUE if request was + successful, FALSE if fail */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read */ + ulint offset, /* in: least significant 32 bits of file + offset where to read */ + ulint offset_high,/* in: most significant 32 bits of + offset */ + ulint n); /* in: number of bytes to read */ +/*********************************************************************** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /* in: file to read from */ + char* str, /* in: buffer where to read */ + ulint size); /* in: size of buffer */ +/*********************************************************************** +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. */ +UNIV_INTERN +ibool +os_file_read_no_error_handling( +/*===========================*/ + /* out: TRUE if request was + successful, FALSE if fail */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read */ + ulint offset, /* in: least significant 32 bits of file + offset where to read */ + ulint offset_high,/* in: most significant 32 bits of + offset */ + ulint n); /* in: number of bytes to read */ + +/*********************************************************************** +Requests a synchronous write operation. */ +UNIV_INTERN +ibool +os_file_write( +/*==========*/ + /* out: TRUE if request was + successful, FALSE if fail */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + os_file_t file, /* in: handle to a file */ + const void* buf, /* in: buffer from which to write */ + ulint offset, /* in: least significant 32 bits of file + offset where to write */ + ulint offset_high,/* in: most significant 32 bits of + offset */ + ulint n); /* in: number of bytes to write */ +/*********************************************************************** +Check the existence and type of the given file. */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + /* out: TRUE if call succeeded */ + const char* path, /* in: pathname of the file */ + ibool* exists, /* out: TRUE if file exists */ + os_file_type_t* type); /* out: type of the file (if it exists) */ +/******************************************************************** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' charac­ +ters are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." +*/ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + /* out, own: directory component of the + pathname */ + const char* path); /* in: pathname */ +/******************************************************************** +Creates all missing subdirectories along the given path. */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + /* out: TRUE if call succeeded + FALSE otherwise */ + const char* path); /* in: path name */ +/**************************************************************************** +Initializes the asynchronous io system. Creates separate aio array for +non-ibuf read and write, a third aio array for the ibuf i/o, with just one +segment, two aio arrays for log reads and writes with one segment, and a +synchronous aio array of the specified size. The combined number of segments +in the three first aio arrays is the parameter n_segments given to the +function. The caller must create an i/o handler thread for each segment in +the four first arrays, but not for the sync aio array. */ +UNIV_INTERN +void +os_aio_init( +/*========*/ + ulint n, /* in: maximum number of pending aio operations + allowed; n must be divisible by n_segments */ +// ulint n_segments, /* in: combined number of segments in the four +// first aio arrays; must be >= 4 */ + ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads */ + ulint n_write_threads, /**/ + ulint n_slots_sync); /* in: number of slots in the sync aio array */ +/*********************************************************************** +Requests an asynchronous i/o operation. */ +UNIV_INTERN +ibool +os_aio( +/*===*/ + /* out: TRUE if request was queued + successfully, FALSE if fail */ + ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read or from which + to write */ + ulint offset, /* in: least significant 32 bits of file + offset where to read or write */ + ulint offset_high, /* in: most significant 32 bits of + offset */ + ulint n, /* in: number of bytes to read or write */ + fil_node_t* message1,/* in: messages for the aio handler (these + can be used to identify a completed aio + operation); if mode is OS_AIO_SYNC, these + are ignored */ + void* message2); +/**************************************************************************** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void); +/*=====================================*/ +/**************************************************************************** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void); +/*=====================================*/ +/************************************************************************** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void); +/*=======================================*/ +/************************************************************************** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep(void); +/*============================================*/ + +#ifdef WIN_ASYNC_IO +/************************************************************************** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + /* out: TRUE if the aio operation succeeded */ + ulint segment, /* in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /* this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /* out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type); /* out: OS_FILE_WRITE or ..._READ */ +#endif + +/************************************************************************** +Does simulated aio. This function should be called by an i/o-handler +thread. */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + /* out: TRUE if the aio operation succeeded */ + ulint segment, /* in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /* out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type); /* out: OS_FILE_WRITE or ..._READ */ +/************************************************************************** +Validates the consistency of the aio system. */ +UNIV_INTERN +ibool +os_aio_validate(void); +/*=================*/ + /* out: TRUE if ok */ +/************************************************************************** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file); /* in: file where to print */ +/************************************************************************** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void); +/*======================*/ + +#ifdef UNIV_DEBUG +/************************************************************************** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void); +/*=======================*/ +#endif /* UNIV_DEBUG */ + +/*********************************************************************** +This function returns information about the specified file */ +UNIV_INTERN +ibool +os_file_get_status( +/*===============*/ + /* out: TRUE if stat + information found */ + const char* path, /* in: pathname of the file */ + os_file_stat_t* stat_info); /* information of a file in a + directory */ + +#if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__) +/************************************************************************* +Creates a temporary file that will be deleted on close. +This function is defined in ha_innodb.cc. */ +UNIV_INTERN +int +innobase_mysql_tmpfile(void); +/*========================*/ + /* out: temporary file descriptor, or < 0 on error */ +#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */ + +#endif diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h new file mode 100644 index 00000000000..19b0b112638 --- /dev/null +++ b/storage/xtradb/include/os0proc.h @@ -0,0 +1,84 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0proc_h +#define os0proc_h + +#include "univ.i" + +#ifdef UNIV_LINUX +#include +#include +#endif + +typedef void* os_process_t; +typedef unsigned long int os_process_id_t; + +extern ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +extern ulint os_large_page_size; + +/******************************************************************** +Converts the current process id to a number. It is not guaranteed that the +number is unique. In Linux returns the 'process number' of the current +thread. That number is the same as one sees in 'top', for example. In Linux +the thread id is not the same as one sees in 'top'. */ +UNIV_INTERN +ulint +os_proc_get_number(void); +/*====================*/ +/******************************************************************** +Allocates large pages memory. */ +UNIV_INTERN +void* +os_mem_alloc_large( +/*===============*/ + /* out: allocated memory */ + ulint* n); /* in/out: number of bytes */ +/******************************************************************** +Frees large pages memory. */ +UNIV_INTERN +void +os_mem_free_large( +/*==============*/ + void *ptr, /* in: pointer returned by + os_mem_alloc_large() */ + ulint size); /* in: size returned by + os_mem_alloc_large() */ +/******************************************************************** +Sets the priority boost for threads released from waiting within the current +process. */ +UNIV_INTERN +void +os_process_set_priority_boost( +/*==========================*/ + ibool do_boost); /* in: TRUE if priority boost should be done, + FALSE if not */ + +#ifndef UNIV_NONINL +#include "os0proc.ic" +#endif + +#endif diff --git a/storage/xtradb/include/os0proc.ic b/storage/xtradb/include/os0proc.ic new file mode 100644 index 00000000000..9f1fb01866d --- /dev/null +++ b/storage/xtradb/include/os0proc.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + + diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h new file mode 100644 index 00000000000..7e058266762 --- /dev/null +++ b/storage/xtradb/include/os0sync.h @@ -0,0 +1,309 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system +synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0sync_h +#define os0sync_h + +#include "univ.i" +#include "ut0lst.h" + +#ifdef __WIN__ + +#define os_fast_mutex_t CRITICAL_SECTION + +typedef HANDLE os_native_event_t; + +typedef struct os_event_struct os_event_struct_t; +typedef os_event_struct_t* os_event_t; + +struct os_event_struct { + os_native_event_t handle; + /* Windows event */ + UT_LIST_NODE_T(os_event_struct_t) os_event_list; + /* list of all created events */ +}; +#else +typedef pthread_mutex_t os_fast_mutex_t; + +typedef struct os_event_struct os_event_struct_t; +typedef os_event_struct_t* os_event_t; + +struct os_event_struct { + os_fast_mutex_t os_mutex; /* this mutex protects the next + fields */ + ibool is_set; /* this is TRUE when the event is + in the signaled state, i.e., a thread + does not stop if it tries to wait for + this event */ + ib_int64_t signal_count; /* this is incremented each time + the event becomes signaled */ + pthread_cond_t cond_var; /* condition variable is used in + waiting for the event */ + UT_LIST_NODE_T(os_event_struct_t) os_event_list; + /* list of all created events */ +}; +#endif + +typedef struct os_mutex_struct os_mutex_str_t; +typedef os_mutex_str_t* os_mutex_t; + +#define OS_SYNC_INFINITE_TIME ((ulint)(-1)) + +#define OS_SYNC_TIME_EXCEEDED 1 + +/* Mutex protecting counts and the event and OS 'slow' mutex lists */ +extern os_mutex_t os_sync_mutex; + +/* This is incremented by 1 in os_thread_create and decremented by 1 in +os_thread_exit */ +extern ulint os_thread_count; + +extern ulint os_event_count; +extern ulint os_mutex_count; +extern ulint os_fast_mutex_count; + +/************************************************************* +Initializes global event and OS 'slow' mutex lists. */ +UNIV_INTERN +void +os_sync_init(void); +/*==============*/ +/************************************************************* +Frees created events and OS 'slow' mutexes. */ +UNIV_INTERN +void +os_sync_free(void); +/*==============*/ +/************************************************************* +Creates an event semaphore, i.e., a semaphore which may just have two states: +signaled and nonsignaled. The created event is manual reset: it must be reset +explicitly by calling sync_os_reset_event. */ +UNIV_INTERN +os_event_t +os_event_create( +/*============*/ + /* out: the event handle */ + const char* name); /* in: the name of the event, if NULL + the event is created without a name */ +#ifdef __WIN__ +/************************************************************* +Creates an auto-reset event semaphore, i.e., an event which is automatically +reset when a single thread is released. Works only in Windows. */ +UNIV_INTERN +os_event_t +os_event_create_auto( +/*=================*/ + /* out: the event handle */ + const char* name); /* in: the name of the event, if NULL + the event is created without a name */ +#endif +/************************************************************** +Sets an event semaphore to the signaled state: lets waiting threads +proceed. */ +UNIV_INTERN +void +os_event_set( +/*=========*/ + os_event_t event); /* in: event to set */ +/************************************************************** +Resets an event semaphore to the nonsignaled state. Waiting threads will +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). */ +UNIV_INTERN +ib_int64_t +os_event_reset( +/*===========*/ + os_event_t event); /* in: event to reset */ +/************************************************************** +Frees an event object. */ +UNIV_INTERN +void +os_event_free( +/*==========*/ + os_event_t event); /* in: event to free */ + +/************************************************************** +Waits for an event object until it is in the signaled state. If +srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the +waiting thread when the event becomes signaled (or immediately if the +event is already in the signaled state). + +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ +UNIV_INTERN +void +os_event_wait_low( +/*==============*/ + os_event_t event, /* in: event to wait */ + ib_int64_t reset_sig_count);/* in: zero or the value + returned by previous call of + os_event_reset(). */ + +#define os_event_wait(event) os_event_wait_low(event, 0) + +/************************************************************** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. In Unix the timeout is always infinite. */ +UNIV_INTERN +ulint +os_event_wait_time( +/*===============*/ + /* out: 0 if success, + OS_SYNC_TIME_EXCEEDED if timeout + was exceeded */ + os_event_t event, /* in: event to wait */ + ulint time); /* in: timeout in microseconds, or + OS_SYNC_INFINITE_TIME */ +#ifdef __WIN__ +/************************************************************** +Waits for any event in an OS native event array. Returns if even a single +one is signaled or becomes signaled. */ +UNIV_INTERN +ulint +os_event_wait_multiple( +/*===================*/ + /* out: index of the event + which was signaled */ + ulint n, /* in: number of events in the + array */ + os_native_event_t* native_event_array); + /* in: pointer to an array of event + handles */ +#endif +/************************************************************* +Creates an operating system mutex semaphore. Because these are slow, the +mutex semaphore of InnoDB itself (mutex_t) should be used where possible. */ +UNIV_INTERN +os_mutex_t +os_mutex_create( +/*============*/ + /* out: the mutex handle */ + const char* name); /* in: the name of the mutex, if NULL + the mutex is created without a name */ +/************************************************************** +Acquires ownership of a mutex semaphore. */ +UNIV_INTERN +void +os_mutex_enter( +/*===========*/ + os_mutex_t mutex); /* in: mutex to acquire */ +/************************************************************** +Releases ownership of a mutex. */ +UNIV_INTERN +void +os_mutex_exit( +/*==========*/ + os_mutex_t mutex); /* in: mutex to release */ +/************************************************************** +Frees an mutex object. */ +UNIV_INTERN +void +os_mutex_free( +/*==========*/ + os_mutex_t mutex); /* in: mutex to free */ +/************************************************************** +Acquires ownership of a fast mutex. Currently in Windows this is the same +as os_fast_mutex_lock! */ +UNIV_INLINE +ulint +os_fast_mutex_trylock( +/*==================*/ + /* out: 0 if success, != 0 if + was reserved by another + thread */ + os_fast_mutex_t* fast_mutex); /* in: mutex to acquire */ +/************************************************************** +Releases ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_unlock( +/*=================*/ + os_fast_mutex_t* fast_mutex); /* in: mutex to release */ +/************************************************************* +Initializes an operating system fast mutex semaphore. */ +UNIV_INTERN +void +os_fast_mutex_init( +/*===============*/ + os_fast_mutex_t* fast_mutex); /* in: fast mutex */ +/************************************************************** +Acquires ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_lock( +/*===============*/ + os_fast_mutex_t* fast_mutex); /* in: mutex to acquire */ +/************************************************************** +Frees an mutex object. */ +UNIV_INTERN +void +os_fast_mutex_free( +/*===============*/ + os_fast_mutex_t* fast_mutex); /* in: mutex to free */ + +#ifdef HAVE_GCC_ATOMIC_BUILTINS +/************************************************************** +Atomic compare-and-swap for InnoDB. Currently requires GCC atomic builtins. +Returns true if swapped, ptr is pointer to target, old_val is value to +compare to, new_val is the value to swap in. */ +#define os_compare_and_swap(ptr, old_val, new_val) \ + __sync_bool_compare_and_swap(ptr, old_val, new_val) + +/************************************************************** +Atomic increment for InnoDB. Currently requires GCC atomic builtins. +Returns the resulting value, ptr is pointer to target, amount is the +amount of increment. */ +#define os_atomic_increment(ptr, amount) \ + __sync_add_and_fetch(ptr, amount) + +#endif /* HAVE_GCC_ATOMIC_BUILTINS */ + +#ifndef UNIV_NONINL +#include "os0sync.ic" +#endif + +#endif diff --git a/storage/xtradb/include/os0sync.ic b/storage/xtradb/include/os0sync.ic new file mode 100644 index 00000000000..5c03d184c7c --- /dev/null +++ b/storage/xtradb/include/os0sync.ic @@ -0,0 +1,62 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#ifdef __WIN__ +#include +#endif + +/************************************************************** +Acquires ownership of a fast mutex. Currently in Windows this is the same +as os_fast_mutex_lock! */ +UNIV_INLINE +ulint +os_fast_mutex_trylock( +/*==================*/ + /* out: 0 if success, != 0 if + was reserved by another + thread */ + os_fast_mutex_t* fast_mutex) /* in: mutex to acquire */ +{ +#ifdef __WIN__ + EnterCriticalSection(fast_mutex); + + return(0); +#else +#if defined(UNIV_HOTBACKUP) && defined(UNIV_HPUX10) + /* Since the hot backup version is standalone, MySQL does not redefine + pthread_mutex_trylock for HP-UX-10.20, and consequently we must invert + the return value here */ + + return((ulint) (1 - pthread_mutex_trylock(fast_mutex))); +#else + /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock + so that it returns 0 on success. In the operating system + libraries, HP-UX-10.20 follows the old Posix 1003.4a Draft 4 and + returns 1 on success (but MySQL remaps that to 0), while Linux, + FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */ + + return((ulint) pthread_mutex_trylock(fast_mutex)); +#endif +#endif +} diff --git a/storage/xtradb/include/os0thread.h b/storage/xtradb/include/os0thread.h new file mode 100644 index 00000000000..863596bfa84 --- /dev/null +++ b/storage/xtradb/include/os0thread.h @@ -0,0 +1,158 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system +process and thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0thread_h +#define os0thread_h + +#include "univ.i" + +/* Maximum number of threads which can be created in the program; +this is also the size of the wait slot array for MySQL threads which +can wait inside InnoDB */ + +#define OS_THREAD_MAX_N srv_max_n_threads + + +/* Possible fixed priorities for threads */ +#define OS_THREAD_PRIORITY_NONE 100 +#define OS_THREAD_PRIORITY_BACKGROUND 1 +#define OS_THREAD_PRIORITY_NORMAL 2 +#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3 + +#ifdef __WIN__ +typedef void* os_thread_t; +typedef ulint os_thread_id_t; /* In Windows the thread id + is an unsigned long int */ +#else +typedef pthread_t os_thread_t; +typedef os_thread_t os_thread_id_t; /* In Unix we use the thread + handle itself as the id of + the thread */ +#endif + +/* Define a function pointer type to use in a typecast */ +typedef void* (*os_posix_f_t) (void*); + +/******************************************************************* +Compares two thread ids for equality. */ +UNIV_INTERN +ibool +os_thread_eq( +/*=========*/ + /* out: TRUE if equal */ + os_thread_id_t a, /* in: OS thread or thread id */ + os_thread_id_t b); /* in: OS thread or thread id */ +/******************************************************************** +Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is +unique for the thread though! */ +UNIV_INTERN +ulint +os_thread_pf( +/*=========*/ + /* out: unsigned long int */ + os_thread_id_t a); /* in: thread or thread id */ +/******************************************************************** +Creates a new thread of execution. The execution starts from +the function given. The start function takes a void* parameter +and returns a ulint. +NOTE: We count the number of threads in os_thread_exit(). A created +thread should always use that to exit and not use return() to exit. */ +UNIV_INTERN +os_thread_t +os_thread_create( +/*=============*/ + /* out: handle to the thread */ +#ifndef __WIN__ + os_posix_f_t start_f, +#else + ulint (*start_f)(void*), /* in: pointer to function + from which to start */ +#endif + void* arg, /* in: argument to start + function */ + os_thread_id_t* thread_id); /* out: id of the created + thread, or NULL */ + +/********************************************************************* +Exits the current thread. */ +UNIV_INTERN +void +os_thread_exit( +/*===========*/ + void* exit_value); /* in: exit value; in Windows this void* + is cast as a DWORD */ +/********************************************************************* +Returns the thread identifier of current thread. */ +UNIV_INTERN +os_thread_id_t +os_thread_get_curr_id(void); +/*========================*/ +/********************************************************************* +Returns handle to the current thread. */ +UNIV_INTERN +os_thread_t +os_thread_get_curr(void); +/*====================*/ +/********************************************************************* +Advises the os to give up remainder of the thread's time slice. */ +UNIV_INTERN +void +os_thread_yield(void); +/*=================*/ +/********************************************************************* +The thread sleeps at least the time given in microseconds. */ +UNIV_INTERN +void +os_thread_sleep( +/*============*/ + ulint tm); /* in: time in microseconds */ +/********************************************************************** +Gets a thread priority. */ +UNIV_INTERN +ulint +os_thread_get_priority( +/*===================*/ + /* out: priority */ + os_thread_t handle);/* in: OS handle to the thread */ +/********************************************************************** +Sets a thread priority. */ +UNIV_INTERN +void +os_thread_set_priority( +/*===================*/ + os_thread_t handle, /* in: OS handle to the thread */ + ulint pri); /* in: priority: one of OS_PRIORITY_... */ +/********************************************************************** +Gets the last operating system error code for the calling thread. */ +UNIV_INTERN +ulint +os_thread_get_last_error(void); +/*==========================*/ + +#ifndef UNIV_NONINL +#include "os0thread.ic" +#endif + +#endif diff --git a/storage/xtradb/include/os0thread.ic b/storage/xtradb/include/os0thread.ic new file mode 100644 index 00000000000..a86b203809c --- /dev/null +++ b/storage/xtradb/include/os0thread.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system +process and thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/page0cur.h b/storage/xtradb/include/page0cur.h new file mode 100644 index 00000000000..960ecdddf4e --- /dev/null +++ b/storage/xtradb/include/page0cur.h @@ -0,0 +1,346 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef page0cur_h +#define page0cur_h + +#include "univ.i" + +#include "buf0types.h" +#include "page0page.h" +#include "rem0rec.h" +#include "data0data.h" +#include "mtr0mtr.h" + + +#define PAGE_CUR_ADAPT + +/* Page cursor search modes; the values must be in this order! */ + +#define PAGE_CUR_UNSUPP 0 +#define PAGE_CUR_G 1 +#define PAGE_CUR_GE 2 +#define PAGE_CUR_L 3 +#define PAGE_CUR_LE 4 +/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in + "column LIKE 'abc%' ORDER BY column DESC"; + we have to find strings which are <= 'abc' or + which extend it */ +#ifdef UNIV_SEARCH_DEBUG +# define PAGE_CUR_DBG 6 /* As PAGE_CUR_LE, but skips search shortcut */ +#endif /* UNIV_SEARCH_DEBUG */ + +#ifdef UNIV_DEBUG +/************************************************************* +Gets pointer to the page frame where the cursor is positioned. */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + /* out: page */ + page_cur_t* cur); /* in: page cursor */ +/************************************************************* +Gets pointer to the buffer block where the cursor is positioned. */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + /* out: page */ + page_cur_t* cur); /* in: page cursor */ +/************************************************************* +Gets pointer to the page frame where the cursor is positioned. */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + /* out: page */ + page_cur_t* cur); /* in: page cursor */ +/************************************************************* +Gets the record where the cursor is positioned. */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + /* out: record */ + page_cur_t* cur); /* in: page cursor */ +#else /* UNIV_DEBUG */ +# define page_cur_get_page(cur) page_align((cur)->rec) +# define page_cur_get_block(cur) (cur)->block +# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block) +# define page_cur_get_rec(cur) (cur)->rec +#endif /* UNIV_DEBUG */ +/************************************************************* +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /* in: index page */ + page_cur_t* cur); /* in: cursor */ +/************************************************************* +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /* in: index page */ + page_cur_t* cur); /* in: cursor */ +/************************************************************* +Returns TRUE if the cursor is before first user record on page. */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + /* out: TRUE if at start */ + const page_cur_t* cur); /* in: cursor */ +/************************************************************* +Returns TRUE if the cursor is after last user record. */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + /* out: TRUE if at end */ + const page_cur_t* cur); /* in: cursor */ +/************************************************************** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /* in: record on a page */ + const buf_block_t* block, /* in: buffer block containing + the record */ + page_cur_t* cur); /* out: page cursor */ +/************************************************************** +Invalidates a page cursor by setting the record pointer NULL. */ +UNIV_INLINE +void +page_cur_invalidate( +/*================*/ + page_cur_t* cur); /* out: page cursor */ +/************************************************************** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur); /* in/out: cursor; must not be after last */ +/************************************************************** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur); /* in/out: cursor; not before first */ +/*************************************************************** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + page_cur_t* cursor, /* in/out: a page cursor */ + const dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ + ulint n_ext, /* in: number of externally stored columns */ + mtr_t* mtr); /* in: mini-transaction handle, or NULL */ +/*************************************************************** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. */ +UNIV_INLINE +rec_t* +page_cur_rec_insert( +/*================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + page_cur_t* cursor, /* in/out: a page cursor */ + const rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr); /* in: mini-transaction handle, or NULL */ +/*************************************************************** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_low( +/*====================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + rec_t* current_rec,/* in: pointer to current record after + which the new record is inserted */ + dict_index_t* index, /* in: record descriptor */ + const rec_t* rec, /* in: pointer to a physical record */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr); /* in: mini-transaction handle, or NULL */ +/*************************************************************** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + rec_t** current_rec,/* in/out: pointer to current record after + which the new record is inserted */ + buf_block_t* block, /* in: buffer block of *current_rec */ + dict_index_t* index, /* in: record descriptor */ + const rec_t* rec, /* in: pointer to a physical record */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr); /* in: mini-transaction handle, or NULL */ +/***************************************************************** +Copies records from page to a newly created page, from a given record onward, +including that record. Infimum and supremum records are not copied. */ +UNIV_INTERN +void +page_copy_rec_list_end_to_created_page( +/*===================================*/ + page_t* new_page, /* in/out: index page to copy to */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ +/*************************************************************** +Deletes a record at the page cursor. The cursor is moved to the +next record after the deleted one. */ +UNIV_INTERN +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /* in/out: a page cursor */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ + mtr_t* mtr); /* in: mini-transaction handle */ +/******************************************************************** +Searches the right position for a page cursor. */ +UNIV_INLINE +ulint +page_cur_search( +/*============*/ + /* out: number of matched + fields on the left */ + const buf_block_t* block, /* in: buffer block */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* tuple, /* in: data tuple */ + ulint mode, /* in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor);/* out: page cursor */ +/******************************************************************** +Searches the right position for a page cursor. */ +UNIV_INTERN +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /* in: buffer block */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* tuple, /* in: data tuple */ + ulint mode, /* in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /* in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /* in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor);/* out: page cursor */ +/*************************************************************** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +UNIV_INTERN +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /* in: page */ + page_cur_t* cursor);/* out: page cursor */ +/*************************************************************** +Parses a log record of a record insert on a page. */ +UNIV_INTERN +byte* +page_cur_parse_insert_rec( +/*======================*/ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr or NULL */ +/************************************************************** +Parses a log record of copying a record list end to a new created page. */ +UNIV_INTERN +byte* +page_parse_copy_rec_list_to_created_page( +/*=====================================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr or NULL */ +/*************************************************************** +Parses log record of a record delete on a page. */ +UNIV_INTERN +byte* +page_cur_parse_delete_rec( +/*======================*/ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr or NULL */ + +/* Index page cursor */ + +struct page_cur_struct{ + byte* rec; /* pointer to a record on page */ + buf_block_t* block; /* pointer to the block containing rec */ +}; + +#ifndef UNIV_NONINL +#include "page0cur.ic" +#endif + +#endif diff --git a/storage/xtradb/include/page0cur.ic b/storage/xtradb/include/page0cur.ic new file mode 100644 index 00000000000..9cf10ea5e3f --- /dev/null +++ b/storage/xtradb/include/page0cur.ic @@ -0,0 +1,300 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0page.h" +#include "buf0types.h" + +#ifdef UNIV_DEBUG +/************************************************************* +Gets pointer to the page frame where the cursor is positioned. */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + /* out: page */ + page_cur_t* cur) /* in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + + return(page_align(cur->rec)); +} + +/************************************************************* +Gets pointer to the buffer block where the cursor is positioned. */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + /* out: page */ + page_cur_t* cur) /* in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(cur->block); +} + +/************************************************************* +Gets pointer to the page frame where the cursor is positioned. */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + /* out: page */ + page_cur_t* cur) /* in: page cursor */ +{ + return(buf_block_get_page_zip(page_cur_get_block(cur))); +} + +/************************************************************* +Gets the record where the cursor is positioned. */ +UNIV_INLINE +rec_t* +page_cur_get_rec( +/*=============*/ + /* out: record */ + page_cur_t* cur) /* in: page cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + + return(cur->rec); +} +#endif /* UNIV_DEBUG */ + +/************************************************************* +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /* in: index page */ + page_cur_t* cur) /* in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block)); +} + +/************************************************************* +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /* in: index page */ + page_cur_t* cur) /* in: cursor */ +{ + cur->block = (buf_block_t*) block; + cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block)); +} + +/************************************************************* +Returns TRUE if the cursor is before first user record on page. */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + /* out: TRUE if at start */ + const page_cur_t* cur) /* in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_infimum(cur->rec)); +} + +/************************************************************* +Returns TRUE if the cursor is after last user record. */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + /* out: TRUE if at end */ + const page_cur_t* cur) /* in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->frame); + return(page_rec_is_supremum(cur->rec)); +} + +/************************************************************** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /* in: record on a page */ + const buf_block_t* block, /* in: buffer block containing + the record */ + page_cur_t* cur) /* out: page cursor */ +{ + ut_ad(rec && block && cur); + ut_ad(page_align(rec) == block->frame); + + cur->rec = (rec_t*) rec; + cur->block = (buf_block_t*) block; +} + +/************************************************************** +Invalidates a page cursor by setting the record pointer NULL. */ +UNIV_INLINE +void +page_cur_invalidate( +/*================*/ + page_cur_t* cur) /* out: page cursor */ +{ + ut_ad(cur); + + cur->rec = NULL; + cur->block = NULL; +} + +/************************************************************** +Moves the cursor to the next record on page. */ +UNIV_INLINE +void +page_cur_move_to_next( +/*==================*/ + page_cur_t* cur) /* in/out: cursor; must not be after last */ +{ + ut_ad(!page_cur_is_after_last(cur)); + + cur->rec = page_rec_get_next(cur->rec); +} + +/************************************************************** +Moves the cursor to the previous record on page. */ +UNIV_INLINE +void +page_cur_move_to_prev( +/*==================*/ + page_cur_t* cur) /* in/out: page cursor, not before first */ +{ + ut_ad(!page_cur_is_before_first(cur)); + + cur->rec = page_rec_get_prev(cur->rec); +} + +/******************************************************************** +Searches the right position for a page cursor. */ +UNIV_INLINE +ulint +page_cur_search( +/*============*/ + /* out: number of matched + fields on the left */ + const buf_block_t* block, /* in: buffer block */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* tuple, /* in: data tuple */ + ulint mode, /* in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor) /* out: page cursor */ +{ + ulint low_matched_fields = 0; + ulint low_matched_bytes = 0; + ulint up_matched_fields = 0; + ulint up_matched_bytes = 0; + + ut_ad(dtuple_check_typed(tuple)); + + page_cur_search_with_match(block, index, tuple, mode, + &up_matched_fields, + &up_matched_bytes, + &low_matched_fields, + &low_matched_bytes, + cursor); + return(low_matched_fields); +} + +/*************************************************************** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + page_cur_t* cursor, /* in/out: a page cursor */ + const dtuple_t* tuple, /* in: pointer to a data tuple */ + dict_index_t* index, /* in: record descriptor */ + ulint n_ext, /* in: number of externally stored columns */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL */ +{ + mem_heap_t* heap; + ulint* offsets; + ulint size + = rec_get_converted_size(index, tuple, n_ext); + rec_t* rec; + + heap = mem_heap_create(size + + (4 + REC_OFFS_HEADER_SIZE + + dtuple_get_n_fields(tuple)) + * sizeof *offsets); + rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(heap, size), + index, tuple, n_ext); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); + + if (buf_block_get_page_zip(cursor->block)) { + rec = page_cur_insert_rec_zip(&cursor->rec, cursor->block, + index, rec, offsets, mtr); + } else { + rec = page_cur_insert_rec_low(cursor->rec, + index, rec, offsets, mtr); + } + + mem_heap_free(heap); + return(rec); +} + +/*************************************************************** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. */ +UNIV_INLINE +rec_t* +page_cur_rec_insert( +/*================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + page_cur_t* cursor, /* in/out: a page cursor */ + const rec_t* rec, /* in: record to insert */ + dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL */ +{ + if (buf_block_get_page_zip(cursor->block)) { + return(page_cur_insert_rec_zip(&cursor->rec, cursor->block, + index, rec, offsets, mtr)); + } else { + return(page_cur_insert_rec_low(cursor->rec, + index, rec, offsets, mtr)); + } +} + diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h new file mode 100644 index 00000000000..e3de6901ee1 --- /dev/null +++ b/storage/xtradb/include/page0page.h @@ -0,0 +1,1019 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0page_h +#define page0page_h + +#include "univ.i" + +#include "page0types.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "data0data.h" +#include "dict0dict.h" +#include "rem0rec.h" +#include "fsp0fsp.h" +#include "mtr0mtr.h" + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE +#endif + +/* PAGE HEADER + =========== + +Index page header starts at the first offset left free by the FIL-module */ + +typedef byte page_header_t; + +#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this + offset */ +/*-----------------------------*/ +#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ +#define PAGE_HEAP_TOP 2 /* pointer to record heap top */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ +#define PAGE_FREE 6 /* pointer to start of page free record list */ +#define PAGE_GARBAGE 8 /* number of bytes in deleted records */ +#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or + NULL if this info has been reset by a delete, + for example */ +#define PAGE_DIRECTION 12 /* last insert direction: PAGE_LEFT, ... */ +#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same + direction */ +#define PAGE_N_RECS 16 /* number of user records on the page */ +#define PAGE_MAX_TRX_ID 18 /* highest id of a trx which may have modified + a record on the page; a dulint; defined only + in secondary indexes; specifically, not in an + ibuf tree; NOTE: this may be modified only + when the thread has an x-latch to the page, + and ALSO an x-latch to btr_search_latch + if there is a hash index to the page! */ +#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page + header which are set in a page create */ +/*----*/ +#define PAGE_LEVEL 26 /* level of the node in an index tree; the + leaf level is the level 0 */ +#define PAGE_INDEX_ID 28 /* index id where the page belongs */ +#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in + a B-tree: defined only on the root page of a + B-tree, but not in the root of an ibuf tree */ +#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF +#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF + /* in the place of PAGE_BTR_SEG_LEAF and _TOP + there is a free list base node if the page is + the root page of an ibuf tree, and at the same + place is the free list node if the page is in + a free list */ +#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE) + /* file segment header for the non-leaf pages + in a B-tree: defined only on the root page of + a B-tree, but not in the root of an ibuf + tree */ +/*----*/ +#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) + /* start of data on the page */ + +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) + /* offset of the page supremum record end on + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ +/*-----------------------------*/ + +/* Heap numbers */ +#define PAGE_HEAP_NO_INFIMUM 0 /* page infimum */ +#define PAGE_HEAP_NO_SUPREMUM 1 /* page supremum */ +#define PAGE_HEAP_NO_USER_LOW 2 /* first user record in + creation (insertion) order, + not necessarily collation order; + this record may have been deleted */ + +/* Directions of cursor movement */ +#define PAGE_LEFT 1 +#define PAGE_RIGHT 2 +#define PAGE_SAME_REC 3 +#define PAGE_SAME_PAGE 4 +#define PAGE_NO_DIRECTION 5 + +/* PAGE DIRECTORY + ============== +*/ + +typedef byte page_dir_slot_t; +typedef page_dir_slot_t page_dir_t; + +/* Offset of the directory start down from the page end. We call the +slot with the highest file address directory start, as it points to +the first record in the list of records. */ +#define PAGE_DIR FIL_PAGE_DATA_END + +/* We define a slot in the page directory as two bytes */ +#define PAGE_DIR_SLOT_SIZE 2 + +/* The offset of the physically lower end of the directory, counted from +page end, when the page is empty */ +#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE) + +/* The maximum and minimum number of records owned by a directory slot. The +number may drop below the minimum in the first and the last slot in the +directory. */ +#define PAGE_DIR_SLOT_MAX_N_OWNED 8 +#define PAGE_DIR_SLOT_MIN_N_OWNED 4 + +/**************************************************************** +Gets the start of a page. */ +UNIV_INLINE +page_t* +page_align( +/*=======*/ + /* out: start of the page */ + const void* ptr) /* in: pointer to page frame */ + __attribute__((const)); +/**************************************************************** +Gets the offset within a page. */ +UNIV_INLINE +ulint +page_offset( +/*========*/ + /* out: offset from the start of the page */ + const void* ptr) /* in: pointer to page frame */ + __attribute__((const)); +/***************************************************************** +Returns the max trx id field value. */ +UNIV_INLINE +dulint +page_get_max_trx_id( +/*================*/ + const page_t* page); /* in: page */ +/***************************************************************** +Sets the max trx id field value. */ +UNIV_INTERN +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dulint trx_id);/* in: transaction id */ +/***************************************************************** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dulint trx_id);/* in: transaction id */ +/***************************************************************** +Reads the given header field. */ +UNIV_INLINE +ulint +page_header_get_field( +/*==================*/ + const page_t* page, /* in: page */ + ulint field); /* in: PAGE_N_DIR_SLOTS, ... */ +/***************************************************************** +Sets the given header field. */ +UNIV_INLINE +void +page_header_set_field( +/*==================*/ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /* in: PAGE_N_DIR_SLOTS, ... */ + ulint val); /* in: value */ +/***************************************************************** +Returns the offset stored in the given header field. */ +UNIV_INLINE +ulint +page_header_get_offs( +/*=================*/ + /* out: offset from the start of the page, + or 0 */ + const page_t* page, /* in: page */ + ulint field) /* in: PAGE_FREE, ... */ + __attribute__((nonnull, pure)); + +/***************************************************************** +Returns the pointer stored in the given header field, or NULL. */ +#define page_header_get_ptr(page, field) \ + (page_header_get_offs(page, field) \ + ? page + page_header_get_offs(page, field) : NULL) +/***************************************************************** +Sets the pointer stored in the given header field. */ +UNIV_INLINE +void +page_header_set_ptr( +/*================*/ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /* in/out: PAGE_FREE, ... */ + const byte* ptr); /* in: pointer or NULL*/ +/***************************************************************** +Resets the last insert info field in the page header. Writes to mlog +about this operation. */ +UNIV_INLINE +void +page_header_reset_last_insert( +/*==========================*/ + page_t* page, /* in: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + mtr_t* mtr); /* in: mtr */ +/**************************************************************** +Gets the offset of the first record on the page. */ +UNIV_INLINE +ulint +page_get_infimum_offset( +/*====================*/ + /* out: offset of the first record + in record list, relative from page */ + const page_t* page); /* in: page which must have record(s) */ +/**************************************************************** +Gets the offset of the last record on the page. */ +UNIV_INLINE +ulint +page_get_supremum_offset( +/*=====================*/ + /* out: offset of the last record in + record list, relative from page */ + const page_t* page); /* in: page which must have record(s) */ +#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page)) +#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page)) +/**************************************************************** +Returns the middle record of record list. If there are an even number +of records in the list, returns the first record of upper half-list. */ +UNIV_INTERN +rec_t* +page_get_middle_rec( +/*================*/ + /* out: middle record */ + page_t* page); /* in: page */ +/***************************************************************** +Compares a data tuple to a physical record. Differs from the function +cmp_dtuple_rec_with_match in the way that the record must reside on an +index page, and also page infimum and supremum records can be given in +the parameter rec. These are considered as the negative infinity and +the positive infinity in the alphabetical order. */ +UNIV_INLINE +int +page_cmp_dtuple_rec_with_match( +/*===========================*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively, when only the + common first fields are compared */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record on a page; may also + be page infimum or supremum, in which case + matched-parameter values below are not + affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint* matched_fields, /* in/out: number of already completely + matched fields; when function returns + contains the value for current comparison */ + ulint* matched_bytes); /* in/out: number of already matched + bytes within the first field not completely + matched; when function returns contains the + value for current comparison */ +/***************************************************************** +Gets the page number. */ +UNIV_INLINE +ulint +page_get_page_no( +/*=============*/ + /* out: page number */ + const page_t* page); /* in: page */ +/***************************************************************** +Gets the tablespace identifier. */ +UNIV_INLINE +ulint +page_get_space_id( +/*==============*/ + /* out: space id */ + const page_t* page); /* in: page */ +/***************************************************************** +Gets the number of user records on page (the infimum and supremum records +are not user records). */ +UNIV_INLINE +ulint +page_get_n_recs( +/*============*/ + /* out: number of user records */ + const page_t* page); /* in: index page */ +/******************************************************************* +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. */ +UNIV_INTERN +ulint +page_rec_get_n_recs_before( +/*=======================*/ + /* out: number of records */ + const rec_t* rec); /* in: the physical record */ +/***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + const page_t* page); /* in: index page */ +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ + ulint n_heap);/* in: number of records */ +/***************************************************************** +Gets the number of dir slots in directory. */ +UNIV_INLINE +ulint +page_dir_get_n_slots( +/*=================*/ + /* out: number of slots */ + const page_t* page); /* in: index page */ +/***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint n_slots);/* in: number of slots */ +#ifdef UNIV_DEBUG +/***************************************************************** +Gets pointer to nth directory slot. */ +UNIV_INLINE +page_dir_slot_t* +page_dir_get_nth_slot( +/*==================*/ + /* out: pointer to dir slot */ + const page_t* page, /* in: index page */ + ulint n); /* in: position */ +#else /* UNIV_DEBUG */ +# define page_dir_get_nth_slot(page, n) \ + ((page) + UNIV_PAGE_SIZE - PAGE_DIR \ + - (n + 1) * PAGE_DIR_SLOT_SIZE) +#endif /* UNIV_DEBUG */ +/****************************************************************** +Used to check the consistency of a record on a page. */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + /* out: TRUE if succeed */ + const rec_t* rec); /* in: record */ +/******************************************************************* +Gets the record pointed to by a directory slot. */ +UNIV_INLINE +const rec_t* +page_dir_slot_get_rec( +/*==================*/ + /* out: pointer to record */ + const page_dir_slot_t* slot); /* in: directory slot */ +/******************************************************************* +This is used to set the record offset in a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_rec( +/*==================*/ + page_dir_slot_t* slot, /* in: directory slot */ + rec_t* rec); /* in: record on the page */ +/******************************************************************* +Gets the number of records owned by a directory slot. */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + /* out: number of records */ + const page_dir_slot_t* slot); /* in: page directory slot */ +/******************************************************************* +This is used to set the owned records field of a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_n_owned( +/*======================*/ + page_dir_slot_t*slot, /* in/out: directory slot */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n); /* in: number of records owned by the slot */ +/**************************************************************** +Calculates the space reserved for directory slots of a given +number of records. The exact value is a fraction number +n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is +rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs); /* in: number of records */ +/******************************************************************* +Looks for the directory slot which owns the given record. */ +UNIV_INTERN +ulint +page_dir_find_owner_slot( +/*=====================*/ + /* out: the directory slot number */ + const rec_t* rec); /* in: the physical record */ +/**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + /* out: nonzero if the page is in compact + format, zero if it is in old-style format */ + const page_t* page); /* in: index page */ +/**************************************************************** +TRUE if the record is on a page in compact format. */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + /* out: nonzero if in compact format */ + const rec_t* rec); /* in: record */ +/******************************************************************* +Returns the heap number of a record. */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + /* out: heap number */ + const rec_t* rec); /* in: the physical record */ +/**************************************************************** +Determine whether the page is a B-tree leaf. */ +UNIV_INLINE +ibool +page_is_leaf( +/*=========*/ + /* out: TRUE if the page is a B-tree leaf */ + const page_t* page) /* in: page */ + __attribute__((nonnull, pure)); +/**************************************************************** +Gets the pointer to the next record on the page. */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + /* out: pointer to next record */ + const rec_t* rec, /* in: pointer to record */ + ulint comp); /* in: nonzero=compact page layout */ +/**************************************************************** +Gets the pointer to the next record on the page. */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + /* out: pointer to next record */ + rec_t* rec); /* in: pointer to record */ +/**************************************************************** +Gets the pointer to the next record on the page. */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + /* out: pointer to next record */ + const rec_t* rec); /* in: pointer to record */ +/**************************************************************** +Sets the pointer to the next record on the page. */ +UNIV_INLINE +void +page_rec_set_next( +/*==============*/ + rec_t* rec, /* in: pointer to record, + must not be page supremum */ + rec_t* next); /* in: pointer to next record, + must not be page infimum */ +/**************************************************************** +Gets the pointer to the previous record. */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + /* out: pointer to previous record */ + const rec_t* rec); /* in: pointer to record, must not be page + infimum */ +/**************************************************************** +Gets the pointer to the previous record. */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + /* out: pointer to previous record */ + rec_t* rec); /* in: pointer to record, + must not be page infimum */ +/**************************************************************** +TRUE if the record is a user record on the page. */ +UNIV_INLINE +ibool +page_rec_is_user_rec_low( +/*=====================*/ + /* out: TRUE if a user record */ + ulint offset) /* in: record offset on page */ + __attribute__((const)); +/**************************************************************** +TRUE if the record is the supremum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_supremum_low( +/*=====================*/ + /* out: TRUE if the supremum record */ + ulint offset) /* in: record offset on page */ + __attribute__((const)); +/**************************************************************** +TRUE if the record is the infimum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_infimum_low( +/*====================*/ + /* out: TRUE if the infimum record */ + ulint offset) /* in: record offset on page */ + __attribute__((const)); + +/**************************************************************** +TRUE if the record is a user record on the page. */ +UNIV_INLINE +ibool +page_rec_is_user_rec( +/*=================*/ + /* out: TRUE if a user record */ + const rec_t* rec) /* in: record */ + __attribute__((const)); +/**************************************************************** +TRUE if the record is the supremum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_supremum( +/*=================*/ + /* out: TRUE if the supremum record */ + const rec_t* rec) /* in: record */ + __attribute__((const)); + +/**************************************************************** +TRUE if the record is the infimum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + /* out: TRUE if the infimum record */ + const rec_t* rec) /* in: record */ + __attribute__((const)); +/******************************************************************* +Looks for the record which owns the given record. */ +UNIV_INLINE +rec_t* +page_rec_find_owner_rec( +/*====================*/ + /* out: the owner record */ + rec_t* rec); /* in: the physical record */ +/*************************************************************************** +This is a low-level operation which is used in a database index creation +to update the page number of a created B-tree to a data dictionary +record. */ +UNIV_INTERN +void +page_rec_write_index_page_no( +/*=========================*/ + rec_t* rec, /* in: record to update */ + ulint i, /* in: index of the field to update */ + ulint page_no,/* in: value to write */ + mtr_t* mtr); /* in: mtr */ +/**************************************************************** +Returns the maximum combined size of records which can be inserted on top +of record heap. */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + /* out: maximum combined size for + inserted records */ + const page_t* page, /* in: index page */ + ulint n_recs);/* in: number of records */ +/**************************************************************** +Returns the maximum combined size of records which can be inserted on top +of record heap if page is first reorganized. */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + /* out: maximum combined size for + inserted records */ + const page_t* page, /* in: index page */ + ulint n_recs);/* in: number of records */ +/***************************************************************** +Calculates free space if a page is emptied. */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + /* out: free space */ + ulint comp) /* in: nonzero=compact page format */ + __attribute__((const)); +/************************************************************** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + /* out: REC_N_NEW_EXTRA_BYTES + or REC_N_OLD_EXTRA_BYTES */ + const rec_t* rec); /* in: physical record */ +/**************************************************************** +Returns the sum of the sizes of the records in the record list +excluding the infimum and supremum records. */ +UNIV_INLINE +ulint +page_get_data_size( +/*===============*/ + /* out: data in bytes */ + const page_t* page); /* in: index page */ +/**************************************************************** +Allocates a block of memory from the head of the free list +of an index page. */ +UNIV_INLINE +void +page_mem_alloc_free( +/*================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with enough + space available for inserting the record, + or NULL */ + rec_t* next_rec,/* in: pointer to the new head of the + free record list */ + ulint need); /* in: number of bytes allocated */ +/**************************************************************** +Allocates a block of memory from the heap of an index page. */ +UNIV_INTERN +byte* +page_mem_alloc_heap( +/*================*/ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with enough + space available for inserting the record, + or NULL */ + ulint need, /* in: total number of bytes needed */ + ulint* heap_no);/* out: this contains the heap number + of the allocated record + if allocation succeeds */ +/**************************************************************** +Puts a record to free list. */ +UNIV_INLINE +void +page_mem_free( +/*==========*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + rec_t* rec, /* in: pointer to the (origin of) record */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +Create an uncompressed B-tree index page. */ +UNIV_INTERN +page_t* +page_create( +/*========*/ + /* out: pointer to the page */ + buf_block_t* block, /* in: a buffer block where the + page is created */ + mtr_t* mtr, /* in: mini-transaction handle */ + ulint comp); /* in: nonzero=compact page format */ +/************************************************************** +Create a compressed B-tree index page. */ +UNIV_INTERN +page_t* +page_create_zip( +/*============*/ + /* out: pointer to the page */ + buf_block_t* block, /* in/out: a buffer frame where the + page is created */ + dict_index_t* index, /* in: the index of the page */ + ulint level, /* in: the B-tree level of the page */ + mtr_t* mtr); /* in: mini-transaction handle */ + +/***************************************************************** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. */ +UNIV_INTERN +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /* in: index page to copy to */ + buf_block_t* block, /* in: index page of rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Copies records from page to new_page, from the given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. */ +UNIV_INTERN +rec_t* +page_copy_rec_list_end( +/*===================*/ + /* out: pointer to the original + successor of the infimum record + on new_page, or NULL on zip overflow + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page to copy to */ + buf_block_t* block, /* in: index page containing rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull)); +/***************************************************************** +Copies records from page to new_page, up to the given record, NOT +including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. */ +UNIV_INTERN +rec_t* +page_copy_rec_list_start( +/*=====================*/ + /* out: pointer to the original + predecessor of the supremum record + on new_page, or NULL on zip overflow + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page to copy to */ + buf_block_t* block, /* in: index page containing rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull)); +/***************************************************************** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /* in: pointer to record on page */ + buf_block_t* block, /* in: buffer block of the page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull)); +/***************************************************************** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /* in: record on page */ + buf_block_t* block, /* in: buffer block of the page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull)); +/***************************************************************** +Moves record list end to another page. Moved records include +split_rec. */ +UNIV_INTERN +ibool +page_move_rec_list_end( +/*===================*/ + /* out: TRUE on success; FALSE on + compression failure + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page where to move */ + buf_block_t* block, /* in: index page from where to move */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull(1, 2, 4, 5))); +/***************************************************************** +Moves record list start to another page. Moved records do not include +split_rec. */ +UNIV_INTERN +ibool +page_move_rec_list_start( +/*=====================*/ + /* out: TRUE on success; FALSE on + compression failure */ + buf_block_t* new_block, /* in/out: index page where to move */ + buf_block_t* block, /* in/out: page containing split_rec */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull(1, 2, 4, 5))); +/******************************************************************** +Splits a directory slot which owns too many records. */ +UNIV_INTERN +void +page_dir_split_slot( +/*================*/ + page_t* page, /* in: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be written, or NULL */ + ulint slot_no)/* in: the directory slot */ + __attribute__((nonnull(1))); +/***************************************************************** +Tries to balance the given directory slot with too few records +with the upper neighbor, so that there are at least the minimum number +of records owned by the slot; this may result in the merging of +two slots. */ +UNIV_INTERN +void +page_dir_balance_slot( +/*==================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint slot_no)/* in: the directory slot */ + __attribute__((nonnull(1))); +/************************************************************** +Parses a log record of a record list end or start deletion. */ +UNIV_INTERN +byte* +page_parse_delete_rec_list( +/*=======================*/ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in/out: buffer block or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr or NULL */ +/*************************************************************** +Parses a redo log record of creating a page. */ +UNIV_INTERN +byte* +page_parse_create( +/*==============*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ + buf_block_t* block, /* in: block or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ +/**************************************************************** +Prints record contents including the data relevant only in +the index page context. */ +UNIV_INTERN +void +page_rec_print( +/*===========*/ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: record descriptor */ +/******************************************************************* +This is used to print the contents of the directory for +debugging purposes. */ +UNIV_INTERN +void +page_dir_print( +/*===========*/ + page_t* page, /* in: index page */ + ulint pr_n); /* in: print n first and n last entries */ +/******************************************************************* +This is used to print the contents of the page record list for +debugging purposes. */ +UNIV_INTERN +void +page_print_list( +/*============*/ + buf_block_t* block, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n); /* in: print n first and n last entries */ +/******************************************************************* +Prints the info in a page header. */ +UNIV_INTERN +void +page_header_print( +/*==============*/ + const page_t* page); +/******************************************************************* +This is used to print the contents of the page for +debugging purposes. */ +UNIV_INTERN +void +page_print( +/*=======*/ + buf_block_t* block, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn); /* in: print rn first and last records + in directory */ +/******************************************************************* +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. */ +UNIV_INTERN +ibool +page_rec_validate( +/*==============*/ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +UNIV_INTERN +void +page_check_dir( +/*===========*/ + const page_t* page); /* in: index page */ +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ +UNIV_INTERN +ibool +page_simple_validate_old( +/*=====================*/ + /* out: TRUE if ok */ + page_t* page); /* in: old-style index page */ +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ +UNIV_INTERN +ibool +page_simple_validate_new( +/*=====================*/ + /* out: TRUE if ok */ + page_t* block); /* in: new-style index page */ +/******************************************************************* +This function checks the consistency of an index page. */ +UNIV_INTERN +ibool +page_validate( +/*==========*/ + /* out: TRUE if ok */ + page_t* page, /* in: index page */ + dict_index_t* index); /* in: data dictionary index containing + the page record type definition */ +/******************************************************************* +Looks in the page record list for a record with the given heap number. */ + +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + /* out: record, NULL if not found */ + const page_t* page, /* in: index page */ + ulint heap_no);/* in: heap number */ + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif + +#ifndef UNIV_NONINL +#include "page0page.ic" +#endif + +#endif diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic new file mode 100644 index 00000000000..df0f6f8b360 --- /dev/null +++ b/storage/xtradb/include/page0page.ic @@ -0,0 +1,1060 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "rem0cmp.h" +#include "mtr0log.h" +#include "page0zip.h" + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE +#endif + +/**************************************************************** +Gets the start of a page. */ +UNIV_INLINE +page_t* +page_align( +/*=======*/ + /* out: start of the page */ + const void* ptr) /* in: pointer to page frame */ +{ + return((page_t*) ut_align_down(ptr, UNIV_PAGE_SIZE)); +} +/**************************************************************** +Gets the offset within a page. */ +UNIV_INLINE +ulint +page_offset( +/*========*/ + /* out: offset from the start of the page */ + const void* ptr) /* in: pointer to page frame */ +{ + return(ut_align_offset(ptr, UNIV_PAGE_SIZE)); +} +/***************************************************************** +Returns the max trx id field value. */ +UNIV_INLINE +dulint +page_get_max_trx_id( +/*================*/ + const page_t* page) /* in: page */ +{ + ut_ad(page); + + return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID)); +} + +/***************************************************************** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dulint trx_id) /* in: transaction id */ +{ + ut_ad(block); + + if (ut_dulint_cmp(page_get_max_trx_id(buf_block_get_frame(block)), + trx_id) < 0) { + + page_set_max_trx_id(block, page_zip, trx_id); + } +} + +/***************************************************************** +Reads the given header field. */ +UNIV_INLINE +ulint +page_header_get_field( +/*==================*/ + const page_t* page, /* in: page */ + ulint field) /* in: PAGE_LEVEL, ... */ +{ + ut_ad(page); + ut_ad(field <= PAGE_INDEX_ID); + + return(mach_read_from_2(page + PAGE_HEADER + field)); +} + +/***************************************************************** +Sets the given header field. */ +UNIV_INLINE +void +page_header_set_field( +/*==================*/ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /* in: PAGE_N_DIR_SLOTS, ... */ + ulint val) /* in: value */ +{ + ut_ad(page); + ut_ad(field <= PAGE_N_RECS); + ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE); + ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE); + + mach_write_to_2(page + PAGE_HEADER + field, val); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_header(page_zip, + page + PAGE_HEADER + field, 2, NULL); + } +} + +/***************************************************************** +Returns the offset stored in the given header field. */ +UNIV_INLINE +ulint +page_header_get_offs( +/*=================*/ + /* out: offset from the start of the page, + or 0 */ + const page_t* page, /* in: page */ + ulint field) /* in: PAGE_FREE, ... */ +{ + ulint offs; + + ut_ad(page); + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + offs = page_header_get_field(page, field); + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + return(offs); +} + +/***************************************************************** +Sets the pointer stored in the given header field. */ +UNIV_INLINE +void +page_header_set_ptr( +/*================*/ + page_t* page, /* in: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint field, /* in: PAGE_FREE, ... */ + const byte* ptr) /* in: pointer or NULL*/ +{ + ulint offs; + + ut_ad(page); + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + if (ptr == NULL) { + offs = 0; + } else { + offs = ptr - page; + } + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + page_header_set_field(page, page_zip, field, offs); +} + +/***************************************************************** +Resets the last insert info field in the page header. Writes to mlog +about this operation. */ +UNIV_INLINE +void +page_header_reset_last_insert( +/*==========================*/ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(page && mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0); + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_LAST_INSERT), + 2, mtr); + } else { + mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0, + MLOG_2BYTES, mtr); + } +} + +/**************************************************************** +Determine whether the page is in new-style compact format. */ +UNIV_INLINE +ulint +page_is_comp( +/*=========*/ + /* out: nonzero if the page is in compact + format, zero if it is in old-style format */ + const page_t* page) /* in: index page */ +{ + return(UNIV_EXPECT(page_header_get_field(page, PAGE_N_HEAP) & 0x8000, + 0x8000)); +} + +/**************************************************************** +TRUE if the record is on a page in compact format. */ +UNIV_INLINE +ulint +page_rec_is_comp( +/*=============*/ + /* out: nonzero if in compact format */ + const rec_t* rec) /* in: record */ +{ + return(page_is_comp(page_align(rec))); +} + +/******************************************************************* +Returns the heap number of a record. */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + /* out: heap number */ + const rec_t* rec) /* in: the physical record */ +{ + if (page_rec_is_comp(rec)) { + return(rec_get_heap_no_new(rec)); + } else { + return(rec_get_heap_no_old(rec)); + } +} + +/**************************************************************** +Determine whether the page is a B-tree leaf. */ +UNIV_INLINE +ibool +page_is_leaf( +/*=========*/ + /* out: TRUE if the page is a B-tree leaf */ + const page_t* page) /* in: page */ +{ + return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL))); +} + +/**************************************************************** +Gets the offset of the first record on the page. */ +UNIV_INLINE +ulint +page_get_infimum_offset( +/*====================*/ + /* out: offset of the first record + in record list, relative from page */ + const page_t* page) /* in: page which must have record(s) */ +{ + ut_ad(page); + ut_ad(!page_offset(page)); + + if (page_is_comp(page)) { + return(PAGE_NEW_INFIMUM); + } else { + return(PAGE_OLD_INFIMUM); + } +} + +/**************************************************************** +Gets the offset of the last record on the page. */ +UNIV_INLINE +ulint +page_get_supremum_offset( +/*=====================*/ + /* out: offset of the last record in + record list, relative from page */ + const page_t* page) /* in: page which must have record(s) */ +{ + ut_ad(page); + ut_ad(!page_offset(page)); + + if (page_is_comp(page)) { + return(PAGE_NEW_SUPREMUM); + } else { + return(PAGE_OLD_SUPREMUM); + } +} + +/**************************************************************** +TRUE if the record is a user record on the page. */ +UNIV_INLINE +ibool +page_rec_is_user_rec_low( +/*=====================*/ + /* out: TRUE if a user record */ + ulint offset) /* in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); +#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM +# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM" +#endif +#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM +# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM +# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM" +#endif +#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM +# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM" +#endif +#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END +# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END" +#endif +#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END +# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END" +#endif + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(UNIV_LIKELY(offset != PAGE_NEW_SUPREMUM) + && UNIV_LIKELY(offset != PAGE_NEW_INFIMUM) + && UNIV_LIKELY(offset != PAGE_OLD_INFIMUM) + && UNIV_LIKELY(offset != PAGE_OLD_SUPREMUM)); +} + +/**************************************************************** +TRUE if the record is the supremum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_supremum_low( +/*=====================*/ + /* out: TRUE if the supremum record */ + ulint offset) /* in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(UNIV_UNLIKELY(offset == PAGE_NEW_SUPREMUM) + || UNIV_UNLIKELY(offset == PAGE_OLD_SUPREMUM)); +} + +/**************************************************************** +TRUE if the record is the infimum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_infimum_low( +/*====================*/ + /* out: TRUE if the infimum record */ + ulint offset) /* in: record offset on page */ +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START); + + return(UNIV_UNLIKELY(offset == PAGE_NEW_INFIMUM) + || UNIV_UNLIKELY(offset == PAGE_OLD_INFIMUM)); +} + +/**************************************************************** +TRUE if the record is a user record on the page. */ +UNIV_INLINE +ibool +page_rec_is_user_rec( +/*=================*/ + /* out: TRUE if a user record */ + const rec_t* rec) /* in: record */ +{ + return(page_rec_is_user_rec_low(page_offset(rec))); +} + +/**************************************************************** +TRUE if the record is the supremum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_supremum( +/*=================*/ + /* out: TRUE if the supremum record */ + const rec_t* rec) /* in: record */ +{ + return(page_rec_is_supremum_low(page_offset(rec))); +} + +/**************************************************************** +TRUE if the record is the infimum record on a page. */ +UNIV_INLINE +ibool +page_rec_is_infimum( +/*================*/ + /* out: TRUE if the infimum record */ + const rec_t* rec) /* in: record */ +{ + return(page_rec_is_infimum_low(page_offset(rec))); +} + +/***************************************************************** +Compares a data tuple to a physical record. Differs from the function +cmp_dtuple_rec_with_match in the way that the record must reside on an +index page, and also page infimum and supremum records can be given in +the parameter rec. These are considered as the negative infinity and +the positive infinity in the alphabetical order. */ +UNIV_INLINE +int +page_cmp_dtuple_rec_with_match( +/*===========================*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively, when only the + common first fields are compared */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record on a page; may also + be page infimum or supremum, in which case + matched-parameter values below are not + affected */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint* matched_fields, /* in/out: number of already completely + matched fields; when function returns + contains the value for current comparison */ + ulint* matched_bytes) /* in/out: number of already matched + bytes within the first field not completely + matched; when function returns contains the + value for current comparison */ +{ + ulint rec_offset; + + ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec)); + + rec_offset = page_offset(rec); + + if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_INFIMUM) + || UNIV_UNLIKELY(rec_offset == PAGE_OLD_INFIMUM)) { + return(1); + } + if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_SUPREMUM) + || UNIV_UNLIKELY(rec_offset == PAGE_OLD_SUPREMUM)) { + return(-1); + } + + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + matched_fields, + matched_bytes)); +} + +/***************************************************************** +Gets the page number. */ +UNIV_INLINE +ulint +page_get_page_no( +/*=============*/ + /* out: page number */ + const page_t* page) /* in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return(mach_read_from_4(page + FIL_PAGE_OFFSET)); +} + +/***************************************************************** +Gets the tablespace identifier. */ +UNIV_INLINE +ulint +page_get_space_id( +/*==============*/ + /* out: space id */ + const page_t* page) /* in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); +} + +/***************************************************************** +Gets the number of user records on page (infimum and supremum records +are not user records). */ +UNIV_INLINE +ulint +page_get_n_recs( +/*============*/ + /* out: number of user records */ + const page_t* page) /* in: index page */ +{ + return(page_header_get_field(page, PAGE_N_RECS)); +} + +/***************************************************************** +Gets the number of dir slots in directory. */ +UNIV_INLINE +ulint +page_dir_get_n_slots( +/*=================*/ + /* out: number of slots */ + const page_t* page) /* in: index page */ +{ + return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); +} +/***************************************************************** +Sets the number of dir slots in directory. */ +UNIV_INLINE +void +page_dir_set_n_slots( +/*=================*/ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + ulint n_slots)/* in: number of slots */ +{ + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots); +} + +/***************************************************************** +Gets the number of records in the heap. */ +UNIV_INLINE +ulint +page_dir_get_n_heap( +/*================*/ + /* out: number of user records */ + const page_t* page) /* in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/***************************************************************** +Sets the number of records in the heap. */ +UNIV_INLINE +void +page_dir_set_n_heap( +/*================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL. + Note that the size of the dense page directory + in the compressed page trailer is + n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */ + ulint n_heap) /* in: number of records */ +{ + ut_ad(n_heap < 0x8000); + ut_ad(!page_zip || n_heap + == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1); + + page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap + | (0x8000 + & page_header_get_field(page, PAGE_N_HEAP))); +} + +#ifdef UNIV_DEBUG +/***************************************************************** +Gets pointer to nth directory slot. */ +UNIV_INLINE +page_dir_slot_t* +page_dir_get_nth_slot( +/*==================*/ + /* out: pointer to dir slot */ + const page_t* page, /* in: index page */ + ulint n) /* in: position */ +{ + ut_ad(page_dir_get_n_slots(page) > n); + + return((page_dir_slot_t*) + page + UNIV_PAGE_SIZE - PAGE_DIR + - (n + 1) * PAGE_DIR_SLOT_SIZE); +} +#endif /* UNIV_DEBUG */ + +/****************************************************************** +Used to check the consistency of a record on a page. */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + /* out: TRUE if succeed */ + const rec_t* rec) /* in: record */ +{ + const page_t* page = page_align(rec); + + ut_a(rec); + + ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP)); + ut_a(page_offset(rec) >= PAGE_DATA); + + return(TRUE); +} + +/******************************************************************* +Gets the record pointed to by a directory slot. */ +UNIV_INLINE +const rec_t* +page_dir_slot_get_rec( +/*==================*/ + /* out: pointer to record */ + const page_dir_slot_t* slot) /* in: directory slot */ +{ + return(page_align(slot) + mach_read_from_2(slot)); +} + +/******************************************************************* +This is used to set the record offset in a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_rec( +/*==================*/ + page_dir_slot_t* slot, /* in: directory slot */ + rec_t* rec) /* in: record on the page */ +{ + ut_ad(page_rec_check(rec)); + + mach_write_to_2(slot, page_offset(rec)); +} + +/******************************************************************* +Gets the number of records owned by a directory slot. */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + /* out: number of records */ + const page_dir_slot_t* slot) /* in: page directory slot */ +{ + const rec_t* rec = page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + return(rec_get_n_owned_new(rec)); + } else { + return(rec_get_n_owned_old(rec)); + } +} + +/******************************************************************* +This is used to set the owned records field of a directory slot. */ +UNIV_INLINE +void +page_dir_slot_set_n_owned( +/*======================*/ + page_dir_slot_t*slot, /* in/out: directory slot */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n) /* in: number of records owned by the slot */ +{ + rec_t* rec = (rec_t*) page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + rec_set_n_owned_new(rec, page_zip, n); + } else { + ut_ad(!page_zip); + rec_set_n_owned_old(rec, n); + } +} + +/**************************************************************** +Calculates the space reserved for directory slots of a given number of +records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE / +PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs) /* in: number of records */ +{ + return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1) + / PAGE_DIR_SLOT_MIN_N_OWNED); +} + +/**************************************************************** +Gets the pointer to the next record on the page. */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + /* out: pointer to next record */ + const rec_t* rec, /* in: pointer to record */ + ulint comp) /* in: nonzero=compact page layout */ +{ + ulint offs; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + offs = rec_get_next_offs(rec, comp); + + if (UNIV_UNLIKELY(offs >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset is nonsensical %lu" + " in record at offset %lu\n" + "InnoDB: rec address %p, space id %lu, page %lu\n", + (ulong)offs, (ulong) page_offset(rec), + (void*) rec, + (ulong) page_get_space_id(page), + (ulong) page_get_page_no(page)); + buf_page_print(page, 0); + + ut_error; + } + + if (UNIV_UNLIKELY(offs == 0)) { + + return(NULL); + } + + return(page + offs); +} + +/**************************************************************** +Gets the pointer to the next record on the page. */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + /* out: pointer to next record */ + rec_t* rec) /* in: pointer to record */ +{ + return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/**************************************************************** +Gets the pointer to the next record on the page. */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + /* out: pointer to next record */ + const rec_t* rec) /* in: pointer to record */ +{ + return(page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/**************************************************************** +Sets the pointer to the next record on the page. */ +UNIV_INLINE +void +page_rec_set_next( +/*==============*/ + rec_t* rec, /* in: pointer to record, + must not be page supremum */ + rec_t* next) /* in: pointer to next record, + must not be page infimum */ +{ + ulint offs; + + ut_ad(page_rec_check(rec)); + ut_ad(!page_rec_is_supremum(rec)); + ut_ad(rec != next); + + ut_ad(!next || !page_rec_is_infimum(next)); + ut_ad(!next || page_align(rec) == page_align(next)); + + if (UNIV_LIKELY(next != NULL)) { + offs = page_offset(next); + } else { + offs = 0; + } + + if (page_rec_is_comp(rec)) { + rec_set_next_offs_new(rec, offs); + } else { + rec_set_next_offs_old(rec, offs); + } +} + +/**************************************************************** +Gets the pointer to the previous record. */ +UNIV_INLINE +const rec_t* +page_rec_get_prev_const( +/*====================*/ + /* out: pointer to previous record */ + const rec_t* rec) /* in: pointer to record, must not be page + infimum */ +{ + const page_dir_slot_t* slot; + ulint slot_no; + const rec_t* rec2; + const rec_t* prev_rec = NULL; + const page_t* page; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + + ut_ad(!page_rec_is_infimum(rec)); + + slot_no = page_dir_find_owner_slot(rec); + + ut_a(slot_no != 0); + + slot = page_dir_get_nth_slot(page, slot_no - 1); + + rec2 = page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, TRUE); + } + } else { + while (rec != rec2) { + prev_rec = rec2; + rec2 = page_rec_get_next_low(rec2, FALSE); + } + } + + ut_a(prev_rec); + + return(prev_rec); +} + +/**************************************************************** +Gets the pointer to the previous record. */ +UNIV_INLINE +rec_t* +page_rec_get_prev( +/*==============*/ + /* out: pointer to previous record */ + rec_t* rec) /* in: pointer to record, must not be page + infimum */ +{ + return((rec_t*) page_rec_get_prev_const(rec)); +} + +/******************************************************************* +Looks for the record which owns the given record. */ +UNIV_INLINE +rec_t* +page_rec_find_owner_rec( +/*====================*/ + /* out: the owner record */ + rec_t* rec) /* in: the physical record */ +{ + ut_ad(page_rec_check(rec)); + + if (page_rec_is_comp(rec)) { + while (rec_get_n_owned_new(rec) == 0) { + rec = page_rec_get_next(rec); + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + rec = page_rec_get_next(rec); + } + } + + return(rec); +} + +/************************************************************** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + /* out: REC_N_NEW_EXTRA_BYTES + or REC_N_OLD_EXTRA_BYTES */ + const rec_t* rec) /* in: physical record */ +{ +#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES +# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES" +#endif + return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec)); +} + +/**************************************************************** +Returns the sum of the sizes of the records in the record list, excluding +the infimum and supremum records. */ +UNIV_INLINE +ulint +page_get_data_size( +/*===============*/ + /* out: data in bytes */ + const page_t* page) /* in: index page */ +{ + ulint ret; + + ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP) + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) + - page_header_get_field(page, PAGE_GARBAGE)); + + ut_ad(ret < UNIV_PAGE_SIZE); + + return(ret); +} + + +/**************************************************************** +Allocates a block of memory from the free list of an index page. */ +UNIV_INTERN +void +page_mem_alloc_free( +/*================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with enough + space available for inserting the record, + or NULL */ + rec_t* next_rec,/* in: pointer to the new head of the + free record list */ + ulint need) /* in: number of bytes allocated */ +{ + ulint garbage; + +#ifdef UNIV_DEBUG + const rec_t* old_rec = page_header_get_ptr(page, PAGE_FREE); + ulint next_offs; + + ut_ad(old_rec); + next_offs = rec_get_next_offs(old_rec, page_is_comp(page)); + ut_ad(next_rec == (next_offs ? page + next_offs : NULL)); +#endif + + page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec); + + garbage = page_header_get_field(page, PAGE_GARBAGE); + ut_ad(garbage >= need); + + page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need); +} + +/***************************************************************** +Calculates free space if a page is emptied. */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + /* out: free space */ + ulint comp) /* in: nonzero=compact page layout */ +{ + if (UNIV_LIKELY(comp)) { + return((ulint)(UNIV_PAGE_SIZE + - PAGE_NEW_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); + } + + return((ulint)(UNIV_PAGE_SIZE + - PAGE_OLD_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); +} + +/**************************************************************** +Each user record on a page, and also the deleted user records in the heap +takes its size plus the fraction of the dir cell size / +PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the +value of page_get_free_space_of_empty, the insert is impossible, otherwise +it is allowed. This function returns the maximum combined size of records +which can be inserted on top of the record heap. */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + /* out: maximum combined size for + inserted records */ + const page_t* page, /* in: index page */ + ulint n_recs) /* in: number of records */ +{ + ulint occupied; + ulint free_space; + + if (page_is_comp(page)) { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_NEW_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(TRUE); + } else { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_OLD_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(FALSE); + } + + /* Above the 'n_recs +' part reserves directory space for the new + inserted records; the '- 2' excludes page infimum and supremum + records */ + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/**************************************************************** +Returns the maximum combined size of records which can be inserted on top +of the record heap if a page is first reorganized. */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + /* out: maximum combined size for + inserted records */ + const page_t* page, /* in: index page */ + ulint n_recs) /* in: number of records */ +{ + ulint occupied; + ulint free_space; + + occupied = page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); + + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/**************************************************************** +Puts a record to free list. */ +UNIV_INLINE +void +page_mem_free( +/*==========*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + rec_t* rec, /* in: pointer to the (origin of) record */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + rec_t* free; + ulint garbage; + + ut_ad(rec_offs_validate(rec, index, offsets)); + free = page_header_get_ptr(page, PAGE_FREE); + + page_rec_set_next(rec, free); + page_header_set_ptr(page, page_zip, PAGE_FREE, rec); + + garbage = page_header_get_field(page, PAGE_GARBAGE); + + page_header_set_field(page, page_zip, PAGE_GARBAGE, + garbage + rec_offs_size(offsets)); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_dir_delete(page_zip, rec, index, offsets, free); + } else { + page_header_set_field(page, page_zip, PAGE_N_RECS, + page_get_n_recs(page) - 1); + } +} + +#ifdef UNIV_MATERIALIZE +#undef UNIV_INLINE +#define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/xtradb/include/page0types.h b/storage/xtradb/include/page0types.h new file mode 100644 index 00000000000..06af7a63d58 --- /dev/null +++ b/storage/xtradb/include/page0types.h @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0types_h +#define page0types_h + +#include "univ.i" +#include "dict0types.h" +#include "mtr0types.h" + +/* Type of the index page */ +/* The following define eliminates a name collision on HP-UX */ +#define page_t ib_page_t +typedef byte page_t; +typedef struct page_search_struct page_search_t; +typedef struct page_cur_struct page_cur_t; + +typedef byte page_zip_t; +typedef struct page_zip_des_struct page_zip_des_t; + +/* The following definitions would better belong to page0zip.h, +but we cannot include page0zip.h from rem0rec.ic, because +page0*.h includes rem0rec.h and may include rem0rec.ic. */ + +#define PAGE_ZIP_SSIZE_BITS 3 + +#define PAGE_ZIP_MIN_SIZE_SHIFT 10 /* log2 of smallest compressed size */ +#define PAGE_ZIP_MIN_SIZE (1 << PAGE_ZIP_MIN_SIZE_SHIFT) + +#define PAGE_ZIP_NUM_SSIZE (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 2) +#if PAGE_ZIP_NUM_SSIZE > (1 << PAGE_ZIP_SSIZE_BITS) +# error "PAGE_ZIP_NUM_SSIZE > (1 << PAGE_ZIP_SSIZE_BITS)" +#endif + +/* Compressed page descriptor */ +struct page_zip_des_struct +{ + page_zip_t* data; /* compressed page data */ + +#ifdef UNIV_DEBUG + unsigned m_start:16; /* start offset of modification log */ +#endif /* UNIV_DEBUG */ + unsigned m_end:16; /* end offset of modification log */ + unsigned m_nonempty:1; /* TRUE if the modification log + is not empty */ + unsigned n_blobs:12; /* number of externally stored + columns on the page; the maximum + is 744 on a 16 KiB page */ + unsigned ssize:PAGE_ZIP_SSIZE_BITS; + /* 0 or compressed page size; + the size in bytes is + PAGE_ZIP_MIN_SIZE << (ssize - 1). */ +}; + +/** Compression statistics for a given page size */ +struct page_zip_stat_struct { + /** Number of page compressions */ + ulint compressed; + /** Number of successful page compressions */ + ulint compressed_ok; + /** Number of page decompressions */ + ulint decompressed; + /** Duration of page compressions in microseconds */ + ib_uint64_t compressed_usec; + /** Duration of page decompressions in microseconds */ + ib_uint64_t decompressed_usec; +}; + +typedef struct page_zip_stat_struct page_zip_stat_t; + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +extern page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE - 1]; + +/************************************************************************** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Shift the dense page directory when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: deleted record */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets,/* in: rec_get_offsets(rec) */ + const byte* free) /* in: previous start of the free list */ + __attribute__((nonnull(1,2,3,4))); + +/************************************************************************** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /* in/out: compressed page */ + ulint is_clustered) /* in: nonzero for clustered index, + zero for others */ + __attribute__((nonnull)); +#endif diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h new file mode 100644 index 00000000000..0183e013d05 --- /dev/null +++ b/storage/xtradb/include/page0zip.h @@ -0,0 +1,455 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifndef page0zip_h +#define page0zip_h + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "mtr0types.h" +#include "page0types.h" +#include "buf0types.h" +#include "dict0types.h" +#include "mem0mem.h" + +/************************************************************************** +Determine the size of a compressed page in bytes. */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + /* out: size in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ + __attribute__((nonnull, pure)); +/************************************************************************** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /* in/out: compressed page */ + ulint size); /* in: size in bytes */ + +/************************************************************************** +Determine if a record is so big that it needs to be stored externally. */ +UNIV_INLINE +ibool +page_zip_rec_needs_ext( +/*===================*/ + /* out: FALSE if the entire record + can be stored locally on the page */ + ulint rec_size, /* in: length of the record in bytes */ + ulint comp, /* in: nonzero=compact format */ + ulint n_fields, /* in: number of fields in the record; + ignored if zip_size == 0 */ + ulint zip_size) /* in: compressed page size in bytes, or 0 */ + __attribute__((const)); + +/************************************************************************** +Determine the guaranteed free space on an empty page. */ +UNIV_INTERN +ulint +page_zip_empty_size( +/*================*/ + /* out: minimum payload size on the page */ + ulint n_fields, /* in: number of columns in the index */ + ulint zip_size) /* in: compressed page size in bytes */ + __attribute__((const)); + +/************************************************************************** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip); /* in/out: compressed page + descriptor */ + +/************************************************************************** +Configure the zlib allocator to use the given memory heap. */ +UNIV_INTERN +void +page_zip_set_alloc( +/*===============*/ + void* stream, /* in/out: zlib stream */ + mem_heap_t* heap); /* in: memory heap to use */ + +/************************************************************************** +Compress a page. */ +UNIV_INTERN +ibool +page_zip_compress( +/*==============*/ + /* out: TRUE on success, FALSE on failure; + page_zip will be left intact on failure. */ + page_zip_des_t* page_zip,/* in: size; out: data, n_blobs, + m_start, m_end, m_nonempty */ + const page_t* page, /* in: uncompressed page */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2,3))); + +/************************************************************************** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. */ +UNIV_INTERN +ibool +page_zip_decompress( +/*================*/ + /* out: TRUE on success, FALSE on failure */ + page_zip_des_t* page_zip,/* in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page) /* out: uncompressed page, may be trashed */ + __attribute__((nonnull)); + +#ifdef UNIV_DEBUG +/************************************************************************** +Validate a compressed page descriptor. */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + /* out: TRUE if ok */ + const page_zip_des_t* page_zip); /* in: compressed page + descriptor */ +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_ZIP_DEBUG +/************************************************************************** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate_low( +/*==================*/ + /* out: TRUE if valid, FALSE if not */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page, /* in: uncompressed page */ + ibool sloppy) /* in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ + __attribute__((nonnull)); +/************************************************************************** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page) /* in: uncompressed page */ + __attribute__((nonnull)); +#endif /* UNIV_ZIP_DEBUG */ + +/************************************************************************** +Determine how big record can be inserted without recompressing the page. */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + /* out: a positive number + indicating the maximum size of + a record whose insertion is + guaranteed to succeed, or + zero or negative */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ibool is_clust)/* in: TRUE if clustered index */ + __attribute__((nonnull, pure)); + +/************************************************************************** +Determine if enough space is available in the modification log. */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + /* out: TRUE if page_zip_write_rec() + will succeed */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ibool is_clust,/* in: TRUE if clustered index */ + ulint length, /* in: combined size of the record */ + ulint create) /* in: nonzero=add the record to + the heap */ + __attribute__((nonnull, pure)); + +/************************************************************************** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length, /* in: length of the data */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/************************************************************************** +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record being written */ + dict_index_t* index, /* in: the index the record belongs to */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint create) /* in: nonzero=insert, zero=update */ + __attribute__((nonnull)); + +/*************************************************************** +Parses a log record of writing a BLOB pointer of a record. */ +UNIV_INTERN +byte* +page_zip_parse_write_blob_ptr( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip);/* in/out: compressed page */ + +/************************************************************************** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in/out: record whose data is being + written */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint n, /* in: column index */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ + __attribute__((nonnull(1,2,3,4))); + +/*************************************************************** +Parses a log record of writing the node pointer of a record. */ +UNIV_INTERN +byte* +page_zip_parse_write_node_ptr( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip);/* in/out: compressed page */ + +/************************************************************************** +Write the node pointer of a record on a non-leaf compressed page. */ +UNIV_INTERN +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + ulint ptr, /* in: node pointer */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/************************************************************************** +Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +UNIV_INTERN +void +page_zip_write_trx_id_and_roll_ptr( +/*===============================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint trx_id_col,/* in: column number of TRX_ID in rec */ + dulint trx_id, /* in: transaction identifier */ + dulint roll_ptr)/* in: roll_ptr */ + __attribute__((nonnull)); + +/************************************************************************** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the deleted flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the owned flag (nonzero=TRUE) */ + __attribute__((nonnull)); + +/************************************************************************** +Insert a record to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_insert( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* prev_rec,/* in: record after which to insert */ + const byte* free_rec,/* in: record from which rec was + allocated, or NULL */ + byte* rec); /* in: record to insert */ + +/************************************************************************** +Shift the dense page directory and the array of BLOB pointers +when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: deleted record */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets,/* in: rec_get_offsets(rec) */ + const byte* free) /* in: previous start of the free list */ + __attribute__((nonnull(1,2,3,4))); + +/************************************************************************** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /* in/out: compressed page */ + ulint is_clustered) /* in: nonzero for clustered index, + zero for others */ + __attribute__((nonnull)); + +/*************************************************************** +Parses a log record of writing to the header of a page. */ +UNIV_INTERN +byte* +page_zip_parse_write_header( +/*========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip);/* in/out: compressed page */ + +/************************************************************************** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_low(). */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length, /* in: length of the data */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ + __attribute__((nonnull(1,2))); + +/************************************************************************** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. */ +UNIV_INTERN +ibool +page_zip_reorganize( +/*================*/ + /* out: TRUE on success, FALSE on failure; + page and page_zip will be left intact + on failure. */ + buf_block_t* block, /* in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction */ + __attribute__((nonnull)); +/************************************************************************** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +UNIV_INTERN +void +page_zip_copy_recs( +/*===============*/ + page_zip_des_t* page_zip, /* out: copy of src_zip + (n_blobs, m_start, m_end, + m_nonempty, data[0..size-1]) */ + page_t* page, /* out: copy of src */ + const page_zip_des_t* src_zip, /* in: compressed page */ + const page_t* src, /* in: page */ + dict_index_t* index, /* in: index of the B-tree */ + mtr_t* mtr) /* in: mini-transaction */ + __attribute__((nonnull(1,2,3,4))); + +/************************************************************************** +Parses a log record of compressing an index page. */ +UNIV_INTERN +byte* +page_zip_parse_compress( +/*====================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* out: uncompressed page */ + page_zip_des_t* page_zip)/* out: compressed page */ + __attribute__((nonnull(1,2))); + +/************************************************************************** +Calculate the compressed page checksum. */ +UNIV_INTERN +ulint +page_zip_calc_checksum( +/*===================*/ + /* out: page checksum */ + const void* data, /* in: compressed page */ + ulint size) /* in: size of compressed page */ + __attribute__((nonnull)); + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif + +#ifndef UNIV_NONINL +# include "page0zip.ic" +#endif + +#endif /* page0zip_h */ diff --git a/storage/xtradb/include/page0zip.ic b/storage/xtradb/include/page0zip.ic new file mode 100644 index 00000000000..3db5f025c31 --- /dev/null +++ b/storage/xtradb/include/page0zip.ic @@ -0,0 +1,398 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "page0zip.h" +#include "page0page.h" + +/* The format of compressed pages is as follows. + +The header and trailer of the uncompressed pages, excluding the page +directory in the trailer, are copied as is to the header and trailer +of the compressed page. + +At the end of the compressed page, there is a dense page directory +pointing to every user record contained on the page, including deleted +records on the free list. The dense directory is indexed in the +collation order, i.e., in the order in which the record list is +linked on the uncompressed page. The infimum and supremum records are +excluded. The two most significant bits of the entries are allocated +for the delete-mark and an n_owned flag indicating the last record in +a chain of records pointed to from the sparse page directory on the +uncompressed page. + +The data between PAGE_ZIP_START and the last page directory entry will +be written in compressed format, starting at offset PAGE_DATA. +Infimum and supremum records are not stored. We exclude the +REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered +from the dense page directory stored at the end of the compressed +page. + +The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and +roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of +externally stored columns are stored separately, in ascending order of +heap_no and column index, starting backwards from the dense page +directory. + +The compressed data stream may be followed by a modification log +covering the compressed portion of the page, as follows. + +MODIFICATION LOG ENTRY FORMAT +- write record: + - (heap_no - 1) << 1 (1..2 bytes) + - extra bytes backwards + - data bytes +- clear record: + - (heap_no - 1) << 1 | 1 (1..2 bytes) + +The integer values are stored in a variable-length format: +- 0xxxxxxx: 0..127 +- 1xxxxxxx xxxxxxxx: 0..32767 + +The end of the modification log is marked by a 0 byte. + +In summary, the compressed page looks like this: + +(1) Uncompressed page header (PAGE_DATA bytes) +(2) Compressed index information +(3) Compressed page data +(4) Page modification log (page_zip->m_start..page_zip->m_end) +(5) Empty zero-filled space +(6) BLOB pointers (on leaf pages) + - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column + - in descending collation order +(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, + - indexed by heap_no + - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes + - REC_NODE_PTR_SIZE for non-leaf pages + - 0 otherwise +(8) dense page directory, stored backwards + - n_dense = n_heap - 2 + - existing records in ascending collation order + - deleted records (free list) in link order +*/ + +/* Start offset of the area that will be compressed */ +#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END +/* Size of an compressed page directory entry */ +#define PAGE_ZIP_DIR_SLOT_SIZE 2 +/* Mask of record offsets */ +#define PAGE_ZIP_DIR_SLOT_MASK 0x3fff +/* 'owned' flag */ +#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000 +/* 'deleted' flag */ +#define PAGE_ZIP_DIR_SLOT_DEL 0x8000 + +/************************************************************************** +Determine the size of a compressed page in bytes. */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + /* out: size in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ +{ + ulint size; + + if (UNIV_UNLIKELY(!page_zip->ssize)) { + return(0); + } + + size = (PAGE_ZIP_MIN_SIZE >> 1) << page_zip->ssize; + + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + ut_ad(size <= UNIV_PAGE_SIZE); + + return(size); +} +/************************************************************************** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /* in/out: compressed page */ + ulint size) /* in: size in bytes */ +{ + if (size) { + int ssize; + + ut_ad(ut_is_2pow(size)); + + for (ssize = 1; size > (ulint) (512 << ssize); ssize++) { + } + + page_zip->ssize = ssize; + } else { + page_zip->ssize = 0; + } + + ut_ad(page_zip_get_size(page_zip) == size); +} + +/************************************************************************** +Determine if a record is so big that it needs to be stored externally. */ +UNIV_INLINE +ibool +page_zip_rec_needs_ext( +/*===================*/ + /* out: FALSE if the entire record + can be stored locally on the page */ + ulint rec_size, /* in: length of the record in bytes */ + ulint comp, /* in: nonzero=compact format */ + ulint n_fields, /* in: number of fields in the record; + ignored if zip_size == 0 */ + ulint zip_size) /* in: compressed page size in bytes, or 0 */ +{ + ut_ad(rec_size > comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(comp || !zip_size); + +#if UNIV_PAGE_SIZE > REC_MAX_DATA_SIZE + if (UNIV_UNLIKELY(rec_size >= REC_MAX_DATA_SIZE)) { + return(TRUE); + } +#endif + + if (UNIV_UNLIKELY(zip_size)) { + ut_ad(comp); + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. There should be enough room for + one record on an empty leaf page. Subtract 1 byte for + the encoded heap number. Check also the available space + on the uncompressed page. */ + return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2) + >= (page_zip_empty_size(n_fields, zip_size) - 1) + || rec_size >= page_get_free_space_of_empty(TRUE) / 2); + } + + return(rec_size >= page_get_free_space_of_empty(comp) / 2); +} + +#ifdef UNIV_DEBUG +/************************************************************************** +Validate a compressed page descriptor. */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + /* out: TRUE if ok */ + const page_zip_des_t* page_zip)/* in: compressed page descriptor */ +{ + ut_ad(page_zip); + ut_ad(page_zip->data); + ut_ad(page_zip->ssize < PAGE_ZIP_NUM_SSIZE); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); + ut_ad(page_zip->m_start <= page_zip->m_end); + ut_ad(page_zip->m_end < page_zip_get_size(page_zip)); + ut_ad(page_zip->n_blobs + < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Determine if the length of the page trailer. */ +UNIV_INLINE +ibool +page_zip_get_trailer_len( +/*=====================*/ + /* out: length of the page trailer, + in bytes, not including the terminating + zero byte of the modification log */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ibool is_clust,/* in: TRUE if clustered index */ + ulint* entry_size)/* out: size of the uncompressed + portion of a user record */ +{ + ulint uncompressed_size; + + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + if (UNIV_UNLIKELY(!page_is_leaf(page_zip->data))) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + REC_NODE_PTR_SIZE; + ut_ad(!page_zip->n_blobs); + } else if (UNIV_UNLIKELY(is_clust)) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE; + ut_ad(!page_zip->n_blobs); + } + + if (entry_size) { + *entry_size = uncompressed_size; + } + + return((page_dir_get_n_heap(page_zip->data) - 2) + * uncompressed_size + + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE); +} + +/************************************************************************** +Determine how big record can be inserted without recompressing the page. */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + /* out: a positive number + indicating the maximum size of + a record whose insertion is + guaranteed to succeed, or + zero or negative */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ibool is_clust)/* in: TRUE if clustered index */ +{ + ulint uncompressed_size; + ulint trailer_len; + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust, + &uncompressed_size); + + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += uncompressed_size; + + return((lint) page_zip_get_size(page_zip) + - trailer_len - page_zip->m_end + - (REC_N_NEW_EXTRA_BYTES - 2)); +} + +/************************************************************************** +Determine if enough space is available in the modification log. */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + /* out: TRUE if enough space + is available */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ibool is_clust,/* in: TRUE if clustered index */ + ulint length, /* in: combined size of the record */ + ulint create) /* in: nonzero=add the record to + the heap */ +{ + ulint uncompressed_size; + ulint trailer_len; + + ut_ad(length > REC_N_NEW_EXTRA_BYTES); + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust, + &uncompressed_size); + + /* Subtract the fixed extra bytes and add the maximum + space needed for identifying the record (encoded heap_no). */ + length -= REC_N_NEW_EXTRA_BYTES - 2; + + if (UNIV_UNLIKELY(create)) { + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += uncompressed_size; + } + + return(UNIV_LIKELY(length + + trailer_len + + page_zip->m_end + < page_zip_get_size(page_zip))); +} + +/************************************************************************** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip) /* in/out: compressed page + descriptor */ +{ + memset(page_zip, 0, sizeof *page_zip); +} + +/************************************************************************** +Write a log record of writing to the uncompressed header portion of a page. */ +UNIV_INTERN +void +page_zip_write_header_log( +/*======================*/ + const byte* data,/* in: data on the uncompressed page */ + ulint length, /* in: length of the data */ + mtr_t* mtr); /* in: mini-transaction */ + +/************************************************************************** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. +However, the data portion of the uncompressed page may differ from +the compressed page when a record is being inserted in +page_cur_insert_rec_zip(). */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length, /* in: length of the data */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + ulint pos; + + ut_ad(buf_frame_get_page_zip(str) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + pos = page_offset(str); + + ut_ad(pos < PAGE_DATA); + + memcpy(page_zip->data + pos, str, length); + + /* The following would fail in page_cur_insert_rec_zip(). */ + /* ut_ad(page_zip_validate(page_zip, str - pos)); */ + + if (UNIV_LIKELY_NULL(mtr)) { + page_zip_write_header_log(str, length, mtr); + } +} + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/storage/xtradb/include/pars0grm.h b/storage/xtradb/include/pars0grm.h new file mode 100644 index 00000000000..3de233eed3a --- /dev/null +++ b/storage/xtradb/include/pars0grm.h @@ -0,0 +1,236 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software +Foundation, Inc. + +As a special exception, when this file is copied by Bison into a +Bison output file, you may use that output file without restriction. +This special exception was added by the Free Software Foundation +in version 1.24 of Bison. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/* A Bison parser, made by GNU Bison 1.875d. */ + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + PARS_INT_LIT = 258, + PARS_FLOAT_LIT = 259, + PARS_STR_LIT = 260, + PARS_FIXBINARY_LIT = 261, + PARS_BLOB_LIT = 262, + PARS_NULL_LIT = 263, + PARS_ID_TOKEN = 264, + PARS_AND_TOKEN = 265, + PARS_OR_TOKEN = 266, + PARS_NOT_TOKEN = 267, + PARS_GE_TOKEN = 268, + PARS_LE_TOKEN = 269, + PARS_NE_TOKEN = 270, + PARS_PROCEDURE_TOKEN = 271, + PARS_IN_TOKEN = 272, + PARS_OUT_TOKEN = 273, + PARS_BINARY_TOKEN = 274, + PARS_BLOB_TOKEN = 275, + PARS_INT_TOKEN = 276, + PARS_INTEGER_TOKEN = 277, + PARS_FLOAT_TOKEN = 278, + PARS_CHAR_TOKEN = 279, + PARS_IS_TOKEN = 280, + PARS_BEGIN_TOKEN = 281, + PARS_END_TOKEN = 282, + PARS_IF_TOKEN = 283, + PARS_THEN_TOKEN = 284, + PARS_ELSE_TOKEN = 285, + PARS_ELSIF_TOKEN = 286, + PARS_LOOP_TOKEN = 287, + PARS_WHILE_TOKEN = 288, + PARS_RETURN_TOKEN = 289, + PARS_SELECT_TOKEN = 290, + PARS_SUM_TOKEN = 291, + PARS_COUNT_TOKEN = 292, + PARS_DISTINCT_TOKEN = 293, + PARS_FROM_TOKEN = 294, + PARS_WHERE_TOKEN = 295, + PARS_FOR_TOKEN = 296, + PARS_DDOT_TOKEN = 297, + PARS_READ_TOKEN = 298, + PARS_ORDER_TOKEN = 299, + PARS_BY_TOKEN = 300, + PARS_ASC_TOKEN = 301, + PARS_DESC_TOKEN = 302, + PARS_INSERT_TOKEN = 303, + PARS_INTO_TOKEN = 304, + PARS_VALUES_TOKEN = 305, + PARS_UPDATE_TOKEN = 306, + PARS_SET_TOKEN = 307, + PARS_DELETE_TOKEN = 308, + PARS_CURRENT_TOKEN = 309, + PARS_OF_TOKEN = 310, + PARS_CREATE_TOKEN = 311, + PARS_TABLE_TOKEN = 312, + PARS_INDEX_TOKEN = 313, + PARS_UNIQUE_TOKEN = 314, + PARS_CLUSTERED_TOKEN = 315, + PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316, + PARS_ON_TOKEN = 317, + PARS_ASSIGN_TOKEN = 318, + PARS_DECLARE_TOKEN = 319, + PARS_CURSOR_TOKEN = 320, + PARS_SQL_TOKEN = 321, + PARS_OPEN_TOKEN = 322, + PARS_FETCH_TOKEN = 323, + PARS_CLOSE_TOKEN = 324, + PARS_NOTFOUND_TOKEN = 325, + PARS_TO_CHAR_TOKEN = 326, + PARS_TO_NUMBER_TOKEN = 327, + PARS_TO_BINARY_TOKEN = 328, + PARS_BINARY_TO_NUMBER_TOKEN = 329, + PARS_SUBSTR_TOKEN = 330, + PARS_REPLSTR_TOKEN = 331, + PARS_CONCAT_TOKEN = 332, + PARS_INSTR_TOKEN = 333, + PARS_LENGTH_TOKEN = 334, + PARS_SYSDATE_TOKEN = 335, + PARS_PRINTF_TOKEN = 336, + PARS_ASSERT_TOKEN = 337, + PARS_RND_TOKEN = 338, + PARS_RND_STR_TOKEN = 339, + PARS_ROW_PRINTF_TOKEN = 340, + PARS_COMMIT_TOKEN = 341, + PARS_ROLLBACK_TOKEN = 342, + PARS_WORK_TOKEN = 343, + PARS_UNSIGNED_TOKEN = 344, + PARS_EXIT_TOKEN = 345, + PARS_FUNCTION_TOKEN = 346, + PARS_LOCK_TOKEN = 347, + PARS_SHARE_TOKEN = 348, + PARS_MODE_TOKEN = 349, + NEG = 350 + }; +#endif +#define PARS_INT_LIT 258 +#define PARS_FLOAT_LIT 259 +#define PARS_STR_LIT 260 +#define PARS_FIXBINARY_LIT 261 +#define PARS_BLOB_LIT 262 +#define PARS_NULL_LIT 263 +#define PARS_ID_TOKEN 264 +#define PARS_AND_TOKEN 265 +#define PARS_OR_TOKEN 266 +#define PARS_NOT_TOKEN 267 +#define PARS_GE_TOKEN 268 +#define PARS_LE_TOKEN 269 +#define PARS_NE_TOKEN 270 +#define PARS_PROCEDURE_TOKEN 271 +#define PARS_IN_TOKEN 272 +#define PARS_OUT_TOKEN 273 +#define PARS_BINARY_TOKEN 274 +#define PARS_BLOB_TOKEN 275 +#define PARS_INT_TOKEN 276 +#define PARS_INTEGER_TOKEN 277 +#define PARS_FLOAT_TOKEN 278 +#define PARS_CHAR_TOKEN 279 +#define PARS_IS_TOKEN 280 +#define PARS_BEGIN_TOKEN 281 +#define PARS_END_TOKEN 282 +#define PARS_IF_TOKEN 283 +#define PARS_THEN_TOKEN 284 +#define PARS_ELSE_TOKEN 285 +#define PARS_ELSIF_TOKEN 286 +#define PARS_LOOP_TOKEN 287 +#define PARS_WHILE_TOKEN 288 +#define PARS_RETURN_TOKEN 289 +#define PARS_SELECT_TOKEN 290 +#define PARS_SUM_TOKEN 291 +#define PARS_COUNT_TOKEN 292 +#define PARS_DISTINCT_TOKEN 293 +#define PARS_FROM_TOKEN 294 +#define PARS_WHERE_TOKEN 295 +#define PARS_FOR_TOKEN 296 +#define PARS_DDOT_TOKEN 297 +#define PARS_READ_TOKEN 298 +#define PARS_ORDER_TOKEN 299 +#define PARS_BY_TOKEN 300 +#define PARS_ASC_TOKEN 301 +#define PARS_DESC_TOKEN 302 +#define PARS_INSERT_TOKEN 303 +#define PARS_INTO_TOKEN 304 +#define PARS_VALUES_TOKEN 305 +#define PARS_UPDATE_TOKEN 306 +#define PARS_SET_TOKEN 307 +#define PARS_DELETE_TOKEN 308 +#define PARS_CURRENT_TOKEN 309 +#define PARS_OF_TOKEN 310 +#define PARS_CREATE_TOKEN 311 +#define PARS_TABLE_TOKEN 312 +#define PARS_INDEX_TOKEN 313 +#define PARS_UNIQUE_TOKEN 314 +#define PARS_CLUSTERED_TOKEN 315 +#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316 +#define PARS_ON_TOKEN 317 +#define PARS_ASSIGN_TOKEN 318 +#define PARS_DECLARE_TOKEN 319 +#define PARS_CURSOR_TOKEN 320 +#define PARS_SQL_TOKEN 321 +#define PARS_OPEN_TOKEN 322 +#define PARS_FETCH_TOKEN 323 +#define PARS_CLOSE_TOKEN 324 +#define PARS_NOTFOUND_TOKEN 325 +#define PARS_TO_CHAR_TOKEN 326 +#define PARS_TO_NUMBER_TOKEN 327 +#define PARS_TO_BINARY_TOKEN 328 +#define PARS_BINARY_TO_NUMBER_TOKEN 329 +#define PARS_SUBSTR_TOKEN 330 +#define PARS_REPLSTR_TOKEN 331 +#define PARS_CONCAT_TOKEN 332 +#define PARS_INSTR_TOKEN 333 +#define PARS_LENGTH_TOKEN 334 +#define PARS_SYSDATE_TOKEN 335 +#define PARS_PRINTF_TOKEN 336 +#define PARS_ASSERT_TOKEN 337 +#define PARS_RND_TOKEN 338 +#define PARS_RND_STR_TOKEN 339 +#define PARS_ROW_PRINTF_TOKEN 340 +#define PARS_COMMIT_TOKEN 341 +#define PARS_ROLLBACK_TOKEN 342 +#define PARS_WORK_TOKEN 343 +#define PARS_UNSIGNED_TOKEN 344 +#define PARS_EXIT_TOKEN 345 +#define PARS_FUNCTION_TOKEN 346 +#define PARS_LOCK_TOKEN 347 +#define PARS_SHARE_TOKEN 348 +#define PARS_MODE_TOKEN 349 +#define NEG 350 + + + + +#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED) +typedef int YYSTYPE; +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +# define YYSTYPE_IS_TRIVIAL 1 +#endif + +extern YYSTYPE yylval; + + + diff --git a/storage/xtradb/include/pars0opt.h b/storage/xtradb/include/pars0opt.h new file mode 100644 index 00000000000..02524e9d893 --- /dev/null +++ b/storage/xtradb/include/pars0opt.h @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0opt_h +#define pars0opt_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "pars0sym.h" +#include "dict0types.h" +#include "row0sel.h" + +/*********************************************************************** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +UNIV_INTERN +void +opt_search_plan( +/*============*/ + sel_node_t* sel_node); /* in: parsed select node */ +/*********************************************************************** +Looks for occurrences of the columns of the table in the query subgraph and +adds them to the list of columns if an occurrence of the same column does not +already exist in the list. If the column is already in the list, puts a value +indirection to point to the occurrence in the column list, except if the +column occurrence we are looking at is in the column list, in which case +nothing is done. */ +UNIV_INTERN +void +opt_find_all_cols( +/*==============*/ + ibool copy_val, /* in: if TRUE, new found columns are + added as columns to copy */ + dict_index_t* index, /* in: index to use */ + sym_node_list_t* col_list, /* in: base node of a list where + to add new found columns */ + plan_t* plan, /* in: plan or NULL */ + que_node_t* exp); /* in: expression or condition */ +/************************************************************************ +Prints info of a query plan. */ +UNIV_INTERN +void +opt_print_query_plan( +/*=================*/ + sel_node_t* sel_node); /* in: select node */ + +#ifndef UNIV_NONINL +#include "pars0opt.ic" +#endif + +#endif diff --git a/storage/xtradb/include/pars0opt.ic b/storage/xtradb/include/pars0opt.ic new file mode 100644 index 00000000000..35653453b30 --- /dev/null +++ b/storage/xtradb/include/pars0opt.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/pars0pars.h b/storage/xtradb/include/pars0pars.h new file mode 100644 index 00000000000..e5693ee5575 --- /dev/null +++ b/storage/xtradb/include/pars0pars.h @@ -0,0 +1,747 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ + +#ifndef pars0pars_h +#define pars0pars_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "pars0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" + +/* Type of the user functions. The first argument is always InnoDB-supplied +and varies in type, while 'user_arg' is a user-supplied argument. The +meaning of the return type also varies. See the individual use cases, e.g. +the FETCH statement, for details on them. */ +typedef void* (*pars_user_func_cb_t)(void* arg, void* user_arg); + +extern int yydebug; + +/* If the following is set TRUE, the lexer will print the SQL string +as it tokenizes it */ + +#ifdef UNIV_SQL_DEBUG +extern ibool pars_print_lexed; +#endif /* UNIV_SQL_DEBUG */ + +/* Global variable used while parsing a single procedure or query : the code is +NOT re-entrant */ +extern sym_tab_t* pars_sym_tab_global; + +extern pars_res_word_t pars_to_char_token; +extern pars_res_word_t pars_to_number_token; +extern pars_res_word_t pars_to_binary_token; +extern pars_res_word_t pars_binary_to_number_token; +extern pars_res_word_t pars_substr_token; +extern pars_res_word_t pars_replstr_token; +extern pars_res_word_t pars_concat_token; +extern pars_res_word_t pars_length_token; +extern pars_res_word_t pars_instr_token; +extern pars_res_word_t pars_sysdate_token; +extern pars_res_word_t pars_printf_token; +extern pars_res_word_t pars_assert_token; +extern pars_res_word_t pars_rnd_token; +extern pars_res_word_t pars_rnd_str_token; +extern pars_res_word_t pars_count_token; +extern pars_res_word_t pars_sum_token; +extern pars_res_word_t pars_distinct_token; +extern pars_res_word_t pars_binary_token; +extern pars_res_word_t pars_blob_token; +extern pars_res_word_t pars_int_token; +extern pars_res_word_t pars_char_token; +extern pars_res_word_t pars_float_token; +extern pars_res_word_t pars_update_token; +extern pars_res_word_t pars_asc_token; +extern pars_res_word_t pars_desc_token; +extern pars_res_word_t pars_open_token; +extern pars_res_word_t pars_close_token; +extern pars_res_word_t pars_share_token; +extern pars_res_word_t pars_unique_token; +extern pars_res_word_t pars_clustered_token; + +extern ulint pars_star_denoter; + +/* Procedure parameter types */ +#define PARS_INPUT 0 +#define PARS_OUTPUT 1 +#define PARS_NOT_PARAM 2 + +int +yyparse(void); + +/***************************************************************** +Parses an SQL string returning the query graph. */ +UNIV_INTERN +que_t* +pars_sql( +/*=====*/ + /* out, own: the query graph */ + pars_info_t* info, /* in: extra information, or NULL */ + const char* str); /* in: SQL string */ +/***************************************************************** +Retrieves characters to the lexical analyzer. */ +UNIV_INTERN +void +pars_get_lex_chars( +/*===============*/ + char* buf, /* in/out: buffer where to copy */ + int* result, /* out: number of characters copied or EOF */ + int max_size); /* in: maximum number of characters which fit + in the buffer */ +/***************************************************************** +Called by yyparse on error. */ +UNIV_INTERN +void +yyerror( +/*====*/ + const char* s); /* in: error message string */ +/************************************************************************* +Parses a variable declaration. */ +UNIV_INTERN +sym_node_t* +pars_variable_declaration( +/*======================*/ + /* out, own: symbol table node of type + SYM_VAR */ + sym_node_t* node, /* in: symbol table node allocated for the + id of the variable */ + pars_res_word_t* type); /* in: pointer to a type token */ +/************************************************************************* +Parses a function expression. */ +UNIV_INTERN +func_node_t* +pars_func( +/*======*/ + /* out, own: function node in a query tree */ + que_node_t* res_word,/* in: function name reserved word */ + que_node_t* arg); /* in: first argument in the argument list */ +/************************************************************************* +Parses an operator expression. */ +UNIV_INTERN +func_node_t* +pars_op( +/*====*/ + /* out, own: function node in a query tree */ + int func, /* in: operator token code */ + que_node_t* arg1, /* in: first argument */ + que_node_t* arg2); /* in: second argument or NULL for an unary + operator */ +/************************************************************************* +Parses an ORDER BY clause. Order by a single column only is supported. */ +UNIV_INTERN +order_node_t* +pars_order_by( +/*==========*/ + /* out, own: order-by node in a query tree */ + sym_node_t* column, /* in: column name */ + pars_res_word_t* asc); /* in: &pars_asc_token or pars_desc_token */ +/************************************************************************* +Parses a select list; creates a query graph node for the whole SELECT +statement. */ +UNIV_INTERN +sel_node_t* +pars_select_list( +/*=============*/ + /* out, own: select node in a query + tree */ + que_node_t* select_list, /* in: select list */ + sym_node_t* into_list); /* in: variables list or NULL */ +/************************************************************************* +Parses a cursor declaration. */ +UNIV_INTERN +que_node_t* +pars_cursor_declaration( +/*====================*/ + /* out: sym_node */ + sym_node_t* sym_node, /* in: cursor id node in the symbol + table */ + sel_node_t* select_node); /* in: select node */ +/************************************************************************* +Parses a function declaration. */ +UNIV_INTERN +que_node_t* +pars_function_declaration( +/*======================*/ + /* out: sym_node */ + sym_node_t* sym_node); /* in: function id node in the symbol + table */ +/************************************************************************* +Parses a select statement. */ +UNIV_INTERN +sel_node_t* +pars_select_statement( +/*==================*/ + /* out, own: select node in a query + tree */ + sel_node_t* select_node, /* in: select node already containing + the select list */ + sym_node_t* table_list, /* in: table list */ + que_node_t* search_cond, /* in: search condition or NULL */ + pars_res_word_t* for_update, /* in: NULL or &pars_update_token */ + pars_res_word_t* consistent_read,/* in: NULL or + &pars_consistent_token */ + order_node_t* order_by); /* in: NULL or an order-by node */ +/************************************************************************* +Parses a column assignment in an update. */ +UNIV_INTERN +col_assign_node_t* +pars_column_assignment( +/*===================*/ + /* out: column assignment node */ + sym_node_t* column, /* in: column to assign */ + que_node_t* exp); /* in: value to assign */ +/************************************************************************* +Parses a delete or update statement start. */ +UNIV_INTERN +upd_node_t* +pars_update_statement_start( +/*========================*/ + /* out, own: update node in a query + tree */ + ibool is_delete, /* in: TRUE if delete */ + sym_node_t* table_sym, /* in: table name node */ + col_assign_node_t* col_assign_list);/* in: column assignment list, NULL + if delete */ +/************************************************************************* +Parses an update or delete statement. */ +UNIV_INTERN +upd_node_t* +pars_update_statement( +/*==================*/ + /* out, own: update node in a query + tree */ + upd_node_t* node, /* in: update node */ + sym_node_t* cursor_sym, /* in: pointer to a cursor entry in + the symbol table or NULL */ + que_node_t* search_cond); /* in: search condition or NULL */ +/************************************************************************* +Parses an insert statement. */ +UNIV_INTERN +ins_node_t* +pars_insert_statement( +/*==================*/ + /* out, own: update node in a query + tree */ + sym_node_t* table_sym, /* in: table name node */ + que_node_t* values_list, /* in: value expression list or NULL */ + sel_node_t* select); /* in: select condition or NULL */ +/************************************************************************* +Parses a procedure parameter declaration. */ +UNIV_INTERN +sym_node_t* +pars_parameter_declaration( +/*=======================*/ + /* out, own: symbol table node of type + SYM_VAR */ + sym_node_t* node, /* in: symbol table node allocated for the + id of the parameter */ + ulint param_type, + /* in: PARS_INPUT or PARS_OUTPUT */ + pars_res_word_t* type); /* in: pointer to a type token */ +/************************************************************************* +Parses an elsif element. */ +UNIV_INTERN +elsif_node_t* +pars_elsif_element( +/*===============*/ + /* out: elsif node */ + que_node_t* cond, /* in: if-condition */ + que_node_t* stat_list); /* in: statement list */ +/************************************************************************* +Parses an if-statement. */ +UNIV_INTERN +if_node_t* +pars_if_statement( +/*==============*/ + /* out: if-statement node */ + que_node_t* cond, /* in: if-condition */ + que_node_t* stat_list, /* in: statement list */ + que_node_t* else_part); /* in: else-part statement list */ +/************************************************************************* +Parses a for-loop-statement. */ +UNIV_INTERN +for_node_t* +pars_for_statement( +/*===============*/ + /* out: for-statement node */ + sym_node_t* loop_var, /* in: loop variable */ + que_node_t* loop_start_limit,/* in: loop start expression */ + que_node_t* loop_end_limit, /* in: loop end expression */ + que_node_t* stat_list); /* in: statement list */ +/************************************************************************* +Parses a while-statement. */ +UNIV_INTERN +while_node_t* +pars_while_statement( +/*=================*/ + /* out: while-statement node */ + que_node_t* cond, /* in: while-condition */ + que_node_t* stat_list); /* in: statement list */ +/************************************************************************* +Parses an exit statement. */ +UNIV_INTERN +exit_node_t* +pars_exit_statement(void); +/*=====================*/ + /* out: exit statement node */ +/************************************************************************* +Parses a return-statement. */ +UNIV_INTERN +return_node_t* +pars_return_statement(void); +/*=======================*/ + /* out: return-statement node */ +/************************************************************************* +Parses a procedure call. */ +UNIV_INTERN +func_node_t* +pars_procedure_call( +/*================*/ + /* out: function node */ + que_node_t* res_word,/* in: procedure name reserved word */ + que_node_t* args); /* in: argument list */ +/************************************************************************* +Parses an assignment statement. */ +UNIV_INTERN +assign_node_t* +pars_assignment_statement( +/*======================*/ + /* out: assignment statement node */ + sym_node_t* var, /* in: variable to assign */ + que_node_t* val); /* in: value to assign */ +/************************************************************************* +Parses a fetch statement. into_list or user_func (but not both) must be +non-NULL. */ +UNIV_INTERN +fetch_node_t* +pars_fetch_statement( +/*=================*/ + /* out: fetch statement node */ + sym_node_t* cursor, /* in: cursor node */ + sym_node_t* into_list, /* in: variables to set, or NULL */ + sym_node_t* user_func); /* in: user function name, or NULL */ +/************************************************************************* +Parses an open or close cursor statement. */ +UNIV_INTERN +open_node_t* +pars_open_statement( +/*================*/ + /* out: fetch statement node */ + ulint type, /* in: ROW_SEL_OPEN_CURSOR + or ROW_SEL_CLOSE_CURSOR */ + sym_node_t* cursor); /* in: cursor node */ +/************************************************************************* +Parses a row_printf-statement. */ +UNIV_INTERN +row_printf_node_t* +pars_row_printf_statement( +/*======================*/ + /* out: row_printf-statement node */ + sel_node_t* sel_node); /* in: select node */ +/************************************************************************* +Parses a commit statement. */ +UNIV_INTERN +commit_node_t* +pars_commit_statement(void); +/*=======================*/ +/************************************************************************* +Parses a rollback statement. */ +UNIV_INTERN +roll_node_t* +pars_rollback_statement(void); +/*=========================*/ +/************************************************************************* +Parses a column definition at a table creation. */ +UNIV_INTERN +sym_node_t* +pars_column_def( +/*============*/ + /* out: column sym table + node */ + sym_node_t* sym_node, /* in: column node in the + symbol table */ + pars_res_word_t* type, /* in: data type */ + sym_node_t* len, /* in: length of column, or + NULL */ + void* is_unsigned, /* in: if not NULL, column + is of type UNSIGNED. */ + void* is_not_null); /* in: if not NULL, column + is of type NOT NULL. */ +/************************************************************************* +Parses a table creation operation. */ +UNIV_INTERN +tab_node_t* +pars_create_table( +/*==============*/ + /* out: table create subgraph */ + sym_node_t* table_sym, /* in: table name node in the symbol + table */ + sym_node_t* column_defs, /* in: list of column names */ + void* not_fit_in_memory);/* in: a non-NULL pointer means that + this is a table which in simulations + should be simulated as not fitting + in memory; thread is put to sleep + to simulate disk accesses; NOTE that + this flag is not stored to the data + dictionary on disk, and the database + will forget about non-NULL value if + it has to reload the table definition + from disk */ +/************************************************************************* +Parses an index creation operation. */ +UNIV_INTERN +ind_node_t* +pars_create_index( +/*==============*/ + /* out: index create subgraph */ + pars_res_word_t* unique_def, /* in: not NULL if a unique index */ + pars_res_word_t* clustered_def, /* in: not NULL if a clustered index */ + sym_node_t* index_sym, /* in: index name node in the symbol + table */ + sym_node_t* table_sym, /* in: table name node in the symbol + table */ + sym_node_t* column_list); /* in: list of column names */ +/************************************************************************* +Parses a procedure definition. */ +UNIV_INTERN +que_fork_t* +pars_procedure_definition( +/*======================*/ + /* out: query fork node */ + sym_node_t* sym_node, /* in: procedure id node in the symbol + table */ + sym_node_t* param_list, /* in: parameter declaration list */ + que_node_t* stat_list); /* in: statement list */ + +/***************************************************************** +Parses a stored procedure call, when this is not within another stored +procedure, that is, the client issues a procedure call directly. +In MySQL/InnoDB, stored InnoDB procedures are invoked via the +parsed procedure tree, not via InnoDB SQL, so this function is not used. */ +UNIV_INTERN +que_fork_t* +pars_stored_procedure_call( +/*=======================*/ + /* out: query graph */ + sym_node_t* sym_node); /* in: stored procedure name */ +/********************************************************************** +Completes a query graph by adding query thread and fork nodes +above it and prepares the graph for running. The fork created is of +type QUE_FORK_MYSQL_INTERFACE. */ +UNIV_INTERN +que_thr_t* +pars_complete_graph_for_exec( +/*=========================*/ + /* out: query thread node to run */ + que_node_t* node, /* in: root node for an incomplete + query graph */ + trx_t* trx, /* in: transaction handle */ + mem_heap_t* heap); /* in: memory heap from which allocated */ + +/******************************************************************** +Create parser info struct.*/ +UNIV_INTERN +pars_info_t* +pars_info_create(void); +/*==================*/ + /* out, own: info struct */ + +/******************************************************************** +Free info struct and everything it contains.*/ +UNIV_INTERN +void +pars_info_free( +/*===========*/ + pars_info_t* info); /* in: info struct */ + +/******************************************************************** +Add bound literal. */ +UNIV_INTERN +void +pars_info_add_literal( +/*==================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const void* address, /* in: address */ + ulint length, /* in: length of data */ + ulint type, /* in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /* in: precise type, e.g. + DATA_UNSIGNED */ + +/******************************************************************** +Equivalent to pars_info_add_literal(info, name, str, strlen(str), +DATA_VARCHAR, DATA_ENGLISH). */ +UNIV_INTERN +void +pars_info_add_str_literal( +/*======================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const char* str); /* in: string */ + +/******************************************************************** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_int4_literal( +/*=======================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + lint val); /* in: value */ + +/******************************************************************** +Equivalent to: + +char buf[8]; +mach_write_to_8(buf, val); +pars_info_add_literal(info, name, buf, 8, DATA_BINARY, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_dulint_literal( +/*=========================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + dulint val); /* in: value */ +/******************************************************************** +Add user function. */ +UNIV_INTERN +void +pars_info_add_function( +/*===================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: function name */ + pars_user_func_cb_t func, /* in: function address */ + void* arg); /* in: user-supplied argument */ + +/******************************************************************** +Add bound id. */ +UNIV_INTERN +void +pars_info_add_id( +/*=============*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const char* id); /* in: id */ + +/******************************************************************** +Get user function with the given name.*/ +UNIV_INTERN +pars_user_func_t* +pars_info_get_user_func( +/*====================*/ + /* out: user func, or NULL if not + found */ + pars_info_t* info, /* in: info struct */ + const char* name); /* in: function name to find*/ + +/******************************************************************** +Get bound literal with the given name.*/ +UNIV_INTERN +pars_bound_lit_t* +pars_info_get_bound_lit( +/*====================*/ + /* out: bound literal, or NULL if + not found */ + pars_info_t* info, /* in: info struct */ + const char* name); /* in: bound literal name to find */ + +/******************************************************************** +Get bound id with the given name.*/ +UNIV_INTERN +pars_bound_id_t* +pars_info_get_bound_id( +/*===================*/ + /* out: bound id, or NULL if not + found */ + pars_info_t* info, /* in: info struct */ + const char* name); /* in: bound id name to find */ + + +/* Extra information supplied for pars_sql(). */ +struct pars_info_struct { + mem_heap_t* heap; /* our own memory heap */ + + ib_vector_t* funcs; /* user functions, or NUll + (pars_user_func_t*) */ + ib_vector_t* bound_lits; /* bound literals, or NULL + (pars_bound_lit_t*) */ + ib_vector_t* bound_ids; /* bound ids, or NULL + (pars_bound_id_t*) */ + + ibool graph_owns_us; /* if TRUE (which is the default), + que_graph_free() will free us */ +}; + +/* User-supplied function and argument. */ +struct pars_user_func_struct { + const char* name; /* function name */ + pars_user_func_cb_t func; /* function address */ + void* arg; /* user-supplied argument */ +}; + +/* Bound literal. */ +struct pars_bound_lit_struct { + const char* name; /* name */ + const void* address; /* address */ + ulint length; /* length of data */ + ulint type; /* type, e.g. DATA_FIXBINARY */ + ulint prtype; /* precise type, e.g. DATA_UNSIGNED */ +}; + +/* Bound id. */ +struct pars_bound_id_struct { + const char* name; /* name */ + const char* id; /* id */ +}; + +/* Struct used to denote a reserved word in a parsing tree */ +struct pars_res_word_struct{ + int code; /* the token code for the reserved word from + pars0grm.h */ +}; + +/* A predefined function or operator node in a parsing tree; this construct +is also used for some non-functions like the assignment ':=' */ +struct func_node_struct{ + que_common_t common; /* type: QUE_NODE_FUNC */ + int func; /* token code of the function name */ + ulint class; /* class of the function */ + que_node_t* args; /* argument(s) of the function */ + UT_LIST_NODE_T(func_node_t) cond_list; + /* list of comparison conditions; defined + only for comparison operator nodes except, + presently, for OPT_SCROLL_TYPE ones */ + UT_LIST_NODE_T(func_node_t) func_node_list; + /* list of function nodes in a parsed + query graph */ +}; + +/* An order-by node in a select */ +struct order_node_struct{ + que_common_t common; /* type: QUE_NODE_ORDER */ + sym_node_t* column; /* order-by column */ + ibool asc; /* TRUE if ascending, FALSE if descending */ +}; + +/* Procedure definition node */ +struct proc_node_struct{ + que_common_t common; /* type: QUE_NODE_PROC */ + sym_node_t* proc_id; /* procedure name symbol in the symbol + table of this same procedure */ + sym_node_t* param_list; /* input and output parameters */ + que_node_t* stat_list; /* statement list */ + sym_tab_t* sym_tab; /* symbol table of this procedure */ +}; + +/* elsif-element node */ +struct elsif_node_struct{ + que_common_t common; /* type: QUE_NODE_ELSIF */ + que_node_t* cond; /* if condition */ + que_node_t* stat_list; /* statement list */ +}; + +/* if-statement node */ +struct if_node_struct{ + que_common_t common; /* type: QUE_NODE_IF */ + que_node_t* cond; /* if condition */ + que_node_t* stat_list; /* statement list */ + que_node_t* else_part; /* else-part statement list */ + elsif_node_t* elsif_list; /* elsif element list */ +}; + +/* while-statement node */ +struct while_node_struct{ + que_common_t common; /* type: QUE_NODE_WHILE */ + que_node_t* cond; /* while condition */ + que_node_t* stat_list; /* statement list */ +}; + +/* for-loop-statement node */ +struct for_node_struct{ + que_common_t common; /* type: QUE_NODE_FOR */ + sym_node_t* loop_var; /* loop variable: this is the + dereferenced symbol from the + variable declarations, not the + symbol occurrence in the for loop + definition */ + que_node_t* loop_start_limit;/* initial value of loop variable */ + que_node_t* loop_end_limit; /* end value of loop variable */ + int loop_end_value; /* evaluated value for the end value: + it is calculated only when the loop + is entered, and will not change within + the loop */ + que_node_t* stat_list; /* statement list */ +}; + +/* exit statement node */ +struct exit_node_struct{ + que_common_t common; /* type: QUE_NODE_EXIT */ +}; + +/* return-statement node */ +struct return_node_struct{ + que_common_t common; /* type: QUE_NODE_RETURN */ +}; + +/* Assignment statement node */ +struct assign_node_struct{ + que_common_t common; /* type: QUE_NODE_ASSIGNMENT */ + sym_node_t* var; /* variable to set */ + que_node_t* val; /* value to assign */ +}; + +/* Column assignment node */ +struct col_assign_node_struct{ + que_common_t common; /* type: QUE_NODE_COL_ASSIGN */ + sym_node_t* col; /* column to set */ + que_node_t* val; /* value to assign */ +}; + +/* Classes of functions */ +#define PARS_FUNC_ARITH 1 /* +, -, *, / */ +#define PARS_FUNC_LOGICAL 2 +#define PARS_FUNC_CMP 3 +#define PARS_FUNC_PREDEFINED 4 /* TO_NUMBER, SUBSTR, ... */ +#define PARS_FUNC_AGGREGATE 5 /* COUNT, DISTINCT, SUM */ +#define PARS_FUNC_OTHER 6 /* these are not real functions, + e.g., := */ + +#ifndef UNIV_NONINL +#include "pars0pars.ic" +#endif + +#endif diff --git a/storage/xtradb/include/pars0pars.ic b/storage/xtradb/include/pars0pars.ic new file mode 100644 index 00000000000..3a55ad86f48 --- /dev/null +++ b/storage/xtradb/include/pars0pars.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/pars0sym.h b/storage/xtradb/include/pars0sym.h new file mode 100644 index 00000000000..69227a2917e --- /dev/null +++ b/storage/xtradb/include/pars0sym.h @@ -0,0 +1,239 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0sym_h +#define pars0sym_h + +#include "univ.i" +#include "que0types.h" +#include "usr0types.h" +#include "dict0types.h" +#include "pars0types.h" +#include "row0types.h" + +/********************************************************************** +Creates a symbol table for a single stored procedure or query. */ +UNIV_INTERN +sym_tab_t* +sym_tab_create( +/*===========*/ + /* out, own: symbol table */ + mem_heap_t* heap); /* in: memory heap where to create */ +/********************************************************************** +Frees the memory allocated dynamically AFTER parsing phase for variables +etc. in the symbol table. Does not free the mem heap where the table was +originally created. Frees also SQL explicit cursor definitions. */ +UNIV_INTERN +void +sym_tab_free_private( +/*=================*/ + sym_tab_t* sym_tab); /* in, own: symbol table */ +/********************************************************************** +Adds an integer literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_int_lit( +/*================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + ulint val); /* in: integer value */ +/********************************************************************** +Adds an string literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_str_lit( +/*================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + byte* str, /* in: string with no quotes around + it */ + ulint len); /* in: string length */ +/********************************************************************** +Add a bound literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_lit( +/*==================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + const char* name, /* in: name of bound literal */ + ulint* lit_type); /* out: type of literal (PARS_*_LIT) */ +/********************************************************************** +Adds an SQL null literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_null_lit( +/*=================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab); /* in: symbol table */ +/********************************************************************** +Adds an identifier to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_id( +/*===========*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + byte* name, /* in: identifier name */ + ulint len); /* in: identifier length */ + +/********************************************************************** +Add a bound identifier to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_id( +/*===========*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + const char* name); /* in: name of bound id */ + +#define SYM_CLUST_FIELD_NO 0 +#define SYM_SEC_FIELD_NO 1 + +struct sym_node_struct{ + que_common_t common; /* node type: + QUE_NODE_SYMBOL */ + /* NOTE: if the data field in 'common.val' is not NULL and the symbol + table node is not for a temporary column, the memory for the value has + been allocated from dynamic memory and it should be freed when the + symbol table is discarded */ + + /* 'alias' and 'indirection' are almost the same, but not quite. + 'alias' always points to the primary instance of the variable, while + 'indirection' does the same only if we should use the primary + instance's values for the node's data. This is usually the case, but + when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM + t WHERE id = x;"), we copy the values from the primary instance to + the cursor's instance so that they are fixed for the duration of the + cursor, and set 'indirection' to NULL. If we did not, the value of + 'x' could change between fetches and things would break horribly. + + TODO: It would be cleaner to make 'indirection' a boolean field and + always use 'alias' to refer to the primary node. */ + + sym_node_t* indirection; /* pointer to + another symbol table + node which contains + the value for this + node, NULL otherwise */ + sym_node_t* alias; /* pointer to + another symbol table + node for which this + node is an alias, + NULL otherwise */ + UT_LIST_NODE_T(sym_node_t) col_var_list; /* list of table + columns or a list of + input variables for an + explicit cursor */ + ibool copy_val; /* TRUE if a column + and its value should + be copied to dynamic + memory when fetched */ + ulint field_nos[2]; /* if a column, in + the position + SYM_CLUST_FIELD_NO is + the field number in the + clustered index; in + the position + SYM_SEC_FIELD_NO + the field number in the + non-clustered index to + use first; if not found + from the index, then + ULINT_UNDEFINED */ + ibool resolved; /* TRUE if the + meaning of a variable + or a column has been + resolved; for literals + this is always TRUE */ + ulint token_type; /* SYM_VAR, SYM_COLUMN, + SYM_IMPLICIT_VAR, + SYM_LIT, SYM_TABLE, + SYM_CURSOR, ... */ + const char* name; /* name of an id */ + ulint name_len; /* id name length */ + dict_table_t* table; /* table definition + if a table id or a + column id */ + ulint col_no; /* column number if a + column */ + sel_buf_t* prefetch_buf; /* NULL, or a buffer + for cached column + values for prefetched + rows */ + sel_node_t* cursor_def; /* cursor definition + select node if a + named cursor */ + ulint param_type; /* PARS_INPUT, + PARS_OUTPUT, or + PARS_NOT_PARAM if not a + procedure parameter */ + sym_tab_t* sym_table; /* back pointer to + the symbol table */ + UT_LIST_NODE_T(sym_node_t) sym_list; /* list of symbol + nodes */ +}; + +struct sym_tab_struct{ + que_t* query_graph; + /* query graph generated by the + parser */ + const char* sql_string; + /* SQL string to parse */ + size_t string_len; + /* SQL string length */ + int next_char_pos; + /* position of the next character in + sql_string to give to the lexical + analyzer */ + pars_info_t* info; /* extra information, or NULL */ + sym_node_list_t sym_list; + /* list of symbol nodes in the symbol + table */ + UT_LIST_BASE_NODE_T(func_node_t) + func_node_list; + /* list of function nodes in the + parsed query graph */ + mem_heap_t* heap; /* memory heap from which we can + allocate space */ +}; + +/* Types of a symbol table entry */ +#define SYM_VAR 91 /* declared parameter or local + variable of a procedure */ +#define SYM_IMPLICIT_VAR 92 /* storage for a intermediate result + of a calculation */ +#define SYM_LIT 93 /* literal */ +#define SYM_TABLE 94 /* database table name */ +#define SYM_COLUMN 95 /* database table name */ +#define SYM_CURSOR 96 /* named cursor */ +#define SYM_PROCEDURE_NAME 97 /* stored procedure name */ +#define SYM_INDEX 98 /* database index name */ +#define SYM_FUNCTION 99 /* user function name */ + +#ifndef UNIV_NONINL +#include "pars0sym.ic" +#endif + +#endif diff --git a/storage/xtradb/include/pars0sym.ic b/storage/xtradb/include/pars0sym.ic new file mode 100644 index 00000000000..235d6819ae9 --- /dev/null +++ b/storage/xtradb/include/pars0sym.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/pars0types.h b/storage/xtradb/include/pars0types.h new file mode 100644 index 00000000000..e0902d0611a --- /dev/null +++ b/storage/xtradb/include/pars0types.h @@ -0,0 +1,49 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser global types + +Created 1/11/1998 Heikki Tuuri +*******************************************************/ + +#ifndef pars0types_h +#define pars0types_h + +typedef struct pars_info_struct pars_info_t; +typedef struct pars_user_func_struct pars_user_func_t; +typedef struct pars_bound_lit_struct pars_bound_lit_t; +typedef struct pars_bound_id_struct pars_bound_id_t; +typedef struct sym_node_struct sym_node_t; +typedef struct sym_tab_struct sym_tab_t; +typedef struct pars_res_word_struct pars_res_word_t; +typedef struct func_node_struct func_node_t; +typedef struct order_node_struct order_node_t; +typedef struct proc_node_struct proc_node_t; +typedef struct elsif_node_struct elsif_node_t; +typedef struct if_node_struct if_node_t; +typedef struct while_node_struct while_node_t; +typedef struct for_node_struct for_node_t; +typedef struct exit_node_struct exit_node_t; +typedef struct return_node_struct return_node_t; +typedef struct assign_node_struct assign_node_t; +typedef struct col_assign_node_struct col_assign_node_t; + +typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t; + +#endif diff --git a/storage/xtradb/include/que0que.h b/storage/xtradb/include/que0que.h new file mode 100644 index 00000000000..a534cb7e464 --- /dev/null +++ b/storage/xtradb/include/que0que.h @@ -0,0 +1,526 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0que_h +#define que0que_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "usr0types.h" +#include "que0types.h" +#include "row0types.h" +#include "pars0types.h" + +/* If the following flag is set TRUE, the module will print trace info +of SQL execution in the UNIV_SQL_DEBUG version */ +extern ibool que_trace_on; + +/*************************************************************************** +Adds a query graph to the session's list of graphs. */ +UNIV_INTERN +void +que_graph_publish( +/*==============*/ + que_t* graph, /* in: graph */ + sess_t* sess); /* in: session */ +/*************************************************************************** +Creates a query graph fork node. */ +UNIV_INTERN +que_fork_t* +que_fork_create( +/*============*/ + /* out, own: fork node */ + que_t* graph, /* in: graph, if NULL then this + fork node is assumed to be the + graph root */ + que_node_t* parent, /* in: parent node */ + ulint fork_type, /* in: fork type */ + mem_heap_t* heap); /* in: memory heap where created */ +/*************************************************************************** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork); /* in: query fork */ +/*************************************************************************** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork); /* in: query fork */ +/*************************************************************************** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /* in: graph node */ + que_node_t* parent);/* in: parent */ +/*************************************************************************** +Creates a query graph thread node. */ +UNIV_INTERN +que_thr_t* +que_thr_create( +/*===========*/ + /* out, own: query thread node */ + que_fork_t* parent, /* in: parent node, i.e., a fork node */ + mem_heap_t* heap); /* in: memory heap where created */ +/************************************************************************** +Checks if the query graph is in a state where it should be freed, and +frees it in that case. If the session is in a state where it should be +closed, also this is done. */ +UNIV_INTERN +ibool +que_graph_try_free( +/*===============*/ + /* out: TRUE if freed */ + que_t* graph); /* in: query graph */ +/************************************************************************** +Frees a query graph, but not the heap where it was created. Does not free +explicit cursor declarations, they are freed in que_graph_free. */ +UNIV_INTERN +void +que_graph_free_recursive( +/*=====================*/ + que_node_t* node); /* in: query graph node */ +/************************************************************************** +Frees a query graph. */ +UNIV_INTERN +void +que_graph_free( +/*===========*/ + que_t* graph); /* in: query graph; we assume that the memory + heap where this graph was created is private + to this graph: if not, then use + que_graph_free_recursive and free the heap + afterwards! */ +/************************************************************************** +Stops a query thread if graph or trx is in a state requiring it. The +conditions are tested in the order (1) graph, (2) trx. The kernel mutex has +to be reserved. */ +UNIV_INTERN +ibool +que_thr_stop( +/*=========*/ + /* out: TRUE if stopped */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction. */ +UNIV_INTERN +void +que_thr_move_to_run_state_for_mysql( +/*================================*/ + que_thr_t* thr, /* in: an query thread */ + trx_t* trx); /* in: transaction */ +/************************************************************************** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL +select, when there is no error or lock wait. */ +UNIV_INTERN +void +que_thr_stop_for_mysql_no_error( +/*============================*/ + que_thr_t* thr, /* in: query thread */ + trx_t* trx); /* in: transaction */ +/************************************************************************** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The +query thread is stopped and made inactive, except in the case where +it was put to the lock wait state in lock0lock.c, but the lock has already +been granted or the transaction chosen as a victim in deadlock resolution. */ +UNIV_INTERN +void +que_thr_stop_for_mysql( +/*===================*/ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Run a query thread. Handles lock waits. */ +UNIV_INTERN +void +que_run_threads( +/*============*/ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +After signal handling is finished, returns control to a query graph error +handling routine. (Currently, just returns the control to the root of the +graph so that the graph can communicate an error message to the client.) */ +UNIV_INTERN +void +que_fork_error_handle( +/*==================*/ + trx_t* trx, /* in: trx */ + que_t* fork); /* in: query graph which was run before signal + handling started, NULL not allowed */ +/************************************************************************** +Moves a suspended query thread to the QUE_THR_RUNNING state and releases +a single worker thread to execute it. This function should be used to end +the wait state of a query thread waiting for a lock or a stored procedure +completion. */ +UNIV_INTERN +void +que_thr_end_wait( +/*=============*/ + que_thr_t* thr, /* in: query thread in the + QUE_THR_LOCK_WAIT, + or QUE_THR_PROCEDURE_WAIT, or + QUE_THR_SIG_REPLY_WAIT state */ + que_thr_t** next_thr); /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +/************************************************************************** +Same as que_thr_end_wait, but no parameter next_thr available. */ +UNIV_INTERN +void +que_thr_end_wait_no_next_thr( +/*=========================*/ + que_thr_t* thr); /* in: query thread in the + QUE_THR_LOCK_WAIT, + or QUE_THR_PROCEDURE_WAIT, or + QUE_THR_SIG_REPLY_WAIT state */ +/************************************************************************** +Starts execution of a command in a query fork. Picks a query thread which +is not in the QUE_THR_RUNNING state and moves it to that state. If none +can be chosen, a situation which may arise in parallelized fetches, NULL +is returned. */ +UNIV_INTERN +que_thr_t* +que_fork_start_command( +/*===================*/ + /* out: a query thread of the graph moved to + QUE_THR_RUNNING state, or NULL; the query + thread should be executed by que_run_threads + by the caller */ + que_fork_t* fork); /* in: a query fork */ +/*************************************************************************** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr); /* in: query thread */ +/*************************************************************************** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + que_node_t* node); /* in: graph node */ +/*************************************************************************** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node); /* in: graph node */ +/*************************************************************************** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node); /* in: graph node */ +/*************************************************************************** +Gets the value buffer size of a graph node. */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + /* out: val buffer size, not defined if + val.data == NULL in node */ + que_node_t* node); /* in: graph node */ +/*************************************************************************** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /* in: graph node */ + ulint size); /* in: size */ +/************************************************************************* +Gets the next list node in a list of query graph nodes. */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node); /* in: node in a list */ +/************************************************************************* +Gets the parent node of a query graph node. */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + /* out: parent node or NULL */ + que_node_t* node); /* in: node */ +/******************************************************************** +Get the first containing loop node (e.g. while_node_t or for_node_t) for the +given node, or NULL if the node is not within a loop. */ +UNIV_INTERN +que_node_t* +que_node_get_containing_loop_node( +/*==============================*/ + /* out: containing loop node, or NULL. */ + que_node_t* node); /* in: node */ +/************************************************************************* +Catenates a query graph node to a list of them, possible empty list. */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + /* out: one-way list of nodes */ + que_node_t* node_list, /* in: node list, or NULL */ + que_node_t* node); /* in: node */ +/************************************************************************* +Gets a query graph node list length. */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + /* out: length, for NULL list 0 */ + que_node_t* node_list); /* in: node list, or NULL */ +/************************************************************************** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + /* out: TRUE if should be stopped; NOTE that + if the peek is made without reserving the + kernel mutex, then another peek with the + mutex reserved is necessary before deciding + the actual stopping */ + que_thr_t* thr); /* in: query thread */ +/*************************************************************************** +Returns TRUE if the query graph is for a SELECT statement. */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + /* out: TRUE if a select */ + que_t* graph); /* in: graph */ +/************************************************************************** +Prints info of an SQL query graph node. */ +UNIV_INTERN +void +que_node_print_info( +/*================*/ + que_node_t* node); /* in: query graph node */ +/************************************************************************* +Evaluate the given SQL */ +UNIV_INTERN +ulint +que_eval_sql( +/*=========*/ + /* out: error code or DB_SUCCESS */ + pars_info_t* info, /* in: info struct, or NULL */ + const char* sql, /* in: SQL string */ + ibool reserve_dict_mutex, + /* in: if TRUE, acquire/release + dict_sys->mutex around call to pars_sql. */ + trx_t* trx); /* in: trx */ + +/* Query graph query thread node: the fields are protected by the kernel +mutex with the exceptions named below */ + +struct que_thr_struct{ + que_common_t common; /* type: QUE_NODE_THR */ + ulint magic_n; /* magic number to catch memory + corruption */ + que_node_t* child; /* graph child node */ + que_t* graph; /* graph where this node belongs */ + ibool is_active; /* TRUE if the thread has been set + to the run state in + que_thr_move_to_run_state, but not + deactivated in + que_thr_dec_reference_count */ + ulint state; /* state of the query thread */ + UT_LIST_NODE_T(que_thr_t) + thrs; /* list of thread nodes of the fork + node */ + UT_LIST_NODE_T(que_thr_t) + trx_thrs; /* lists of threads in wait list of + the trx */ + UT_LIST_NODE_T(que_thr_t) + queue; /* list of runnable thread nodes in + the server task queue */ + /*------------------------------*/ + /* The following fields are private to the OS thread executing the + query thread, and are not protected by the kernel mutex: */ + + que_node_t* run_node; /* pointer to the node where the + subgraph down from this node is + currently executed */ + que_node_t* prev_node; /* pointer to the node from which + the control came */ + ulint resource; /* resource usage of the query thread + thus far */ + ulint lock_state; /* lock state of thread (table or + row) */ +}; + +#define QUE_THR_MAGIC_N 8476583 +#define QUE_THR_MAGIC_FREED 123461526 + +/* Query graph fork node: its fields are protected by the kernel mutex */ +struct que_fork_struct{ + que_common_t common; /* type: QUE_NODE_FORK */ + que_t* graph; /* query graph of this node */ + ulint fork_type; /* fork type */ + ulint n_active_thrs; /* if this is the root of a graph, the + number query threads that have been + started in que_thr_move_to_run_state + but for which que_thr_dec_refer_count + has not yet been called */ + trx_t* trx; /* transaction: this is set only in + the root node */ + ulint state; /* state of the fork node */ + que_thr_t* caller; /* pointer to a possible calling query + thread */ + UT_LIST_BASE_NODE_T(que_thr_t) + thrs; /* list of query threads */ + /*------------------------------*/ + /* The fields in this section are defined only in the root node */ + sym_tab_t* sym_tab; /* symbol table of the query, + generated by the parser, or NULL + if the graph was created 'by hand' */ + pars_info_t* info; /* in: info struct, or NULL */ + /* The following cur_... fields are relevant only in a select graph */ + + ulint cur_end; /* QUE_CUR_NOT_DEFINED, QUE_CUR_START, + QUE_CUR_END */ + ulint cur_pos; /* if there are n rows in the result + set, values 0 and n + 1 mean before + first row, or after last row, depending + on cur_end; values 1...n mean a row + index */ + ibool cur_on_row; /* TRUE if cursor is on a row, i.e., + it is not before the first row or + after the last row */ + dulint n_inserts; /* number of rows inserted */ + dulint n_updates; /* number of rows updated */ + dulint n_deletes; /* number of rows deleted */ + sel_node_t* last_sel_node; /* last executed select node, or NULL + if none */ + UT_LIST_NODE_T(que_fork_t) + graphs; /* list of query graphs of a session + or a stored procedure */ + /*------------------------------*/ + mem_heap_t* heap; /* memory heap where the fork was + created */ + +}; + +/* Query fork (or graph) types */ +#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */ +#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */ +#define QUE_FORK_INSERT 3 +#define QUE_FORK_UPDATE 4 +#define QUE_FORK_ROLLBACK 5 + /* This is really the undo graph used in rollback, + no signal-sending roll_node in this graph */ +#define QUE_FORK_PURGE 6 +#define QUE_FORK_EXECUTE 7 +#define QUE_FORK_PROCEDURE 8 +#define QUE_FORK_PROCEDURE_CALL 9 +#define QUE_FORK_MYSQL_INTERFACE 10 +#define QUE_FORK_RECOVERY 11 + +/* Query fork (or graph) states */ +#define QUE_FORK_ACTIVE 1 +#define QUE_FORK_COMMAND_WAIT 2 +#define QUE_FORK_INVALID 3 +#define QUE_FORK_BEING_FREED 4 + +/* Flag which is ORed to control structure statement node types */ +#define QUE_NODE_CONTROL_STAT 1024 + +/* Query graph node types */ +#define QUE_NODE_LOCK 1 +#define QUE_NODE_INSERT 2 +#define QUE_NODE_UPDATE 4 +#define QUE_NODE_CURSOR 5 +#define QUE_NODE_SELECT 6 +#define QUE_NODE_AGGREGATE 7 +#define QUE_NODE_FORK 8 +#define QUE_NODE_THR 9 +#define QUE_NODE_UNDO 10 +#define QUE_NODE_COMMIT 11 +#define QUE_NODE_ROLLBACK 12 +#define QUE_NODE_PURGE 13 +#define QUE_NODE_CREATE_TABLE 14 +#define QUE_NODE_CREATE_INDEX 15 +#define QUE_NODE_SYMBOL 16 +#define QUE_NODE_RES_WORD 17 +#define QUE_NODE_FUNC 18 +#define QUE_NODE_ORDER 19 +#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_ASSIGNMENT 23 +#define QUE_NODE_FETCH 24 +#define QUE_NODE_OPEN 25 +#define QUE_NODE_COL_ASSIGNMENT 26 +#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_RETURN 28 +#define QUE_NODE_ROW_PRINTF 29 +#define QUE_NODE_ELSIF 30 +#define QUE_NODE_CALL 31 +#define QUE_NODE_EXIT 32 + +/* Query thread states */ +#define QUE_THR_RUNNING 1 +#define QUE_THR_PROCEDURE_WAIT 2 +#define QUE_THR_COMPLETED 3 /* in selects this means that the + thread is at the end of its result set + (or start, in case of a scroll cursor); + in other statements, this means the + thread has done its task */ +#define QUE_THR_COMMAND_WAIT 4 +#define QUE_THR_LOCK_WAIT 5 +#define QUE_THR_SIG_REPLY_WAIT 6 +#define QUE_THR_SUSPENDED 7 +#define QUE_THR_ERROR 8 + +/* Query thread lock states */ +#define QUE_THR_LOCK_NOLOCK 0 +#define QUE_THR_LOCK_ROW 1 +#define QUE_THR_LOCK_TABLE 2 + +/* From where the cursor position is counted */ +#define QUE_CUR_NOT_DEFINED 1 +#define QUE_CUR_START 2 +#define QUE_CUR_END 3 + + +#ifndef UNIV_NONINL +#include "que0que.ic" +#endif + +#endif diff --git a/storage/xtradb/include/que0que.ic b/storage/xtradb/include/que0que.ic new file mode 100644 index 00000000000..e9a6b00b9ab --- /dev/null +++ b/storage/xtradb/include/que0que.ic @@ -0,0 +1,275 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#include "usr0sess.h" + +/*************************************************************************** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr) /* in: query thread */ +{ + ut_ad(thr); + + return(thr->graph->trx); +} + +/*************************************************************************** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork) /* in: query fork */ +{ + return(UT_LIST_GET_FIRST(fork->thrs)); +} + +/*************************************************************************** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork) /* in: query fork */ +{ + que_thr_t* thr; + + thr = UT_LIST_GET_FIRST(fork->thrs); + + return(thr->child); +} + +/*************************************************************************** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + que_node_t* node) /* in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*)node)->type); +} + +/*************************************************************************** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node) /* in: graph node */ +{ + ut_ad(node); + + return(&(((que_common_t*)node)->val)); +} + +/*************************************************************************** +Gets the value buffer size of a graph node. */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + /* out: val buffer size, not defined if + val.data == NULL in node */ + que_node_t* node) /* in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*)node)->val_buf_size); +} + +/*************************************************************************** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /* in: graph node */ + ulint size) /* in: size */ +{ + ut_ad(node); + + ((que_common_t*)node)->val_buf_size = size; +} + +/*************************************************************************** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /* in: graph node */ + que_node_t* parent) /* in: parent */ +{ + ut_ad(node); + + ((que_common_t*)node)->parent = parent; +} + +/*************************************************************************** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node) /* in: graph node */ +{ + ut_ad(node); + + return(dfield_get_type(&((que_common_t*) node)->val)); +} + +/************************************************************************* +Catenates a query graph node to a list of them, possible empty list. */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + /* out: one-way list of nodes */ + que_node_t* node_list, /* in: node list, or NULL */ + que_node_t* node) /* in: node */ +{ + que_common_t* cnode; + que_common_t* cnode2; + + cnode = (que_common_t*) node; + + cnode->brother = NULL; + + if (node_list == NULL) { + + return(node); + } + + cnode2 = (que_common_t*) node_list; + + while (cnode2->brother != NULL) { + cnode2 = (que_common_t*) cnode2->brother; + } + + cnode2->brother = node; + + return(node_list); +} + +/************************************************************************* +Gets the next list node in a list of query graph nodes. */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + /* out: next node in a list of nodes */ + que_node_t* node) /* in: node in a list */ +{ + return(((que_common_t*)node)->brother); +} + +/************************************************************************* +Gets a query graph node list length. */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + /* out: length, for NULL list 0 */ + que_node_t* node_list) /* in: node list, or NULL */ +{ + const que_common_t* cnode; + ulint len; + + cnode = (const que_common_t*) node_list; + len = 0; + + while (cnode != NULL) { + len++; + cnode = (const que_common_t*) cnode->brother; + } + + return(len); +} + +/************************************************************************* +Gets the parent node of a query graph node. */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + /* out: parent node or NULL */ + que_node_t* node) /* in: node */ +{ + return(((que_common_t*)node)->parent); +} + +/************************************************************************** +Checks if graph, trx, or session is in a state where the query thread should +be stopped. */ +UNIV_INLINE +ibool +que_thr_peek_stop( +/*==============*/ + /* out: TRUE if should be stopped; NOTE that + if the peek is made without reserving the + kernel mutex, then another peek with the + mutex reserved is necessary before deciding + the actual stopping */ + que_thr_t* thr) /* in: query thread */ +{ + trx_t* trx; + que_t* graph; + + graph = thr->graph; + trx = graph->trx; + + if (graph->state != QUE_FORK_ACTIVE + || trx->que_state == TRX_QUE_LOCK_WAIT + || (UT_LIST_GET_LEN(trx->signals) > 0 + && trx->que_state == TRX_QUE_RUNNING)) { + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************************** +Returns TRUE if the query graph is for a SELECT statement. */ +UNIV_INLINE +ibool +que_graph_is_select( +/*================*/ + /* out: TRUE if a select */ + que_t* graph) /* in: graph */ +{ + if (graph->fork_type == QUE_FORK_SELECT_SCROLL + || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) { + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/xtradb/include/que0types.h b/storage/xtradb/include/que0types.h new file mode 100644 index 00000000000..1d3217fb491 --- /dev/null +++ b/storage/xtradb/include/que0types.h @@ -0,0 +1,59 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Query graph global types + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0types_h +#define que0types_h + +#include "data0data.h" +#include "dict0types.h" + +/* Pseudotype for all graph nodes */ +typedef void que_node_t; + +typedef struct que_fork_struct que_fork_t; + +/* Query graph root is a fork node */ +typedef que_fork_t que_t; + +typedef struct que_thr_struct que_thr_t; +typedef struct que_common_struct que_common_t; + +/* Common struct at the beginning of each query graph node; the name of this +substruct must be 'common' */ + +struct que_common_struct{ + ulint type; /* query node type */ + que_node_t* parent; /* back pointer to parent node, or NULL */ + que_node_t* brother;/* pointer to a possible brother node */ + dfield_t val; /* evaluated value for an expression */ + ulint val_buf_size; + /* buffer size for the evaluated value data, + if the buffer has been allocated dynamically: + if this field is != 0, and the node is a + symbol node or a function node, then we + have to free the data field in val + explicitly */ +}; + +#endif diff --git a/storage/xtradb/include/read0read.h b/storage/xtradb/include/read0read.h new file mode 100644 index 00000000000..7ea8bdaf8dd --- /dev/null +++ b/storage/xtradb/include/read0read.h @@ -0,0 +1,181 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#ifndef read0read_h +#define read0read_h + +#include "univ.i" + + +#include "ut0byte.h" +#include "ut0lst.h" +#include "trx0trx.h" +#include "read0types.h" + +/************************************************************************* +Opens a read view where exactly the transactions serialized before this +point in time are seen in the view. */ +UNIV_INTERN +read_view_t* +read_view_open_now( +/*===============*/ + /* out, own: read view struct */ + dulint cr_trx_id, /* in: trx_id of creating + transaction, or (0, 0) used in + purge */ + mem_heap_t* heap); /* in: memory heap from which + allocated */ +/************************************************************************* +Makes a copy of the oldest existing read view, or opens a new. The view +must be closed with ..._close. */ +UNIV_INTERN +read_view_t* +read_view_oldest_copy_or_open_new( +/*==============================*/ + /* out, own: read view struct */ + dulint cr_trx_id, /* in: trx_id of creating + transaction, or (0, 0) used in + purge */ + mem_heap_t* heap); /* in: memory heap from which + allocated */ +/************************************************************************* +Closes a read view. */ +UNIV_INTERN +void +read_view_close( +/*============*/ + read_view_t* view); /* in: read view */ +/************************************************************************* +Closes a consistent read view for MySQL. This function is called at an SQL +statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ +UNIV_INTERN +void +read_view_close_for_mysql( +/*======================*/ + trx_t* trx); /* in: trx which has a read view */ +/************************************************************************* +Checks if a read view sees the specified transaction. */ +UNIV_INLINE +ibool +read_view_sees_trx_id( +/*==================*/ + /* out: TRUE if sees */ + read_view_t* view, /* in: read view */ + dulint trx_id);/* in: trx id */ +/************************************************************************* +Prints a read view to stderr. */ +UNIV_INTERN +void +read_view_print( +/*============*/ + read_view_t* view); /* in: read view */ +/************************************************************************* +Create a consistent cursor view for mysql to be used in cursors. In this +consistent read view modifications done by the creating transaction or future +transactions are not visible. */ +UNIV_INTERN +cursor_view_t* +read_cursor_view_create_for_mysql( +/*==============================*/ + trx_t* cr_trx);/* in: trx where cursor view is created */ +/************************************************************************* +Close a given consistent cursor view for mysql and restore global read view +back to a transaction read view. */ +UNIV_INTERN +void +read_cursor_view_close_for_mysql( +/*=============================*/ + trx_t* trx, /* in: trx */ + cursor_view_t* curview); /* in: cursor view to be closed */ +/************************************************************************* +This function sets a given consistent cursor view to a transaction +read view if given consistent cursor view is not NULL. Otherwise, function +restores a global read view to a transaction read view. */ +UNIV_INTERN +void +read_cursor_set_for_mysql( +/*======================*/ + trx_t* trx, /* in: transaction where cursor is set */ + cursor_view_t* curview);/* in: consistent cursor view to be set */ + +/* Read view lists the trx ids of those transactions for which a consistent +read should not see the modifications to the database. */ + +struct read_view_struct{ + ulint type; /* VIEW_NORMAL, VIEW_HIGH_GRANULARITY */ + dulint undo_no; /* (0, 0) or if type is VIEW_HIGH_GRANULARITY + transaction undo_no when this high-granularity + consistent read view was created */ + dulint low_limit_no; /* The view does not need to see the undo + logs for transactions whose transaction number + is strictly smaller (<) than this value: they + can be removed in purge if not needed by other + views */ + dulint low_limit_id; /* The read should not see any transaction + with trx id >= this value */ + dulint up_limit_id; /* The read should see all trx ids which + are strictly smaller (<) than this value */ + ulint n_trx_ids; /* Number of cells in the trx_ids array */ + dulint* trx_ids; /* Additional trx ids which the read should + not see: typically, these are the active + transactions at the time when the read is + serialized, except the reading transaction + itself; the trx ids in this array are in a + descending order */ + dulint creator_trx_id; /* trx id of creating transaction, or + (0, 0) used in purge */ + UT_LIST_NODE_T(read_view_t) view_list; + /* List of read views in trx_sys */ +}; + +/* Read view types */ +#define VIEW_NORMAL 1 /* Normal consistent read view + where transaction does not see changes + made by active transactions except + creating transaction. */ +#define VIEW_HIGH_GRANULARITY 2 /* High-granularity read view where + transaction does not see changes + made by active transactions and own + changes after a point in time when this + read view was created. */ + +/* Implement InnoDB framework to support consistent read views in +cursors. This struct holds both heap where consistent read view +is allocated and pointer to a read view. */ + +struct cursor_view_struct{ + mem_heap_t* heap; + /* Memory heap for the cursor view */ + read_view_t* read_view; + /* Consistent read view of the cursor*/ + ulint n_mysql_tables_in_use; + /* number of Innobase tables used in the + processing of this cursor */ +}; + +#ifndef UNIV_NONINL +#include "read0read.ic" +#endif + +#endif diff --git a/storage/xtradb/include/read0read.ic b/storage/xtradb/include/read0read.ic new file mode 100644 index 00000000000..9fc6af04e88 --- /dev/null +++ b/storage/xtradb/include/read0read.ic @@ -0,0 +1,97 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +/************************************************************************* +Gets the nth trx id in a read view. */ +UNIV_INLINE +dulint +read_view_get_nth_trx_id( +/*=====================*/ + /* out: trx id */ + read_view_t* view, /* in: read view */ + ulint n) /* in: position */ +{ + ut_ad(n < view->n_trx_ids); + + return(*(view->trx_ids + n)); +} + +/************************************************************************* +Sets the nth trx id in a read view. */ +UNIV_INLINE +void +read_view_set_nth_trx_id( +/*=====================*/ + read_view_t* view, /* in: read view */ + ulint n, /* in: position */ + dulint trx_id) /* in: trx id to set */ +{ + ut_ad(n < view->n_trx_ids); + + *(view->trx_ids + n) = trx_id; +} + +/************************************************************************* +Checks if a read view sees the specified transaction. */ +UNIV_INLINE +ibool +read_view_sees_trx_id( +/*==================*/ + /* out: TRUE if sees */ + read_view_t* view, /* in: read view */ + dulint trx_id) /* in: trx id */ +{ + ulint n_ids; + int cmp; + ulint i; + + if (ut_dulint_cmp(trx_id, view->up_limit_id) < 0) { + + return(TRUE); + } + + if (ut_dulint_cmp(trx_id, view->low_limit_id) >= 0) { + + return(FALSE); + } + + /* We go through the trx ids in the array smallest first: this order + may save CPU time, because if there was a very long running + transaction in the trx id array, its trx id is looked at first, and + the first two comparisons may well decide the visibility of trx_id. */ + + n_ids = view->n_trx_ids; + + for (i = 0; i < n_ids; i++) { + + cmp = ut_dulint_cmp( + trx_id, + read_view_get_nth_trx_id(view, n_ids - i - 1)); + if (cmp <= 0) { + return(cmp < 0); + } + } + + return(TRUE); +} diff --git a/storage/xtradb/include/read0types.h b/storage/xtradb/include/read0types.h new file mode 100644 index 00000000000..44849cbb498 --- /dev/null +++ b/storage/xtradb/include/read0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#ifndef read0types_h +#define read0types_h + +typedef struct read_view_struct read_view_t; +typedef struct cursor_view_struct cursor_view_t; + +#endif diff --git a/storage/xtradb/include/rem0cmp.h b/storage/xtradb/include/rem0cmp.h new file mode 100644 index 00000000000..f32bae73a13 --- /dev/null +++ b/storage/xtradb/include/rem0cmp.h @@ -0,0 +1,205 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#ifndef rem0cmp_h +#define rem0cmp_h + +#include "univ.i" +#include "data0data.h" +#include "data0type.h" +#include "dict0dict.h" +#include "rem0rec.h" + +/***************************************************************** +Returns TRUE if two columns are equal for comparison purposes. */ +UNIV_INTERN +ibool +cmp_cols_are_equal( +/*===============*/ + /* out: TRUE if the columns are + considered equal in comparisons */ + const dict_col_t* col1, /* in: column 1 */ + const dict_col_t* col2, /* in: column 2 */ + ibool check_charsets); + /* in: whether to check charsets */ +/***************************************************************** +This function is used to compare two data fields for which we know the +data type. */ +UNIV_INLINE +int +cmp_data_data( +/*==========*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two data fields for which we know the +data type. */ +UNIV_INTERN +int +cmp_data_data_slow( +/*===============*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2); /* in: data field length or UNIV_SQL_NULL */ +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield( +/*==============*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + const dfield_t* dfield1,/* in: data field; must have type field set */ + const dfield_t* dfield2);/* in: data field */ +/***************************************************************** +This function is used to compare a data tuple to a physical record. +Only dtuple->n_fields_cmp first fields are taken into account for +the the data tuple! If we denote by n = n_fields_cmp, then rec must +have either m >= n fields, or it must differ from dtuple in some of +the m fields rec has. If rec has an externally stored field we do not +compare it but return with value 0 if such a comparison should be +made. */ +UNIV_INTERN +int +cmp_dtuple_rec_with_match( +/*======================*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively, when only the + common first fields are compared, or + until the first externally stored field in + rec */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint* matched_fields, /* in/out: number of already completely + matched fields; when function returns, + contains the value for current comparison */ + ulint* matched_bytes); /* in/out: number of already matched + bytes within the first field not completely + matched; when function returns, contains the + value for current comparison */ +/****************************************************************** +Compares a data tuple to a physical record. */ +UNIV_INTERN +int +cmp_dtuple_rec( +/*===========*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively; see the comments + for cmp_dtuple_rec_with_match */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/****************************************************************** +Checks if a dtuple is a prefix of a record. The last field in dtuple +is allowed to be a prefix of the corresponding field in the record. */ +UNIV_INTERN +ibool +cmp_dtuple_is_prefix_of_rec( +/*========================*/ + /* out: TRUE if prefix */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +#ifndef UNIV_HOTBACKUP +/***************************************************************** +Compare two physical records that contain the same number of columns, +none of which are stored externally. */ +UNIV_INTERN +int +cmp_rec_rec_simple( +/*===============*/ + /* out: 1, 0 , -1 if rec1 is greater, + equal, less, respectively, than rec2 */ + const rec_t* rec1, /* in: physical record */ + const rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + const dict_index_t* index); /* in: data dictionary index */ +#endif /* !UNIV_HOTBACKUP */ +/***************************************************************** +This function is used to compare two physical records. Only the common +first fields are compared, and if an externally stored field is +encountered, then 0 is returned. */ +UNIV_INTERN +int +cmp_rec_rec_with_match( +/*===================*/ + /* out: 1, 0 , -1 if rec1 is greater, equal, + less, respectively, than rec2; only the common + first fields are compared */ + const rec_t* rec1, /* in: physical record */ + const rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + dict_index_t* index, /* in: data dictionary index */ + ulint* matched_fields, /* in/out: number of already completely + matched fields; when the function returns, + contains the value the for current + comparison */ + ulint* matched_bytes);/* in/out: number of already matched + bytes within the first field not completely + matched; when the function returns, contains + the value for the current comparison */ +/***************************************************************** +This function is used to compare two physical records. Only the common +first fields are compared. */ +UNIV_INLINE +int +cmp_rec_rec( +/*========*/ + /* out: 1, 0 , -1 if rec1 is greater, equal, + less, respectively, than rec2; only the common + first fields are compared */ + const rec_t* rec1, /* in: physical record */ + const rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + dict_index_t* index); /* in: data dictionary index */ + + +#ifndef UNIV_NONINL +#include "rem0cmp.ic" +#endif + +#endif diff --git a/storage/xtradb/include/rem0cmp.ic b/storage/xtradb/include/rem0cmp.ic new file mode 100644 index 00000000000..6c58d9e5a25 --- /dev/null +++ b/storage/xtradb/include/rem0cmp.ic @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type. */ +UNIV_INLINE +int +cmp_data_data( +/*==========*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ + return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2)); +} + +/***************************************************************** +This function is used to compare two dfields where at least the first +has its data type field set. */ +UNIV_INLINE +int +cmp_dfield_dfield( +/*==============*/ + /* out: 1, 0, -1, if dfield1 is greater, equal, + less than dfield2, respectively */ + const dfield_t* dfield1,/* in: data field; must have type field set */ + const dfield_t* dfield2)/* in: data field */ +{ + const dtype_t* type; + + ut_ad(dfield_check_typed(dfield1)); + + type = dfield_get_type(dfield1); + + return(cmp_data_data(type->mtype, type->prtype, + (const byte*) dfield_get_data(dfield1), + dfield_get_len(dfield1), + (const byte*) dfield_get_data(dfield2), + dfield_get_len(dfield2))); +} + +/***************************************************************** +This function is used to compare two physical records. Only the common +first fields are compared. */ +UNIV_INLINE +int +cmp_rec_rec( +/*========*/ + /* out: 1, 0 , -1 if rec1 is greater, equal, + less, respectively, than rec2; only the common + first fields are compared */ + const rec_t* rec1, /* in: physical record */ + const rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + dict_index_t* index) /* in: data dictionary index */ +{ + ulint match_f = 0; + ulint match_b = 0; + + return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, + &match_f, &match_b)); +} diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h new file mode 100644 index 00000000000..cb72a5fa25b --- /dev/null +++ b/storage/xtradb/include/rem0rec.h @@ -0,0 +1,822 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0rec_h +#define rem0rec_h + +#include "univ.i" +#include "data0data.h" +#include "rem0types.h" +#include "mtr0types.h" +#include "page0types.h" + +/* Info bit denoting the predefined minimum record: this bit is set +if and only if the record is the first user record on a non-leaf +B-tree page that is the leftmost page on its level +(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ +#define REC_INFO_MIN_REC_FLAG 0x10UL +/* The deleted flag in info bits */ +#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the + record has been delete marked */ + +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +/* Record status values */ +#define REC_STATUS_ORDINARY 0 +#define REC_STATUS_NODE_PTR 1 +#define REC_STATUS_INFIMUM 2 +#define REC_STATUS_SUPREMUM 3 + +/* The following four constants are needed in page0zip.c in order to +efficiently compress and decompress pages. */ + +/* The offset of heap_no in a compact record */ +#define REC_NEW_HEAP_NO 4 +/* The shift of heap_no in a compact record. +The status is stored in the low-order bits. */ +#define REC_HEAP_NO_SHIFT 3 + +/* Length of a B-tree node pointer, in bytes */ +#define REC_NODE_PTR_SIZE 4 + +#ifdef UNIV_DEBUG +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 4 +#else /* UNIV_DEBUG */ +/* Length of the rec_get_offsets() header */ +# define REC_OFFS_HEADER_SIZE 2 +#endif /* UNIV_DEBUG */ + +/* Number of elements that should be initially allocated for the +offsets[] array, first passed to rec_get_offsets() */ +#define REC_OFFS_NORMAL_SIZE 100 +#define REC_OFFS_SMALL_SIZE 10 + +/********************************************************** +The following function is used to get the pointer of the next chained record +on the same page. */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + /* out: pointer to the next chained record, or + NULL if none */ + const rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ +/********************************************************** +The following function is used to get the pointer of the next chained record +on the same page. */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + /* out: pointer to the next chained record, or + NULL if none */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ +/********************************************************** +The following function is used to get the offset of the +next chained record on the same page. */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + /* out: the page offset of the next + chained record, or 0 if none */ + const rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ +/********************************************************** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ + ulint next); /* in: offset of the next record */ +/********************************************************** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + ulint next); /* in: offset of the next record */ +/********************************************************** +The following function is used to get the number of fields +in an old-style record. */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + /* out: number of data fields */ + const rec_t* rec); /* in: physical record */ +/********************************************************** +The following function is used to get the number of fields +in a record. */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + /* out: number of data fields */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index); /* in: record descriptor */ +/********************************************************** +The following function is used to get the number of records owned by the +previous directory record. */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + /* out: number of owned records */ + const rec_t* rec); /* in: old-style physical record */ +/********************************************************** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_old( +/*================*/ + /* out: TRUE on success */ + rec_t* rec, /* in: old-style physical record */ + ulint n_owned); /* in: the number of owned */ +/********************************************************** +The following function is used to get the number of records owned by the +previous directory record. */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + /* out: number of owned records */ + const rec_t* rec); /* in: new-style physical record */ +/********************************************************** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n_owned);/* in: the number of owned */ +/********************************************************** +The following function is used to retrieve the info bits of +a record. */ +UNIV_INLINE +ulint +rec_get_info_bits( +/*==============*/ + /* out: info bits */ + const rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ +/********************************************************** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ + ulint bits); /* in: info bits */ +/********************************************************** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + ulint bits); /* in: info bits */ +/********************************************************** +The following function retrieves the status bits of a new-style record. */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + const rec_t* rec); /* in: physical record */ + +/********************************************************** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in/out: physical record */ + ulint bits); /* in: info bits */ + +/********************************************************** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + /* out: info bits */ + const rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ +/********************************************************** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /* in/out: compact physical record */ + ulint bits); /* in: info bits */ + +/********************************************************** +The following function tells if record is delete marked. */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + /* out: nonzero if delete marked */ + const rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ +/********************************************************** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /* in: old-style physical record */ + ulint flag); /* in: nonzero if delete marked */ +/********************************************************** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint flag); /* in: nonzero if delete marked */ +/********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*==================*/ + /* out: TRUE if node pointer */ + const rec_t* rec); /* in: physical record */ +/********************************************************** +The following function is used to get the order number +of an old-style record in the heap of the index page. */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + /* out: heap order number */ + const rec_t* rec); /* in: physical record */ +/********************************************************** +The following function is used to set the heap number +field in an old-style record. */ +UNIV_INLINE +void +rec_set_heap_no_old( +/*================*/ + rec_t* rec, /* in: physical record */ + ulint heap_no);/* in: the heap number */ +/********************************************************** +The following function is used to get the order number +of a new-style record in the heap of the index page. */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + /* out: heap order number */ + const rec_t* rec); /* in: physical record */ +/********************************************************** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /* in/out: physical record */ + ulint heap_no);/* in: the heap number */ +/********************************************************** +The following function is used to test whether the data offsets +in the record are stored in one-byte or two-byte format. */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + /* out: TRUE if 1-byte form */ + const rec_t* rec); /* in: physical record */ + +/********************************************************** +Determine how many of the first n columns in a compact +physical record are stored externally. */ +UNIV_INTERN +ulint +rec_get_n_extern_new( +/*=================*/ + /* out: number of externally stored columns */ + const rec_t* rec, /* in: compact physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n); /* in: number of columns to scan */ + +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +UNIV_INTERN +ulint* +rec_get_offsets_func( +/*=================*/ + /* out: the new offsets */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in/out: array consisting of + offsets[0] allocated elements, + or an array from rec_get_offsets(), + or NULL */ + ulint n_fields,/* in: maximum number of + initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t** heap, /* in/out: memory heap */ + const char* file, /* in: file name where called */ + ulint line); /* in: line number where called */ + +#define rec_get_offsets(rec,index,offsets,n,heap) \ + rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__) + +/********************************************************** +Determine the offset to each field in a leaf-page record +in ROW_FORMAT=COMPACT. This is a special case of +rec_init_offsets() and rec_get_offsets_func(). */ +UNIV_INTERN +void +rec_init_offsets_comp_ordinary( +/*===========================*/ + const rec_t* rec, /* in: physical record in + ROW_FORMAT=COMPACT */ + ulint extra, /* in: number of bytes to reserve + between the record header and + the data payload + (usually REC_N_NEW_EXTRA_BYTES) */ + const dict_index_t* index, /* in: record descriptor */ + ulint* offsets);/* in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ + +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +UNIV_INTERN +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /* in: the extra bytes of a + compact record in reverse order, + excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + const dict_index_t* index, /* in: record descriptor */ + ulint node_ptr,/* in: nonzero=node pointer, + 0=leaf node */ + ulint* offsets);/* in/out: array consisting of + offsets[0] allocated elements */ + +/**************************************************************** +Validates offsets returned by rec_get_offsets(). */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + const rec_t* rec, /* in: record or NULL */ + const dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets);/* in: array returned by + rec_get_offsets() */ +#ifdef UNIV_DEBUG +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec, /* in: record */ + const dict_index_t* index, /* in: record descriptor */ + ulint* offsets);/* in: array returned by + rec_get_offsets() */ +#else +# define rec_offs_make_valid(rec, index, offsets) ((void) 0) +#endif /* UNIV_DEBUG */ + +/**************************************************************** +The following function is used to get the offset to the nth +data field in an old-style record. */ +UNIV_INTERN +ulint +rec_get_nth_field_offs_old( +/*=======================*/ + /* out: offset to the field */ + const rec_t* rec, /* in: record */ + ulint n, /* in: index of the field */ + ulint* len); /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +#define rec_get_nth_field_old(rec, n, len) \ +((rec) + rec_get_nth_field_offs_old(rec, n, len)) +/**************************************************************** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + /* out: field size in bytes */ + const rec_t* rec, /* in: record */ + ulint n); /* in: index of the field */ +/**************************************************************** +The following function is used to get an offset to the nth +data field in a record. */ +UNIV_INLINE +ulint +rec_get_nth_field_offs( +/*===================*/ + /* out: offset from the origin of rec */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len); /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +#define rec_get_nth_field(rec, offsets, n, len) \ +((rec) + rec_get_nth_field_offs(offsets, n, len)) +/********************************************************** +Determine if the offsets are for a record in the new +compact format. */ +UNIV_INLINE +ulint +rec_offs_comp( +/*==========*/ + /* out: nonzero if compact format */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/********************************************************** +Determine if the offsets are for a record containing +externally stored columns. */ +UNIV_INLINE +ulint +rec_offs_any_extern( +/*================*/ + /* out: nonzero if externally stored */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/********************************************************** +Returns nonzero if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + /* out: nonzero if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ +/********************************************************** +Returns nonzero if the SQL NULL bit is set in nth field of rec. */ +UNIV_INLINE +ulint +rec_offs_nth_sql_null( +/*==================*/ + /* out: nonzero if SQL NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n); /* in: nth field */ + +/********************************************************** +Returns the number of extern bits set in a record. */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + /* out: number of externally stored fields */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/*************************************************************** +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null. +For records in ROW_FORMAT=COMPACT (new-style records), len must not be +UNIV_SQL_NULL unless the field already is SQL null. */ +UNIV_INLINE +void +rec_set_nth_field( +/*==============*/ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data if not SQL null */ + ulint len); /* in: length of the data or UNIV_SQL_NULL */ +/************************************************************** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + /* out: size */ + const rec_t* rec); /* in: physical record */ +/************************************************************** +The following function returns the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + /* out: number of elements */ + const ulint* offsets);/* in: array for rec_get_offsets() */ +/************************************************************** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /* out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc); /* in: number of elements */ +#define rec_offs_init(offsets) \ + rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) +/************************************************************** +The following function returns the number of fields in a record. */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*==============*/ + /* out: number of fields */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +Returns the total size of record minus data size of record. +The value returned by the function is the distance from record +start to record origin in bytes. */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +Returns the total size of a physical record. */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +Returns a pointer to the start of the record. */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/************************************************************** +Returns a pointer to the end of the record. */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Copies a physical record to a buffer. */ +UNIV_INLINE +rec_t* +rec_copy( +/*=====*/ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/****************************************************************** +Copies the first n fields of a physical record to a new physical record in +a buffer. */ +UNIV_INTERN +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + /* out, own: copied record */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields + to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, + or NULL */ + ulint* buf_size); /* in/out: buffer size */ +/**************************************************************** +Folds a prefix of a physical record to a ulint. */ +UNIV_INLINE +ulint +rec_fold( +/*=====*/ + /* out: the folded value */ + const rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id) /* in: index tree id */ + __attribute__((pure)); +/************************************************************* +Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ +UNIV_INTERN +void +rec_convert_dtuple_to_rec_comp( +/*===========================*/ + rec_t* rec, /* in: origin of record */ + ulint extra, /* in: number of bytes to + reserve between the record + header and the data payload + (normally REC_N_NEW_EXTRA_BYTES) */ + const dict_index_t* index, /* in: record descriptor */ + ulint status, /* in: status bits of the record */ + const dfield_t* fields, /* in: array of data fields */ + ulint n_fields);/* in: number of data fields */ +/************************************************************* +Builds a physical record out of a data tuple and +stores it into the given buffer. */ +UNIV_INTERN +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* dtuple, /* in: data tuple */ + ulint n_ext); /* in: number of + externally stored columns */ +/************************************************************** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + /* out: extra size */ + ulint data_size, /* in: data size */ + ulint n_fields, /* in: number of fields */ + ulint n_ext) /* in: number of externally stored columns */ + __attribute__((const)); +/************************************************************** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. */ +UNIV_INTERN +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + /* out: total size */ + const dict_index_t* index, /* in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + const dfield_t* fields, /* in: array of data fields */ + ulint n_fields,/* in: number of data fields */ + ulint* extra); /* out: extra size */ +/************************************************************** +Determines the size of a data tuple in ROW_FORMAT=COMPACT. */ +UNIV_INTERN +ulint +rec_get_converted_size_comp( +/*========================*/ + /* out: total size */ + const dict_index_t* index, /* in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + ulint status, /* in: status bits of the record */ + const dfield_t* fields, /* in: array of data fields */ + ulint n_fields,/* in: number of data fields */ + ulint* extra); /* out: extra size */ +/************************************************************** +The following function returns the size of a data tuple when converted to +a physical record. */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + const dtuple_t* dtuple, /* in: data tuple */ + ulint n_ext); /* in: number of externally stored columns */ +/****************************************************************** +Copies the first n fields of a physical record to a data tuple. +The fields are copied to the memory heap. */ +UNIV_INTERN +void +rec_copy_prefix_to_dtuple( +/*======================*/ + dtuple_t* tuple, /* out: data tuple */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields + to copy */ + mem_heap_t* heap); /* in: memory heap */ +/******************************************************************* +Validates the consistency of a physical record. */ +UNIV_INTERN +ibool +rec_validate( +/*=========*/ + /* out: TRUE if ok */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Prints an old-style physical record. */ +UNIV_INTERN +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + const rec_t* rec); /* in: physical record */ +/******************************************************************* +Prints a physical record in ROW_FORMAT=COMPACT. Ignores the +record header. */ +UNIV_INTERN +void +rec_print_comp( +/*===========*/ + FILE* file, /* in: file where to print */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Prints a physical record. */ +UNIV_INTERN +void +rec_print_new( +/*==========*/ + FILE* file, /* in: file where to print */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Prints a physical record. */ +UNIV_INTERN +void +rec_print( +/*======*/ + FILE* file, /* in: file where to print */ + const rec_t* rec, /* in: physical record */ + dict_index_t* index); /* in: record descriptor */ + +#define REC_INFO_BITS 6 /* This is single byte bit-field */ + +/* Maximum lengths for the data in a physical record if the offsets +are given in one byte (resp. two byte) format. */ +#define REC_1BYTE_OFFS_LIMIT 0x7FUL +#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL + +/* The data size of record must be smaller than this because we reserve +two upmost bits in a two byte offset for special purposes */ +#define REC_MAX_DATA_SIZE (16 * 1024) + +#ifndef UNIV_NONINL +#include "rem0rec.ic" +#endif + +#endif diff --git a/storage/xtradb/include/rem0rec.ic b/storage/xtradb/include/rem0rec.ic new file mode 100644 index 00000000000..0b2b9f4a685 --- /dev/null +++ b/storage/xtradb/include/rem0rec.ic @@ -0,0 +1,1652 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mach0data.h" +#include "ut0byte.h" +#include "dict0dict.h" + +/* Compact flag ORed to the extra size returned by rec_get_offsets() */ +#define REC_OFFS_COMPACT ((ulint) 1 << 31) +/* SQL NULL flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_SQL_NULL ((ulint) 1 << 31) +/* External flag in offsets returned by rec_get_offsets() */ +#define REC_OFFS_EXTERNAL ((ulint) 1 << 30) +/* Mask for offsets returned by rec_get_offsets() */ +#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1) + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits pointer to next record + 2 8 bits pointer to next record + 3 1 bit short flag + 7 bits number of fields + 4 3 bits number of fields + 5 bits heap number + 5 8 bits heap number + 6 4 bits n_owned + 4 bits info bits +*/ + +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits relative offset of next record + 2 8 bits relative offset of next record + the relative offset is an unsigned 16-bit + integer: + (offset_of_next_record + - offset_of_this_record) mod 64Ki, + where mod is the modulo as a non-negative + number; + we can calculate the the offset of the next + record with the formula: + relative_offset + offset_of_this_record + mod UNIV_PAGE_SIZE + 3 3 bits status: + 000=conventional record + 001=node pointer record (inside B-tree) + 010=infimum record + 011=supremum record + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + +/* We list the byte offsets from the origin of the record, the mask, +and the shift needed to obtain each bit-field of the record. */ + +#define REC_NEXT 2 +#define REC_NEXT_MASK 0xFFFFUL +#define REC_NEXT_SHIFT 0 + +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 + +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 + +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 + +#define REC_OLD_HEAP_NO 5 +#define REC_HEAP_NO_MASK 0xFFF8UL +#if 0 /* defined in rem0rec.h for use of page0zip.c */ +#define REC_NEW_HEAP_NO 4 +#define REC_HEAP_NO_SHIFT 3 +#endif + +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ +#define REC_N_OWNED_MASK 0xFUL +#define REC_N_OWNED_SHIFT 0 + +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ +#define REC_INFO_BITS_MASK 0xF0UL +#define REC_INFO_BITS_SHIFT 0 + +/* The following masks are used to filter the SQL null bit from +one-byte and two-byte offsets */ + +#define REC_1BYTE_SQL_NULL_MASK 0x80UL +#define REC_2BYTE_SQL_NULL_MASK 0x8000UL + +/* In a 2-byte offset the second most significant bit denotes +a field stored to another page: */ + +#define REC_2BYTE_EXTERN_MASK 0x4000UL + +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif + +/*************************************************************** +Sets the value of the ith field SQL null bit of an old-style record. */ +UNIV_INTERN +void +rec_set_nth_field_null_bit( +/*=======================*/ + rec_t* rec, /* in: record */ + ulint i, /* in: ith field */ + ibool val); /* in: value to set */ +/*************************************************************** +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ +UNIV_INTERN +void +rec_set_nth_field_sql_null( +/*=======================*/ + rec_t* rec, /* in: record */ + ulint n); /* in: index of the field */ + +/********************************************************** +Gets a bit field from within 1 byte. */ +UNIV_INLINE +ulint +rec_get_bit_field_1( +/*================*/ + const rec_t* rec, /* in: pointer to record origin */ + ulint offs, /* in: offset from the origin down */ + ulint mask, /* in: mask used to filter bits */ + ulint shift) /* in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_1(rec - offs) & mask) >> shift); +} + +/********************************************************** +Sets a bit field within 1 byte. */ +UNIV_INLINE +void +rec_set_bit_field_1( +/*================*/ + rec_t* rec, /* in: pointer to record origin */ + ulint val, /* in: value to set */ + ulint offs, /* in: offset from the origin down */ + ulint mask, /* in: mask used to filter bits */ + ulint shift) /* in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask); + ut_ad(mask <= 0xFFUL); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_1(rec - offs, + (mach_read_from_1(rec - offs) & ~mask) + | (val << shift)); +} + +/********************************************************** +Gets a bit field from within 2 bytes. */ +UNIV_INLINE +ulint +rec_get_bit_field_2( +/*================*/ + const rec_t* rec, /* in: pointer to record origin */ + ulint offs, /* in: offset from the origin down */ + ulint mask, /* in: mask used to filter bits */ + ulint shift) /* in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_2(rec - offs) & mask) >> shift); +} + +/********************************************************** +Sets a bit field within 2 bytes. */ +UNIV_INLINE +void +rec_set_bit_field_2( +/*================*/ + rec_t* rec, /* in: pointer to record origin */ + ulint val, /* in: value to set */ + ulint offs, /* in: offset from the origin down */ + ulint mask, /* in: mask used to filter bits */ + ulint shift) /* in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask > 0xFFUL); + ut_ad(mask <= 0xFFFFUL); + ut_ad((mask >> shift) & 1); + ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_2(rec - offs, + (mach_read_from_2(rec - offs) & ~mask) + | (val << shift)); +} + +/********************************************************** +The following function is used to get the pointer of the next chained record +on the same page. */ +UNIV_INLINE +const rec_t* +rec_get_next_ptr_const( +/*===================*/ + /* out: pointer to the next chained record, or + NULL if none */ + const rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + ulint field_value; + + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (UNIV_UNLIKELY(field_value == 0)) { + + return(NULL); + } + + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { +#if UNIV_PAGE_SIZE <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) + + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE) + + field_value); + } +} + +/********************************************************** +The following function is used to get the pointer of the next chained record +on the same page. */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + /* out: pointer to the next chained record, or + NULL if none */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + return((rec_t*) rec_get_next_ptr_const(rec, comp)); +} + +/********************************************************** +The following function is used to get the offset of the next chained record +on the same page. */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + /* out: the page offset of the next + chained record, or 0 if none */ + const rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + ulint field_value; +#if REC_NEXT_MASK != 0xFFFFUL +# error "REC_NEXT_MASK != 0xFFFFUL" +#endif +#if REC_NEXT_SHIFT +# error "REC_NEXT_SHIFT != 0" +#endif + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { +#if UNIV_PAGE_SIZE <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + if (UNIV_UNLIKELY(field_value == 0)) { + + return(0); + } + + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE)); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return(field_value); + } +} + +/********************************************************** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ + ulint next) /* in: offset of the next record */ +{ + ut_ad(rec); + ut_ad(UNIV_PAGE_SIZE > next); +#if REC_NEXT_MASK != 0xFFFFUL +# error "REC_NEXT_MASK != 0xFFFFUL" +#endif +#if REC_NEXT_SHIFT +# error "REC_NEXT_SHIFT != 0" +#endif + + mach_write_to_2(rec - REC_NEXT, next); +} + +/********************************************************** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + ulint next) /* in: offset of the next record */ +{ + ulint field_value; + + ut_ad(rec); + ut_ad(UNIV_PAGE_SIZE > next); + + if (UNIV_UNLIKELY(!next)) { + field_value = 0; + } else { + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint) + ((lint) next + - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE)); + field_value &= REC_NEXT_MASK; + } + + mach_write_to_2(rec - REC_NEXT, field_value); +} + +/********************************************************** +The following function is used to get the number of fields +in an old-style record. */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + /* out: number of data fields */ + const rec_t* rec) /* in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, + REC_OLD_N_FIELDS_SHIFT); + ut_ad(ret <= REC_MAX_N_FIELDS); + ut_ad(ret > 0); + + return(ret); +} + +/********************************************************** +The following function is used to set the number of fields +in an old-style record. */ +UNIV_INLINE +void +rec_set_n_fields_old( +/*=================*/ + rec_t* rec, /* in: physical record */ + ulint n_fields) /* in: the number of fields */ +{ + ut_ad(rec); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields > 0); + + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/********************************************************** +The following function retrieves the status bits of a new-style record. */ +UNIV_INLINE +ulint +rec_get_status( +/*===========*/ + /* out: status bits */ + const rec_t* rec) /* in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_1(rec, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); + ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0); + + return(ret); +} + +/********************************************************** +The following function is used to get the number of fields +in a record. */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + /* out: number of data fields */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index) /* in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + + if (!dict_table_is_comp(index->table)) { + return(rec_get_n_fields_old(rec)); + } + + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + default: + ut_error; + return(ULINT_UNDEFINED); + } +} + +/********************************************************** +The following function is used to get the number of records owned by the +previous directory record. */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + /* out: number of owned records */ + const rec_t* rec) /* in: old-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/********************************************************** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_old( +/*================*/ + /* out: TRUE on success */ + rec_t* rec, /* in: old-style physical record */ + ulint n_owned) /* in: the number of owned */ +{ + rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); +} + +/********************************************************** +The following function is used to get the number of records owned by the +previous directory record. */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + /* out: number of owned records */ + const rec_t* rec) /* in: new-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/********************************************************** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n_owned)/* in: the number of owned */ +{ + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (UNIV_LIKELY_NULL(page_zip) + && UNIV_LIKELY(rec_get_status(rec) + != REC_STATUS_SUPREMUM)) { + page_zip_rec_set_owned(page_zip, rec, n_owned); + } +} + +/********************************************************** +The following function is used to retrieve the info bits of a record. */ +UNIV_INLINE +ulint +rec_get_info_bits( +/*==============*/ + /* out: info bits */ + const rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + return(rec_get_bit_field_1( + rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT)); +} + +/********************************************************** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ + ulint bits) /* in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} +/********************************************************** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + ulint bits) /* in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} + +/********************************************************** +The following function is used to set the status bits of a new-style record. */ +UNIV_INLINE +void +rec_set_status( +/*===========*/ + rec_t* rec, /* in/out: physical record */ + ulint bits) /* in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, + REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); +} + +/********************************************************** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +ulint +rec_get_info_and_status_bits( +/*=========================*/ + /* out: info bits */ + const rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + ulint bits; +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { + bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec); + } else { + bits = rec_get_info_bits(rec, FALSE); + ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + } + return(bits); +} +/********************************************************** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /* in/out: physical record */ + ulint bits) /* in: info bits */ +{ +#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ +& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) +# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" +#endif + rec_set_status(rec, bits & REC_NEW_STATUS_MASK); + rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK); +} + +/********************************************************** +The following function tells if record is delete marked. */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + /* out: nonzero if delete marked */ + const rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { + return(UNIV_UNLIKELY( + rec_get_bit_field_1(rec, REC_NEW_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT))); + } else { + return(UNIV_UNLIKELY( + rec_get_bit_field_1(rec, REC_OLD_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT))); + } +} + +/********************************************************** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /* in: old-style physical record */ + ulint flag) /* in: nonzero if delete marked */ +{ + ulint val; + + val = rec_get_info_bits(rec, FALSE); + + if (flag) { + val |= REC_INFO_DELETED_FLAG; + } else { + val &= ~REC_INFO_DELETED_FLAG; + } + + rec_set_info_bits_old(rec, val); +} + +/********************************************************** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint flag) /* in: nonzero if delete marked */ +{ + ulint val; + + val = rec_get_info_bits(rec, TRUE); + + if (flag) { + val |= REC_INFO_DELETED_FLAG; + } else { + val &= ~REC_INFO_DELETED_FLAG; + } + + rec_set_info_bits_new(rec, val); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_rec_set_deleted(page_zip, rec, flag); + } +} + +/********************************************************** +The following function tells if a new-style record is a node pointer. */ +UNIV_INLINE +ibool +rec_get_node_ptr_flag( +/*==================*/ + /* out: TRUE if node pointer */ + const rec_t* rec) /* in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); +} + +/********************************************************** +The following function is used to get the order number +of an old-style record in the heap of the index page. */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + /* out: heap order number */ + const rec_t* rec) /* in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/********************************************************** +The following function is used to set the heap number +field in an old-style record. */ +UNIV_INLINE +void +rec_set_heap_no_old( +/*================*/ + rec_t* rec, /* in: physical record */ + ulint heap_no)/* in: the heap number */ +{ + rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); +} + +/********************************************************** +The following function is used to get the order number +of a new-style record in the heap of the index page. */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + /* out: heap order number */ + const rec_t* rec) /* in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/********************************************************** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /* in/out: physical record */ + ulint heap_no)/* in: the heap number */ +{ + rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); +} + +/********************************************************** +The following function is used to test whether the data offsets in the record +are stored in one-byte or two-byte format. */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + /* out: TRUE if 1-byte form */ + const rec_t* rec) /* in: physical record */ +{ +#if TRUE != 1 +#error "TRUE != 1" +#endif + + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); +} + +/********************************************************** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /* in: physical record */ + ibool flag) /* in: TRUE if 1byte form */ +{ +#if TRUE != 1 +#error "TRUE != 1" +#endif + ut_ad(flag <= TRUE); + + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); +} + +/********************************************************** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. */ +UNIV_INLINE +ulint +rec_1_get_field_end_info( +/*=====================*/ + /* out: offset of the start of the + field, SQL null flag ORed */ + const rec_t* rec, /* in: record */ + ulint n) /* in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); +} + +/********************************************************** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. */ +UNIV_INLINE +ulint +rec_2_get_field_end_info( +/*=====================*/ + /* out: offset of the start of the + field, SQL null flag and extern + storage flag ORed */ + const rec_t* rec, /* in: record */ + ulint n) /* in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); +} + +/* Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +/************************************************************** +The following function returns the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +ulint +rec_offs_get_n_alloc( +/*=================*/ + /* out: number of elements */ + const ulint* offsets)/* in: array for rec_get_offsets() */ +{ + ulint n_alloc; + ut_ad(offsets); + n_alloc = offsets[0]; + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + UNIV_MEM_ASSERT_W(offsets, n_alloc * sizeof *offsets); + return(n_alloc); +} + +/************************************************************** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + ulint* offsets, /* out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /* in: number of elements */ +{ + ut_ad(offsets); + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets); + offsets[0] = n_alloc; +} + +/************************************************************** +The following function returns the number of fields in a record. */ +UNIV_INLINE +ulint +rec_offs_n_fields( +/*==============*/ + /* out: number of fields */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/**************************************************************** +Validates offsets returned by rec_get_offsets(). */ +UNIV_INLINE +ibool +rec_offs_validate( +/*==============*/ + /* out: TRUE if valid */ + const rec_t* rec, /* in: record or NULL */ + const dict_index_t* index, /* in: record descriptor or NULL */ + const ulint* offsets)/* in: array returned by + rec_get_offsets() */ +{ + ulint i = rec_offs_n_fields(offsets); + ulint last = ULINT_MAX; + ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT; + + if (rec) { + ut_ad((ulint) rec == offsets[2]); + if (!comp) { + ut_a(rec_get_n_fields_old(rec) >= i); + } + } + if (index) { + ulint max_n_fields; + ut_ad((ulint) index == offsets[3]); + max_n_fields = ut_max( + dict_index_get_n_fields(index), + dict_index_get_n_unique_in_tree(index) + 1); + if (comp && rec) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_NODE_PTR: + max_n_fields = dict_index_get_n_unique_in_tree( + index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + max_n_fields = 1; + break; + default: + ut_error; + } + } + /* index->n_def == 0 for dummy indexes if !comp */ + ut_a(!comp || index->n_def); + ut_a(!index->n_def || i <= max_n_fields); + } + while (i--) { + ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK; + ut_a(curr <= last); + last = curr; + } + return(TRUE); +} +#ifdef UNIV_DEBUG +/**************************************************************** +Updates debug data in offsets, in order to avoid bogus +rec_offs_validate() failures. */ +UNIV_INLINE +void +rec_offs_make_valid( +/*================*/ + const rec_t* rec, /* in: record */ + const dict_index_t* index, /* in: record descriptor */ + ulint* offsets)/* in: array returned by + rec_get_offsets() */ +{ + ut_ad(rec); + ut_ad(index); + ut_ad(offsets); + ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets)); + offsets[2] = (ulint) rec; + offsets[3] = (ulint) index; +} +#endif /* UNIV_DEBUG */ + +/**************************************************************** +The following function is used to get an offset to the nth +data field in a record. */ +UNIV_INLINE +ulint +rec_get_nth_field_offs( +/*===================*/ + /* out: offset from the origin of rec */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index of the field */ + ulint* len) /* out: length of the field; UNIV_SQL_NULL + if SQL null */ +{ + ulint offs; + ulint length; + ut_ad(n < rec_offs_n_fields(offsets)); + ut_ad(len); + + if (UNIV_UNLIKELY(n == 0)) { + offs = 0; + } else { + offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK; + } + + length = rec_offs_base(offsets)[1 + n]; + + if (length & REC_OFFS_SQL_NULL) { + length = UNIV_SQL_NULL; + } else { + length &= REC_OFFS_MASK; + length -= offs; + } + + *len = length; + return(offs); +} + +/********************************************************** +Determine if the offsets are for a record in the new +compact format. */ +UNIV_INLINE +ulint +rec_offs_comp( +/*==========*/ + /* out: nonzero if compact format */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return(*rec_offs_base(offsets) & REC_OFFS_COMPACT); +} + +/********************************************************** +Determine if the offsets are for a record containing +externally stored columns. */ +UNIV_INLINE +ulint +rec_offs_any_extern( +/*================*/ + /* out: nonzero if externally stored */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL)); +} + +/********************************************************** +Returns nonzero if the extern bit is set in nth field of rec. */ +UNIV_INLINE +ulint +rec_offs_nth_extern( +/*================*/ + /* out: nonzero if externally stored */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n] + & REC_OFFS_EXTERNAL)); +} + +/********************************************************** +Returns nonzero if the SQL NULL bit is set in nth field of rec. */ +UNIV_INLINE +ulint +rec_offs_nth_sql_null( +/*==================*/ + /* out: nonzero if SQL NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n] + & REC_OFFS_SQL_NULL)); +} + +/********************************************************** +Gets the physical size of a field. */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + /* out: length of field */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + if (!n) { + return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK); + } + return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n]) + & REC_OFFS_MASK); +} + +/********************************************************** +Returns the number of extern bits set in a record. */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + /* out: number of externally stored fields */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n = 0; + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { + n++; + } + } + } + + return(n); +} + +/********************************************************** +Returns the offset of n - 1th field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. This function and the 2-byte counterpart are defined here because the +C-compiler was not able to sum negative and positive constant offsets, and +warned of constant arithmetic overflow within the compiler. */ +UNIV_INLINE +ulint +rec_1_get_prev_field_end_info( +/*==========================*/ + /* out: offset of the start of the + PREVIOUS field, SQL null flag ORed */ + const rec_t* rec, /* in: record */ + ulint n) /* in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); +} + +/********************************************************** +Returns the offset of n - 1th field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. */ +UNIV_INLINE +ulint +rec_2_get_prev_field_end_info( +/*==========================*/ + /* out: offset of the start of the + PREVIOUS field, SQL null flag ORed */ + const rec_t* rec, /* in: record */ + ulint n) /* in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); +} + +/********************************************************** +Sets the field end info for the nth field if the record is stored in the +1-byte format. */ +UNIV_INLINE +void +rec_1_set_field_end_info( +/*=====================*/ + rec_t* rec, /* in: record */ + ulint n, /* in: field index */ + ulint info) /* in: value to set */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); +} + +/********************************************************** +Sets the field end info for the nth field if the record is stored in the +2-byte format. */ +UNIV_INLINE +void +rec_2_set_field_end_info( +/*=====================*/ + rec_t* rec, /* in: record */ + ulint n, /* in: field index */ + ulint info) /* in: value to set */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); +} + +/********************************************************** +Returns the offset of nth field start if the record is stored in the 1-byte +offsets form. */ +UNIV_INLINE +ulint +rec_1_get_field_start_offs( +/*=======================*/ + /* out: offset of the start of the field */ + const rec_t* rec, /* in: record */ + ulint n) /* in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_1_get_prev_field_end_info(rec, n) + & ~REC_1BYTE_SQL_NULL_MASK); +} + +/********************************************************** +Returns the offset of nth field start if the record is stored in the 2-byte +offsets form. */ +UNIV_INLINE +ulint +rec_2_get_field_start_offs( +/*=======================*/ + /* out: offset of the start of the field */ + const rec_t* rec, /* in: record */ + ulint n) /* in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_2_get_prev_field_end_info(rec, n) + & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK)); +} + +/********************************************************** +The following function is used to read the offset of the start of a data field +in the record. The start of an SQL null field is the end offset of the +previous non-null field, or 0, if none exists. If n is the number of the last +field + 1, then the end offset of the last field is returned. */ +UNIV_INLINE +ulint +rec_get_field_start_offs( +/*=====================*/ + /* out: offset of the start of the field */ + const rec_t* rec, /* in: record */ + ulint n) /* in: field index */ +{ + ut_ad(rec); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + if (rec_get_1byte_offs_flag(rec)) { + + return(rec_1_get_field_start_offs(rec, n)); + } + + return(rec_2_get_field_start_offs(rec, n)); +} + +/**************************************************************** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + /* out: field size in bytes */ + const rec_t* rec, /* in: record */ + ulint n) /* in: index of the field */ +{ + ulint os; + ulint next_os; + + os = rec_get_field_start_offs(rec, n); + next_os = rec_get_field_start_offs(rec, n + 1); + + ut_ad(next_os - os < UNIV_PAGE_SIZE); + + return(next_os - os); +} + +/*************************************************************** +This is used to modify the value of an already existing field in a record. +The previous value must have exactly the same size as the new value. If len +is UNIV_SQL_NULL then the field is treated as an SQL null. +For records in ROW_FORMAT=COMPACT (new-style records), len must not be +UNIV_SQL_NULL unless the field already is SQL null. */ +UNIV_INLINE +void +rec_set_nth_field( +/*==============*/ + rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n, /* in: index number of the field */ + const void* data, /* in: pointer to the data + if not SQL null */ + ulint len) /* in: length of the data or UNIV_SQL_NULL */ +{ + byte* data2; + ulint len2; + + ut_ad(rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (UNIV_UNLIKELY(len == UNIV_SQL_NULL)) { + if (!rec_offs_nth_sql_null(offsets, n)) { + ut_a(!rec_offs_comp(offsets)); + rec_set_nth_field_sql_null(rec, n); + } + + return; + } + + data2 = rec_get_nth_field(rec, offsets, n, &len2); + if (len2 == UNIV_SQL_NULL) { + ut_ad(!rec_offs_comp(offsets)); + rec_set_nth_field_null_bit(rec, n, FALSE); + ut_ad(len == rec_get_nth_field_size(rec, n)); + } else { + ut_ad(len2 == len); + } + + ut_memcpy(data2, data, len); +} + +/************************************************************** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + /* out: size */ + const rec_t* rec) /* in: physical record */ +{ + ut_ad(rec); + + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); +} + +/************************************************************** +The following function sets the number of fields in offsets. */ +UNIV_INLINE +void +rec_offs_set_n_fields( +/*==================*/ + ulint* offsets, /* in/out: array returned by + rec_get_offsets() */ + ulint n_fields) /* in: number of fields */ +{ + ut_ad(offsets); + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + offsets[1] = n_fields; +} + +/************************************************************** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint size; + + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)] + & REC_OFFS_MASK; + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} + +/************************************************************** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL); + ut_ad(size < UNIV_PAGE_SIZE); + return(size); +} + +/************************************************************** +Returns the total size of a physical record. */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + /* out: size */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); +} + +/************************************************************** +Returns a pointer to the end of the record. */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + /* out: pointer to end */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(rec + rec_offs_data_size(offsets)); +} + +/************************************************************** +Returns a pointer to the start of the record. */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + /* out: pointer to start */ + rec_t* rec, /* in: pointer to record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(rec - rec_offs_extra_size(offsets)); +} + +/******************************************************************* +Copies a physical record to a buffer. */ +UNIV_INLINE +rec_t* +rec_copy( +/*=====*/ + /* out: pointer to the origin of the copy */ + void* buf, /* in: buffer */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint extra_len; + ulint data_len; + + ut_ad(rec && buf); + ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); + + ut_memcpy(buf, rec - extra_len, extra_len + data_len); + + return((byte*)buf + extra_len); +} + +/************************************************************** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + /* out: extra size */ + ulint data_size, /* in: data size */ + ulint n_fields, /* in: number of fields */ + ulint n_ext) /* in: number of externally stored columns */ +{ + if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) { + + return(REC_N_OLD_EXTRA_BYTES + n_fields); + } + + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); +} + +/************************************************************** +The following function returns the size of a data tuple when converted to +a physical record. */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + /* out: size */ + dict_index_t* index, /* in: record descriptor */ + const dtuple_t* dtuple, /* in: data tuple */ + ulint n_ext) /* in: number of externally stored columns */ +{ + ulint data_size; + ulint extra_size; + + ut_ad(index); + ut_ad(dtuple); + ut_ad(dtuple_check_typed(dtuple)); + + ut_ad(index->type & DICT_UNIVERSAL + || dtuple_get_n_fields(dtuple) + == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) + ? dict_index_get_n_unique_in_tree(index) + 1 + : dict_index_get_n_fields(index))); + + if (dict_table_is_comp(index->table)) { + return(rec_get_converted_size_comp(index, + dtuple_get_info_bits(dtuple) + & REC_NEW_STATUS_MASK, + dtuple->fields, + dtuple->n_fields, NULL)); + } + + data_size = dtuple_get_data_size(dtuple); + + extra_size = rec_get_converted_extra_size( + data_size, dtuple_get_n_fields(dtuple), n_ext); + + return(data_size + extra_size); +} + +/**************************************************************** +Folds a prefix of a physical record to a ulint. Folds only existing fields, +that is, checks that we do not run out of the record. */ +UNIV_INLINE +ulint +rec_fold( +/*=====*/ + /* out: the folded value */ + const rec_t* rec, /* in: the physical record */ + const ulint* offsets, /* in: array returned by + rec_get_offsets() */ + ulint n_fields, /* in: number of complete + fields to fold */ + ulint n_bytes, /* in: number of bytes to fold + in an incomplete last field */ + dulint tree_id) /* in: index tree id */ +{ + ulint i; + const byte* data; + ulint len; + ulint fold; + ulint n_fields_rec; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + ut_ad(n_fields + n_bytes > 0); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); + + if (n_fields > n_fields_rec) { + n_fields = n_fields_rec; + } + + if (n_fields == n_fields_rec) { + n_bytes = 0; + } + + fold = ut_fold_dulint(tree_id); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} diff --git a/storage/xtradb/include/rem0types.h b/storage/xtradb/include/rem0types.h new file mode 100644 index 00000000000..d0b11b92495 --- /dev/null +++ b/storage/xtradb/include/rem0types.h @@ -0,0 +1,45 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +Record manager global types + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0types_h +#define rem0types_h + +/* We define the physical record simply as an array of bytes */ +typedef byte rec_t; + +/* Maximum values for various fields (for non-blob tuples) */ +#define REC_MAX_N_FIELDS (1024 - 1) +#define REC_MAX_HEAP_NO (2 * 8192 - 1) +#define REC_MAX_N_OWNED (16 - 1) + +/* REC_MAX_INDEX_COL_LEN is measured in bytes and is the maximum +indexed column length (or indexed prefix length). It is set to 3*256, +so that one can create a column prefix index on 256 characters of a +TEXT or VARCHAR column also in the UTF-8 charset. In that charset, +a character may take at most 3 bytes. +This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define REC_MAX_INDEX_COL_LEN 768 + +#endif diff --git a/storage/xtradb/include/row0ext.h b/storage/xtradb/include/row0ext.h new file mode 100644 index 00000000000..08ebafa4d98 --- /dev/null +++ b/storage/xtradb/include/row0ext.h @@ -0,0 +1,98 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#ifndef row0ext_h +#define row0ext_h + +#include "univ.i" +#include "row0types.h" +#include "data0types.h" +#include "mem0mem.h" + +/************************************************************************ +Creates a cache of column prefixes of externally stored columns. */ +UNIV_INTERN +row_ext_t* +row_ext_create( +/*===========*/ + /* out,own: column prefix cache */ + ulint n_ext, /* in: number of externally stored columns */ + const ulint* ext, /* in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + const dtuple_t* tuple, /* in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + ulint zip_size,/* compressed page size in bytes, or 0 */ + mem_heap_t* heap); /* in: heap where created */ + +/************************************************************************ +Looks up a column prefix of an externally stored column. */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + /* out: column prefix, or NULL if + the column is not stored externally, + or pointer to field_ref_zero + if the BLOB pointer is unset */ + const row_ext_t* ext, /* in/out: column prefix cache */ + ulint i, /* in: index of ext->ext[] */ + ulint* len); /* out: length of prefix, in bytes, + at most REC_MAX_INDEX_COL_LEN */ +/************************************************************************ +Looks up a column prefix of an externally stored column. */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + /* out: column prefix, or NULL if + the column is not stored externally, + or pointer to field_ref_zero + if the BLOB pointer is unset */ + const row_ext_t* ext, /* in: column prefix cache */ + ulint col, /* in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len); /* out: length of prefix, in bytes, + at most REC_MAX_INDEX_COL_LEN */ + +/* Prefixes of externally stored columns */ +struct row_ext_struct{ + ulint n_ext; /* number of externally stored columns */ + const ulint* ext; /* col_no's of externally stored columns */ + byte* buf; /* backing store of the column prefix cache */ + ulint len[1]; /* prefix lengths; 0 if not cached */ +}; + +#ifndef UNIV_NONINL +#include "row0ext.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0ext.ic b/storage/xtradb/include/row0ext.ic new file mode 100644 index 00000000000..e56fc175764 --- /dev/null +++ b/storage/xtradb/include/row0ext.ic @@ -0,0 +1,87 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "rem0types.h" +#include "btr0types.h" + +/************************************************************************ +Looks up a column prefix of an externally stored column. */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + /* out: column prefix, or NULL if + the column is not stored externally, + or pointer to field_ref_zero + if the BLOB pointer is unset */ + const row_ext_t* ext, /* in/out: column prefix cache */ + ulint i, /* in: index of ext->ext[] */ + ulint* len) /* out: length of prefix, in bytes, + at most REC_MAX_INDEX_COL_LEN */ +{ + ut_ad(ext); + ut_ad(len); + ut_ad(i < ext->n_ext); + + *len = ext->len[i]; + + if (UNIV_UNLIKELY(*len == 0)) { + /* The BLOB could not be fetched to the cache. */ + return(field_ref_zero); + } else { + return(ext->buf + i * REC_MAX_INDEX_COL_LEN); + } +} + +/************************************************************************ +Looks up a column prefix of an externally stored column. */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + /* out: column prefix, or NULL if + the column is not stored externally, + or pointer to field_ref_zero + if the BLOB pointer is unset */ + const row_ext_t* ext, /* in: column prefix cache */ + ulint col, /* in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len) /* out: length of prefix, in bytes, + at most REC_MAX_INDEX_COL_LEN */ +{ + ulint i; + + ut_ad(ext); + ut_ad(len); + + for (i = 0; i < ext->n_ext; i++) { + if (col == ext->ext[i]) { + return(row_ext_lookup_ith(ext, i, len)); + } + } + + return(NULL); +} diff --git a/storage/xtradb/include/row0ins.h b/storage/xtradb/include/row0ins.h new file mode 100644 index 00000000000..6aa83bed0f6 --- /dev/null +++ b/storage/xtradb/include/row0ins.h @@ -0,0 +1,157 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0ins_h +#define row0ins_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" + +/******************************************************************* +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_foreign_key_check_lock. */ +UNIV_INTERN +ulint +row_ins_check_foreign_constraint( +/*=============================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_NO_REFERENCED_ROW, + or DB_ROW_IS_REFERENCED */ + ibool check_ref,/* in: TRUE If we want to check that + the referenced table is ok, FALSE if we + want to to check the foreign key table */ + dict_foreign_t* foreign,/* in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /* in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /* in: index entry for index */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Creates an insert node struct. */ +UNIV_INTERN +ins_node_t* +ins_node_create( +/*============*/ + /* out, own: insert node struct */ + ulint ins_type, /* in: INS_VALUES, ... */ + dict_table_t* table, /* in: table where to insert */ + mem_heap_t* heap); /* in: mem heap where created */ +/************************************************************************* +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +UNIV_INTERN +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /* in: insert node */ + dtuple_t* row); /* in: new row (or first row) for the node */ +/******************************************************************* +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. */ +UNIV_INTERN +ulint +row_ins_index_entry( +/*================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DUPLICATE_KEY, or some other error code */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + ulint n_ext, /* in: number of externally stored columns */ + ibool foreign,/* in: TRUE=check foreign key constraints */ + que_thr_t* thr); /* in: query thread */ +/*************************************************************** +Inserts a row to a table. This is a high-level function used in +SQL execution graphs. */ +UNIV_INTERN +que_thr_t* +row_ins_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/*************************************************************** +Creates an entry template for each index of a table. */ +UNIV_INTERN +void +ins_node_create_entry_list( +/*=======================*/ + ins_node_t* node); /* in: row insert node */ + +/* Insert node structure */ + +struct ins_node_struct{ + que_common_t common; /* node type: QUE_NODE_INSERT */ + ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */ + dtuple_t* row; /* row to insert */ + dict_table_t* table; /* table where to insert */ + sel_node_t* select; /* select in searched insert */ + que_node_t* values_list;/* list of expressions to evaluate and + insert in an INS_VALUES insert */ + ulint state; /* node execution state */ + dict_index_t* index; /* NULL, or the next index where the index + entry should be inserted */ + dtuple_t* entry; /* NULL, or entry to insert in the index; + after a successful insert of the entry, + this should be reset to NULL */ + UT_LIST_BASE_NODE_T(dtuple_t) + entry_list;/* list of entries, one for each index */ + byte* row_id_buf;/* buffer for the row id sys field in row */ + dulint trx_id; /* trx id or the last trx which executed the + node */ + byte* trx_id_buf;/* buffer for the trx id sys field in row */ + mem_heap_t* entry_sys_heap; + /* memory heap used as auxiliary storage; + entry_list and sys fields are stored here; + if this is NULL, entry list should be created + and buffers for sys fields in row allocated */ + ulint magic_n; +}; + +#define INS_NODE_MAGIC_N 15849075 + +/* Insert node types */ +#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */ +#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */ +#define INS_DIRECT 2 /* this is for internal use in dict0crea: + insert the row directly */ + +/* Node execution states */ +#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */ +#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ +#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and + inserted */ + +#ifndef UNIV_NONINL +#include "row0ins.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0ins.ic b/storage/xtradb/include/row0ins.ic new file mode 100644 index 00000000000..b7aeaf97834 --- /dev/null +++ b/storage/xtradb/include/row0ins.ic @@ -0,0 +1,25 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + + diff --git a/storage/xtradb/include/row0merge.h b/storage/xtradb/include/row0merge.h new file mode 100644 index 00000000000..9975497cbeb --- /dev/null +++ b/storage/xtradb/include/row0merge.h @@ -0,0 +1,198 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Index build routines using a merge sort + +Created 13/06/2005 Jan Lindstrom +*******************************************************/ + +#ifndef row0merge_h +#define row0merge_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "rem0rec.h" +#include "read0types.h" +#include "btr0types.h" +#include "row0mysql.h" +#include "lock0types.h" + +/* This structure holds index field definitions */ + +struct merge_index_field_struct { + ulint prefix_len; /* Prefix len */ + const char* field_name; /* Field name */ +}; + +typedef struct merge_index_field_struct merge_index_field_t; + +/* This structure holds index definitions */ + +struct merge_index_def_struct { + const char* name; /* Index name */ + ulint ind_type; /* 0, DICT_UNIQUE, + or DICT_CLUSTERED */ + ulint n_fields; /* Number of fields in index */ + merge_index_field_t* fields; /* Field definitions */ +}; + +typedef struct merge_index_def_struct merge_index_def_t; + +/************************************************************************* +Sets an exclusive lock on a table, for the duration of creating indexes. */ +UNIV_INTERN +ulint +row_merge_lock_table( +/*=================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in/out: transaction */ + dict_table_t* table, /* in: table to lock */ + enum lock_mode mode); /* in: LOCK_X or LOCK_S */ +/************************************************************************* +Drop an index from the InnoDB system tables. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. */ +UNIV_INTERN +void +row_merge_drop_index( +/*=================*/ + dict_index_t* index, /* in: index to be removed */ + dict_table_t* table, /* in: table */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* +Drop those indexes which were created before an error occurred when +building an index. The data dictionary must have been locked +exclusively by the caller, because the transaction will not be +committed. */ +UNIV_INTERN +void +row_merge_drop_indexes( +/*===================*/ + trx_t* trx, /* in: transaction */ + dict_table_t* table, /* in: table containing the indexes */ + dict_index_t** index, /* in: indexes to drop */ + ulint num_created); /* in: number of elements in index[] */ +/************************************************************************* +Drop all partially created indexes during crash recovery. */ +UNIV_INTERN +void +row_merge_drop_temp_indexes(void); +/*=============================*/ +/************************************************************************* +Rename the tables in the data dictionary. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. */ +UNIV_INTERN +ulint +row_merge_rename_tables( +/*====================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* old_table, /* in/out: old table, renamed to + tmp_name */ + dict_table_t* new_table, /* in/out: new table, renamed to + old_table->name */ + const char* tmp_name, /* in: new name for old_table */ + trx_t* trx); /* in: transaction handle */ + +/************************************************************************* +Create a temporary table for creating a primary key, using the definition +of an existing table. */ +UNIV_INTERN +dict_table_t* +row_merge_create_temporary_table( +/*=============================*/ + /* out: table, + or NULL on error */ + const char* table_name, /* in: new table name */ + const merge_index_def_t*index_def, /* in: the index definition + of the primary key */ + const dict_table_t* table, /* in: old table definition */ + trx_t* trx); /* in/out: transaction + (sets error_state) */ +/************************************************************************* +Rename the temporary indexes in the dictionary to permanent ones. The +data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +ulint +row_merge_rename_indexes( +/*=====================*/ + /* out: DB_SUCCESS if all OK */ + trx_t* trx, /* in/out: transaction */ + dict_table_t* table); /* in/out: table with new indexes */ +/************************************************************************* +Create the index and load in to the dictionary. */ +UNIV_INTERN +dict_index_t* +row_merge_create_index( +/*===================*/ + /* out: index, or NULL on error */ + trx_t* trx, /* in/out: trx (sets error_state) */ + dict_table_t* table, /* in: the index is on this table */ + const merge_index_def_t* /* in: the index definition */ + index_def); +#ifdef ROW_MERGE_IS_INDEX_USABLE +/************************************************************************* +Check if a transaction can use an index. */ +UNIV_INTERN +ibool +row_merge_is_index_usable( +/*======================*/ + /* out: TRUE if index can be used by + the transaction else FALSE*/ + const trx_t* trx, /* in: transaction */ + const dict_index_t* index); /* in: index to check */ +#endif /* ROW_MERGE_IS_INDEX_USABLE */ +/************************************************************************* +If there are views that refer to the old table name then we "attach" to +the new instance of the table else we drop it immediately. */ +UNIV_INTERN +ulint +row_merge_drop_table( +/*=================*/ + /* out: DB_SUCCESS or error code */ + trx_t* trx, /* in: transaction */ + dict_table_t* table); /* in: table instance to drop */ + +/************************************************************************* +Build indexes on a table by reading a clustered index, +creating a temporary file containing index entries, merge sorting +these index entries and inserting sorted index entries to indexes. */ +UNIV_INTERN +ulint +row_merge_build_indexes( +/*====================*/ + /* out: DB_SUCCESS or error code */ + trx_t* trx, /* in: transaction */ + dict_table_t* old_table, /* in: table where rows are + read from */ + dict_table_t* new_table, /* in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + dict_index_t** indexes, /* in: indexes to be created */ + ulint n_indexes, /* in: size of indexes[] */ + TABLE* table); /* in/out: MySQL table, for + reporting erroneous key value + if applicable */ +#endif /* row0merge.h */ diff --git a/storage/xtradb/include/row0mysql.h b/storage/xtradb/include/row0mysql.h new file mode 100644 index 00000000000..c1e11124a5d --- /dev/null +++ b/storage/xtradb/include/row0mysql.h @@ -0,0 +1,769 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#ifndef row0mysql_h +#define row0mysql_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "btr0pcur.h" +#include "trx0types.h" + +extern ibool row_rollback_on_timeout; + +typedef struct row_prebuilt_struct row_prebuilt_t; + +/*********************************************************************** +Frees the blob heap in prebuilt when no longer needed. */ +UNIV_INTERN +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt); /* in: prebuilt struct of a + ha_innobase:: table handle */ +/*********************************************************************** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. */ +UNIV_INTERN +byte* +row_mysql_store_true_var_len( +/*=========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + byte* dest, /* in: where to store */ + ulint len, /* in: length, must fit in two bytes */ + ulint lenlen);/* in: storage length of len: either 1 or 2 bytes */ +/*********************************************************************** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. */ + +const byte* +row_mysql_read_true_varchar( +/*========================*/ + /* out: pointer to the data, we skip + the 1 or 2 bytes at the start that are + used to store the len */ + ulint* len, /* out: variable-length field length */ + const byte* field, /* in: field in the MySQL format */ + ulint lenlen);/* in: storage length of len: either 1 + or 2 bytes */ +/*********************************************************************** +Stores a reference to a BLOB in the MySQL format. */ +UNIV_INTERN +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /* in: where to store */ + ulint col_len,/* in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /* in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len); /* in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +/*********************************************************************** +Reads a reference to a BLOB in the MySQL format. */ + +const byte* +row_mysql_read_blob_ref( +/*====================*/ + /* out: pointer to BLOB data */ + ulint* len, /* out: BLOB length */ + const byte* ref, /* in: BLOB reference in the + MySQL format */ + ulint col_len); /* in: BLOB reference length + (not BLOB length) */ +/****************************************************************** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.c. */ +UNIV_INTERN +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + /* out: up to which byte we used + buf in the conversion */ + dfield_t* dfield, /* in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /* in/out: buffer for a converted + integer value; this must be at least + col_len long then! */ + ibool row_format_col, /* TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /* in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /* in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp); /* in: nonzero=compact format */ +/******************************************************************** +Handles user errors and lock waits detected by the database engine. */ +UNIV_INTERN +ibool +row_mysql_handle_errors( +/*====================*/ + /* out: TRUE if it was a lock wait and + we should continue running the query thread */ + ulint* new_err,/* out: possible new error encountered in + rollback, or the old error which was + during the function entry */ + trx_t* trx, /* in: transaction */ + que_thr_t* thr, /* in: query thread */ + trx_savept_t* savept);/* in: savepoint */ +/************************************************************************ +Create a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + /* out, own: a prebuilt struct */ + dict_table_t* table); /* in: Innobase table handle */ +/************************************************************************ +Free a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt, /* in, own: prebuilt struct */ + ibool dict_locked); /* in: TRUE=data dictionary locked */ +/************************************************************************* +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +UNIV_INTERN +void +row_update_prebuilt_trx( +/*====================*/ + /* out: prebuilt dtuple */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* +Unlocks AUTO_INC type locks that were possibly reserved by a trx. */ +UNIV_INTERN +void +row_unlock_table_autoinc_for_mysql( +/*===============================*/ + trx_t* trx); /* in/out: transaction */ +/************************************************************************* +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. */ +UNIV_INTERN +int +row_lock_table_autoinc_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in the MySQL + table handle */ +/************************************************************************* +Sets a table lock on the table mentioned in prebuilt. */ +UNIV_INTERN +int +row_lock_table_for_mysql( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in the MySQL + table handle */ + dict_table_t* table, /* in: table to lock, or NULL + if prebuilt->table should be + locked as + prebuilt->select_lock_type */ + ulint mode); /* in: lock mode of table + (ignored if table==NULL) */ + +/************************************************************************* +Does an insert for MySQL. */ +UNIV_INTERN +int +row_insert_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: row in the MySQL format */ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL + handle */ +/************************************************************************* +Builds a dummy query graph used in selects. */ +UNIV_INTERN +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL + handle */ +/************************************************************************* +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. */ +UNIV_INTERN +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + /* out: prebuilt update vector */ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL + handle */ +/************************************************************************* +Checks if a table is such that we automatically created a clustered +index on it (on row id). */ +UNIV_INTERN +ibool +row_table_got_default_clust_index( +/*==============================*/ + const dict_table_t* table); +/************************************************************************* +Calculates the key number used inside MySQL for an Innobase index. We have +to take into account if we generated a default clustered index for the table */ +UNIV_INTERN +ulint +row_get_mysql_key_number_for_index( +/*===============================*/ + const dict_index_t* index); +/************************************************************************* +Does an update or delete of a row for MySQL. */ +UNIV_INTERN +int +row_update_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL + handle */ +/************************************************************************* +This can only be used when srv_locks_unsafe_for_binlog is TRUE or +session is using a READ COMMITTED isolation level. Before +calling this function we must use trx_reset_new_rec_lock_info() and +trx_register_new_rec_lock() to store the information which new record locks +really were set. This function removes a newly set lock under prebuilt->pcur, +and also under prebuilt->clust_pcur. Currently, this is only used and tested +in the case of an UPDATE or a DELETE statement, where the row lock is of the +LOCK_X type. +Thus, this implements a 'mini-rollback' that releases the latest record +locks we set. */ +UNIV_INTERN +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs);/* TRUE if called so that we have + the latches on the records under pcur + and clust_pcur, and we do not need to + reposition the cursors. */ +/************************************************************************* +Creates an query graph node of 'update' type to be used in the MySQL +interface. */ +UNIV_INTERN +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + /* out, own: update node */ + dict_table_t* table, /* in: table to update */ + mem_heap_t* heap); /* in: mem heap from which allocated */ +/************************************************************************** +Does a cascaded delete or set null in a foreign key operation. */ +UNIV_INTERN +ulint +row_update_cascade_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + que_thr_t* thr, /* in: query thread */ + upd_node_t* node, /* in: update node used in the cascade + or set null operation */ + dict_table_t* table); /* in: table where we do the operation */ +/************************************************************************* +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ +UNIV_INTERN +void +row_mysql_lock_data_dictionary_func( +/*================================*/ + trx_t* trx, /* in/out: transaction */ + const char* file, /* in: file name */ + ulint line); /* in: line number */ +#define row_mysql_lock_data_dictionary(trx) \ + row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__) +/************************************************************************* +Unlocks the data dictionary exclusive lock. */ +UNIV_INTERN +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx); /* in/out: transaction */ +/************************************************************************* +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ +UNIV_INTERN +void +row_mysql_freeze_data_dictionary_func( +/*==================================*/ + trx_t* trx, /* in/out: transaction */ + const char* file, /* in: file name */ + ulint line); /* in: line number */ +#define row_mysql_freeze_data_dictionary(trx) \ + row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__) +/************************************************************************* +Unlocks the data dictionary shared lock. */ +UNIV_INTERN +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx); /* in/out: transaction */ +#ifndef UNIV_HOTBACKUP +/************************************************************************* +Creates a table for MySQL. If the name of the table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also start the printing of monitor +output by the master thread. If the table name ends in "innodb_mem_validate", +InnoDB will try to invoke mem_validate(). */ +UNIV_INTERN +int +row_create_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in, own: table definition + (will be freed) */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. */ +UNIV_INTERN +int +row_create_index_for_mysql( +/*=======================*/ + /* out: error number or DB_SUCCESS */ + dict_index_t* index, /* in, own: index definition + (will be freed) */ + trx_t* trx, /* in: transaction handle */ + const ulint* field_lengths); /* in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ +/************************************************************************* +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. */ +UNIV_INTERN +int +row_table_add_foreign_constraints( +/*==============================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the + database name before it: test.table2 */ + const char* name, /* in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks); /* in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ + +/************************************************************************* +The master thread in srv0srv.c calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. */ +UNIV_INTERN +ulint +row_drop_tables_for_mysql_in_background(void); +/*=========================================*/ + /* out: how many tables dropped + + remaining tables in list */ +/************************************************************************* +Get the background drop list length. NOTE: the caller must own the kernel +mutex! */ +UNIV_INTERN +ulint +row_get_background_drop_list_len_low(void); +/*======================================*/ + /* out: how many tables in list */ +/************************************************************************* +Truncates a table for MySQL. */ +UNIV_INTERN +int +row_truncate_table_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table handle */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* +Drops a table for MySQL. If the name of the dropped table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also stop the printing of monitor +output by the master thread. If the data dictionary was not already locked +by the transaction, the transaction will be committed. Otherwise, the +data dictionary will remain locked. */ +UNIV_INTERN +int +row_drop_table_for_mysql( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx, /* in: transaction handle */ + ibool drop_db);/* in: TRUE=dropping whole database */ + +/************************************************************************* +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. */ +UNIV_INTERN +int +row_discard_tablespace_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx); /* in: transaction handle */ +/********************************************************************* +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. */ +UNIV_INTERN +int +row_import_tablespace_for_mysql( +/*============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* +Drops a database for MySQL. */ +UNIV_INTERN +int +row_drop_database_for_mysql( +/*========================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: database name which ends to '/' */ + trx_t* trx); /* in: transaction handle */ +/************************************************************************* +Renames a table for MySQL. */ +UNIV_INTERN +ulint +row_rename_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + const char* old_name, /* in: old table name */ + const char* new_name, /* in: new table name */ + trx_t* trx, /* in: transaction handle */ + ibool commit); /* in: if TRUE then commit trx */ +/************************************************************************* +Checks a table for corruption. */ +UNIV_INTERN +ulint +row_check_table_for_mysql( +/*======================*/ + /* out: DB_ERROR or DB_SUCCESS */ + row_prebuilt_t* prebuilt); /* in: prebuilt struct in MySQL + handle */ +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************************* +Determines if a table is a magic monitor table. */ +UNIV_INTERN +ibool +row_is_magic_monitor_table( +/*=======================*/ + /* out: TRUE if monitor table */ + const char* table_name); /* in: name of the table, in the + form database/table_name */ + +/* A struct describing a place for an individual column in the MySQL +row format which is presented to the table handler in ha_innobase. +This template struct is used to speed up row transformations between +Innobase and MySQL. */ + +typedef struct mysql_row_templ_struct mysql_row_templ_t; +struct mysql_row_templ_struct { + ulint col_no; /* column number of the column */ + ulint rec_field_no; /* field number of the column in an + Innobase record in the current index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ulint mysql_col_offset; /* offset of the column in the MySQL + row format */ + ulint mysql_col_len; /* length of the column in the MySQL + row format */ + ulint mysql_null_byte_offset; /* MySQL NULL bit byte offset in a + MySQL record */ + ulint mysql_null_bit_mask; /* bit mask to get the NULL bit, + zero if column cannot be NULL */ + ulint type; /* column type in Innobase mtype + numbers DATA_CHAR... */ + ulint mysql_type; /* MySQL type code; this is always + < 256 */ + ulint mysql_length_bytes; /* if mysql_type + == DATA_MYSQL_TRUE_VARCHAR, this tells + whether we should use 1 or 2 bytes to + store the MySQL true VARCHAR data + length at the start of row in the MySQL + format (NOTE that the MySQL key value + format always uses 2 bytes for the data + len) */ + ulint charset; /* MySQL charset-collation code + of the column, or zero */ + ulint mbminlen; /* minimum length of a char, in bytes, + or zero if not a char type */ + ulint mbmaxlen; /* maximum length of a char, in bytes, + or zero if not a char type */ + ulint is_unsigned; /* if a column type is an integer + type and this field is != 0, then + it is an unsigned integer type */ +}; + +#define MYSQL_FETCH_CACHE_SIZE 8 +/* After fetching this many rows, we start caching them in fetch_cache */ +#define MYSQL_FETCH_CACHE_THRESHOLD 4 + +#define ROW_PREBUILT_ALLOCATED 78540783 +#define ROW_PREBUILT_FREED 26423527 + +/* A struct for (sometimes lazily) prebuilt structures in an Innobase table +handle used within MySQL; these are used to save CPU time. */ + +struct row_prebuilt_struct { + ulint magic_n; /* this magic number is set to + ROW_PREBUILT_ALLOCATED when created, + or ROW_PREBUILT_FREED when the + struct has been freed */ + dict_table_t* table; /* Innobase table handle */ + trx_t* trx; /* current transaction handle */ + ibool sql_stat_start; /* TRUE when we start processing of + an SQL statement: we may have to set + an intention lock on the table, + create a consistent read view etc. */ + ibool mysql_has_locked; /* this is set TRUE when MySQL + calls external_lock on this handle + with a lock flag, and set FALSE when + with the F_UNLOCK flag */ + ibool clust_index_was_generated; + /* if the user did not define a + primary key in MySQL, then Innobase + automatically generated a clustered + index where the ordering column is + the row id: in this case this flag + is set to TRUE */ + dict_index_t* index; /* current index for a search, if + any */ + ulint read_just_key; /* set to 1 when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ + ibool used_in_HANDLER;/* TRUE if we have been using this + handle in a MySQL HANDLER low level + index cursor command: then we must + store the pcur position even in a + unique search from a clustered index, + because HANDLER allows NEXT and PREV + in such a situation */ + ulint template_type; /* ROW_MYSQL_WHOLE_ROW, + ROW_MYSQL_REC_FIELDS, + ROW_MYSQL_DUMMY_TEMPLATE, or + ROW_MYSQL_NO_TEMPLATE */ + ulint n_template; /* number of elements in the + template */ + ulint null_bitmap_len;/* number of bytes in the SQL NULL + bitmap at the start of a row in the + MySQL format */ + ibool need_to_access_clustered; /* if we are fetching + columns through a secondary index + and at least one column is not in + the secondary index, then this is + set to TRUE */ + ibool templ_contains_blob;/* TRUE if the template contains + BLOB column(s) */ + mysql_row_templ_t* mysql_template;/* template used to transform + rows fast between MySQL and Innobase + formats; memory for this template + is not allocated from 'heap' */ + mem_heap_t* heap; /* memory heap from which + these auxiliary structures are + allocated when needed */ + ins_node_t* ins_node; /* Innobase SQL insert node + used to perform inserts + to the table */ + byte* ins_upd_rec_buff;/* buffer for storing data converted + to the Innobase format from the MySQL + format */ + const byte* default_rec; /* the default values of all columns + (a "default row") in MySQL format */ + ulint hint_need_to_fetch_extra_cols; + /* normally this is set to 0; if this + is set to ROW_RETRIEVE_PRIMARY_KEY, + then we should at least retrieve all + columns in the primary key; if this + is set to ROW_RETRIEVE_ALL_COLS, then + we must retrieve all columns in the + key (if read_just_key == 1), or all + columns in the table */ + upd_node_t* upd_node; /* Innobase SQL update node used + to perform updates and deletes */ + que_fork_t* ins_graph; /* Innobase SQL query graph used + in inserts */ + que_fork_t* upd_graph; /* Innobase SQL query graph used + in updates or deletes */ + btr_pcur_t* pcur; /* persistent cursor used in selects + and updates */ + btr_pcur_t* clust_pcur; /* persistent cursor used in + some selects and updates */ + que_fork_t* sel_graph; /* dummy query graph used in + selects */ + dtuple_t* search_tuple; /* prebuilt dtuple used in selects */ + byte row_id[DATA_ROW_ID_LEN]; + /* if the clustered index was + generated, the row id of the + last row fetched is stored + here */ + dtuple_t* clust_ref; /* prebuilt dtuple used in + sel/upd/del */ + ulint select_lock_type;/* LOCK_NONE, LOCK_S, or LOCK_X */ + ulint stored_select_lock_type;/* this field is used to + remember the original select_lock_type + that was decided in ha_innodb.cc, + ::store_lock(), ::external_lock(), + etc. */ + ulint row_read_type; /* ROW_READ_WITH_LOCKS if row locks + should be the obtained for records + under an UPDATE or DELETE cursor. + If innodb_locks_unsafe_for_binlog + is TRUE, this can be set to + ROW_READ_TRY_SEMI_CONSISTENT, so that + if the row under an UPDATE or DELETE + cursor was locked by another + transaction, InnoDB will resort + to reading the last committed value + ('semi-consistent read'). Then, + this field will be set to + ROW_READ_DID_SEMI_CONSISTENT to + indicate that. If the row does not + match the WHERE condition, MySQL will + invoke handler::unlock_row() to + clear the flag back to + ROW_READ_TRY_SEMI_CONSISTENT and + to simply skip the row. If + the row matches, the next call to + row_search_for_mysql() will lock + the row. + This eliminates lock waits in some + cases; note that this breaks + serializability. */ + ulint mysql_prefix_len;/* byte offset of the end of + the last requested column */ + ulint mysql_row_len; /* length in bytes of a row in the + MySQL format */ + ulint n_rows_fetched; /* number of rows fetched after + positioning the current cursor */ + ulint fetch_direction;/* ROW_SEL_NEXT or ROW_SEL_PREV */ + byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE]; + /* a cache for fetched rows if we + fetch many rows from the same cursor: + it saves CPU time to fetch them in a + batch; we reserve mysql_row_len + bytes for each such row; these + pointers point 4 bytes past the + allocated mem buf start, because + there is a 4 byte magic number at the + start and at the end */ + ibool keep_other_fields_on_keyread; /* when using fetch + cache with HA_EXTRA_KEYREAD, don't + overwrite other fields in mysql row + row buffer.*/ + ulint fetch_cache_first;/* position of the first not yet + fetched row in fetch_cache */ + ulint n_fetch_cached; /* number of not yet fetched rows + in fetch_cache */ + mem_heap_t* blob_heap; /* in SELECTS BLOB fields are copied + to this heap */ + mem_heap_t* old_vers_heap; /* memory heap where a previous + version is built in consistent read */ + /*----------------------*/ + ulonglong autoinc_last_value;/* last value of AUTO-INC interval */ + ulonglong autoinc_increment;/* The increment step of the auto + increment column. Value must be + greater than or equal to 1. Required to + calculate the next value */ + ulonglong autoinc_offset; /* The offset passed to + get_auto_increment() by MySQL. Required + to calculate the next value */ + ulint autoinc_error; /* The actual error code encountered + while trying to init or read the + autoinc value from the table. We + store it here so that we can return + it to MySQL */ + /*----------------------*/ + UT_LIST_NODE_T(row_prebuilt_t) prebuilts; + /* list node of table->prebuilts */ + ulint magic_n2; /* this should be the same as + magic_n */ +}; + +#define ROW_PREBUILT_FETCH_MAGIC_N 465765687 + +#define ROW_MYSQL_WHOLE_ROW 0 +#define ROW_MYSQL_REC_FIELDS 1 +#define ROW_MYSQL_NO_TEMPLATE 2 +#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in + row_scan_and_check_index */ + +/* Values for hint_need_to_fetch_extra_cols */ +#define ROW_RETRIEVE_PRIMARY_KEY 1 +#define ROW_RETRIEVE_ALL_COLS 2 + +/* Values for row_read_type */ +#define ROW_READ_WITH_LOCKS 0 +#define ROW_READ_TRY_SEMI_CONSISTENT 1 +#define ROW_READ_DID_SEMI_CONSISTENT 2 + +#ifndef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0mysql.ic b/storage/xtradb/include/row0mysql.ic new file mode 100644 index 00000000000..5260ae17924 --- /dev/null +++ b/storage/xtradb/include/row0mysql.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 2001, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +MySQL interface for Innobase + +Created 1/23/2001 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0purge.h b/storage/xtradb/include/row0purge.h new file mode 100644 index 00000000000..fbc12f8d389 --- /dev/null +++ b/storage/xtradb/include/row0purge.h @@ -0,0 +1,95 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0purge_h +#define row0purge_h + +#include "univ.i" +#include "data0data.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" + +/************************************************************************ +Creates a purge node to a query graph. */ +UNIV_INTERN +purge_node_t* +row_purge_node_create( +/*==================*/ + /* out, own: purge node */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap); /* in: memory heap where created */ +/*************************************************************** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. */ +UNIV_INTERN +que_thr_t* +row_purge_step( +/*===========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ + +/* Purge node structure */ + +struct purge_node_struct{ + que_common_t common; /* node type: QUE_NODE_PURGE */ + /*----------------------*/ + /* Local storage for this graph node */ + dulint roll_ptr;/* roll pointer to undo log record */ + trx_undo_rec_t* undo_rec;/* undo log record */ + trx_undo_inf_t* reservation;/* reservation for the undo log record in + the purge array */ + dulint undo_no;/* undo number of the record */ + ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC, + ... */ + btr_pcur_t pcur; /* persistent cursor used in searching the + clustered index record */ + ibool found_clust;/* TRUE if the clustered index record + determined by ref was found in the clustered + index, and we were able to position pcur on + it */ + dict_table_t* table; /* table where purge is done */ + ulint cmpl_info;/* compiler analysis info of an update */ + upd_t* update; /* update vector for a clustered index + record */ + dtuple_t* ref; /* NULL, or row reference to the next row to + handle */ + dtuple_t* row; /* NULL, or a copy (also fields copied to + heap) of the indexed fields of the row to + handle */ + dict_index_t* index; /* NULL, or the next index whose record should + be handled */ + mem_heap_t* heap; /* memory heap used as auxiliary storage for + row; this must be emptied after a successful + purge of a row */ +}; + +#ifndef UNIV_NONINL +#include "row0purge.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0purge.ic b/storage/xtradb/include/row0purge.ic new file mode 100644 index 00000000000..5fc665e9d20 --- /dev/null +++ b/storage/xtradb/include/row0purge.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + + +/****************************************************** +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0row.h b/storage/xtradb/include/row0row.h new file mode 100644 index 00000000000..26c4b5e4e71 --- /dev/null +++ b/storage/xtradb/include/row0row.h @@ -0,0 +1,331 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0row_h +#define row0row_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "read0types.h" +#include "row0types.h" +#include "btr0types.h" + +/************************************************************************* +Gets the offset of the trx id field, in bytes relative to the origin of +a clustered index record. */ +UNIV_INTERN +ulint +row_get_trx_id_offset( +/*==================*/ + /* out: offset of DATA_TRX_ID */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ +/************************************************************************* +Reads the trx id field from a clustered index record. */ +UNIV_INLINE +dulint +row_get_rec_trx_id( +/*===============*/ + /* out: value of the field */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ +/************************************************************************* +Reads the roll pointer field from a clustered index record. */ +UNIV_INLINE +dulint +row_get_rec_roll_ptr( +/*=================*/ + /* out: value of the field */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ +/********************************************************************* +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. */ +UNIV_INTERN +dtuple_t* +row_build_index_entry( +/*==================*/ + /* out: index entry which should be + inserted or purged, or NULL if the + externally stored columns in the + clustered index record are unavailable + and ext != NULL */ + const dtuple_t* row, /* in: row which should be + inserted or purged */ + row_ext_t* ext, /* in: externally stored column prefixes, + or NULL */ + dict_index_t* index, /* in: index on the table */ + mem_heap_t* heap); /* in: memory heap from which the memory for + the index entry is allocated */ +/*********************************************************************** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. */ +UNIV_INTERN +dtuple_t* +row_build( +/*======*/ + /* out, own: row built; + see the NOTE below! */ + ulint type, /* in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /* in: clustered index */ + const rec_t* rec, /* in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /* in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead; the user + columns in this table should be + the same columns as in index->table */ + row_ext_t** ext, /* out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap); /* in: memory heap from which + the memory needed is allocated */ +/*********************************************************************** +Converts an index record to a typed data tuple. */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + /* out: index entry built; does not + set info_bits, and the data fields in + the entry will point directly to rec */ + const rec_t* rec, /* in: record in the index */ + const dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint* n_ext, /* out: number of externally + stored columns */ + mem_heap_t* heap); /* in: memory heap from which + the memory needed is allocated */ +/*********************************************************************** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + /* out, own: index entry + built; see the NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or + ROW_COPY_POINTERS: the former + copies also the data fields to + heap as the latter only places + pointers to data fields on the + index page */ + const rec_t* rec, /* in: record in the index; + NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the dtuple is used! */ + const dict_index_t* index, /* in: index */ + ulint* offsets,/* in/out: rec_get_offsets(rec) */ + ulint* n_ext, /* out: number of externally + stored columns */ + mem_heap_t* heap); /* in: memory heap from which + the memory needed is allocated */ +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +dtuple_t* +row_build_row_ref( +/*==============*/ + /* out, own: row reference built; see the + NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /* in: secondary index */ + const rec_t* rec, /* in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap); /* in: memory heap from which the memory + needed is allocated */ +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /* in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /* in: secondary index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL */ + trx_t* trx); /* in: transaction */ +/*********************************************************************** +From a row build a row reference with which we can search the clustered +index record. */ +UNIV_INTERN +void +row_build_row_ref_from_row( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; + see the NOTE below! + ref must have the right number + of fields! */ + const dict_table_t* table, /* in: table */ + const dtuple_t* row); /* in: row + NOTE: the data fields in ref will point + directly into data of this row */ +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /* in/out: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/******************************************************************* +Searches the clustered index record for a row, if we have the row +reference. */ +UNIV_INTERN +ibool +row_search_on_row_ref( +/*==================*/ + /* out: TRUE if found */ + btr_pcur_t* pcur, /* out: persistent cursor, which must + be closed by the caller */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /* in: table */ + const dtuple_t* ref, /* in: row reference */ + mtr_t* mtr); /* in/out: mtr */ +/************************************************************************* +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. */ +UNIV_INTERN +rec_t* +row_get_clust_rec( +/*==============*/ + /* out: record or NULL, if no record found */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index, /* in: secondary index */ + dict_index_t** clust_index,/* out: clustered index */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************* +Searches an index record. */ +UNIV_INTERN +ibool +row_search_index_entry( +/*===================*/ + /* out: TRUE if found */ + dict_index_t* index, /* in: index */ + const dtuple_t* entry, /* in: index entry */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /* in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr); /* in: mtr */ + + +#define ROW_COPY_DATA 1 +#define ROW_COPY_POINTERS 2 + +/* The allowed latching order of index records is the following: +(1) a secondary index record -> +(2) the clustered index record -> +(3) rollback segment data for the clustered index record. + +No new latches may be obtained while the kernel mutex is reserved. +However, the kernel mutex can be reserved while latches are owned. */ + +/*********************************************************************** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). */ +UNIV_INTERN +ulint +row_raw_format( +/*===========*/ + /* out: number of bytes + that were written */ + const char* data, /* in: raw data */ + ulint data_len, /* in: raw data length + in bytes */ + const dict_field_t* dict_field, /* in: index field */ + char* buf, /* out: output buffer */ + ulint buf_size); /* in: output buffer size + in bytes */ + +#ifndef UNIV_NONINL +#include "row0row.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0row.ic b/storage/xtradb/include/row0row.ic new file mode 100644 index 00000000000..9947dd43257 --- /dev/null +++ b/storage/xtradb/include/row0row.ic @@ -0,0 +1,119 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0dict.h" +#include "rem0rec.h" +#include "trx0undo.h" + +/************************************************************************* +Reads the trx id field from a clustered index record. */ +UNIV_INLINE +dulint +row_get_rec_trx_id( +/*===============*/ + /* out: value of the field */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(rec, index, offsets); + } + + return(trx_read_trx_id(rec + offset)); +} + +/************************************************************************* +Reads the roll pointer field from a clustered index record. */ +UNIV_INLINE +dulint +row_get_rec_roll_ptr( +/*=================*/ + /* out: value of the field */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(rec, index, offsets); + } + + return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); +} + +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /* in/out: typed data tuple where the + reference is built */ + const ulint* map, /* in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /* in: record in the index; must be + preserved while ref is used, as we do + not copy field values to heap */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint field_no; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dtuple_get_n_fields(ref); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + field_no = *(map + i); + + if (field_no != ULINT_UNDEFINED) { + + field = rec_get_nth_field(rec, offsets, + field_no, &len); + dfield_set_data(dfield, field, len); + } + } +} diff --git a/storage/xtradb/include/row0sel.h b/storage/xtradb/include/row0sel.h new file mode 100644 index 00000000000..2f8574d0691 --- /dev/null +++ b/storage/xtradb/include/row0sel.h @@ -0,0 +1,401 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0sel_h +#define row0sel_h + +#include "univ.i" +#include "data0data.h" +#include "que0types.h" +#include "dict0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "que0types.h" +#include "pars0sym.h" +#include "btr0pcur.h" +#include "read0read.h" +#include "row0mysql.h" + +/************************************************************************* +Creates a select node struct. */ +UNIV_INTERN +sel_node_t* +sel_node_create( +/*============*/ + /* out, own: select node struct */ + mem_heap_t* heap); /* in: memory heap where created */ +/************************************************************************* +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +UNIV_INTERN +void +sel_node_free_private( +/*==================*/ + sel_node_t* node); /* in: select node struct */ +/************************************************************************* +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +UNIV_INTERN +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf); /* in, own: prefetch buffer */ +/************************************************************************* +Gets the plan node for the nth table in a join. */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + sel_node_t* node, + ulint i); +/************************************************************************** +Performs a select step. This is a high-level function used in SQL execution +graphs. */ +UNIV_INTERN +que_thr_t* +row_sel_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs an execution step of an open or close cursor statement node. */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Performs a fetch for a cursor. */ +UNIV_INTERN +que_thr_t* +fetch_step( +/*=======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/******************************************************************** +Sample callback function for fetch that prints each row.*/ +UNIV_INTERN +void* +row_fetch_print( +/*============*/ + /* out: always returns non-NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg); /* in: not used */ +/******************************************************************** +Callback function for fetch that stores an unsigned 4 byte integer to the +location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length += 4. */ +UNIV_INTERN +void* +row_fetch_store_uint4( +/*==================*/ + /* out: always returns NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg); /* in: data pointer */ +/*************************************************************** +Prints a row in a select result. */ +UNIV_INTERN +que_thr_t* +row_printf_step( +/*============*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/******************************************************************** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. */ +UNIV_INTERN +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /* in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /* in: buffer to use in field + conversions */ + ulint buf_len, /* in: buffer length */ + dict_index_t* index, /* in: index of the key value */ + const byte* key_ptr, /* in: MySQL key value */ + ulint key_len, /* in: MySQL key value length */ + trx_t* trx); /* in: transaction */ +/************************************************************************ +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! */ +UNIV_INTERN +ulint +row_search_for_mysql( +/*=================*/ + /* out: DB_SUCCESS, + DB_RECORD_NOT_FOUND, + DB_END_OF_INDEX, DB_DEADLOCK, + DB_LOCK_TABLE_FULL, + or DB_TOO_BIG_RECORD */ + byte* buf, /* in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /* in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /* in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction); /* in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ +/*********************************************************************** +Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. */ +UNIV_INTERN +ibool +row_search_check_if_query_cache_permitted( +/*======================================*/ + /* out: TRUE if storing or retrieving + from the query cache is permitted */ + trx_t* trx, /* in: transaction object */ + const char* norm_name); /* in: concatenation of database name, + '/' char, table name */ +/*********************************************************************** +Read the max AUTOINC value from an index. */ +UNIV_INTERN +ulint +row_search_max_autoinc( +/*===================*/ + /* out: DB_SUCCESS if all OK else + error code */ + dict_index_t* index, /* in: index to search */ + const char* col_name, /* in: autoinc column name */ + ib_uint64_t* value); /* out: AUTOINC value read */ + +/* A structure for caching column values for prefetched rows */ +struct sel_buf_struct{ + byte* data; /* data, or NULL; if not NULL, this field + has allocated memory which must be explicitly + freed; can be != NULL even when len is + UNIV_SQL_NULL */ + ulint len; /* data length or UNIV_SQL_NULL */ + ulint val_buf_size; + /* size of memory buffer allocated for data: + this can be more than len; this is defined + when data != NULL */ +}; + +struct plan_struct{ + dict_table_t* table; /* table struct in the dictionary + cache */ + dict_index_t* index; /* table index used in the search */ + btr_pcur_t pcur; /* persistent cursor used to search + the index */ + ibool asc; /* TRUE if cursor traveling upwards */ + ibool pcur_is_open; /* TRUE if pcur has been positioned + and we can try to fetch new rows */ + ibool cursor_at_end; /* TRUE if the cursor is open but + we know that there are no more + qualifying rows left to retrieve from + the index tree; NOTE though, that + there may still be unprocessed rows in + the prefetch stack; always FALSE when + pcur_is_open is FALSE */ + ibool stored_cursor_rec_processed; + /* TRUE if the pcur position has been + stored and the record it is positioned + on has already been processed */ + que_node_t** tuple_exps; /* array of expressions which are used + to calculate the field values in the + search tuple: there is one expression + for each field in the search tuple */ + dtuple_t* tuple; /* search tuple */ + ulint mode; /* search mode: PAGE_CUR_G, ... */ + ulint n_exact_match; /* number of first fields in the search + tuple which must be exactly matched */ + ibool unique_search; /* TRUE if we are searching an + index record with a unique key */ + ulint n_rows_fetched; /* number of rows fetched using pcur + after it was opened */ + ulint n_rows_prefetched;/* number of prefetched rows cached + for fetch: fetching several rows in + the same mtr saves CPU time */ + ulint first_prefetched;/* index of the first cached row in + select buffer arrays for each column */ + ibool no_prefetch; /* no prefetch for this table */ + sym_node_list_t columns; /* symbol table nodes for the columns + to retrieve from the table */ + UT_LIST_BASE_NODE_T(func_node_t) + end_conds; /* conditions which determine the + fetch limit of the index segment we + have to look at: when one of these + fails, the result set has been + exhausted for the cursor in this + index; these conditions are normalized + so that in a comparison the column + for this table is the first argument */ + UT_LIST_BASE_NODE_T(func_node_t) + other_conds; /* the rest of search conditions we can + test at this table in a join */ + ibool must_get_clust; /* TRUE if index is a non-clustered + index and we must also fetch the + clustered index record; this is the + case if the non-clustered record does + not contain all the needed columns, or + if this is a single-table explicit + cursor, or a searched update or + delete */ + ulint* clust_map; /* map telling how clust_ref is built + from the fields of a non-clustered + record */ + dtuple_t* clust_ref; /* the reference to the clustered + index entry is built here if index is + a non-clustered index */ + btr_pcur_t clust_pcur; /* if index is non-clustered, we use + this pcur to search the clustered + index */ + mem_heap_t* old_vers_heap; /* memory heap used in building an old + version of a row, or NULL */ +}; + +struct sel_node_struct{ + que_common_t common; /* node type: QUE_NODE_SELECT */ + ulint state; /* node state */ + que_node_t* select_list; /* select list */ + sym_node_t* into_list; /* variables list or NULL */ + sym_node_t* table_list; /* table list */ + ibool asc; /* TRUE if the rows should be fetched + in an ascending order */ + ibool set_x_locks; /* TRUE if the cursor is for update or + delete, which means that a row x-lock + should be placed on the cursor row */ + ulint row_lock_mode; /* LOCK_X or LOCK_S */ + ulint n_tables; /* number of tables */ + ulint fetch_table; /* number of the next table to access + in the join */ + plan_t* plans; /* array of n_tables many plan nodes + containing the search plan and the + search data structures */ + que_node_t* search_cond; /* search condition */ + read_view_t* read_view; /* if the query is a non-locking + consistent read, its read view is + placed here, otherwise NULL */ + ibool consistent_read;/* TRUE if the select is a consistent, + non-locking read */ + order_node_t* order_by; /* order by column definition, or + NULL */ + ibool is_aggregate; /* TRUE if the select list consists of + aggregate functions */ + ibool aggregate_already_fetched; + /* TRUE if the aggregate row has + already been fetched for the current + cursor */ + ibool can_get_updated;/* this is TRUE if the select + is in a single-table explicit + cursor which can get updated + within the stored procedure, + or in a searched update or + delete; NOTE that to determine + of an explicit cursor if it + can get updated, the parser + checks from a stored procedure + if it contains positioned + update or delete statements */ + sym_node_t* explicit_cursor;/* not NULL if an explicit cursor */ + UT_LIST_BASE_NODE_T(sym_node_t) + copy_variables; /* variables whose values we have to + copy when an explicit cursor is opened, + so that they do not change between + fetches */ +}; + +/* Select node states */ +#define SEL_NODE_CLOSED 0 /* it is a declared cursor which is not + currently open */ +#define SEL_NODE_OPEN 1 /* intention locks not yet set on + tables */ +#define SEL_NODE_FETCH 2 /* intention locks have been set */ +#define SEL_NODE_NO_MORE_ROWS 3 /* cursor has reached the result set + end */ + +/* Fetch statement node */ +struct fetch_node_struct{ + que_common_t common; /* type: QUE_NODE_FETCH */ + sel_node_t* cursor_def; /* cursor definition */ + sym_node_t* into_list; /* variables to set */ + + pars_user_func_t* + func; /* User callback function or NULL. + The first argument to the function + is a sel_node_t*, containing the + results of the SELECT operation for + one row. If the function returns + NULL, it is not interested in + further rows and the cursor is + modified so (cursor % NOTFOUND) is + true. If it returns not-NULL, + continue normally. See + row_fetch_print() for an example + (and a useful debugging tool). */ +}; + +/* Open or close cursor statement node */ +struct open_node_struct{ + que_common_t common; /* type: QUE_NODE_OPEN */ + ulint op_type; /* ROW_SEL_OPEN_CURSOR or + ROW_SEL_CLOSE_CURSOR */ + sel_node_t* cursor_def; /* cursor definition */ +}; + +/* Row printf statement node */ +struct row_printf_node_struct{ + que_common_t common; /* type: QUE_NODE_ROW_PRINTF */ + sel_node_t* sel_node; /* select */ +}; + +#define ROW_SEL_OPEN_CURSOR 0 +#define ROW_SEL_CLOSE_CURSOR 1 + +/* Flags for the MySQL interface */ +#define ROW_SEL_NEXT 1 +#define ROW_SEL_PREV 2 + +#define ROW_SEL_EXACT 1 /* search using a complete key value */ +#define ROW_SEL_EXACT_PREFIX 2 /* search using a key prefix which + must match to rows: the prefix may + contain an incomplete field (the + last field in prefix may be just + a prefix of a fixed length column) */ + +#ifndef UNIV_NONINL +#include "row0sel.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0sel.ic b/storage/xtradb/include/row0sel.ic new file mode 100644 index 00000000000..a21181e3237 --- /dev/null +++ b/storage/xtradb/include/row0sel.ic @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" + +/************************************************************************* +Gets the plan node for the nth table in a join. */ +UNIV_INLINE +plan_t* +sel_node_get_nth_plan( +/*==================*/ + /* out: plan node */ + sel_node_t* node, /* in: select node */ + ulint i) /* in: get ith plan node */ +{ + ut_ad(i < node->n_tables); + + return(node->plans + i); +} + +/************************************************************************* +Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means +that it will start fetching from the start of the result set again, regardless +of where it was before, and it will set intention locks on the tables. */ +UNIV_INLINE +void +sel_node_reset_cursor( +/*==================*/ + sel_node_t* node) /* in: select node */ +{ + node->state = SEL_NODE_OPEN; +} + +/************************************************************************** +Performs an execution step of an open or close cursor statement node. */ +UNIV_INLINE +que_thr_t* +open_step( +/*======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + sel_node_t* sel_node; + open_node_t* node; + ulint err; + + ut_ad(thr); + + node = (open_node_t*) thr->run_node; + ut_ad(que_node_get_type(node) == QUE_NODE_OPEN); + + sel_node = node->cursor_def; + + err = DB_SUCCESS; + + if (node->op_type == ROW_SEL_OPEN_CURSOR) { + + /* if (sel_node->state == SEL_NODE_CLOSED) { */ + + sel_node_reset_cursor(sel_node); + /* } else { + err = DB_ERROR; + } */ + } else { + if (sel_node->state != SEL_NODE_CLOSED) { + + sel_node->state = SEL_NODE_CLOSED; + } else { + err = DB_ERROR; + } + } + + if (UNIV_EXPECT(err, DB_SUCCESS) != DB_SUCCESS) { + /* SQL error detected */ + fprintf(stderr, "SQL error %lu\n", (ulong) err); + + ut_error; + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/xtradb/include/row0types.h b/storage/xtradb/include/row0types.h new file mode 100644 index 00000000000..f0af7c2bf53 --- /dev/null +++ b/storage/xtradb/include/row0types.h @@ -0,0 +1,58 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Row operation global types + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0types_h +#define row0types_h + +typedef struct plan_struct plan_t; + +typedef struct upd_struct upd_t; + +typedef struct upd_field_struct upd_field_t; + +typedef struct upd_node_struct upd_node_t; + +typedef struct del_node_struct del_node_t; + +typedef struct ins_node_struct ins_node_t; + +typedef struct sel_node_struct sel_node_t; + +typedef struct open_node_struct open_node_t; + +typedef struct fetch_node_struct fetch_node_t; + +typedef struct row_printf_node_struct row_printf_node_t; +typedef struct sel_buf_struct sel_buf_t; + +typedef struct undo_node_struct undo_node_t; + +typedef struct purge_node_struct purge_node_t; + +typedef struct row_ext_struct row_ext_t; + +/* MySQL data types */ +typedef struct st_table TABLE; + +#endif diff --git a/storage/xtradb/include/row0uins.h b/storage/xtradb/include/row0uins.h new file mode 100644 index 00000000000..16bbbbd0d12 --- /dev/null +++ b/storage/xtradb/include/row0uins.h @@ -0,0 +1,53 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0uins_h +#define row0uins_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/*************************************************************** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. */ +UNIV_INTERN +ulint +row_undo_ins( +/*=========*/ + /* out: DB_SUCCESS */ + undo_node_t* node); /* in: row undo node */ + +#ifndef UNIV_NONINL +#include "row0uins.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0uins.ic b/storage/xtradb/include/row0uins.ic new file mode 100644 index 00000000000..75bef8431eb --- /dev/null +++ b/storage/xtradb/include/row0uins.ic @@ -0,0 +1,24 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/row0umod.h b/storage/xtradb/include/row0umod.h new file mode 100644 index 00000000000..3a4e8c2f9a3 --- /dev/null +++ b/storage/xtradb/include/row0umod.h @@ -0,0 +1,51 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0umod_h +#define row0umod_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/*************************************************************** +Undoes a modify operation on a row of a table. */ +UNIV_INTERN +ulint +row_undo_mod( +/*=========*/ + /* out: DB_SUCCESS or error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr); /* in: query thread */ + + +#ifndef UNIV_NONINL +#include "row0umod.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0umod.ic b/storage/xtradb/include/row0umod.ic new file mode 100644 index 00000000000..7ac7bc2fea7 --- /dev/null +++ b/storage/xtradb/include/row0umod.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0undo.h b/storage/xtradb/include/row0undo.h new file mode 100644 index 00000000000..a17cfb1babd --- /dev/null +++ b/storage/xtradb/include/row0undo.h @@ -0,0 +1,136 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0undo_h +#define row0undo_h + +#include "univ.i" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" + +/************************************************************************ +Creates a row undo node to a query graph. */ +UNIV_INTERN +undo_node_t* +row_undo_node_create( +/*=================*/ + /* out, own: undo node */ + trx_t* trx, /* in: transaction */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap); /* in: memory heap where created */ +/*************************************************************** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. */ +UNIV_INTERN +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + /* out: TRUE if found; NOTE the node->pcur + must be closed by the caller, regardless of + the return value */ + undo_node_t* node); /* in: row undo node */ +/*************************************************************** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. */ +UNIV_INTERN +que_thr_t* +row_undo_step( +/*==========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ + +/* A single query thread will try to perform the undo for all successive +versions of a clustered index record, if the transaction has modified it +several times during the execution which is rolled back. It may happen +that the task is transferred to another query thread, if the other thread +is assigned to handle an undo log record in the chain of different versions +of the record, and the other thread happens to get the x-latch to the +clustered index record at the right time. + If a query thread notices that the clustered index record it is looking +for is missing, or the roll ptr field in the record doed not point to the +undo log record the thread was assigned to handle, then it gives up the undo +task for that undo log record, and fetches the next. This situation can occur +just in the case where the transaction modified the same record several times +and another thread is currently doing the undo for successive versions of +that index record. */ + +/* Undo node structure */ + +struct undo_node_struct{ + que_common_t common; /* node type: QUE_NODE_UNDO */ + ulint state; /* node execution state */ + trx_t* trx; /* trx for which undo is done */ + dulint roll_ptr;/* roll pointer to undo log record */ + trx_undo_rec_t* undo_rec;/* undo log record */ + dulint undo_no;/* undo number of the record */ + ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC, + ... */ + dulint new_roll_ptr; /* roll ptr to restore to clustered index + record */ + dulint new_trx_id; /* trx id to restore to clustered index + record */ + btr_pcur_t pcur; /* persistent cursor used in searching the + clustered index record */ + dict_table_t* table; /* table where undo is done */ + ulint cmpl_info;/* compiler analysis of an update */ + upd_t* update; /* update vector for a clustered index + record */ + dtuple_t* ref; /* row reference to the next row to handle */ + dtuple_t* row; /* a copy (also fields copied to heap) of the + row to handle */ + row_ext_t* ext; /* NULL, or prefixes of the externally + stored columns of the row */ + dtuple_t* undo_row;/* NULL, or the row after undo */ + row_ext_t* undo_ext;/* NULL, or prefixes of the externally + stored columns of undo_row */ + dict_index_t* index; /* the next index whose record should be + handled */ + mem_heap_t* heap; /* memory heap used as auxiliary storage for + row; this must be emptied after undo is tried + on a row */ +}; + +/* Execution states for an undo node */ +#define UNDO_NODE_FETCH_NEXT 1 /* we should fetch the next undo log + record */ +#define UNDO_NODE_PREV_VERS 2 /* the roll ptr to previous version of + a row is stored in node, and undo + should be done based on it */ +#define UNDO_NODE_INSERT 3 +#define UNDO_NODE_MODIFY 4 + + +#ifndef UNIV_NONINL +#include "row0undo.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0undo.ic b/storage/xtradb/include/row0undo.ic new file mode 100644 index 00000000000..921e3633b10 --- /dev/null +++ b/storage/xtradb/include/row0undo.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/row0upd.h b/storage/xtradb/include/row0upd.h new file mode 100644 index 00000000000..71aa20d158c --- /dev/null +++ b/storage/xtradb/include/row0upd.h @@ -0,0 +1,475 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0upd_h +#define row0upd_h + +#include "univ.i" +#include "data0data.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "pars0types.h" + +/************************************************************************* +Creates an update vector object. */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + /* out, own: update vector object */ + ulint n, /* in: number of fields */ + mem_heap_t* heap); /* in: heap from which memory allocated */ +/************************************************************************* +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + /* out: number of fields */ + const upd_t* update); /* in: update vector */ +#ifdef UNIV_DEBUG +/************************************************************************* +Returns the nth field of an update vector. */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + /* out: update vector field */ + const upd_t* update, /* in: update vector */ + ulint n); /* in: field position in update vector */ +#else +# define upd_get_nth_field(update, n) ((update)->fields + (n)) +#endif +/************************************************************************* +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /* in: update vector field */ + ulint field_no, /* in: field number in a clustered + index */ + dict_index_t* index, /* in: index */ + trx_t* trx); /* in: transaction */ +/************************************************************************* +Returns a field of an update vector by field_no. */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + /* out: update vector field, or NULL */ + const upd_t* update, /* in: update vector */ + ulint no) /* in: field_no */ + __attribute__((nonnull, pure)); +/************************************************************************* +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. */ +UNIV_INTERN +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + /* out: new pointer to mlog */ + dict_index_t* index, /* in: clustered index */ + trx_t* trx, /* in: transaction */ + dulint roll_ptr,/* in: roll ptr of the undo log record */ + byte* log_ptr,/* pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************* +Updates the trx id and roll ptr field in a clustered index record when +a row is updated or marked deleted. */ +UNIV_INLINE +void +row_upd_rec_sys_fields( +/*===================*/ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + trx_t* trx, /* in: transaction */ + dulint roll_ptr);/* in: roll ptr of the undo log record */ +/************************************************************************* +Sets the trx id or roll ptr field of a clustered index entry. */ +UNIV_INTERN +void +row_upd_index_entry_sys_field( +/*==========================*/ + const dtuple_t* entry, /* in: index entry, where the memory buffers + for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /* in: clustered index */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + dulint val); /* in: value to write */ +/************************************************************************* +Creates an update node for a query graph. */ +UNIV_INTERN +upd_node_t* +upd_node_create( +/*============*/ + /* out, own: update node */ + mem_heap_t* heap); /* in: mem heap where created */ +/*************************************************************** +Writes to the redo log the new values of the fields occurring in the index. */ +UNIV_INTERN +void +row_upd_index_write_log( +/*====================*/ + const upd_t* update, /* in: update vector */ + byte* log_ptr,/* in: pointer to mlog buffer: must + contain at least MLOG_BUF_MARGIN bytes + of free space; the buffer is closed + within this function */ + mtr_t* mtr); /* in: mtr into whose log to write */ +/*************************************************************** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. */ +UNIV_INTERN +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + /* out: TRUE if the update changes the size of + some field in index or the field is external + in rec or update */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + const upd_t* update);/* in: update vector */ +/*************************************************************** +Replaces the new column values stored in the update vector to the record +given. No field size changes are allowed. */ +UNIV_INTERN +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /* in/out: record where replaced */ + dict_index_t* index, /* in: the index the record belongs to */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + const upd_t* update, /* in: update vector */ + page_zip_des_t* page_zip);/* in: compressed page with enough space + available, or NULL */ +/******************************************************************* +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! */ +UNIV_INTERN +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + /* out, own: update vector of differing + fields */ + dict_index_t* index, /* in: index */ + const dtuple_t* entry, /* in: entry to insert */ + const rec_t* rec, /* in: secondary index record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap); /* in: memory heap from which allocated */ +/******************************************************************* +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! */ +UNIV_INTERN +upd_t* +row_upd_build_difference_binary( +/*============================*/ + /* out, own: update vector of differing + fields, excluding roll ptr and trx id */ + dict_index_t* index, /* in: clustered index */ + const dtuple_t* entry, /* in: entry to insert */ + const rec_t* rec, /* in: clustered index record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap); /* in: memory heap from which allocated */ +/*************************************************************** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals_index_pos( +/*=========================================*/ + dtuple_t* entry, /* in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /* in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /* in: an update vector built for the index so + that the field number in an upd_field is the + index position */ + ibool order_only, + /* in: if TRUE, limit the replacement to + ordering fields of index; note that this + does not work for non-clustered indexes. */ + mem_heap_t* heap) /* in: memory heap for allocating and + copying the new values */ + __attribute__((nonnull)); +/*************************************************************** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /* in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /* in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /* in: an update vector built for the + CLUSTERED index so that the field number in + an upd_field is the clustered index position */ + mem_heap_t* heap) /* in: memory heap for allocating and + copying the new values */ + __attribute__((nonnull)); +/*************************************************************** +Replaces the new column values stored in the update vector. */ +UNIV_INTERN +void +row_upd_replace( +/*============*/ + dtuple_t* row, /* in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /* out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /* in: clustered index */ + const upd_t* update, /* in: an update vector built for the + clustered index */ + mem_heap_t* heap); /* in: memory heap */ +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! */ +UNIV_INTERN +ibool +row_upd_changes_ord_field_binary( +/*=============================*/ + /* out: TRUE if update vector changes + an ordering field in the index record; + NOTE: the fields are compared as binary + strings */ + const dtuple_t* row, /* in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + dict_index_t* index, /* in: index of the record */ + const upd_t* update);/* in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! */ +UNIV_INTERN +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + /* out: TRUE if update vector + may change an ordering field + in an index record */ + const dict_table_t* table, /* in: table */ + const upd_t* update);/* in: update vector for the row */ +/*************************************************************** +Updates a row in a table. This is a high-level function used +in SQL execution graphs. */ +UNIV_INTERN +que_thr_t* +row_upd_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************* +Parses the log data of system field values. */ +UNIV_INTERN +byte* +row_upd_parse_sys_vals( +/*===================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint* pos, /* out: TRX_ID position in record */ + dulint* trx_id, /* out: trx id */ + dulint* roll_ptr);/* out: roll ptr */ +/************************************************************************* +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ +UNIV_INTERN +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr);/* in: roll ptr of the undo log record */ +/************************************************************************* +Parses the log data written by row_upd_index_write_log. */ +UNIV_INTERN +byte* +row_upd_index_parse( +/*================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + mem_heap_t* heap, /* in: memory heap where update vector is + built */ + upd_t** update_out);/* out: update vector */ + + +/* Update vector field */ +struct upd_field_struct{ + unsigned field_no:16; /* field number in an index, usually + the clustered index, but in updating + a secondary index record in btr0cur.c + this is the position in the secondary + index */ + unsigned orig_len:16; /* original length of the locally + stored part of an externally stored + column, or 0 */ + que_node_t* exp; /* expression for calculating a new + value: it refers to column values and + constants in the symbol table of the + query graph */ + dfield_t new_val; /* new value for the column */ +}; + +/* Update vector structure */ +struct upd_struct{ + ulint info_bits; /* new value of info bits to record; + default is 0 */ + ulint n_fields; /* number of update fields */ + upd_field_t* fields; /* array of update fields */ +}; + +/* Update node structure which also implements the delete operation +of a row */ + +struct upd_node_struct{ + que_common_t common; /* node type: QUE_NODE_UPDATE */ + ibool is_delete;/* TRUE if delete, FALSE if update */ + ibool searched_update; + /* TRUE if searched update, FALSE if + positioned */ + ibool in_mysql_interface; + /* TRUE if the update node was created + for the MySQL interface */ + dict_foreign_t* foreign;/* NULL or pointer to a foreign key + constraint if this update node is used in + doing an ON DELETE or ON UPDATE operation */ + upd_node_t* cascade_node;/* NULL or an update node template which + is used to implement ON DELETE/UPDATE CASCADE + or ... SET NULL for foreign keys */ + mem_heap_t* cascade_heap;/* NULL or a mem heap where the cascade + node is created */ + sel_node_t* select; /* query graph subtree implementing a base + table cursor: the rows returned will be + updated */ + btr_pcur_t* pcur; /* persistent cursor placed on the clustered + index record which should be updated or + deleted; the cursor is stored in the graph + of 'select' field above, except in the case + of the MySQL interface */ + dict_table_t* table; /* table where updated */ + upd_t* update; /* update vector for the row */ + ulint update_n_fields; + /* when this struct is used to implement + a cascade operation for foreign keys, we store + here the size of the buffer allocated for use + as the update vector */ + sym_node_list_t columns;/* symbol table nodes for the columns + to retrieve from the table */ + ibool has_clust_rec_x_lock; + /* TRUE if the select which retrieves the + records to update already sets an x-lock on + the clustered record; note that it must always + set at least an s-lock */ + ulint cmpl_info;/* information extracted during query + compilation; speeds up execution: + UPD_NODE_NO_ORD_CHANGE and + UPD_NODE_NO_SIZE_CHANGE, ORed */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /* node execution state */ + dict_index_t* index; /* NULL, or the next index whose record should + be updated */ + dtuple_t* row; /* NULL, or a copy (also fields copied to + heap) of the row to update; this must be reset + to NULL after a successful update */ + row_ext_t* ext; /* NULL, or prefixes of the externally + stored columns in the old row */ + dtuple_t* upd_row;/* NULL, or a copy of the updated row */ + row_ext_t* upd_ext;/* NULL, or prefixes of the externally + stored columns in upd_row */ + mem_heap_t* heap; /* memory heap used as auxiliary storage; + this must be emptied after a successful + update */ + /*----------------------*/ + sym_node_t* table_sym;/* table node in symbol table */ + que_node_t* col_assign_list; + /* column assignment list */ + ulint magic_n; +}; + +#define UPD_NODE_MAGIC_N 1579975 + +/* Node execution states */ +#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from + a node above and if the field + has_clust_rec_x_lock is FALSE, we + should set an intention x-lock on + the table */ +#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be + updated */ +#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be + inserted, old record is already delete + marked */ +#define UPD_NODE_UPDATE_ALL_SEC 4 /* an ordering field of the clustered + index record was changed, or this is + a delete operation: should update + all the secondary index records */ +#define UPD_NODE_UPDATE_SOME_SEC 5 /* secondary index entries should be + looked at and updated if an ordering + field changed */ + +/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */ +#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be + changed in the update and no ordering + field of the clustered index */ +#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be + changed in the update */ + +#ifndef UNIV_NONINL +#include "row0upd.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0upd.ic b/storage/xtradb/include/row0upd.ic new file mode 100644 index 00000000000..a0c23aa6b07 --- /dev/null +++ b/storage/xtradb/include/row0upd.ic @@ -0,0 +1,179 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "row0row.h" +#include "btr0sea.h" +#include "page0zip.h" + +/************************************************************************* +Creates an update vector object. */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + /* out, own: update vector object */ + ulint n, /* in: number of fields */ + mem_heap_t* heap) /* in: heap from which memory allocated */ +{ + upd_t* update; + + update = (upd_t*) mem_heap_alloc(heap, sizeof(upd_t)); + + update->info_bits = 0; + update->n_fields = n; + update->fields = (upd_field_t*) + mem_heap_alloc(heap, sizeof(upd_field_t) * n); + + return(update); +} + +/************************************************************************* +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + /* out: number of fields */ + const upd_t* update) /* in: update vector */ +{ + ut_ad(update); + + return(update->n_fields); +} + +#ifdef UNIV_DEBUG +/************************************************************************* +Returns the nth field of an update vector. */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + /* out: update vector field */ + const upd_t* update, /* in: update vector */ + ulint n) /* in: field position in update vector */ +{ + ut_ad(update); + ut_ad(n < update->n_fields); + + return((upd_field_t*) update->fields + n); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /* in: update vector field */ + ulint field_no, /* in: field number in a clustered + index */ + dict_index_t* index, /* in: index */ + trx_t* trx) /* in: transaction */ +{ + upd_field->field_no = field_no; + upd_field->orig_len = 0; + + if (UNIV_UNLIKELY(field_no >= dict_index_get_n_fields(index))) { + fprintf(stderr, + "InnoDB: Error: trying to access field %lu in ", + (ulong) field_no); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, "\n" + "InnoDB: but index only has %lu fields\n", + (ulong) dict_index_get_n_fields(index)); + } + + dict_col_copy_type(dict_index_get_nth_col(index, field_no), + dfield_get_type(&upd_field->new_val)); +} + +/************************************************************************* +Returns a field of an update vector by field_no. */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + /* out: update vector field, or NULL */ + const upd_t* update, /* in: update vector */ + ulint no) /* in: field_no */ +{ + ulint i; + for (i = 0; i < upd_get_n_fields(update); i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + + if (uf->field_no == no) { + + return(uf); + } + } + + return(NULL); +} + +/************************************************************************* +Updates the trx id and roll ptr field in a clustered index record when +a row is updated or marked deleted. */ +UNIV_INLINE +void +row_upd_rec_sys_fields( +/*===================*/ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be updated, or NULL */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + trx_t* trx, /* in: transaction */ + dulint roll_ptr)/* in: roll ptr of the undo log record */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); +#ifdef UNIV_SYNC_DEBUG + if (!rw_lock_own(&btr_search_latch, RW_LOCK_EX)) { + ut_ad(!buf_block_align(rec)->is_hashed); + } +#endif /* UNIV_SYNC_DEBUG */ + + if (UNIV_LIKELY_NULL(page_zip)) { + ulint pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets, + pos, trx->id, roll_ptr); + } else { + ulint offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(rec, index, offsets); + } + +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + trx_write_trx_id(rec + offset, trx->id); + trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); + } +} diff --git a/storage/xtradb/include/row0vers.h b/storage/xtradb/include/row0vers.h new file mode 100644 index 00000000000..0feae77e8b5 --- /dev/null +++ b/storage/xtradb/include/row0vers.h @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0vers_h +#define row0vers_h + +#include "univ.i" +#include "data0data.h" +#include "dict0types.h" +#include "trx0types.h" +#include "que0types.h" +#include "rem0types.h" +#include "mtr0mtr.h" +#include "read0types.h" + +/********************************************************************* +Finds out if an active transaction has inserted or modified a secondary +index record. NOTE: the kernel mutex is temporarily released in this +function! */ +UNIV_INTERN +trx_t* +row_vers_impl_x_locked_off_kernel( +/*==============================*/ + /* out: NULL if committed, else the active + transaction; NOTE that the kernel mutex is + temporarily released! */ + const rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets);/* in: rec_get_offsets(rec, index) */ +/********************************************************************* +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. */ +UNIV_INTERN +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + /* out: TRUE if earlier version should be preserved */ + dulint trx_id, /* in: transaction id in the version */ + mtr_t* mtr); /* in: mtr holding the latch on the clustered index + record; it will also hold the latch on purge_view */ +/********************************************************************* +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. */ +UNIV_INTERN +ibool +row_vers_old_has_index_entry( +/*=========================*/ + /* out: TRUE if earlier version should have */ + ibool also_curr,/* in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + const rec_t* rec, /* in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /* in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /* in: the secondary index */ + const dtuple_t* ientry);/* in: the secondary index entry */ +/********************************************************************* +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. */ +UNIV_INTERN +ulint +row_vers_build_for_consistent_read( +/*===============================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + const rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ + read_view_t* view, /* in: the consistent read view */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers);/* out, own: old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ + +/********************************************************************* +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +UNIV_INTERN +ulint +row_vers_build_for_semi_consistent_read( +/*====================================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + const rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec */ + dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers);/* out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ + + +#ifndef UNIV_NONINL +#include "row0vers.ic" +#endif + +#endif diff --git a/storage/xtradb/include/row0vers.ic b/storage/xtradb/include/row0vers.ic new file mode 100644 index 00000000000..aac95ea6593 --- /dev/null +++ b/storage/xtradb/include/row0vers.ic @@ -0,0 +1,29 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" +#include "dict0dict.h" +#include "read0read.h" +#include "page0page.h" +#include "log0recv.h" diff --git a/storage/xtradb/include/srv0que.h b/storage/xtradb/include/srv0que.h new file mode 100644 index 00000000000..88db1a013f6 --- /dev/null +++ b/storage/xtradb/include/srv0que.h @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Server query execution + +Created 6/5/1996 Heikki Tuuri +*******************************************************/ + +#ifndef srv0que_h +#define srv0que_h + +#include "univ.i" +#include "que0types.h" + +/************************************************************************** +Checks if there is work to do in the server task queue. If there is, the +thread starts processing a task. Before leaving, it again checks the task +queue and picks a new task if any exists. This is called by a SRV_WORKER +thread. */ +UNIV_INTERN +void +srv_que_task_queue_check(void); +/*==========================*/ +/************************************************************************** +Performs round-robin on the server tasks. This is called by a SRV_WORKER +thread every second or so. */ +UNIV_INTERN +que_thr_t* +srv_que_round_robin( +/*================*/ + /* out: the new (may be == thr) query thread + to run */ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Enqueues a task to server task queue and releases a worker thread, if +there exists one suspended. */ +UNIV_INTERN +void +srv_que_task_enqueue( +/*=================*/ + que_thr_t* thr); /* in: query thread */ +/************************************************************************** +Enqueues a task to server task queue and releases a worker thread, if +there exists one suspended. */ +UNIV_INTERN +void +srv_que_task_enqueue_low( +/*=====================*/ + que_thr_t* thr); /* in: query thread */ + +#endif + diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h new file mode 100644 index 00000000000..cb78d66da1b --- /dev/null +++ b/storage/xtradb/include/srv0srv.h @@ -0,0 +1,608 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The server main program + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#ifndef srv0srv_h +#define srv0srv_h + +#include "univ.i" +#include "sync0sync.h" +#include "os0sync.h" +#include "que0types.h" +#include "trx0types.h" + +extern const char* srv_main_thread_op_info; + +/* Prefix used by MySQL to indicate pre-5.1 table name encoding */ +extern const char srv_mysql50_table_name_prefix[9]; + +/* When this event is set the lock timeout and InnoDB monitor +thread starts running */ +extern os_event_t srv_lock_timeout_thread_event; + +/* If the last data file is auto-extended, we add this many pages to it +at a time */ +#define SRV_AUTO_EXTEND_INCREMENT \ + (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE)) + +/* This is set to TRUE if the MySQL user has set it in MySQL */ +extern ibool srv_lower_case_table_names; + +/* Mutex for locking srv_monitor_file */ +extern mutex_t srv_monitor_file_mutex; +/* Temporary file for innodb monitor output */ +extern FILE* srv_monitor_file; +/* Mutex for locking srv_dict_tmpfile. +This mutex has a very high rank; threads reserving it should not +be holding any InnoDB latches. */ +extern mutex_t srv_dict_tmpfile_mutex; +/* Temporary file for output from the data dictionary */ +extern FILE* srv_dict_tmpfile; +/* Mutex for locking srv_misc_tmpfile. +This mutex has a very low rank; threads reserving it should not +acquire any further latches or sleep before releasing this one. */ +extern mutex_t srv_misc_tmpfile_mutex; +/* Temporary file for miscellanous diagnostic output */ +extern FILE* srv_misc_tmpfile; + +/* Server parameters which are read from the initfile */ + +extern char* srv_data_home; +#ifdef UNIV_LOG_ARCHIVE +extern char* srv_arch_dir; +#endif /* UNIV_LOG_ARCHIVE */ + +/* store to its own file each table created by an user; data +dictionary tables are in the system tablespace 0 */ +extern my_bool srv_file_per_table; +/* The file format to use on new *.ibd files. */ +extern ulint srv_file_format; +/* Whether to check file format during startup.*/ +extern ulint srv_check_file_format_at_startup; +/* Place locks to records only i.e. do not use next-key locking except +on duplicate key checking and foreign key checking */ +extern ibool srv_locks_unsafe_for_binlog; + +extern ulint srv_n_data_files; +extern char** srv_data_file_names; +extern ulint* srv_data_file_sizes; +extern ulint* srv_data_file_is_raw_partition; + +extern ibool srv_extra_undoslots; + +extern ibool srv_auto_extend_last_data_file; +extern ulint srv_last_file_size_max; +extern ulong srv_auto_extend_increment; + +extern ibool srv_created_new_raw; + +#define SRV_NEW_RAW 1 +#define SRV_OLD_RAW 2 + +extern char** srv_log_group_home_dirs; + +extern ulint srv_n_log_groups; +extern ulint srv_n_log_files; +extern ulint srv_log_file_size; +extern ulint srv_log_buffer_size; +extern ulong srv_flush_log_at_trx_commit; + +extern ulint srv_show_locks_held; +extern ulint srv_show_verbose_locks; + +/* The sort order table of the MySQL latin1_swedish_ci character set +collation */ +extern const byte* srv_latin1_ordering; +extern my_bool srv_use_sys_malloc; +extern ulint srv_buf_pool_size; /* requested size in bytes */ +extern ulint srv_buf_pool_old_size; /* previously requested size */ +extern ulint srv_buf_pool_curr_size; /* current size in bytes */ +extern ulint srv_mem_pool_size; +extern ulint srv_lock_table_size; + +extern ulint srv_n_file_io_threads; +extern ulint srv_n_read_io_threads; +extern ulint srv_n_write_io_threads; + +#ifdef UNIV_LOG_ARCHIVE +extern ibool srv_log_archive_on; +extern ibool srv_archive_recovery; +extern dulint srv_archive_recovery_limit_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + +extern char* srv_file_flush_method_str; +extern ulint srv_unix_file_flush_method; +extern ulint srv_win_file_flush_method; + +extern ulint srv_max_n_open_files; + +extern ulint srv_max_dirty_pages_pct; + +extern ulint srv_force_recovery; +extern ulong srv_thread_concurrency; +extern ulong srv_commit_concurrency; + +extern ulint srv_max_n_threads; + +extern lint srv_conc_n_threads; + +extern ulint srv_fast_shutdown; /* If this is 1, do not do a + purge and index buffer merge. + If this 2, do not even flush the + buffer pool to data files at the + shutdown: we effectively 'crash' + InnoDB (but lose no committed + transactions). */ +extern ibool srv_innodb_status; + +extern unsigned long long srv_stats_sample_pages; + +extern ibool srv_use_doublewrite_buf; +extern ibool srv_use_checksums; + +extern ibool srv_set_thread_priorities; +extern int srv_query_thread_priority; + +extern ulong srv_max_buf_pool_modified_pct; +extern ulong srv_max_purge_lag; + +extern ulong srv_replication_delay; + +extern ulint srv_io_capacity; +extern long long srv_ibuf_max_size; +extern ulint srv_ibuf_active_contract; +extern ulint srv_ibuf_accel_rate; +extern ulint srv_flush_neighbor_pages; +extern ulint srv_enable_unsafe_group_commit; +extern ulint srv_read_ahead; +extern ulint srv_adaptive_checkpoint; + +extern ulint srv_extra_rsegments; + +/*-------------------------------------------*/ + +extern ulint srv_n_rows_inserted; +extern ulint srv_n_rows_updated; +extern ulint srv_n_rows_deleted; +extern ulint srv_n_rows_read; + +extern ibool srv_print_innodb_monitor; +extern ibool srv_print_innodb_lock_monitor; +extern ibool srv_print_innodb_tablespace_monitor; +extern ibool srv_print_verbose_log; +extern ibool srv_print_innodb_table_monitor; + +extern ibool srv_lock_timeout_and_monitor_active; +extern ibool srv_error_monitor_active; + +extern ulong srv_n_spin_wait_rounds; +extern ulong srv_n_free_tickets_to_enter; +extern ulong srv_thread_sleep_delay; +extern ulint srv_spin_wait_delay; +extern ibool srv_priority_boost; + +extern ulint srv_mem_pool_size; +extern ulint srv_lock_table_size; + +#ifdef UNIV_DEBUG +extern ibool srv_print_thread_releases; +extern ibool srv_print_lock_waits; +extern ibool srv_print_buf_io; +extern ibool srv_print_log_io; +extern ibool srv_print_latch_waits; +#else /* UNIV_DEBUG */ +# define srv_print_thread_releases FALSE +# define srv_print_lock_waits FALSE +# define srv_print_buf_io FALSE +# define srv_print_log_io FALSE +# define srv_print_latch_waits FALSE +#endif /* UNIV_DEBUG */ + +extern ulint srv_activity_count; +extern ulint srv_fatal_semaphore_wait_threshold; +extern ulint srv_dml_needed_delay; + +extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs, + query threads, and lock table: we allocate + it from dynamic memory to get it to the + same DRAM page as other hotspot semaphores */ +#define kernel_mutex (*kernel_mutex_temp) + +#define SRV_MAX_N_IO_THREADS 100 + +/* Array of English strings describing the current state of an +i/o handler thread */ +extern const char* srv_io_thread_op_info[]; +extern const char* srv_io_thread_function[]; + +/* the number of the log write requests done */ +extern ulint srv_log_write_requests; + +/* the number of physical writes to the log performed */ +extern ulint srv_log_writes; + +/* amount of data written to the log files in bytes */ +extern ulint srv_os_log_written; + +/* amount of writes being done to the log files */ +extern ulint srv_os_log_pending_writes; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +extern ulint srv_log_waits; + +/* variable that counts amount of data read in total (in bytes) */ +extern ulint srv_data_read; + +/* here we count the amount of data written in total (in bytes) */ +extern ulint srv_data_written; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +extern ulint srv_dblwr_writes; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +extern ulint srv_dblwr_pages_written; + +/* in this variable we store the number of write requests issued */ +extern ulint srv_buf_pool_write_requests; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +extern ulint srv_buf_pool_wait_free; + +/* variable to count the number of pages that were written from the +buffer pool to disk */ +extern ulint srv_buf_pool_flushed; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +extern ulint srv_buf_pool_reads; + +/* variable to count the number of sequential read-aheads were done */ +extern ulint srv_read_ahead_seq; + +/* variable to count the number of random read-aheads were done */ +extern ulint srv_read_ahead_rnd; + +/* In this structure we store status variables to be passed to MySQL */ +typedef struct export_var_struct export_struc; + +extern export_struc export_vars; + +typedef struct srv_sys_struct srv_sys_t; + +/* The server system */ +extern srv_sys_t* srv_sys; + +/* Alternatives for the file flush option in Unix; see the InnoDB manual +about what these mean */ +#define SRV_UNIX_FSYNC 1 /* This is the default */ +#define SRV_UNIX_O_DSYNC 2 +#define SRV_UNIX_LITTLESYNC 3 +#define SRV_UNIX_NOSYNC 4 +#define SRV_UNIX_O_DIRECT 5 + +/* Alternatives for file i/o in Windows */ +#define SRV_WIN_IO_NORMAL 1 +#define SRV_WIN_IO_UNBUFFERED 2 /* This is the default */ + +/* Alternatives for srv_force_recovery. Non-zero values are intended +to help the user get a damaged database up so that he can dump intact +tables and rows with SELECT INTO OUTFILE. The database must not otherwise +be used with these options! A bigger number below means that all precautions +of lower numbers are included. */ + +#define SRV_FORCE_IGNORE_CORRUPT 1 /* let the server run even if it + detects a corrupt page */ +#define SRV_FORCE_NO_BACKGROUND 2 /* prevent the main thread from + running: if a crash would occur + in purge, this prevents it */ +#define SRV_FORCE_NO_TRX_UNDO 3 /* do not run trx rollback after + recovery */ +#define SRV_FORCE_NO_IBUF_MERGE 4 /* prevent also ibuf operations: + if they would cause a crash, better + not do them */ +#define SRV_FORCE_NO_UNDO_LOG_SCAN 5 /* do not look at undo logs when + starting the database: InnoDB will + treat even incomplete transactions + as committed */ +#define SRV_FORCE_NO_LOG_REDO 6 /* do not do the log roll-forward + in connection with recovery */ + +/** Types of threads existing in the system. */ +enum srv_thread_type { + SRV_COM = 1, /**< threads serving communication and queries */ + SRV_CONSOLE, /**< thread serving console */ + SRV_WORKER, /**< threads serving parallelized queries and + queries released from lock wait */ +#if 0 + /* Utility threads */ + SRV_BUFFER, /**< thread flushing dirty buffer blocks */ + SRV_RECOVERY, /**< threads finishing a recovery */ + SRV_INSERT, /**< thread flushing the insert buffer to disk */ +#endif + SRV_MASTER /**< the master thread, (whose type number must + be biggest) */ +}; + +/************************************************************************* +Boots Innobase server. */ +UNIV_INTERN +ulint +srv_boot(void); +/*==========*/ + /* out: DB_SUCCESS or error code */ +/************************************************************************* +Initializes the server. */ +UNIV_INTERN +void +srv_init(void); +/*==========*/ +/************************************************************************* +Frees the OS fast mutex created in srv_boot(). */ +UNIV_INTERN +void +srv_free(void); +/*==========*/ +/************************************************************************* +Initializes the synchronization primitives, memory system, and the thread +local storage. */ +UNIV_INTERN +void +srv_general_init(void); +/*==================*/ +/************************************************************************* +Gets the number of threads in the system. */ +UNIV_INTERN +ulint +srv_get_n_threads(void); +/*===================*/ +/************************************************************************* +Returns the calling thread type. */ + +enum srv_thread_type +srv_get_thread_type(void); +/*=====================*/ + /* out: SRV_COM, ... */ +/************************************************************************* +Sets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_set_io_thread_op_info( +/*======================*/ + ulint i, /* in: the 'segment' of the i/o thread */ + const char* str); /* in: constant char string describing the + state */ +/************************************************************************* +Releases threads of the type given from suspension in the thread table. +NOTE! The server mutex has to be reserved by the caller! */ +UNIV_INTERN +ulint +srv_release_threads( +/*================*/ + /* out: number of threads + released: this may be < n if + not enough threads were + suspended at the moment */ + enum srv_thread_type type, /* in: thread type */ + ulint n); /* in: number of threads to release */ +/************************************************************************* +The master thread controlling the server. */ +UNIV_INTERN +os_thread_ret_t +srv_master_thread( +/*==============*/ + /* out: a dummy parameter */ + void* arg); /* in: a dummy parameter required by + os_thread_create */ +/*********************************************************************** +Tells the Innobase server that there has been activity in the database +and wakes up the master thread if it is suspended (not sleeping). Used +in the MySQL interface. Note that there is a small chance that the master +thread stays suspended (we do not protect our operation with the kernel +mutex, for performace reasons). */ +UNIV_INTERN +void +srv_active_wake_master_thread(void); +/*===============================*/ +/*********************************************************************** +Wakes up the master thread if it is suspended or being suspended. */ +UNIV_INTERN +void +srv_wake_master_thread(void); +/*========================*/ +/************************************************************************* +Puts an OS thread to wait if there are too many concurrent threads +(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ +UNIV_INTERN +void +srv_conc_enter_innodb( +/*==================*/ + trx_t* trx); /* in: transaction object associated with the + thread */ +/************************************************************************* +This lets a thread enter InnoDB regardless of the number of threads inside +InnoDB. This must be called when a thread ends a lock wait. */ +UNIV_INTERN +void +srv_conc_force_enter_innodb( +/*========================*/ + trx_t* trx); /* in: transaction object associated with the + thread */ +/************************************************************************* +This must be called when a thread exits InnoDB in a lock wait or at the +end of an SQL statement. */ +UNIV_INTERN +void +srv_conc_force_exit_innodb( +/*=======================*/ + trx_t* trx); /* in: transaction object associated with the + thread */ +/************************************************************************* +This must be called when a thread exits InnoDB. */ +UNIV_INTERN +void +srv_conc_exit_innodb( +/*=================*/ + trx_t* trx); /* in: transaction object associated with the + thread */ +/******************************************************************* +Puts a MySQL OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ +UNIV_INTERN +void +srv_suspend_mysql_thread( +/*=====================*/ + que_thr_t* thr); /* in: query thread associated with the MySQL + OS thread */ +/************************************************************************ +Releases a MySQL OS thread waiting for a lock to be released, if the +thread is already suspended. */ +UNIV_INTERN +void +srv_release_mysql_thread_if_suspended( +/*==================================*/ + que_thr_t* thr); /* in: query thread associated with the + MySQL OS thread */ +/************************************************************************* +A thread which wakes up threads whose lock wait may have lasted too long. +This also prints the info output by various InnoDB monitors. */ +UNIV_INTERN +os_thread_ret_t +srv_lock_timeout_and_monitor_thread( +/*================================*/ + /* out: a dummy parameter */ + void* arg); /* in: a dummy parameter required by + os_thread_create */ +/************************************************************************* +A thread which prints warnings about semaphore waits which have lasted +too long. These can be used to track bugs which cause hangs. */ +UNIV_INTERN +os_thread_ret_t +srv_error_monitor_thread( +/*=====================*/ + /* out: a dummy parameter */ + void* arg); /* in: a dummy parameter required by + os_thread_create */ +/********************************************************************** +Outputs to a file the output of the InnoDB Monitor. */ +UNIV_INTERN +void +srv_printf_innodb_monitor( +/*======================*/ + FILE* file, /* in: output stream */ + ulint* trx_start, /* out: file position of the start of + the list of active transactions */ + ulint* trx_end); /* out: file position of the end of + the list of active transactions */ + +/********************************************************************** +Function to pass InnoDB status variables to MySQL */ +UNIV_INTERN +void +srv_export_innodb_status(void); +/*=====================*/ + +/* Thread slot in the thread table */ +typedef struct srv_slot_struct srv_slot_t; + +/* Thread table is an array of slots */ +typedef srv_slot_t srv_table_t; + +/* In this structure we store status variables to be passed to MySQL */ +struct export_var_struct{ + ulint innodb_data_pending_reads; + ulint innodb_data_pending_writes; + ulint innodb_data_pending_fsyncs; + ulint innodb_data_fsyncs; + ulint innodb_data_read; + ulint innodb_data_writes; + ulint innodb_data_written; + ulint innodb_data_reads; + ulint innodb_buffer_pool_pages_total; + ulint innodb_buffer_pool_pages_data; + ulint innodb_buffer_pool_pages_dirty; + ulint innodb_buffer_pool_pages_misc; + ulint innodb_buffer_pool_pages_free; +#ifdef UNIV_DEBUG + ulint innodb_buffer_pool_pages_latched; +#endif /* UNIV_DEBUG */ + ulint innodb_buffer_pool_read_requests; + ulint innodb_buffer_pool_reads; + ulint innodb_buffer_pool_wait_free; + ulint innodb_buffer_pool_pages_flushed; + ulint innodb_buffer_pool_write_requests; + ulint innodb_buffer_pool_read_ahead_seq; + ulint innodb_buffer_pool_read_ahead_rnd; + ulint innodb_dblwr_pages_written; + ulint innodb_dblwr_writes; + ibool innodb_have_atomic_builtins; + ulint innodb_log_waits; + ulint innodb_log_write_requests; + ulint innodb_log_writes; + ulint innodb_os_log_written; + ulint innodb_os_log_fsyncs; + ulint innodb_os_log_pending_writes; + ulint innodb_os_log_pending_fsyncs; + ulint innodb_page_size; + ulint innodb_pages_created; + ulint innodb_pages_read; + ulint innodb_pages_written; + ulint innodb_row_lock_waits; + ulint innodb_row_lock_current_waits; + ib_int64_t innodb_row_lock_time; + ulint innodb_row_lock_time_avg; + ulint innodb_row_lock_time_max; + ulint innodb_rows_read; + ulint innodb_rows_inserted; + ulint innodb_rows_updated; + ulint innodb_rows_deleted; +}; + +/* The server system struct */ +struct srv_sys_struct{ + srv_table_t* threads; /* server thread table */ + UT_LIST_BASE_NODE_T(que_thr_t) + tasks; /* task queue */ + dict_index_t* dummy_ind1; /* dummy index for old-style + supremum and infimum records */ + dict_index_t* dummy_ind2; /* dummy index for new-style + supremum and infimum records */ +}; + +extern ulint srv_n_threads_active[]; + +#endif diff --git a/storage/xtradb/include/srv0srv.ic b/storage/xtradb/include/srv0srv.ic new file mode 100644 index 00000000000..93d675f1dca --- /dev/null +++ b/storage/xtradb/include/srv0srv.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Server main program + +Created 10/4/1995 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/srv0start.h b/storage/xtradb/include/srv0start.h new file mode 100644 index 00000000000..15fa3b8f95f --- /dev/null +++ b/storage/xtradb/include/srv0start.h @@ -0,0 +1,118 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Starts the Innobase database server + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#ifndef srv0start_h +#define srv0start_h + +#include "univ.i" +#include "ut0byte.h" + +/************************************************************************* +Normalizes a directory path for Windows: converts slashes to backslashes. */ +UNIV_INTERN +void +srv_normalize_path_for_win( +/*=======================*/ + char* str); /* in/out: null-terminated character string */ +/************************************************************************* +Reads the data files and their sizes from a character string given in +the .cnf file. */ +UNIV_INTERN +ibool +srv_parse_data_file_paths_and_sizes( +/*================================*/ + /* out: TRUE if ok, FALSE on parse error */ + char* str); /* in/out: the data file path string */ +/************************************************************************* +Reads log group home directories from a character string given in +the .cnf file. */ +UNIV_INTERN +ibool +srv_parse_log_group_home_dirs( +/*==========================*/ + /* out: TRUE if ok, FALSE on parse error */ + char* str); /* in/out: character string */ +/************************************************************************* +Frees the memory allocated by srv_parse_data_file_paths_and_sizes() +and srv_parse_log_group_home_dirs(). */ +UNIV_INTERN +void +srv_free_paths_and_sizes(void); +/*==========================*/ +/************************************************************************* +Adds a slash or a backslash to the end of a string if it is missing +and the string is not empty. */ +UNIV_INTERN +char* +srv_add_path_separator_if_needed( +/*=============================*/ + /* out: string which has the separator if the + string is not empty */ + char* str); /* in: null-terminated character string */ +/******************************************************************** +Starts Innobase and creates a new database if database files +are not found and the user wants. */ +UNIV_INTERN +int +innobase_start_or_create_for_mysql(void); +/*====================================*/ + /* out: DB_SUCCESS or error code */ +/******************************************************************** +Shuts down the Innobase database. */ +UNIV_INTERN +int +innobase_shutdown_for_mysql(void); +/*=============================*/ + /* out: DB_SUCCESS or error code */ +extern ib_uint64_t srv_shutdown_lsn; +extern ib_uint64_t srv_start_lsn; + +#ifdef __NETWARE__ +void set_panic_flag_for_netware(void); +#endif + +#ifdef HAVE_DARWIN_THREADS +extern ibool srv_have_fullfsync; +#endif + +extern ibool srv_is_being_started; +extern ibool srv_was_started; +extern ibool srv_startup_is_before_trx_rollback_phase; +extern ibool srv_is_being_shut_down; + +extern ibool srv_start_raw_disk_in_use; + +/* At a shutdown the value first climbs from 0 to SRV_SHUTDOWN_CLEANUP +and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ + +extern ulint srv_shutdown_state; + +#define SRV_SHUTDOWN_CLEANUP 1 +#define SRV_SHUTDOWN_LAST_PHASE 2 +#define SRV_SHUTDOWN_EXIT_THREADS 3 + +/* Log 'spaces' have id's >= this */ +#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL + +#endif diff --git a/storage/xtradb/include/sync0arr.h b/storage/xtradb/include/sync0arr.h new file mode 100644 index 00000000000..cc01c9ac5c8 --- /dev/null +++ b/storage/xtradb/include/sync0arr.h @@ -0,0 +1,138 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The wait array used in synchronization primitives + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0arr_h +#define sync0arr_h + +#include "univ.i" +#include "ut0lst.h" +#include "ut0mem.h" +#include "os0thread.h" + +typedef struct sync_cell_struct sync_cell_t; +typedef struct sync_array_struct sync_array_t; + +#define SYNC_ARRAY_OS_MUTEX 1 +#define SYNC_ARRAY_MUTEX 2 + +/*********************************************************************** +Creates a synchronization wait array. It is protected by a mutex +which is automatically reserved when the functions operating on it +are called. */ +UNIV_INTERN +sync_array_t* +sync_array_create( +/*==============*/ + /* out, own: created wait array */ + ulint n_cells, /* in: number of cells in the array + to create */ + ulint protection); /* in: either SYNC_ARRAY_OS_MUTEX or + SYNC_ARRAY_MUTEX: determines the type + of mutex protecting the data structure */ +/********************************************************************** +Frees the resources in a wait array. */ +UNIV_INTERN +void +sync_array_free( +/*============*/ + sync_array_t* arr); /* in, own: sync wait array */ +/********************************************************************** +Reserves a wait array cell for waiting for an object. +The event of the cell is reset to nonsignalled state. */ +UNIV_INTERN +void +sync_array_reserve_cell( +/*====================*/ + sync_array_t* arr, /* in: wait array */ + void* object, /* in: pointer to the object to wait for */ + ulint type, /* in: lock request type */ + const char* file, /* in: file where requested */ + ulint line, /* in: line where requested */ + ulint* index); /* out: index of the reserved cell */ +/********************************************************************** +This function should be called when a thread starts to wait on +a wait array cell. In the debug version this function checks +if the wait for a semaphore will result in a deadlock, in which +case prints info and asserts. */ +UNIV_INTERN +void +sync_array_wait_event( +/*==================*/ + sync_array_t* arr, /* in: wait array */ + ulint index); /* in: index of the reserved cell */ +/********************************************************************** +Frees the cell. NOTE! sync_array_wait_event frees the cell +automatically! */ +UNIV_INTERN +void +sync_array_free_cell( +/*=================*/ + sync_array_t* arr, /* in: wait array */ + ulint index); /* in: index of the cell in array */ +/************************************************************************** +Note that one of the wait objects was signalled. */ +UNIV_INTERN +void +sync_array_object_signalled( +/*========================*/ + sync_array_t* arr); /* in: wait array */ +/************************************************************************** +If the wakeup algorithm does not work perfectly at semaphore relases, +this function will do the waking (see the comment in mutex_exit). This +function should be called about every 1 second in the server. */ +UNIV_INTERN +void +sync_arr_wake_threads_if_sema_free(void); +/*====================================*/ +/************************************************************************** +Prints warnings of long semaphore waits to stderr. */ +UNIV_INTERN +ibool +sync_array_print_long_waits(void); +/*=============================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ +/************************************************************************ +Validates the integrity of the wait array. Checks +that the number of reserved cells equals the count variable. */ +UNIV_INTERN +void +sync_array_validate( +/*================*/ + sync_array_t* arr); /* in: sync wait array */ +/************************************************************************** +Prints info of the wait array. */ +UNIV_INTERN +void +sync_array_print_info( +/*==================*/ + FILE* file, /* in: file where to print */ + sync_array_t* arr); /* in: wait array */ + + +#ifndef UNIV_NONINL +#include "sync0arr.ic" +#endif + +#endif diff --git a/storage/xtradb/include/sync0arr.ic b/storage/xtradb/include/sync0arr.ic new file mode 100644 index 00000000000..09a562a4723 --- /dev/null +++ b/storage/xtradb/include/sync0arr.ic @@ -0,0 +1,26 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The wait array for synchronization primitives + +Inline code + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + diff --git a/storage/xtradb/include/sync0rw.h b/storage/xtradb/include/sync0rw.h new file mode 100644 index 00000000000..e3fe0dc9ccc --- /dev/null +++ b/storage/xtradb/include/sync0rw.h @@ -0,0 +1,567 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The read-write lock (for threads, not for database transactions) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0rw_h +#define sync0rw_h + +#include "univ.i" +#include "ut0lst.h" +#include "sync0sync.h" +#include "os0sync.h" + +/* The following undef is to prevent a name conflict with a macro +in MySQL: */ +#undef rw_lock_t + +/* Latch types; these are used also in btr0btr.h: keep the numerical values +smaller than 30 and the order of the numerical values like below! */ +#define RW_S_LATCH 1 +#define RW_X_LATCH 2 +#define RW_NO_LATCH 3 + +/* We decrement lock_word by this amount for each x_lock. It is also the +start value for the lock_word, meaning that it limits the maximum number +of concurrent read locks before the rw_lock breaks. The current value of +0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/ +#define X_LOCK_DECR 0x00100000 + +typedef struct rw_lock_struct rw_lock_t; +#ifdef UNIV_SYNC_DEBUG +typedef struct rw_lock_debug_struct rw_lock_debug_t; +#endif /* UNIV_SYNC_DEBUG */ + +typedef UT_LIST_BASE_NODE_T(rw_lock_t) rw_lock_list_t; + +extern rw_lock_list_t rw_lock_list; +extern mutex_t rw_lock_list_mutex; + +#ifdef UNIV_SYNC_DEBUG +/* The global mutex which protects debug info lists of all rw-locks. +To modify the debug info list of an rw-lock, this mutex has to be + +acquired in addition to the mutex protecting the lock. */ +extern mutex_t rw_lock_debug_mutex; +extern os_event_t rw_lock_debug_event; /* If deadlock detection does + not get immediately the mutex it + may wait for this event */ +extern ibool rw_lock_debug_waiters; /* This is set to TRUE, if + there may be waiters for the event */ +#endif /* UNIV_SYNC_DEBUG */ + +extern ib_int64_t rw_s_spin_wait_count; +extern ib_int64_t rw_s_spin_round_count; +extern ib_int64_t rw_s_exit_count; +extern ib_int64_t rw_s_os_wait_count; +extern ib_int64_t rw_x_spin_wait_count; +extern ib_int64_t rw_x_spin_round_count; +extern ib_int64_t rw_x_os_wait_count; +extern ib_int64_t rw_x_exit_count; + +/********************************************************************** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define rw_lock_create(L, level) \ + rw_lock_create_func((L), (level), #L, __FILE__, __LINE__) +# else /* UNIV_SYNC_DEBUG */ +# define rw_lock_create(L, level) \ + rw_lock_create_func((L), #L, __FILE__, __LINE__) +# endif /* UNIV_SYNC_DEBUG */ +#else /* UNIV_DEBUG */ +# define rw_lock_create(L, level) \ + rw_lock_create_func((L), __FILE__, __LINE__) +#endif /* UNIV_DEBUG */ + +/********************************************************************** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /* in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cmutex_name, /* in: mutex name */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /* in: file name where created */ + ulint cline); /* in: file line where created */ +/********************************************************************** +Calling this function is obligatory only if the memory buffer containing +the rw-lock is freed. Removes an rw-lock object from the global list. The +rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free( +/*=========*/ + rw_lock_t* lock); /* in: rw-lock */ +#ifdef UNIV_DEBUG +/********************************************************************** +Checks that the rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + rw_lock_t* lock); +#endif /* UNIV_DEBUG */ +/****************************************************************** +NOTE! The following macros should be used in rw s-locking, not the +corresponding function. */ + +#define rw_lock_s_lock(M) rw_lock_s_lock_func(\ + (M), 0, __FILE__, __LINE__) +/****************************************************************** +NOTE! The following macros should be used in rw s-locking, not the +corresponding function. */ + +#define rw_lock_s_lock_gen(M, P) rw_lock_s_lock_func(\ + (M), (P), __FILE__, __LINE__) +/****************************************************************** +NOTE! The following macros should be used in rw s-locking, not the +corresponding function. */ + +#define rw_lock_s_lock_nowait(M, F, L) rw_lock_s_lock_low(\ + (M), 0, (F), (L)) +/********************************************************************** +Low-level function which tries to lock an rw-lock in s-mode. Performs no +spinning. */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + /* out: TRUE if success */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass __attribute__((unused)), + /* in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /* in: file name where lock requested */ + ulint line); /* in: line where requested */ +/********************************************************************** +NOTE! Use the corresponding macro, not directly this function, except if +you supply the file name and line number. Lock an rw-lock in shared mode +for the current thread. If the rw-lock is locked in exclusive mode, or +there is an exclusive lock request waiting, the function spins a preset +time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before +suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line); /* in: line where requested */ +/********************************************************************** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + /* out: TRUE if success */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name,/* in: file name where lock requested */ + ulint line); /* in: line where requested */ +/********************************************************************** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ + rw_lock_t* lock /* in: rw-lock */ +#ifdef UNIV_SYNC_DEBUG + ,ulint pass /* in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + ); +/*********************************************************************** +Releases a shared mode lock. */ + +#ifdef UNIV_SYNC_DEBUG +#define rw_lock_s_unlock(L) rw_lock_s_unlock_func(L, 0) +#else +#define rw_lock_s_unlock(L) rw_lock_s_unlock_func(L) +#endif +/*********************************************************************** +Releases a shared mode lock. */ + +#ifdef UNIV_SYNC_DEBUG +#define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L, P) +#else +#define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L) +#endif +/****************************************************************** +NOTE! The following macro should be used in rw x-locking, not the +corresponding function. */ + +#define rw_lock_x_lock(M) rw_lock_x_lock_func(\ + (M), 0, __FILE__, __LINE__) +/****************************************************************** +NOTE! The following macro should be used in rw x-locking, not the +corresponding function. */ + +#define rw_lock_x_lock_gen(M, P) rw_lock_x_lock_func(\ + (M), (P), __FILE__, __LINE__) +/****************************************************************** +NOTE! The following macros should be used in rw x-locking, not the +corresponding function. */ + +#define rw_lock_x_lock_nowait(M) rw_lock_x_lock_func_nowait(\ + (M), __FILE__, __LINE__) +/********************************************************************** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line); /* in: line where requested */ +/********************************************************************** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ + rw_lock_t* lock /* in: rw-lock */ +#ifdef UNIV_SYNC_DEBUG + ,ulint pass /* in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + ); +/*********************************************************************** +Releases an exclusive mode lock. */ + +#ifdef UNIV_SYNC_DEBUG +#define rw_lock_x_unlock(L) rw_lock_x_unlock_func(L, 0) +#else +#define rw_lock_x_unlock(L) rw_lock_x_unlock_func(L) +#endif +/*********************************************************************** +Releases an exclusive mode lock. */ + +#ifdef UNIV_SYNC_DEBUG +#define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L, P) +#else +#define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L) +#endif +/********************************************************************** +Low-level function which locks an rw-lock in s-mode when we know that it +is possible and none else is currently accessing the rw-lock structure. +Then we can do the locking without reserving the mutex. */ +UNIV_INLINE +void +rw_lock_s_lock_direct( +/*==================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line /* in: line where lock requested */ +); +/********************************************************************** +Low-level function which locks an rw-lock in x-mode when we know that it +is not locked and none else is currently accessing the rw-lock structure. +Then we can do the locking without reserving the mutex. */ +UNIV_INLINE +void +rw_lock_x_lock_direct( +/*==================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line /* in: line where lock requested */ +); +/********************************************************************** +This function is used in the insert buffer to move the ownership of an +x-latch on a buffer frame to the current thread. The x-latch was set by +the buffer read operation and it protected the buffer frame while the +read was done. The ownership is moved because we want that the current +thread is able to acquire a second x-latch which is stored in an mtr. +This, in turn, is needed to pass the debug checks of index page +operations. */ +UNIV_INTERN +void +rw_lock_x_lock_move_ownership( +/*==========================*/ + rw_lock_t* lock); /* in: lock which was x-locked in the + buffer read */ +/********************************************************************** +Releases a shared mode lock when we know there are no waiters and none +else will access the lock during the time this function is executed. */ +UNIV_INLINE +void +rw_lock_s_unlock_direct( +/*====================*/ + rw_lock_t* lock); /* in: rw-lock */ +/********************************************************************** +Releases an exclusive mode lock when we know there are no waiters, and +none else will access the lock durint the time this function is executed. */ +UNIV_INLINE +void +rw_lock_x_unlock_direct( +/*====================*/ + rw_lock_t* lock); /* in: rw-lock */ +/********************************************************************** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + /* out: value of writer_count */ + rw_lock_t* lock); /* in: rw-lock */ +/************************************************************************ +Accessor functions for rw lock. */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + rw_lock_t* lock); +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + rw_lock_t* lock); +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + rw_lock_t* lock); +/********************************************************************** +Decrements lock_word the specified amount if it is greater than 0. +This is used by both s_lock and x_lock operations. */ +UNIV_INLINE +ibool +rw_lock_lock_word_decr( +/*===================*/ + /* out: TRUE if decr occurs */ + rw_lock_t* lock, /* in: rw-lock */ + ulint amount); /* in: amount to decrement */ +/********************************************************************** +Increments lock_word the specified amount and returns new value. */ +UNIV_INLINE +lint +rw_lock_lock_word_incr( +/*===================*/ + /* out: TRUE if decr occurs */ + rw_lock_t* lock, + ulint amount); /* in: rw-lock */ +/********************************************************************** +This function sets the lock->writer_thread and lock->recursive fields. +For platforms where we are using atomic builtins instead of lock->mutex +it sets the lock->writer_thread field using atomics to ensure memory +ordering. Note that it is assumed that the caller of this function +effectively owns the lock i.e.: nobody else is allowed to modify +lock->writer_thread at this point in time. +The protocol is that lock->writer_thread MUST be updated BEFORE the +lock->recursive flag is set. */ +UNIV_INLINE +void +rw_lock_set_writer_id_and_recursion_flag( +/*=====================================*/ + rw_lock_t* lock, /* in/out: lock to work on */ + ibool recursive); /* in: TRUE if recursion + allowed */ +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint lock_type); /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +#endif /* UNIV_SYNC_DEBUG */ +/********************************************************************** +Checks if somebody has locked the rw-lock in the specified mode. */ +UNIV_INTERN +ibool +rw_lock_is_locked( +/*==============*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint lock_type); /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +#ifdef UNIV_SYNC_DEBUG +/******************************************************************* +Prints debug info of an rw-lock. */ +UNIV_INTERN +void +rw_lock_print( +/*==========*/ + rw_lock_t* lock); /* in: rw-lock */ +/******************************************************************* +Prints debug info of currently locked rw-locks. */ +UNIV_INTERN +void +rw_lock_list_print_info( +/*====================*/ + FILE* file); /* in: file where to print */ +/******************************************************************* +Returns the number of currently locked rw-locks. +Works only in the debug version. */ +UNIV_INTERN +ulint +rw_lock_n_locked(void); +/*==================*/ + +/*#####################################################################*/ + +/********************************************************************** +Acquires the debug mutex. We cannot use the mutex defined in sync0sync, +because the debug mutex is also acquired in sync0arr while holding the OS +mutex protecting the sync array, and the ordinary mutex_enter might +recursively call routines in sync0arr, leading to a deadlock on the OS +mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_enter(void); +/*==========================*/ +/********************************************************************** +Releases the debug mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_exit(void); +/*==========================*/ +/************************************************************************* +Prints info of a debug struct. */ +UNIV_INTERN +void +rw_lock_debug_print( +/*================*/ + rw_lock_debug_t* info); /* in: debug struct */ +#endif /* UNIV_SYNC_DEBUG */ + +/* NOTE! The structure appears here only for the compiler to know its size. +Do not use its fields directly! The structure used in the spin lock +implementation of a read-write lock. Several threads may have a shared lock +simultaneously in this lock, but only one writer may have an exclusive lock, +in which case no shared locks are allowed. To prevent starving of a writer +blocked by readers, a writer may queue for x-lock by decrementing lock_word: +no new readers will be let in while the thread waits for readers to exit. */ + +struct rw_lock_struct { + volatile lint lock_word; + /* Holds the state of the lock. */ + volatile ulint waiters;/* 1: there are waiters */ + volatile ibool recursive;/* Default value FALSE which means the lock + is non-recursive. The value is typically set + to TRUE making normal rw_locks recursive. In + case of asynchronous IO, when a non-zero + value of 'pass' is passed then we keep the + lock non-recursive. + This flag also tells us about the state of + writer_thread field. If this flag is set + then writer_thread MUST contain the thread + id of the current x-holder or wait-x thread. + This flag must be reset in x_unlock + functions before incrementing the lock_word */ + volatile os_thread_id_t writer_thread; + /* Thread id of writer thread. Is only + guaranteed to have sane and non-stale + value iff recursive flag is set. */ + os_event_t event; /* Used by sync0arr.c for thread queueing */ + os_event_t wait_ex_event; + /* Event for next-writer to wait on. A thread + must decrement lock_word before waiting. */ +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_t mutex; /* The mutex protecting rw_lock_struct */ +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + UT_LIST_NODE_T(rw_lock_t) list; + /* All allocated rw locks are put into a + list */ +#ifdef UNIV_SYNC_DEBUG + UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; + /* In the debug version: pointer to the debug + info list of the lock */ + ulint level; /* Level in the global latching order. */ +#endif /* UNIV_SYNC_DEBUG */ + ulint count_os_wait; /* Count of os_waits. May not be accurate */ + const char* cfile_name;/* File name where lock created */ + /* last s-lock file/line is not guaranteed to be correct */ + const char* last_s_file_name;/* File name where last s-locked */ + const char* last_x_file_name;/* File name where last x-locked */ + ibool writer_is_wait_ex; + /* This is TRUE if the writer field is + RW_LOCK_WAIT_EX; this field is located far + from the memory update hotspot fields which + are at the start of this struct, thus we can + peek this field without causing much memory + bus traffic */ + unsigned cline:14; /* Line where created */ + unsigned last_s_line:14; /* Line number where last time s-locked */ + unsigned last_x_line:14; /* Line number where last time x-locked */ + ulint magic_n; +}; + +#define RW_LOCK_MAGIC_N 22643 + +#ifdef UNIV_SYNC_DEBUG +/* The structure for storing debug info of an rw-lock */ +struct rw_lock_debug_struct { + + os_thread_id_t thread_id; /* The thread id of the thread which + locked the rw-lock */ + ulint pass; /* Pass value given in the lock operation */ + ulint lock_type; /* Type of the lock: RW_LOCK_EX, + RW_LOCK_SHARED, RW_LOCK_WAIT_EX */ + const char* file_name;/* File name where the lock was obtained */ + ulint line; /* Line where the rw-lock was locked */ + UT_LIST_NODE_T(rw_lock_debug_t) list; + /* Debug structs are linked in a two-way + list */ +}; +#endif /* UNIV_SYNC_DEBUG */ + +#ifndef UNIV_NONINL +#include "sync0rw.ic" +#endif + +#endif diff --git a/storage/xtradb/include/sync0rw.ic b/storage/xtradb/include/sync0rw.ic new file mode 100644 index 00000000000..9e7e4dc9bd8 --- /dev/null +++ b/storage/xtradb/include/sync0rw.ic @@ -0,0 +1,635 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The read-write lock (for threads) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +/********************************************************************** +Lock an rw-lock in shared mode for the current thread. If the rw-lock is +locked in exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), +waiting for the lock before suspending the thread. */ +UNIV_INTERN +void +rw_lock_s_lock_spin( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line); /* in: line where requested */ +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Inserts the debug information for an rw-lock. */ +UNIV_INTERN +void +rw_lock_add_debug_info( +/*===================*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint pass, /* in: pass value */ + ulint lock_type, /* in: lock type */ + const char* file_name, /* in: file where requested */ + ulint line); /* in: line where requested */ +/********************************************************************** +Removes a debug information struct for an rw-lock. */ +UNIV_INTERN +void +rw_lock_remove_debug_info( +/*======================*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint pass, /* in: pass value */ + ulint lock_type); /* in: lock type */ +#endif /* UNIV_SYNC_DEBUG */ + +/************************************************************************ +Accessor functions for rw lock. */ +UNIV_INLINE +ulint +rw_lock_get_waiters( +/*================*/ + /* out: 1 if waiters, 0 otherwise */ + rw_lock_t* lock) /* in: rw-lock */ +{ + return(lock->waiters); +} + +/************************************************************************ +Sets lock->waiters to 1. It is not an error if lock->waiters is already +1. On platforms where ATOMIC builtins are used this function enforces a +memory barrier. */ +UNIV_INLINE +void +rw_lock_set_waiter_flag( +/*====================*/ + rw_lock_t* lock) /* in: rw-lock */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + os_compare_and_swap(&lock->waiters, 0, 1); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lock->waiters = 1; +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/************************************************************************ +Resets lock->waiters to 0. It is not an error if lock->waiters is already +0. On platforms where ATOMIC builtins are used this function enforces a +memory barrier. */ +UNIV_INLINE +void +rw_lock_reset_waiter_flag( +/*======================*/ + rw_lock_t* lock) /* in: rw-lock */ +{ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + os_compare_and_swap(&lock->waiters, 1, 0); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + lock->waiters = 0; +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/********************************************************************** +Returns the write-status of the lock - this function made more sense +with the old rw_lock implementation. */ +UNIV_INLINE +ulint +rw_lock_get_writer( +/*===============*/ + rw_lock_t* lock) +{ + lint lock_word = lock->lock_word; + if(lock_word > 0) { + /* return NOT_LOCKED in s-lock state, like the writer + member of the old lock implementation. */ + return(RW_LOCK_NOT_LOCKED); + } else if (((-lock_word) % X_LOCK_DECR) == 0) { + return(RW_LOCK_EX); + } else { + ut_ad(lock_word > -X_LOCK_DECR); + return(RW_LOCK_WAIT_EX); + } +} + +/********************************************************************** +Returns number of readers. */ +UNIV_INLINE +ulint +rw_lock_get_reader_count( +/*=====================*/ + rw_lock_t* lock) +{ + lint lock_word = lock->lock_word; + if(lock_word > 0) { + /* s-locked, no x-waiters */ + return(X_LOCK_DECR - lock_word); + } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) { + /* s-locked, with x-waiters */ + return((ulint)(-lock_word)); + } + return(0); +} + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS +UNIV_INLINE +mutex_t* +rw_lock_get_mutex( +/*==============*/ + rw_lock_t* lock) +{ + return(&(lock->mutex)); +} +#endif + +/********************************************************************** +Returns the value of writer_count for the lock. Does not reserve the lock +mutex, so the caller must be sure it is not changed during the call. */ +UNIV_INLINE +ulint +rw_lock_get_x_lock_count( +/*=====================*/ + /* out: value of writer_count */ + rw_lock_t* lock) /* in: rw-lock */ +{ + lint lock_copy = lock->lock_word; + /* If there is a reader, lock_word is not divisible by X_LOCK_DECR */ + if(lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) { + return(0); + } + return(((-lock_copy) / X_LOCK_DECR) + 1); +} + +/********************************************************************** +Two different implementations for decrementing the lock_word of a rw_lock: +one for systems supporting atomic operations, one for others. This does +does not support recusive x-locks: they should be handled by the caller and +need not be atomic since they are performed by the current lock holder. +Returns true if the decrement was made, false if not. */ +UNIV_INLINE +ibool +rw_lock_lock_word_decr( +/*===================*/ + /* out: TRUE if decr occurs */ + rw_lock_t* lock, /* in: rw-lock */ + ulint amount) /* in: amount of decrement */ +{ + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + + lint local_lock_word = lock->lock_word; + while (local_lock_word > 0) { + if(os_compare_and_swap(&(lock->lock_word), + local_lock_word, + local_lock_word - amount)) { + return(TRUE); + } + local_lock_word = lock->lock_word; + } + return(FALSE); + +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + + ibool success = FALSE; + mutex_enter(&(lock->mutex)); + if(lock->lock_word > 0) { + lock->lock_word -= amount; + success = TRUE; + } + mutex_exit(&(lock->mutex)); + return(success); + +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/********************************************************************** +Two different implementations for incrementing the lock_word of a rw_lock: +one for systems supporting atomic operations, one for others. +Returns the value of lock_word after increment. */ +UNIV_INLINE +lint +rw_lock_lock_word_incr( +/*===================*/ + /* out: lock->lock_word after increment */ + rw_lock_t* lock, /* in: rw-lock */ + ulint amount) /* in: amount of increment */ +{ + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + + return(os_atomic_increment(&(lock->lock_word), amount)); + +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + + lint local_lock_word; + + mutex_enter(&(lock->mutex)); + + lock->lock_word += amount; + local_lock_word = lock->lock_word; + + mutex_exit(&(lock->mutex)); + + return(local_lock_word); + +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/********************************************************************** +This function sets the lock->writer_thread and lock->recursive fields. +For platforms where we are using atomic builtins instead of lock->mutex +it sets the lock->writer_thread field using atomics to ensure memory +ordering. Note that it is assumed that the caller of this function +effectively owns the lock i.e.: nobody else is allowed to modify +lock->writer_thread at this point in time. +The protocol is that lock->writer_thread MUST be updated BEFORE the +lock->recursive flag is set. */ +UNIV_INLINE +void +rw_lock_set_writer_id_and_recursion_flag( +/*=====================================*/ + rw_lock_t* lock, /* in/out: lock to work on */ + ibool recursive) /* in: TRUE if recursion + allowed */ +{ + os_thread_id_t curr_thread = os_thread_get_curr_id(); + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + os_thread_id_t local_thread; + ibool success; + + /* Prevent Valgrind warnings about writer_thread being + uninitialized. It does not matter if writer_thread is + uninitialized, because we are comparing writer_thread against + itself, and the operation should always succeed. */ + UNIV_MEM_VALID(&lock->writer_thread, sizeof lock->writer_thread); + + local_thread = lock->writer_thread; + success = os_compare_and_swap(&lock->writer_thread, + local_thread, curr_thread); + ut_a(success); + lock->recursive = recursive; + +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + + mutex_enter(&lock->mutex); + lock->writer_thread = curr_thread; + lock->recursive = recursive; + mutex_exit(&lock->mutex); + +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +} + +/********************************************************************** +Low-level function which tries to lock an rw-lock in s-mode. Performs no +spinning. */ +UNIV_INLINE +ibool +rw_lock_s_lock_low( +/*===============*/ + /* out: TRUE if success */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass __attribute__((unused)), + /* in: pass value; != 0, if the lock will be + passed to another thread to unlock */ + const char* file_name, /* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + /* TODO: study performance of UNIV_LIKELY branch prediction hints. */ + if (!rw_lock_lock_word_decr(lock, 1)) { + /* Locking did not succeed */ + return(FALSE); + } + +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line); +#endif + /* These debugging values are not set safely: they may be incorrect + or even refer to a line that is invalid for the file name. */ + lock->last_s_file_name = file_name; + lock->last_s_line = line; + + return(TRUE); /* locking succeeded */ +} + +/********************************************************************** +Low-level function which locks an rw-lock in s-mode when we know that it +is possible and none else is currently accessing the rw-lock structure. +Then we can do the locking without reserving the mutex. */ +UNIV_INLINE +void +rw_lock_s_lock_direct( +/*==================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ +{ + ut_ad(lock->lock_word == X_LOCK_DECR); + + /* Indicate there is a new reader by decrementing lock_word */ + lock->lock_word--; + + lock->last_s_file_name = file_name; + lock->last_s_line = line; + +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, line); +#endif +} + +/********************************************************************** +Low-level function which locks an rw-lock in x-mode when we know that it +is not locked and none else is currently accessing the rw-lock structure. +Then we can do the locking without reserving the mutex. */ +UNIV_INLINE +void +rw_lock_x_lock_direct( +/*==================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ +{ + ut_ad(rw_lock_validate(lock)); + ut_ad(lock->lock_word == X_LOCK_DECR); + + lock->lock_word -= X_LOCK_DECR; + lock->writer_thread = os_thread_get_curr_id(); + lock->recursive = TRUE; + + lock->last_x_file_name = file_name; + lock->last_x_line = line; + +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); +#endif +} + +/********************************************************************** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in shared mode for the current thread. If the rw-lock is locked +in exclusive mode, or there is an exclusive lock request waiting, the +function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for +the lock, before suspending the thread. */ +UNIV_INLINE +void +rw_lock_s_lock_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + /* NOTE: As we do not know the thread ids for threads which have + s-locked a latch, and s-lockers will be served only after waiting + x-lock requests have been fulfilled, then if this thread already + owns an s-lock here, it may end up in a deadlock with another thread + which requests an x-lock here. Therefore, we will forbid recursive + s-locking of a latch: the following assert will warn the programmer + of the possibility of this kind of a deadlock. If we want to implement + safe recursive s-locking, we should keep in a list the thread ids of + the threads which have s-locked a latch. This would use some CPU + time. */ + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ +#endif /* UNIV_SYNC_DEBUG */ + + /* TODO: study performance of UNIV_LIKELY branch prediction hints. */ + if (rw_lock_s_lock_low(lock, pass, file_name, line)) { + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ + + rw_lock_s_lock_spin(lock, pass, file_name, line); + + return; + } +} + +/********************************************************************** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread if the lock can be +obtained immediately. */ +UNIV_INLINE +ibool +rw_lock_x_lock_func_nowait( +/*=======================*/ + /* out: TRUE if success */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + os_thread_id_t curr_thread = os_thread_get_curr_id(); + + ibool success; + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + success = os_compare_and_swap(&(lock->lock_word), X_LOCK_DECR, 0); +#else + + success = FALSE; + mutex_enter(&(lock->mutex)); + if (lock->lock_word == X_LOCK_DECR) { + lock->lock_word = 0; + success = TRUE; + } + mutex_exit(&(lock->mutex)); + +#endif + if (success) { + rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); + + } else if (lock->recursive + && os_thread_eq(lock->writer_thread, curr_thread)) { + /* Relock: this lock_word modification is safe since no other + threads can modify (lock, unlock, or reserve) lock_word while + there is an exclusive writer and this is the writer thread. */ + lock->lock_word -= X_LOCK_DECR; + + ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0); + + } else { + /* Failure */ + return(FALSE); + } +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); +#endif + + lock->last_x_file_name = file_name; + lock->last_x_line = line; + + ut_ad(rw_lock_validate(lock)); + + return(TRUE); +} + +/********************************************************************** +Releases a shared mode lock. */ +UNIV_INLINE +void +rw_lock_s_unlock_func( +/*==================*/ + rw_lock_t* lock /* in: rw-lock */ +#ifdef UNIV_SYNC_DEBUG + ,ulint pass /* in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + ) +{ + ut_ad((lock->lock_word % X_LOCK_DECR) != 0); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); +#endif + + /* Increment lock_word to indicate 1 less reader */ + if (rw_lock_lock_word_incr(lock, 1) == 0) { + + /* wait_ex waiter exists. It may not be asleep, but we signal + anyway. We do not wake other waiters, because they can't + exist without wait_ex waiter and wait_ex waiter goes first.*/ + os_event_set(lock->wait_ex_event); + sync_array_object_signalled(sync_primary_wait_array); + + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_s_exit_count++; +#endif +} + +/********************************************************************** +Releases a shared mode lock when we know there are no waiters and none +else will access the lock during the time this function is executed. */ +UNIV_INLINE +void +rw_lock_s_unlock_direct( +/*====================*/ + rw_lock_t* lock) /* in: rw-lock */ +{ + ut_ad(lock->lock_word < X_LOCK_DECR); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED); +#endif + + /* Decrease reader count by incrementing lock_word */ + lock->lock_word++; + + ut_ad(!lock->waiters); + ut_ad(rw_lock_validate(lock)); +#ifdef UNIV_SYNC_PERF_STAT + rw_s_exit_count++; +#endif +} + +/********************************************************************** +Releases an exclusive mode lock. */ +UNIV_INLINE +void +rw_lock_x_unlock_func( +/*==================*/ + rw_lock_t* lock /* in: rw-lock */ +#ifdef UNIV_SYNC_DEBUG + ,ulint pass /* in: pass value; != 0, if the lock may have + been passed to another thread to unlock */ +#endif + ) +{ + ut_ad((lock->lock_word % X_LOCK_DECR) == 0); + + /* lock->recursive flag also indicates if lock->writer_thread is + valid or stale. If we are the last of the recursive callers + then we must unset lock->recursive flag to indicate that the + lock->writer_thread is now stale. + Note that since we still hold the x-lock we can safely read the + lock_word. */ + if (lock->lock_word == 0) { + /* Last caller in a possible recursive chain. */ + lock->recursive = FALSE; + UNIV_MEM_INVALID(&lock->writer_thread, + sizeof lock->writer_thread); + } + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); +#endif + + if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) { + /* Lock is now free. May have to signal read/write waiters. + We do not need to signal wait_ex waiters, since they cannot + exist when there is a writer. */ + if (lock->waiters) { + rw_lock_reset_waiter_flag(lock); + os_event_set(lock->event); + sync_array_object_signalled(sync_primary_wait_array); + } + } + + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_x_exit_count++; +#endif +} + +/********************************************************************** +Releases an exclusive mode lock when we know there are no waiters, and +none else will access the lock during the time this function is executed. */ +UNIV_INLINE +void +rw_lock_x_unlock_direct( +/*====================*/ + rw_lock_t* lock) /* in: rw-lock */ +{ + /* Reset the exclusive lock if this thread no longer has an x-mode + lock */ + + ut_ad((lock->lock_word % X_LOCK_DECR) == 0); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); +#endif + + if (lock->lock_word == 0) { + lock->recursive = FALSE; + UNIV_MEM_INVALID(&lock->writer_thread, + sizeof lock->writer_thread); + } + + lock->lock_word += X_LOCK_DECR; + + ut_ad(!lock->waiters); + ut_ad(rw_lock_validate(lock)); + +#ifdef UNIV_SYNC_PERF_STAT + rw_x_exit_count++; +#endif +} diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h new file mode 100644 index 00000000000..ea4abddbbf4 --- /dev/null +++ b/storage/xtradb/include/sync0sync.h @@ -0,0 +1,569 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0sync_h +#define sync0sync_h + +#include "univ.i" +#include "sync0types.h" +#include "ut0lst.h" +#include "ut0mem.h" +#include "os0thread.h" +#include "os0sync.h" +#include "sync0arr.h" + +#ifndef UNIV_HOTBACKUP +extern my_bool timed_mutexes; +#endif /* UNIV_HOTBACKUP */ + +/********************************************************************** +Initializes the synchronization data structures. */ +UNIV_INTERN +void +sync_init(void); +/*===========*/ +/********************************************************************** +Frees the resources in synchronization data structures. */ +UNIV_INTERN +void +sync_close(void); +/*===========*/ +/********************************************************************** +Creates, or rather, initializes a mutex object to a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ + +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG +# define mutex_create(M, level) \ + mutex_create_func((M), #M, (level), __FILE__, __LINE__) +# else +# define mutex_create(M, level) \ + mutex_create_func((M), #M, __FILE__, __LINE__) +# endif +#else +# define mutex_create(M, level) \ + mutex_create_func((M), __FILE__, __LINE__) +#endif + +/********************************************************************** +Creates, or rather, initializes a mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + mutex_t* mutex, /* in: pointer to memory */ +#ifdef UNIV_DEBUG + const char* cmutex_name, /* in: mutex name */ +# ifdef UNIV_SYNC_DEBUG + ulint level, /* in: level */ +# endif /* UNIV_SYNC_DEBUG */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /* in: file name where created */ + ulint cline); /* in: file line where created */ + +#undef mutex_free /* Fix for MacOS X */ + +/********************************************************************** +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a mutex object from the mutex list. The mutex +is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free( +/*=======*/ + mutex_t* mutex); /* in: mutex */ +/****************************************************************** +NOTE! The following macro should be used in mutex locking, not the +corresponding function. */ + +#define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__) +/****************************************************************** +NOTE! The following macro should be used in mutex locking, not the +corresponding function. */ + +/* NOTE! currently same as mutex_enter! */ + +#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__) +/********************************************************************** +NOTE! Use the corresponding macro in the header file, not this function +directly. Locks a mutex for the current thread. If the mutex is reserved +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting +for the mutex before suspending the thread. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where locked */ + ulint line); /* in: line where locked */ +/****************************************************************** +NOTE! The following macro should be used in mutex locking, not the +corresponding function. */ + +#define mutex_enter_nowait(M) \ + mutex_enter_nowait_func((M), __FILE__, __LINE__) +/************************************************************************ +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. */ +UNIV_INTERN +ulint +mutex_enter_nowait_func( +/*====================*/ + /* out: 0 if succeed, 1 if not */ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where mutex + requested */ + ulint line); /* in: line where requested */ +/********************************************************************** +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit( +/*=======*/ + mutex_t* mutex); /* in: pointer to mutex */ +/********************************************************************** +Returns TRUE if no mutex or rw-lock is currently locked. +Works only in the debug version. */ +UNIV_INTERN +ibool +sync_all_freed(void); +/*================*/ +/*##################################################################### +FUNCTION PROTOTYPES FOR DEBUGGING */ +/*********************************************************************** +Prints wait info of the sync system. */ +UNIV_INTERN +void +sync_print_wait_info( +/*=================*/ + FILE* file); /* in: file where to print */ +/*********************************************************************** +Prints info of the sync system. */ +UNIV_INTERN +void +sync_print( +/*=======*/ + FILE* file); /* in: file where to print */ +#ifdef UNIV_DEBUG +/********************************************************************** +Checks that the mutex has been initialized. */ +UNIV_INTERN +ibool +mutex_validate( +/*===========*/ + const mutex_t* mutex); +/********************************************************************** +Checks that the current thread owns the mutex. Works only +in the debug version. */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + /* out: TRUE if owns */ + const mutex_t* mutex); /* in: mutex */ +#endif /* UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Adds a latch and its level in the thread level array. Allocates the memory +for the array if called first time for this OS thread. Makes the checks +against other latch levels stored in the array for this thread. */ +UNIV_INTERN +void +sync_thread_add_level( +/*==================*/ + void* latch, /* in: pointer to a mutex or an rw-lock */ + ulint level); /* in: level in the latching order; if + SYNC_LEVEL_VARYING, nothing is done */ +/********************************************************************** +Removes a latch from the thread level array if it is found there. */ +UNIV_INTERN +ibool +sync_thread_reset_level( +/*====================*/ + /* out: TRUE if found from the array; it is no error + if the latch is not found, as we presently are not + able to determine the level for every latch + reservation the program does */ + void* latch); /* in: pointer to a mutex or an rw-lock */ +/********************************************************************** +Checks that the level array for the current thread is empty. */ +UNIV_INTERN +ibool +sync_thread_levels_empty(void); +/*==========================*/ + /* out: TRUE if empty */ +/********************************************************************** +Checks that the level array for the current thread is empty. */ +UNIV_INTERN +ibool +sync_thread_levels_empty_gen( +/*=========================*/ + /* out: TRUE if empty except the + exceptions specified below */ + ibool dict_mutex_allowed); /* in: TRUE if dictionary mutex is + allowed to be owned by the thread, + also purge_is_running mutex is + allowed */ +/********************************************************************** +Gets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_get_debug_info( +/*=================*/ + mutex_t* mutex, /* in: mutex */ + const char** file_name, /* out: file where requested */ + ulint* line, /* out: line where requested */ + os_thread_id_t* thread_id); /* out: id of the thread which owns + the mutex */ +/********************************************************************** +Counts currently reserved mutexes. Works only in the debug version. */ +UNIV_INTERN +ulint +mutex_n_reserved(void); +/*==================*/ +#endif /* UNIV_SYNC_DEBUG */ +/********************************************************************** +NOT to be used outside this module except in debugging! Gets the value +of the lock word. */ +UNIV_INLINE +byte +mutex_get_lock_word( +/*================*/ + const mutex_t* mutex); /* in: mutex */ +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +NOT to be used outside this module except in debugging! Gets the waiters +field in a mutex. */ +UNIV_INLINE +ulint +mutex_get_waiters( +/*==============*/ + /* out: value to set */ + const mutex_t* mutex); /* in: mutex */ +#endif /* UNIV_SYNC_DEBUG */ + +/* + LATCHING ORDER WITHIN THE DATABASE + ================================== + +The mutex or latch in the central memory object, for instance, a rollback +segment object, must be acquired before acquiring the latch or latches to +the corresponding file data structure. In the latching order below, these +file page object latches are placed immediately below the corresponding +central memory object latch or mutex. + +Synchronization object Notes +---------------------- ----- + +Dictionary mutex If we have a pointer to a dictionary +| object, e.g., a table, it can be +| accessed without reserving the +| dictionary mutex. We must have a +| reservation, a memoryfix, to the +| appropriate table object in this case, +| and the table must be explicitly +| released later. +V +Dictionary header +| +V +Secondary index tree latch The tree latch protects also all +| the B-tree non-leaf pages. These +V can be read with the page only +Secondary index non-leaf bufferfixed to save CPU time, +| no s-latch is needed on the page. +| Modification of a page requires an +| x-latch on the page, however. If a +| thread owns an x-latch to the tree, +| it is allowed to latch non-leaf pages +| even after it has acquired the fsp +| latch. +V +Secondary index leaf The latch on the secondary index leaf +| can be kept while accessing the +| clustered index, to save CPU time. +V +Clustered index tree latch To increase concurrency, the tree +| latch is usually released when the +| leaf page latch has been acquired. +V +Clustered index non-leaf +| +V +Clustered index leaf +| +V +Transaction system header +| +V +Transaction undo mutex The undo log entry must be written +| before any index page is modified. +| Transaction undo mutex is for the undo +| logs the analogue of the tree latch +| for a B-tree. If a thread has the +| trx undo mutex reserved, it is allowed +| to latch the undo log pages in any +| order, and also after it has acquired +| the fsp latch. +V +Rollback segment mutex The rollback segment mutex must be +| reserved, if, e.g., a new page must +| be added to an undo log. The rollback +| segment and the undo logs in its +| history list can be seen as an +| analogue of a B-tree, and the latches +| reserved similarly, using a version of +| lock-coupling. If an undo log must be +| extended by a page when inserting an +| undo log record, this corresponds to +| a pessimistic insert in a B-tree. +V +Rollback segment header +| +V +Purge system latch +| +V +Undo log pages If a thread owns the trx undo mutex, +| or for a log in the history list, the +| rseg mutex, it is allowed to latch +| undo log pages in any order, and even +| after it has acquired the fsp latch. +| If a thread does not have the +| appropriate mutex, it is allowed to +| latch only a single undo log page in +| a mini-transaction. +V +File space management latch If a mini-transaction must allocate +| several file pages, it can do that, +| because it keeps the x-latch to the +| file space management in its memo. +V +File system pages +| +V +Kernel mutex If a kernel operation needs a file +| page allocation, it must reserve the +| fsp x-latch before acquiring the kernel +| mutex. +V +Search system mutex +| +V +Buffer pool mutex +| +V +Log mutex +| +Any other latch +| +V +Memory pool mutex */ + +/* Latching order levels */ + +/* User transaction locks are higher than any of the latch levels below: +no latches are allowed when a thread goes to wait for a normal table +or row lock! */ +#define SYNC_USER_TRX_LOCK 9999 +#define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress + latching order checking */ +#define SYNC_LEVEL_VARYING 2000 /* Level is varying. Only used with + buffer pool page locks, which do not + have a fixed level, but instead have + their level set after the page is + locked; see e.g. + ibuf_bitmap_get_map_page(). */ +#define SYNC_TRX_I_S_RWLOCK 1910 /* Used for + trx_i_s_cache_t::rw_lock */ +#define SYNC_TRX_I_S_LAST_READ 1900 /* Used for + trx_i_s_cache_t::last_read_mutex */ +#define SYNC_FILE_FORMAT_TAG 1200 /* Used to serialize access to the + file format tag */ +#define SYNC_DICT_OPERATION 1001 /* table create, drop, etc. reserve + this in X-mode, implicit or backround + operations purge, rollback, foreign + key checks reserve this in S-mode */ +#define SYNC_DICT 1000 +#define SYNC_DICT_AUTOINC_MUTEX 999 +#define SYNC_DICT_HEADER 995 +#define SYNC_IBUF_HEADER 914 +#define SYNC_IBUF_PESS_INSERT_MUTEX 912 +#define SYNC_IBUF_MUTEX 910 /* ibuf mutex is really below + SYNC_FSP_PAGE: we assign a value this + high only to make the program to pass + the debug checks */ +/*-------------------------------*/ +#define SYNC_INDEX_TREE 900 +#define SYNC_TREE_NODE_NEW 892 +#define SYNC_TREE_NODE_FROM_HASH 891 +#define SYNC_TREE_NODE 890 +#define SYNC_PURGE_SYS 810 +#define SYNC_PURGE_LATCH 800 +#define SYNC_TRX_UNDO 700 +#define SYNC_RSEG 600 +#define SYNC_RSEG_HEADER_NEW 591 +#define SYNC_RSEG_HEADER 590 +#define SYNC_TRX_UNDO_PAGE 570 +#define SYNC_EXTERN_STORAGE 500 +#define SYNC_FSP 400 +#define SYNC_FSP_PAGE 395 +/*------------------------------------- Insert buffer headers */ +/*------------------------------------- ibuf_mutex */ +/*------------------------------------- Insert buffer tree */ +#define SYNC_IBUF_BITMAP_MUTEX 351 +#define SYNC_IBUF_BITMAP 350 +/*------------------------------------- MySQL query cache mutex */ +/*------------------------------------- MySQL binlog mutex */ +/*-------------------------------*/ +#define SYNC_KERNEL 300 +#define SYNC_REC_LOCK 299 +#define SYNC_TRX_LOCK_HEAP 298 +#define SYNC_TRX_SYS_HEADER 290 +#define SYNC_LOG 170 +#define SYNC_RECV 168 +#define SYNC_WORK_QUEUE 162 +#define SYNC_SEARCH_SYS_CONF 161 /* for assigning btr_search_enabled */ +#define SYNC_SEARCH_SYS 160 /* NOTE that if we have a memory + heap that can be extended to the + buffer pool, its logical level is + SYNC_SEARCH_SYS, as memory allocation + can call routines there! Otherwise + the level is SYNC_MEM_HASH. */ +#define SYNC_BUF_POOL 150 +#define SYNC_BUF_BLOCK 149 +#define SYNC_DOUBLEWRITE 140 +#define SYNC_ANY_LATCH 135 +#define SYNC_THR_LOCAL 133 +#define SYNC_MEM_HASH 131 +#define SYNC_MEM_POOL 130 + +/* Codes used to designate lock operations */ +#define RW_LOCK_NOT_LOCKED 350 +#define RW_LOCK_EX 351 +#define RW_LOCK_EXCLUSIVE 351 +#define RW_LOCK_SHARED 352 +#define RW_LOCK_WAIT_EX 353 +#define SYNC_MUTEX 354 + +/* NOTE! The structure appears here only for the compiler to know its size. +Do not use its fields directly! The structure used in the spin lock +implementation of a mutual exclusion semaphore. */ + +struct mutex_struct { + os_event_t event; /* Used by sync0arr.c for the wait queue */ + byte lock_word; /* This byte is the target of the atomic + test-and-set instruction in Win32 and + x86 32/64 with GCC 4.1.0 or later version */ +#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) +#elif defined(HAVE_GCC_ATOMIC_BUILTINS) +#else + os_fast_mutex_t + os_fast_mutex; /* In other systems we use this OS mutex + in place of lock_word */ +#endif + ulint waiters; /* This ulint is set to 1 if there are (or + may be) threads waiting in the global wait + array for this mutex to be released. + Otherwise, this is 0. */ + UT_LIST_NODE_T(mutex_t) list; /* All allocated mutexes are put into + a list. Pointers to the next and prev. */ +#ifdef UNIV_SYNC_DEBUG + const char* file_name; /* File where the mutex was locked */ + ulint line; /* Line where the mutex was locked */ + ulint level; /* Level in the global latching order */ +#endif /* UNIV_SYNC_DEBUG */ + const char* cfile_name;/* File name where mutex created */ + ulint cline; /* Line where created */ +#ifdef UNIV_DEBUG + os_thread_id_t thread_id; /* The thread id of the thread + which locked the mutex. */ + ulint magic_n; +# define MUTEX_MAGIC_N (ulint)979585 +#endif /* UNIV_DEBUG */ +#ifndef UNIV_HOTBACKUP + ulong count_os_wait; /* count of os_wait */ +# ifdef UNIV_DEBUG + ulong count_using; /* count of times mutex used */ + ulong count_spin_loop; /* count of spin loops */ + ulong count_spin_rounds; /* count of spin rounds */ + ulong count_os_yield; /* count of os_wait */ + ulonglong lspent_time; /* mutex os_wait timer msec */ + ulonglong lmax_spent_time; /* mutex os_wait timer msec */ + const char* cmutex_name;/* mutex name */ + ulint mutex_type;/* 0 - usual mutex 1 - rw_lock mutex */ +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ +}; + +/* The global array of wait cells for implementation of the databases own +mutexes and read-write locks. Appears here for debugging purposes only! */ + +extern sync_array_t* sync_primary_wait_array; + +/* Constant determining how long spin wait is continued before suspending +the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond +to 20 microseconds. */ + +#define SYNC_SPIN_ROUNDS srv_n_spin_wait_rounds + +/* The number of system calls made in this module. Intended for performance +monitoring. */ + +extern ib_int64_t mutex_exit_count; + +#ifdef UNIV_SYNC_DEBUG +/* Latching order checks start when this is set TRUE */ +extern ibool sync_order_checks_on; +#endif /* UNIV_SYNC_DEBUG */ + +/* This variable is set to TRUE when sync_init is called */ +extern ibool sync_initialized; + +/* Global list of database mutexes (not OS mutexes) created. */ +typedef UT_LIST_BASE_NODE_T(mutex_t) ut_list_base_node_t; +extern ut_list_base_node_t mutex_list; + +/* Mutex protecting the mutex_list variable */ +extern mutex_t mutex_list_mutex; + + +#ifndef UNIV_NONINL +#include "sync0sync.ic" +#endif + +#endif diff --git a/storage/xtradb/include/sync0sync.ic b/storage/xtradb/include/sync0sync.ic new file mode 100644 index 00000000000..c43121ebd0b --- /dev/null +++ b/storage/xtradb/include/sync0sync.ic @@ -0,0 +1,270 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +/********************************************************************** +Sets the waiters field in a mutex. */ +UNIV_INTERN +void +mutex_set_waiters( +/*==============*/ + mutex_t* mutex, /* in: mutex */ + ulint n); /* in: value to set */ +/********************************************************************** +Reserves a mutex for the current thread. If the mutex is reserved, the +function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting +for the mutex before suspending the thread. */ +UNIV_INTERN +void +mutex_spin_wait( +/*============*/ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where mutex + requested */ + ulint line); /* in: line where requested */ +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Sets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_set_debug_info( +/*=================*/ + mutex_t* mutex, /* in: mutex */ + const char* file_name, /* in: file where requested */ + ulint line); /* in: line where requested */ +#endif /* UNIV_SYNC_DEBUG */ +/********************************************************************** +Releases the threads waiting in the primary wait array for this mutex. */ +UNIV_INTERN +void +mutex_signal_object( +/*================*/ + mutex_t* mutex); /* in: mutex */ + +/********************************************************************** +Performs an atomic test-and-set instruction to the lock_word field of a +mutex. */ +UNIV_INLINE +byte +mutex_test_and_set( +/*===============*/ + /* out: the previous value of lock_word: 0 or + 1 */ + mutex_t* mutex) /* in: mutex */ +{ +#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) + byte res; + byte* lw; /* assembler code is used to ensure that + lock_word is loaded from memory */ + ut_ad(mutex); + ut_ad(sizeof(byte) == 1); + + lw = &(mutex->lock_word); + + __asm MOV ECX, lw + __asm MOV EDX, 1 + __asm XCHG DL, BYTE PTR [ECX] + __asm MOV res, DL + + /* The fence below would prevent this thread from + reading the data structure protected by the mutex + before the test-and-set operation is committed, but + the fence is apparently not needed: + + In a posting to comp.arch newsgroup (August 10, 1997) + Andy Glew said that in P6 a LOCKed instruction like + XCHG establishes a fence with respect to memory reads + and writes and thus an explicit fence is not + needed. In P5 he seemed to agree with a previous + newsgroup poster that LOCKed instructions serialize + all instruction execution, and, consequently, also + memory operations. This is confirmed in Intel Software + Dev. Manual, Vol. 3. */ + + /* mutex_fence(); */ + + return(res); +#elif defined(HAVE_GCC_ATOMIC_BUILTINS) + return __sync_lock_test_and_set(&(mutex->lock_word), 1); +#else + ibool ret; + + ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex)); + + if (ret == 0) { + /* We check that os_fast_mutex_trylock does not leak + and allow race conditions */ + ut_a(mutex->lock_word == 0); + + mutex->lock_word = 1; + } + + return((byte)ret); +#endif +} + +/********************************************************************** +Performs a reset instruction to the lock_word field of a mutex. This +instruction also serializes memory operations to the program order. */ +UNIV_INLINE +void +mutex_reset_lock_word( +/*==================*/ + mutex_t* mutex) /* in: mutex */ +{ +#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) + byte* lw; /* assembler code is used to ensure that + lock_word is loaded from memory */ + ut_ad(mutex); + + lw = &(mutex->lock_word); + + __asm MOV EDX, 0 + __asm MOV ECX, lw + __asm XCHG DL, BYTE PTR [ECX] +#elif defined(HAVE_GCC_ATOMIC_BUILTINS) + /* In theory __sync_lock_release should be used to release the lock. + Unfortunately, it does not work properly alone. The workaround is + that more conservative __sync_lock_test_and_set is used instead. */ + __sync_lock_test_and_set(&(mutex->lock_word), 0); +#else + mutex->lock_word = 0; + + os_fast_mutex_unlock(&(mutex->os_fast_mutex)); +#endif +} + +/********************************************************************** +Gets the value of the lock word. */ +UNIV_INLINE +byte +mutex_get_lock_word( +/*================*/ + const mutex_t* mutex) /* in: mutex */ +{ + const volatile byte* ptr; /* declared volatile to ensure that + lock_word is loaded from memory */ + ut_ad(mutex); + + ptr = &(mutex->lock_word); + + return(*ptr); +} + +/********************************************************************** +Gets the waiters field in a mutex. */ +UNIV_INLINE +ulint +mutex_get_waiters( +/*==============*/ + /* out: value to set */ + const mutex_t* mutex) /* in: mutex */ +{ + const volatile ulint* ptr; /* declared volatile to ensure that + the value is read from memory */ + ut_ad(mutex); + + ptr = &(mutex->waiters); + + return(*ptr); /* Here we assume that the read of a single + word from memory is atomic */ +} + +/********************************************************************** +Unlocks a mutex owned by the current thread. */ +UNIV_INLINE +void +mutex_exit( +/*=======*/ + mutex_t* mutex) /* in: pointer to mutex */ +{ + ut_ad(mutex_own(mutex)); + + ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED); + +#ifdef UNIV_SYNC_DEBUG + sync_thread_reset_level(mutex); +#endif + mutex_reset_lock_word(mutex); + + /* A problem: we assume that mutex_reset_lock word + is a memory barrier, that is when we read the waiters + field next, the read must be serialized in memory + after the reset. A speculative processor might + perform the read first, which could leave a waiting + thread hanging indefinitely. + + Our current solution call every second + sync_arr_wake_threads_if_sema_free() + to wake up possible hanging threads if + they are missed in mutex_signal_object. */ + + if (mutex_get_waiters(mutex) != 0) { + + mutex_signal_object(mutex); + } + +#ifdef UNIV_SYNC_PERF_STAT + mutex_exit_count++; +#endif +} + +/********************************************************************** +Locks a mutex for the current thread. If the mutex is reserved, the function +spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex +before suspending the thread. */ +UNIV_INLINE +void +mutex_enter_func( +/*=============*/ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where locked */ + ulint line) /* in: line where locked */ +{ + ut_ad(mutex_validate(mutex)); + ut_ad(!mutex_own(mutex)); + + /* Note that we do not peek at the value of lock_word before trying + the atomic test_and_set; we could peek, and possibly save time. */ + +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + mutex->count_using++; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + + if (!mutex_test_and_set(mutex)) { + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + return; /* Succeeded! */ + } + + mutex_spin_wait(mutex, file_name, line); +} diff --git a/storage/xtradb/include/sync0types.h b/storage/xtradb/include/sync0types.h new file mode 100644 index 00000000000..3c1021b1a30 --- /dev/null +++ b/storage/xtradb/include/sync0types.h @@ -0,0 +1,31 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Global types for sync + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef sync0types_h +#define sync0types_h + +#define mutex_t ib_mutex_t +typedef struct mutex_struct mutex_t; + +#endif diff --git a/storage/xtradb/include/thr0loc.h b/storage/xtradb/include/thr0loc.h new file mode 100644 index 00000000000..96ec13cc8e4 --- /dev/null +++ b/storage/xtradb/include/thr0loc.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The thread local storage + +Created 10/5/1995 Heikki Tuuri +*******************************************************/ + +/* This module implements storage private to each thread, +a capability useful in some situations like storing the +OS handle to the current thread, or its priority. */ + +#ifndef thr0loc_h +#define thr0loc_h + +#include "univ.i" +#include "os0thread.h" + +/******************************************************************** +Initializes the thread local storage module. */ +UNIV_INTERN +void +thr_local_init(void); +/*================*/ +/*********************************************************************** +Creates a local storage struct for the calling new thread. */ +UNIV_INTERN +void +thr_local_create(void); +/*==================*/ +/*********************************************************************** +Frees the local storage struct for the specified thread. */ +UNIV_INTERN +void +thr_local_free( +/*===========*/ + os_thread_id_t id); /* in: thread id */ +/*********************************************************************** +Gets the slot number in the thread table of a thread. */ +UNIV_INTERN +ulint +thr_local_get_slot_no( +/*==================*/ + /* out: slot number */ + os_thread_id_t id); /* in: thread id of the thread */ +/*********************************************************************** +Sets in the local storage the slot number in the thread table of a thread. */ +UNIV_INTERN +void +thr_local_set_slot_no( +/*==================*/ + os_thread_id_t id, /* in: thread id of the thread */ + ulint slot_no);/* in: slot number */ +/*********************************************************************** +Returns pointer to the 'in_ibuf' field within the current thread local +storage. */ +UNIV_INTERN +ibool* +thr_local_get_in_ibuf_field(void); +/*=============================*/ + /* out: pointer to the in_ibuf field */ + +/************************************************************************* +Return local hash table informations. */ + +ulint +thr_local_hash_cells(void); +/*=======================*/ + +ulint +thr_local_hash_nodes(void); +/*=======================*/ + +#ifndef UNIV_NONINL +#include "thr0loc.ic" +#endif + +#endif diff --git a/storage/xtradb/include/thr0loc.ic b/storage/xtradb/include/thr0loc.ic new file mode 100644 index 00000000000..6de183fd857 --- /dev/null +++ b/storage/xtradb/include/thr0loc.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Thread local storage + +Created 10/4/1995 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/trx0i_s.h b/storage/xtradb/include/trx0i_s.h new file mode 100644 index 00000000000..cf2865af127 --- /dev/null +++ b/storage/xtradb/include/trx0i_s.h @@ -0,0 +1,212 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables cache structures and public +functions. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +#ifndef trx0i_s_h +#define trx0i_s_h + +#include "univ.i" +#include "trx0types.h" +#include "ut0ut.h" + +/* the maximum amount of memory that can be consumed by innodb_trx, +innodb_locks and innodb_lock_waits information schema tables. */ +#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */ + +/* the maximum length of a string that can be stored in +i_s_locks_row_t::lock_data */ +#define TRX_I_S_LOCK_DATA_MAX_LEN 8192 + +/* the maximum length of a string that can be stored in +i_s_trx_row_t::trx_query */ +#define TRX_I_S_TRX_QUERY_MAX_LEN 1024 + +typedef struct i_s_locks_row_struct i_s_locks_row_t; +typedef struct i_s_hash_chain_struct i_s_hash_chain_t; + +/* Objects of this type are added to the hash table +trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_struct { + i_s_locks_row_t* value; + i_s_hash_chain_t* next; +}; + +/* This structure represents INFORMATION_SCHEMA.innodb_locks row */ +struct i_s_locks_row_struct { + ullint lock_trx_id; + const char* lock_mode; + const char* lock_type; + const char* lock_table; + const char* lock_index; + ulint lock_space; + ulint lock_page; + ulint lock_rec; + const char* lock_data; + + /* The following are auxiliary and not included in the table */ + ullint lock_table_id; + i_s_hash_chain_t hash_chain; /* this object is added to the hash + table + trx_i_s_cache_t::locks_hash */ +}; + +/* This structure represents INFORMATION_SCHEMA.innodb_trx row */ +typedef struct i_s_trx_row_struct { + ullint trx_id; + const char* trx_state; + ib_time_t trx_started; + const i_s_locks_row_t* requested_lock_row; + ib_time_t trx_wait_started; + ullint trx_weight; + ulint trx_mysql_thread_id; + const char* trx_query; +} i_s_trx_row_t; + +/* This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */ +typedef struct i_s_lock_waits_row_struct { + const i_s_locks_row_t* requested_lock_row; + const i_s_locks_row_t* blocking_lock_row; +} i_s_lock_waits_row_t; + +/* This type is opaque and is defined in trx/trx0i_s.c */ +typedef struct trx_i_s_cache_struct trx_i_s_cache_t; + +/* Auxiliary enum used by functions that need to select one of the +INFORMATION_SCHEMA tables */ +enum i_s_table { + I_S_INNODB_TRX, + I_S_INNODB_LOCKS, + I_S_INNODB_LOCK_WAITS +}; + +/* This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +extern trx_i_s_cache_t* trx_i_s_cache; + +/*********************************************************************** +Initialize INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache); /* out: cache to init */ + +/*********************************************************************** +Issue a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache); /* in: cache */ + +/*********************************************************************** +Release a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache); /* in: cache */ + +/*********************************************************************** +Issue an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache); /* in: cache */ + +/*********************************************************************** +Release an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache); /* in: cache */ + + +/*********************************************************************** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. */ +UNIV_INTERN +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + /* out: number of rows */ + trx_i_s_cache_t* cache, /* in: cache */ + enum i_s_table table); /* in: which table */ + +/*********************************************************************** +Retrieves the nth row in the cache for a given INFORMATION SCHEMA +table. */ +UNIV_INTERN +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + /* out: row */ + trx_i_s_cache_t* cache, /* in: cache */ + enum i_s_table table, /* in: which table */ + ulint n); /* in: row number */ + +/*********************************************************************** +Update the transactions cache if it has not been read for some time. */ +UNIV_INTERN +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + /* out: 0 - fetched, 1 - not */ + trx_i_s_cache_t* cache); /* in/out: cache */ + +/*********************************************************************** +Returns TRUE if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. */ +UNIV_INTERN +ibool +trx_i_s_cache_is_truncated( +/*=======================*/ + /* out: TRUE if truncated */ + trx_i_s_cache_t* cache); /* in: cache */ + +/* The maximum length of a resulting lock_id_size in +trx_i_s_create_lock_id(), not including the terminating '\0'. +":%lu:%lu:%lu" -> 63 chars */ +#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63) + +/*********************************************************************** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. */ +UNIV_INTERN +char* +trx_i_s_create_lock_id( +/*===================*/ + /* out: resulting lock id */ + const i_s_locks_row_t* row, /* in: innodb_locks row */ + char* lock_id,/* out: resulting lock_id */ + ulint lock_id_size);/* in: size of the lock id + buffer */ + +#endif /* trx0i_s_h */ diff --git a/storage/xtradb/include/trx0purge.h b/storage/xtradb/include/trx0purge.h new file mode 100644 index 00000000000..4921b860485 --- /dev/null +++ b/storage/xtradb/include/trx0purge.h @@ -0,0 +1,185 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0purge_h +#define trx0purge_h + +#include "univ.i" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "que0types.h" +#include "page0page.h" +#include "usr0sess.h" +#include "fil0fil.h" + +/* The global data structure coordinating a purge */ +extern trx_purge_t* purge_sys; + +/* A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +extern trx_undo_rec_t trx_purge_dummy_rec; + +/************************************************************************ +Calculates the file address of an undo log header when we have the file +address of its history list node. */ +UNIV_INLINE +fil_addr_t +trx_purge_get_log_from_hist( +/*========================*/ + /* out: file address of the log */ + fil_addr_t node_addr); /* in: file address of the history + list node of the log */ +/********************************************************************* +Checks if trx_id is >= purge_view: then it is guaranteed that its update +undo log still exists in the system. */ +UNIV_INTERN +ibool +trx_purge_update_undo_must_exist( +/*=============================*/ + /* out: TRUE if is sure that it is preserved, also + if the function returns FALSE, it is possible that + the undo log still exists in the system */ + dulint trx_id);/* in: transaction id */ +/************************************************************************ +Creates the global purge system control structure and inits the history +mutex. */ +UNIV_INTERN +void +trx_purge_sys_create(void); +/*======================*/ +/************************************************************************ +Adds the update undo log as the first log in the history list. Removes the +update undo log segment from the rseg slot if it is too big for reuse. */ +UNIV_INTERN +void +trx_purge_add_update_undo_to_history( +/*=================================*/ + trx_t* trx, /* in: transaction */ + page_t* undo_page, /* in: update undo log header page, + x-latched */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************ +Fetches the next undo log record from the history list to purge. It must be +released with the corresponding release function. */ +UNIV_INTERN +trx_undo_rec_t* +trx_purge_fetch_next_rec( +/*=====================*/ + /* out: copy of an undo log record, or + pointer to the dummy undo log record + &trx_purge_dummy_rec if the whole undo log + can skipped in purge; NULL if none left */ + dulint* roll_ptr,/* out: roll pointer to undo record */ + trx_undo_inf_t** cell, /* out: storage cell for the record in the + purge array */ + mem_heap_t* heap); /* in: memory heap where copied */ +/*********************************************************************** +Releases a reserved purge undo record. */ +UNIV_INTERN +void +trx_purge_rec_release( +/*==================*/ + trx_undo_inf_t* cell); /* in: storage cell */ +/*********************************************************************** +This function runs a purge batch. */ +UNIV_INTERN +ulint +trx_purge(void); +/*===========*/ + /* out: number of undo log pages handled in + the batch */ +/********************************************************************** +Prints information of the purge system to stderr. */ +UNIV_INTERN +void +trx_purge_sys_print(void); +/*======================*/ + +/* The control structure used in the purge operation */ +struct trx_purge_struct{ + ulint state; /* Purge system state */ + sess_t* sess; /* System session running the purge + query */ + trx_t* trx; /* System transaction running the purge + query: this trx is not in the trx list + of the trx system and it never ends */ + que_t* query; /* The query graph which will do the + parallelized purge operation */ + rw_lock_t latch; /* The latch protecting the purge view. + A purge operation must acquire an + x-latch here for the instant at which + it changes the purge view: an undo + log operation can prevent this by + obtaining an s-latch here. */ + read_view_t* view; /* The purge will not remove undo logs + which are >= this view (purge view) */ + mutex_t mutex; /* Mutex protecting the fields below */ + ulint n_pages_handled;/* Approximate number of undo log + pages processed in purge */ + ulint handle_limit; /* Target of how many pages to get + processed in the current purge */ + /*------------------------------*/ + /* The following two fields form the 'purge pointer' which advances + during a purge, and which is used in history list truncation */ + + dulint purge_trx_no; /* Purge has advanced past all + transactions whose number is less + than this */ + dulint purge_undo_no; /* Purge has advanced past all records + whose undo number is less than this */ + /*-----------------------------*/ + ibool next_stored; /* TRUE if the info of the next record + to purge is stored below: if yes, then + the transaction number and the undo + number of the record are stored in + purge_trx_no and purge_undo_no above */ + trx_rseg_t* rseg; /* Rollback segment for the next undo + record to purge */ + ulint page_no; /* Page number for the next undo + record to purge, page number of the + log header, if dummy record */ + ulint offset; /* Page offset for the next undo + record to purge, 0 if the dummy + record */ + ulint hdr_page_no; /* Header page of the undo log where + the next record to purge belongs */ + ulint hdr_offset; /* Header byte offset on the page */ + /*-----------------------------*/ + trx_undo_arr_t* arr; /* Array of transaction numbers and + undo numbers of the undo records + currently under processing in purge */ + mem_heap_t* heap; /* Temporary storage used during a + purge: can be emptied after purge + completes */ +}; + +#define TRX_PURGE_ON 1 /* purge operation is running */ +#define TRX_STOP_PURGE 2 /* purge operation is stopped, or + it should be stopped */ +#ifndef UNIV_NONINL +#include "trx0purge.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0purge.ic b/storage/xtradb/include/trx0purge.ic new file mode 100644 index 00000000000..2c1d2ac75af --- /dev/null +++ b/storage/xtradb/include/trx0purge.ic @@ -0,0 +1,42 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" + +/************************************************************************ +Calculates the file address of an undo log header when we have the file +address of its history list node. */ +UNIV_INLINE +fil_addr_t +trx_purge_get_log_from_hist( +/*========================*/ + /* out: file address of the log */ + fil_addr_t node_addr) /* in: file address of the history + list node of the log */ +{ + node_addr.boffset -= TRX_UNDO_HISTORY_NODE; + + return(node_addr); +} + diff --git a/storage/xtradb/include/trx0rec.h b/storage/xtradb/include/trx0rec.h new file mode 100644 index 00000000000..444d39e39db --- /dev/null +++ b/storage/xtradb/include/trx0rec.h @@ -0,0 +1,333 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rec_h +#define trx0rec_h + +#include "univ.i" +#include "trx0types.h" +#include "row0types.h" +#include "mtr0mtr.h" +#include "dict0types.h" +#include "que0types.h" +#include "data0data.h" +#include "rem0types.h" + +/*************************************************************************** +Copies the undo record to the heap. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + /* out, own: copy of undo log record */ + trx_undo_rec_t* undo_rec, /* in: undo log record */ + mem_heap_t* heap); /* in: heap where copied */ +/************************************************************************** +Reads the undo log record type. */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + /* out: record type */ + trx_undo_rec_t* undo_rec); /* in: undo log record */ +/************************************************************************** +Reads from an undo log record the record compiler info. */ +UNIV_INLINE +ulint +trx_undo_rec_get_cmpl_info( +/*=======================*/ + /* out: compiler info */ + trx_undo_rec_t* undo_rec); /* in: undo log record */ +/************************************************************************** +Returns TRUE if an undo log record contains an extern storage field. */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + /* out: TRUE if extern */ + trx_undo_rec_t* undo_rec); /* in: undo log record */ +/************************************************************************** +Reads the undo log record number. */ +UNIV_INLINE +dulint +trx_undo_rec_get_undo_no( +/*=====================*/ + /* out: undo no */ + trx_undo_rec_t* undo_rec); /* in: undo log record */ +/************************************************************************** + * Returns the start of the undo record data area. */ + +UNIV_INLINE +byte* +trx_undo_rec_get_ptr( +/*==================*/ + /* out: compiler info */ + trx_undo_rec_t* undo_rec, /* in: undo log record */ + dulint undo_no); /* in: undo no read from node */ + +/************************************************************************** +Reads from an undo log record the general parameters. */ +UNIV_INTERN +byte* +trx_undo_rec_get_pars( +/*==================*/ + /* out: remaining part of undo log + record after reading these values */ + trx_undo_rec_t* undo_rec, /* in: undo log record */ + ulint* type, /* out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /* out: compiler info, relevant only + for update type records */ + ibool* updated_extern, /* out: TRUE if we updated an + externally stored fild */ + dulint* undo_no, /* out: undo log record number */ + dulint* table_id); /* out: table id */ +/*********************************************************************** +Builds a row reference from an undo log record. */ +UNIV_INTERN +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + dtuple_t** ref, /* out, own: row reference */ + mem_heap_t* heap); /* in: memory heap from which the memory + needed is allocated */ +/*********************************************************************** +Skips a row reference from an undo log record. */ +UNIV_INTERN +byte* +trx_undo_rec_skip_row_ref( +/*======================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part in update undo log + record, at the start of the row reference */ + dict_index_t* index); /* in: clustered index */ +/************************************************************************** +Reads from an undo log update record the system field values of the old +version. */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + /* out: remaining part of undo log + record after reading these values */ + byte* ptr, /* in: remaining part of undo log + record after reading general + parameters */ + dulint* trx_id, /* out: trx id */ + dulint* roll_ptr, /* out: roll ptr */ + ulint* info_bits); /* out: info bits state */ +/*********************************************************************** +Builds an update vector based on a remaining part of an undo log record. */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + /* out: remaining part of the record, + NULL if an error detected, which means that + the record is corrupted */ + byte* ptr, /* in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + ulint type, /* in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + dulint trx_id, /* in: transaction id from this undorecord */ + dulint roll_ptr,/* in: roll pointer from this undo record */ + ulint info_bits,/* in: info bits from this undo record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap, /* in: memory heap from which the memory + needed is allocated */ + upd_t** upd); /* out, own: update vector */ +/*********************************************************************** +Builds a partial row from an update undo log record. It contains the +columns which occur as ordering in any index of the table. */ +UNIV_INTERN +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + dtuple_t** row, /* out, own: partial row */ + ibool ignore_prefix, /* in: flag to indicate if we + expect blob prefixes in undo. Used + only in the assertion. */ + mem_heap_t* heap); /* in: memory heap from which the memory + needed is allocated */ +/*************************************************************************** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. */ +UNIV_INTERN +ulint +trx_undo_report_row_operation( +/*==========================*/ + /* out: DB_SUCCESS or error code */ + ulint flags, /* in: if BTR_NO_UNDO_LOG_FLAG bit is + set, does nothing */ + ulint op_type, /* in: TRX_UNDO_INSERT_OP or + TRX_UNDO_MODIFY_OP */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: clustered index */ + const dtuple_t* clust_entry, /* in: in the case of an insert, + index entry to insert into the + clustered index, otherwise NULL */ + const upd_t* update, /* in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /* in: compiler info on secondary + index updates */ + const rec_t* rec, /* in: case of an update or delete + marking, the record in the clustered + index, otherwise NULL */ + dulint* roll_ptr); /* out: rollback pointer to the + inserted undo log record, + ut_dulint_zero if BTR_NO_UNDO_LOG + flag was specified */ +/********************************************************************** +Copies an undo record to heap. This function can be called if we know that +the undo log record exists. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_undo_rec_low( +/*======================*/ + /* out, own: copy of the record */ + dulint roll_ptr, /* in: roll pointer to record */ + mem_heap_t* heap); /* in: memory heap where copied */ +/********************************************************************** +Copies an undo record to heap. */ +UNIV_INTERN +ulint +trx_undo_get_undo_rec( +/*==================*/ + /* out: DB_SUCCESS, or + DB_MISSING_HISTORY if the undo log + has been truncated and we cannot + fetch the old version; NOTE: the + caller must have latches on the + clustered index page and purge_view */ + dulint roll_ptr, /* in: roll pointer to record */ + dulint trx_id, /* in: id of the trx that generated + the roll pointer: it points to an + undo log of this transaction */ + trx_undo_rec_t** undo_rec, /* out, own: copy of the record */ + mem_heap_t* heap); /* in: memory heap where copied */ +/*********************************************************************** +Build a previous version of a clustered index record. This function checks +that the caller has a latch on the index page of the clustered index record +and an s-latch on the purge_view. This guarantees that the stack of versions +is locked. */ +UNIV_INTERN +ulint +trx_undo_prev_version_build( +/*========================*/ + /* out: DB_SUCCESS, or DB_MISSING_HISTORY if + the previous version is not >= purge_view, + which means that it may have been removed, + DB_ERROR if corrupted record */ + const rec_t* index_rec,/* in: clustered index record in the + index tree */ + mtr_t* index_mtr,/* in: mtr which contains the latch to + index_rec page and purge_view */ + const rec_t* rec, /* in: version of a clustered index record */ + dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /* in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers);/* out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted */ +/*************************************************************** +Parses a redo log record of adding an undo log record. */ +UNIV_INTERN +byte* +trx_undo_parse_add_undo_rec( +/*========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page); /* in: page or NULL */ +/*************************************************************** +Parses a redo log record of erasing of an undo page end. */ +UNIV_INTERN +byte* +trx_undo_parse_erase_page_end( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ + +/* Types of an undo log record: these have to be smaller than 16, as the +compilation info multiplied by 16 is ORed to this value in an undo log +record */ + +#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */ +#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked + record */ +#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to + a not delete marked record; also the + fields of the record can change */ +#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields + do not change */ +#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by + this and ORed to the type above */ +#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl + to denote that we updated external + storage fields: used by purge to + free the external storage */ + +/* Operation type flags used in trx_undo_report_row_operation */ +#define TRX_UNDO_INSERT_OP 1 +#define TRX_UNDO_MODIFY_OP 2 + +#ifndef UNIV_NONINL +#include "trx0rec.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0rec.ic b/storage/xtradb/include/trx0rec.ic new file mode 100644 index 00000000000..bfd74eb9dfb --- /dev/null +++ b/storage/xtradb/include/trx0rec.ic @@ -0,0 +1,116 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/************************************************************************** +Reads from an undo log record the record type. */ +UNIV_INLINE +ulint +trx_undo_rec_get_type( +/*==================*/ + /* out: record type */ + trx_undo_rec_t* undo_rec) /* in: undo log record */ +{ + return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1)); +} + +/************************************************************************** +Reads from an undo log record the record compiler info. */ +UNIV_INLINE +ulint +trx_undo_rec_get_cmpl_info( +/*=======================*/ + /* out: compiler info */ + trx_undo_rec_t* undo_rec) /* in: undo log record */ +{ + return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT); +} + +/************************************************************************** +Returns TRUE if an undo log record contains an extern storage field. */ +UNIV_INLINE +ibool +trx_undo_rec_get_extern_storage( +/*============================*/ + /* out: TRUE if extern */ + trx_undo_rec_t* undo_rec) /* in: undo log record */ +{ + if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************** +Reads the undo log record number. */ +UNIV_INLINE +dulint +trx_undo_rec_get_undo_no( +/*=====================*/ + /* out: undo no */ + trx_undo_rec_t* undo_rec) /* in: undo log record */ +{ + byte* ptr; + + ptr = undo_rec + 3; + + return(mach_dulint_read_much_compressed(ptr)); +} + +/************************************************************************** +Returns the start of the undo record data area. */ +UNIV_INLINE +byte* +trx_undo_rec_get_ptr( +/*=================*/ + /* out: compiler info */ + trx_undo_rec_t* undo_rec, /* in: undo log record */ + dulint undo_no) /* in: undo no read from node */ +{ + return (((byte*) undo_rec) + 3 + + mach_dulint_get_much_compressed_size(undo_no)); +} + +/*************************************************************************** +Copies the undo record to the heap. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_rec_copy( +/*==============*/ + /* out, own: copy of undo log record */ + trx_undo_rec_t* undo_rec, /* in: undo log record */ + mem_heap_t* heap) /* in: heap where copied */ +{ + ulint len; + trx_undo_rec_t* rec_copy; + + len = mach_read_from_2(undo_rec) + - ut_align_offset(undo_rec, UNIV_PAGE_SIZE); + rec_copy = mem_heap_alloc(heap, len); + + ut_memcpy(rec_copy, undo_rec, len); + + return(rec_copy); +} diff --git a/storage/xtradb/include/trx0roll.h b/storage/xtradb/include/trx0roll.h new file mode 100644 index 00000000000..3318a5985d7 --- /dev/null +++ b/storage/xtradb/include/trx0roll.h @@ -0,0 +1,340 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0roll_h +#define trx0roll_h + +#include "univ.i" +#include "trx0trx.h" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" + +#define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL) + +/*********************************************************************** +Determines if this transaction is rolling back an incomplete transaction +in crash recovery. */ +UNIV_INTERN +ibool +trx_is_recv( +/*========*/ + /* out: TRUE if trx is an incomplete + transaction that is being rolled back + in crash recovery */ + const trx_t* trx); /* in: transaction */ +/*********************************************************************** +Returns a transaction savepoint taken at this point in time. */ +UNIV_INTERN +trx_savept_t +trx_savept_take( +/*============*/ + /* out: savepoint */ + trx_t* trx); /* in: transaction */ +/*********************************************************************** +Creates an undo number array. */ +UNIV_INTERN +trx_undo_arr_t* +trx_undo_arr_create(void); +/*=====================*/ +/*********************************************************************** +Frees an undo number array. */ +UNIV_INTERN +void +trx_undo_arr_free( +/*==============*/ + trx_undo_arr_t* arr); /* in: undo number array */ +/*********************************************************************** +Returns pointer to nth element in an undo number array. */ +UNIV_INLINE +trx_undo_inf_t* +trx_undo_arr_get_nth_info( +/*======================*/ + /* out: pointer to the nth element */ + trx_undo_arr_t* arr, /* in: undo number array */ + ulint n); /* in: position */ +/*************************************************************************** +Tries truncate the undo logs. */ +UNIV_INTERN +void +trx_roll_try_truncate( +/*==================*/ + trx_t* trx); /* in: transaction */ +/************************************************************************ +Pops the topmost record when the two undo logs of a transaction are seen +as a single stack of records ordered by their undo numbers. Inserts the +undo number of the popped undo record to the array of currently processed +undo numbers in the transaction. When the query thread finishes processing +of this undo record, it must be released with trx_undo_rec_release. */ +UNIV_INTERN +trx_undo_rec_t* +trx_roll_pop_top_rec_of_trx( +/*========================*/ + /* out: undo log record copied to heap, NULL + if none left, or if the undo number of the + top record would be less than the limit */ + trx_t* trx, /* in: transaction */ + dulint limit, /* in: least undo number we need */ + dulint* roll_ptr,/* out: roll pointer to undo record */ + mem_heap_t* heap); /* in: memory heap where copied */ +/************************************************************************ +Reserves an undo log record for a query thread to undo. This should be +called if the query thread gets the undo log record not using the pop +function above. */ +UNIV_INTERN +ibool +trx_undo_rec_reserve( +/*=================*/ + /* out: TRUE if succeeded */ + trx_t* trx, /* in: transaction */ + dulint undo_no);/* in: undo number of the record */ +/*********************************************************************** +Releases a reserved undo record. */ +UNIV_INTERN +void +trx_undo_rec_release( +/*=================*/ + trx_t* trx, /* in: transaction */ + dulint undo_no);/* in: undo number */ +/************************************************************************* +Starts a rollback operation. */ +UNIV_INTERN +void +trx_rollback( +/*=========*/ + trx_t* trx, /* in: transaction */ + trx_sig_t* sig, /* in: signal starting the rollback */ + que_thr_t** next_thr);/* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +/*********************************************************************** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ +UNIV_INTERN +os_thread_ret_t +trx_rollback_or_clean_all_recovered( +/*================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))); + /* in: a dummy parameter required by + os_thread_create */ +/******************************************************************** +Finishes a transaction rollback. */ +UNIV_INTERN +void +trx_finish_rollback_off_kernel( +/*===========================*/ + que_t* graph, /* in: undo graph which can now be freed */ + trx_t* trx, /* in: transaction */ + que_thr_t** next_thr);/* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if this parameter is + NULL, it is ignored */ +/******************************************************************** +Builds an undo 'query' graph for a transaction. The actual rollback is +performed by executing this query graph like a query subprocedure call. +The reply about the completion of the rollback will be sent by this +graph. */ +UNIV_INTERN +que_t* +trx_roll_graph_build( +/*=================*/ + /* out, own: the query graph */ + trx_t* trx); /* in: trx handle */ +/************************************************************************* +Creates a rollback command node struct. */ +UNIV_INTERN +roll_node_t* +roll_node_create( +/*=============*/ + /* out, own: rollback node struct */ + mem_heap_t* heap); /* in: mem heap where created */ +/*************************************************************** +Performs an execution step for a rollback command node in a query graph. */ +UNIV_INTERN +que_thr_t* +trx_rollback_step( +/*==============*/ + /* out: query thread to run next, or NULL */ + que_thr_t* thr); /* in: query thread */ +/*********************************************************************** +Rollback a transaction used in MySQL. */ +UNIV_INTERN +int +trx_rollback_for_mysql( +/*===================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx); /* in: transaction handle */ +/*********************************************************************** +Rollback the latest SQL statement for MySQL. */ +UNIV_INTERN +int +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx); /* in: transaction handle */ +/*********************************************************************** +Rollback a transaction used in MySQL. */ +UNIV_INTERN +int +trx_general_rollback_for_mysql( +/*===========================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + ibool partial,/* in: TRUE if partial rollback requested */ + trx_savept_t* savept);/* in: pointer to savepoint undo number, if + partial rollback requested */ +/*********************************************************************** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. */ +UNIV_INTERN +ulint +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name, /* in: savepoint name */ + ib_int64_t* mysql_binlog_cache_pos);/* out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +/*********************************************************************** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. */ +UNIV_INTERN +ulint +trx_savepoint_for_mysql( +/*====================*/ + /* out: always DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name, /* in: savepoint name */ + ib_int64_t binlog_cache_pos); /* in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ + +/*********************************************************************** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. */ +UNIV_INTERN +ulint +trx_release_savepoint_for_mysql( +/*============================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name); /* in: savepoint name */ + +/*********************************************************************** +Frees a single savepoint struct. */ +UNIV_INTERN +void +trx_roll_savepoint_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep); /* in: savepoint to free */ + +/*********************************************************************** +Frees savepoint structs starting from savep, if savep == NULL then +free all savepoints. */ + +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep); /* in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ + +/* A cell in the array used during a rollback and a purge */ +struct trx_undo_inf_struct{ + dulint trx_no; /* transaction number: not defined during + a rollback */ + dulint undo_no; /* undo number of an undo record */ + ibool in_use; /* TRUE if the cell is in use */ +}; + +/* During a rollback and a purge, undo numbers of undo records currently being +processed are stored in this array */ + +struct trx_undo_arr_struct{ + ulint n_cells; /* number of cells in the array */ + ulint n_used; /* number of cells currently in use */ + trx_undo_inf_t* infos; /* the array of undo infos */ + mem_heap_t* heap; /* memory heap from which allocated */ +}; + +/* Rollback command node in a query graph */ +struct roll_node_struct{ + que_common_t common; /* node type: QUE_NODE_ROLLBACK */ + ulint state; /* node execution state */ + ibool partial;/* TRUE if we want a partial rollback */ + trx_savept_t savept; /* savepoint to which to roll back, in the + case of a partial rollback */ +}; + +/* A savepoint set with SQL's "SAVEPOINT savepoint_id" command */ +struct trx_named_savept_struct{ + char* name; /* savepoint name */ + trx_savept_t savept; /* the undo number corresponding to + the savepoint */ + ib_int64_t mysql_binlog_cache_pos; + /* the MySQL binlog cache position + corresponding to this savepoint, not + defined if the MySQL binlogging is not + enabled */ + UT_LIST_NODE_T(trx_named_savept_t) + trx_savepoints; /* the list of savepoints of a + transaction */ +}; + +/* Rollback node states */ +#define ROLL_NODE_SEND 1 +#define ROLL_NODE_WAIT 2 + +#ifndef UNIV_NONINL +#include "trx0roll.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0roll.ic b/storage/xtradb/include/trx0roll.ic new file mode 100644 index 00000000000..513b8b44847 --- /dev/null +++ b/storage/xtradb/include/trx0roll.ic @@ -0,0 +1,39 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/*********************************************************************** +Returns pointer to nth element in an undo number array. */ +UNIV_INLINE +trx_undo_inf_t* +trx_undo_arr_get_nth_info( +/*======================*/ + /* out: pointer to the nth element */ + trx_undo_arr_t* arr, /* in: undo number array */ + ulint n) /* in: position */ +{ + ut_ad(arr); + ut_ad(n < arr->n_cells); + + return(arr->infos + n); +} diff --git a/storage/xtradb/include/trx0rseg.h b/storage/xtradb/include/trx0rseg.h new file mode 100644 index 00000000000..f3aa736f788 --- /dev/null +++ b/storage/xtradb/include/trx0rseg.h @@ -0,0 +1,220 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0rseg_h +#define trx0rseg_h + +#include "univ.i" +#include "trx0types.h" +#include "trx0sys.h" + +/********************************************************************** +Gets a rollback segment header. */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get( +/*==========*/ + /* out: rollback segment header, page + x-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number of the header */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Gets a newly created rollback segment header. */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get_new( +/*==============*/ + /* out: rollback segment header, page + x-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number of the header */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************* +Gets the file page number of the nth undo log slot. */ +UNIV_INLINE +ulint +trx_rsegf_get_nth_undo( +/*===================*/ + /* out: page number of the undo log segment */ + trx_rsegf_t* rsegf, /* in: rollback segment header */ + ulint n, /* in: index of slot */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************* +Sets the file page number of the nth undo log slot. */ +UNIV_INLINE +void +trx_rsegf_set_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /* in: rollback segment header */ + ulint n, /* in: index of slot */ + ulint page_no,/* in: page number of the undo log segment */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************** +Looks for a free slot for an undo log segment. */ +UNIV_INLINE +ulint +trx_rsegf_undo_find_free( +/*=====================*/ + /* out: slot index or ULINT_UNDEFINED if not + found */ + trx_rsegf_t* rsegf, /* in: rollback segment header */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Looks for a rollback segment, based on the rollback segment id. */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + /* out: rollback segment */ + ulint id); /* in: rollback segment id */ +/******************************************************************** +Creates a rollback segment header. This function is called only when +a new rollback segment is created in the database. */ +UNIV_INTERN +ulint +trx_rseg_header_create( +/*===================*/ + /* out: page number of the created segment, + FIL_NULL if fail */ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint max_size, /* in: max size in pages */ + ulint* slot_no, /* out: rseg id == slot number in trx sys */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************* +Creates the memory copies for rollback segments and initializes the +rseg list and array in trx_sys at a database startup. */ +UNIV_INTERN +void +trx_rseg_list_and_array_init( +/*=========================*/ + trx_sysf_t* sys_header, /* in: trx system header */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************** +Creates a new rollback segment to the database. */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_create( +/*============*/ + /* out: the created segment object, NULL if + fail */ + ulint space, /* in: space id */ + ulint max_size, /* in: max size in pages */ + ulint* id, /* out: rseg id */ + mtr_t* mtr); /* in: mtr */ + + +/* Real max value may be 4076 in usual. But reserve 4 slot for safety or etc... */ +#define TRX_RSEG_N_EXTRA_SLOTS (((UNIV_PAGE_SIZE - (FIL_PAGE_DATA + FIL_PAGE_DATA_END + TRX_RSEG_UNDO_SLOTS)) / TRX_RSEG_SLOT_SIZE) - 4) + +/* Number of undo log slots in a rollback segment file copy */ +#define TRX_RSEG_N_SLOTS (srv_extra_undoslots ? TRX_RSEG_N_EXTRA_SLOTS : (UNIV_PAGE_SIZE / 16)) + +/* Maximum number of transactions supported by a single rollback segment */ +#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2) + +/* The rollback segment memory object */ +struct trx_rseg_struct{ + /*--------------------------------------------------------*/ + ulint id; /* rollback segment id == the index of + its slot in the trx system file copy */ + mutex_t mutex; /* mutex protecting the fields in this + struct except id; NOTE that the latching + order must always be kernel mutex -> + rseg mutex */ + ulint space; /* space where the rollback segment is + header is placed */ + ulint zip_size;/* in: compressed page size of space + in bytes, or 0 for uncompressed spaces */ + ulint page_no;/* page number of the rollback segment + header */ + ulint max_size;/* maximum allowed size in pages */ + ulint curr_size;/* current size in pages */ + /*--------------------------------------------------------*/ + /* Fields for update undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list; + /* List of update undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached; + /* List of update undo log segments + cached for fast reuse */ + /*--------------------------------------------------------*/ + /* Fields for insert undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list; + /* List of insert undo logs */ + UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached; + /* List of insert undo log segments + cached for fast reuse */ + /*--------------------------------------------------------*/ + ulint last_page_no; /* Page number of the last not yet + purged log header in the history list; + FIL_NULL if all list purged */ + ulint last_offset; /* Byte offset of the last not yet + purged log header */ + dulint last_trx_no; /* Transaction number of the last not + yet purged log */ + ibool last_del_marks; /* TRUE if the last not yet purged log + needs purging */ + /*--------------------------------------------------------*/ + UT_LIST_NODE_T(trx_rseg_t) rseg_list; + /* the list of the rollback segment + memory objects */ +}; + +/* Undo log segment slot in a rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of + an undo log segment */ +/*-------------------------------------------------------------*/ +/* Slot size */ +#define TRX_RSEG_SLOT_SIZE 4 + +/* The offset of the rollback segment header on its page */ +#define TRX_RSEG FSEG_PAGE_DATA + +/* Transaction rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback + segment in pages */ +#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied + by the logs in the history list */ +#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed + transactions */ +#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE) + /* Header for the file segment where + this page is placed */ +#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE) + /* Undo log segment slots */ +/*-------------------------------------------------------------*/ + +#ifndef UNIV_NONINL +#include "trx0rseg.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0rseg.ic b/storage/xtradb/include/trx0rseg.ic new file mode 100644 index 00000000000..e665a40fa8b --- /dev/null +++ b/storage/xtradb/include/trx0rseg.ic @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "srv0srv.h" + +/********************************************************************** +Gets a rollback segment header. */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get( +/*==========*/ + /* out: rollback segment header, page + x-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number of the header */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + trx_rsegf_t* header; + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER); + + header = TRX_RSEG + buf_block_get_frame(block); + + return(header); +} + +/********************************************************************** +Gets a newly created rollback segment header. */ +UNIV_INLINE +trx_rsegf_t* +trx_rsegf_get_new( +/*==============*/ + /* out: rollback segment header, page + x-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number of the header */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + trx_rsegf_t* header; + + block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); + + header = TRX_RSEG + buf_block_get_frame(block); + + return(header); +} + +/******************************************************************* +Gets the file page number of the nth undo log slot. */ +UNIV_INLINE +ulint +trx_rsegf_get_nth_undo( +/*===================*/ + /* out: page number of the undo log segment */ + trx_rsegf_t* rsegf, /* in: rollback segment header */ + ulint n, /* in: index of slot */ + mtr_t* mtr) /* in: mtr */ +{ + if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) { + fprintf(stderr, + "InnoDB: Error: trying to get slot %lu of rseg\n", + (ulong) n); + ut_error; + } + + return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr)); +} + +/******************************************************************* +Sets the file page number of the nth undo log slot. */ +UNIV_INLINE +void +trx_rsegf_set_nth_undo( +/*===================*/ + trx_rsegf_t* rsegf, /* in: rollback segment header */ + ulint n, /* in: index of slot */ + ulint page_no,/* in: page number of the undo log segment */ + mtr_t* mtr) /* in: mtr */ +{ + if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) { + fprintf(stderr, + "InnoDB: Error: trying to set slot %lu of rseg\n", + (ulong) n); + ut_error; + } + + mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE, + page_no, MLOG_4BYTES, mtr); +} + +/******************************************************************** +Looks for a free slot for an undo log segment. */ +UNIV_INLINE +ulint +trx_rsegf_undo_find_free( +/*=====================*/ + /* out: slot index or ULINT_UNDEFINED if not + found */ + trx_rsegf_t* rsegf, /* in: rollback segment header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint i; + ulint page_no; + + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + + page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h new file mode 100644 index 00000000000..f7e7e082278 --- /dev/null +++ b/storage/xtradb/include/trx0sys.h @@ -0,0 +1,560 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0sys_h +#define trx0sys_h + +#include "univ.i" + +#include "trx0types.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "sync0sync.h" +#include "ut0lst.h" +#include "buf0buf.h" +#include "fil0fil.h" +#include "fut0lst.h" +#include "fsp0fsp.h" +#include "read0types.h" +#include "page0types.h" + +/* In a MySQL replication slave, in crash recovery we store the master log +file name and position here. We have successfully got the updates to InnoDB +up to this position. If .._pos is -1, it means no crash recovery was needed, +or there was no master log position info inside InnoDB. */ + +extern char trx_sys_mysql_master_log_name[]; +extern ib_int64_t trx_sys_mysql_master_log_pos; + +extern char trx_sys_mysql_relay_log_name[]; +extern ib_int64_t trx_sys_mysql_relay_log_pos; + +/* If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. If .._pos is -1, it means there was no binlog position info inside +InnoDB. */ + +extern char trx_sys_mysql_bin_log_name[]; +extern ib_int64_t trx_sys_mysql_bin_log_pos; + +/* The transaction system */ +extern trx_sys_t* trx_sys; + +/* Doublewrite system */ +extern trx_doublewrite_t* trx_doublewrite; +extern ibool trx_doublewrite_must_reset_space_ids; +extern ibool trx_sys_multiple_tablespace_format; + +/******************************************************************** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ +UNIV_INTERN +void +trx_sys_create_doublewrite_buf(void); +/*================================*/ +/******************************************************************** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore +half-written pages in the data files. */ +UNIV_INTERN +void +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages); +/******************************************************************** +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ +UNIV_INTERN +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void); +/*===============================================*/ +/******************************************************************** +Determines if a page number is located inside the doublewrite buffer. */ +UNIV_INTERN +ibool +trx_doublewrite_page_inside( +/*========================*/ + /* out: TRUE if the location is inside + the two blocks of the doublewrite buffer */ + ulint page_no); /* in: page number */ +/******************************************************************* +Checks if a page address is the trx sys header page. */ +UNIV_INLINE +ibool +trx_sys_hdr_page( +/*=============*/ + /* out: TRUE if trx sys header page */ + ulint space, /* in: space */ + ulint page_no);/* in: page number */ +/********************************************************************* +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. */ +UNIV_INTERN +void +trx_sys_init_at_db_start(void); +/*==========================*/ +/********************************************************************* +Creates and initializes the transaction system at the database creation. */ +UNIV_INTERN +void +trx_sys_create(void); +/*================*/ +/********************************************************************* +Create extra rollback segments when create_new_db */ +UNIV_INTERN +void +trx_sys_create_extra_rseg( +/*======================*/ + ulint num); /* in: number of extra user rollback segments */ +/******************************************************************** +Looks for a free slot for a rollback segment in the trx system file copy. */ +UNIV_INTERN +ulint +trx_sysf_rseg_find_free( +/*====================*/ + /* out: slot index or ULINT_UNDEFINED + if not found */ + mtr_t* mtr); /* in: mtr */ +/******************************************************************* +Gets the pointer in the nth slot of the rseg array. */ +UNIV_INLINE +trx_rseg_t* +trx_sys_get_nth_rseg( +/*=================*/ + /* out: pointer to rseg object, NULL if slot + not in use */ + trx_sys_t* sys, /* in: trx system */ + ulint n); /* in: index of slot */ +/******************************************************************* +Sets the pointer in the nth slot of the rseg array. */ +UNIV_INLINE +void +trx_sys_set_nth_rseg( +/*=================*/ + trx_sys_t* sys, /* in: trx system */ + ulint n, /* in: index of slot */ + trx_rseg_t* rseg); /* in: pointer to rseg object, NULL if slot + not in use */ +/************************************************************************** +Gets a pointer to the transaction system file copy and x-locks its page. */ +UNIV_INLINE +trx_sysf_t* +trx_sysf_get( +/*=========*/ + /* out: pointer to system file copy, page x-locked */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Gets the space of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_space( +/*====================*/ + /* out: space id */ + trx_sysf_t* sys_header, /* in: trx sys file copy */ + ulint i, /* in: slot index == rseg id */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Gets the page number of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_page_no( +/*======================*/ + /* out: page number, FIL_NULL + if slot unused */ + trx_sysf_t* sys_header, /* in: trx sys file copy */ + ulint i, /* in: slot index == rseg id */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Sets the space id of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_space( +/*====================*/ + trx_sysf_t* sys_header, /* in: trx sys file copy */ + ulint i, /* in: slot index == rseg id */ + ulint space, /* in: space id */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Sets the page number of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_page_no( +/*======================*/ + trx_sysf_t* sys_header, /* in: trx sys file copy */ + ulint i, /* in: slot index == rseg id */ + ulint page_no, /* in: page number, FIL_NULL if + the slot is reset to unused */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Allocates a new transaction id. */ +UNIV_INLINE +dulint +trx_sys_get_new_trx_id(void); +/*========================*/ + /* out: new, allocated trx id */ +/********************************************************************* +Allocates a new transaction number. */ +UNIV_INLINE +dulint +trx_sys_get_new_trx_no(void); +/*========================*/ + /* out: new, allocated trx number */ +/********************************************************************* +Writes a trx id to an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_trx_id( +/*=============*/ + byte* ptr, /* in: pointer to memory where written */ + dulint id); /* in: id */ +/********************************************************************* +Reads a trx id from an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_read_... */ +UNIV_INLINE +dulint +trx_read_trx_id( +/*============*/ + /* out: id */ + const byte* ptr); /* in: pointer to memory from where to read */ +/******************************************************************** +Looks for the trx handle with the given id in trx_list. */ +UNIV_INLINE +trx_t* +trx_get_on_id( +/*==========*/ + /* out: the trx handle or NULL if not found */ + dulint trx_id); /* in: trx id to search for */ +/******************************************************************** +Returns the minumum trx id in trx list. This is the smallest id for which +the trx can possibly be active. (But, you must look at the trx->conc_state to +find out if the minimum trx id transaction itself is active, or already +committed.) */ +UNIV_INLINE +dulint +trx_list_get_min_trx_id(void); +/*=========================*/ + /* out: the minimum trx id, or trx_sys->max_trx_id + if the trx list is empty */ +/******************************************************************** +Checks if a transaction with the given id is active. */ +UNIV_INLINE +ibool +trx_is_active( +/*==========*/ + /* out: TRUE if active */ + dulint trx_id);/* in: trx id of the transaction */ +/******************************************************************** +Checks that trx is in the trx list. */ +UNIV_INTERN +ibool +trx_in_trx_list( +/*============*/ + /* out: TRUE if is in */ + trx_t* in_trx);/* in: trx */ +/********************************************************************* +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +UNIV_INTERN +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name_in,/* in: MySQL log file name */ + ib_int64_t offset, /* in: position in that log file */ + ulint field, /* in: offset of the MySQL log info field in + the trx sys header */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************* +Prints to stderr the MySQL binlog offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset(void); +/*===================================*/ +#ifdef UNIV_HOTBACKUP +/********************************************************************* +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + const byte* page); /* in: buffer containing the trx + system header page, i.e., page number + TRX_SYS_PAGE_NO in the tablespace */ +#endif /* UNIV_HOTBACKUP */ +/********************************************************************* +Prints to stderr the MySQL master log offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_master_log_pos(void); +/*====================================*/ +/********************************************************************* +Initializes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_init(void); +/*==========================*/ +/********************************************************************* +Closes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_close(void); +/*===========================*/ +/************************************************************************ +Tags the system table space with minimum format id if it has not been +tagged yet. +WARNING: This function is only called during the startup and AFTER the +redo log application during recovery has finished. */ +UNIV_INTERN +void +trx_sys_file_format_tag_init(void); +/*==============================*/ +/********************************************************************* +Get the name representation of the file format from its id. */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + /* out: pointer to the name */ + const ulint id); /* in: id of the file format */ +/********************************************************************* +Set the file format id unconditionally except if it's already the +same value. */ +UNIV_INTERN +ibool +trx_sys_file_format_max_set( +/*========================*/ + /* out: TRUE if value updated */ + ulint format_id, /* in: file format id */ + const char** name); /* out: max file format name or + NULL if not needed. */ +/********************************************************************* +Get the name representation of the file format from its id. */ +UNIV_INTERN +const char* +trx_sys_file_format_max_get(void); +/*=============================*/ + /* out: pointer to the max format name */ +/********************************************************************* +Check for the max file format tag stored on disk. */ +UNIV_INTERN +ulint +trx_sys_file_format_max_check( +/*==========================*/ + /* out: DB_SUCCESS or error code */ + ulint max_format_id); /* in: the max format id to check */ +/************************************************************************ +Update the file format tag in the system tablespace only if the given +format id is greater than the known max id. */ +UNIV_INTERN +ibool +trx_sys_file_format_max_upgrade( +/*============================*/ + /* out: TRUE if format_id was + bigger than the known max id */ + const char** name, /* out: max file format name */ + ulint format_id); /* in: file format identifier */ +/* The automatically created system rollback segment has this id */ +#define TRX_SYS_SYSTEM_RSEG_ID 0 + +/* Space id and page no where the trx system file copy resides */ +#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */ +#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO + +/* The offset of the transaction system header on the page */ +#define TRX_SYS FSEG_PAGE_DATA + +/* Transaction system header */ +/*-------------------------------------------------------------*/ +#define TRX_SYS_TRX_ID_STORE 0 /* the maximum trx id or trx number + modulo TRX_SYS_TRX_ID_UPDATE_MARGIN + written to a file page by any + transaction; the assignment of + transaction ids continues from this + number rounded up by .._MARGIN plus + .._MARGIN when the database is + started */ +#define TRX_SYS_FSEG_HEADER 8 /* segment header for the tablespace + segment the trx system is created + into */ +#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE) + /* the start of the array of rollback + segment specification slots */ +/*-------------------------------------------------------------*/ + +/* Max number of rollback segments: the number of segment specification slots +in the transaction system array; rollback segment id must fit in one byte, +therefore 256; each slot is currently 8 bytes in size */ +#define TRX_SYS_N_RSEGS 256 + +#define TRX_SYS_MYSQL_LOG_NAME_LEN 512 +#define TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN 480 /* (500 - 12) is dead line. */ +#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344 + +#if UNIV_PAGE_SIZE < 4096 +# error "UNIV_PAGE_SIZE < 4096" +#endif +/* The offset of the MySQL replication info in the trx system header; +this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */ +#define TRX_SYS_MYSQL_MASTER_LOG_INFO (UNIV_PAGE_SIZE - 2000) +#define TRX_SYS_MYSQL_RELAY_LOG_INFO (UNIV_PAGE_SIZE - 1500) + +/* The offset of the MySQL binlog offset info in the trx system header */ +#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 1000) +#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /* magic number which shows + if we have valid data in the + MySQL binlog info; the value + is ..._MAGIC_N if yes */ +#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH 4 /* high 4 bytes of the offset + within that file */ +#define TRX_SYS_MYSQL_LOG_OFFSET_LOW 8 /* low 4 bytes of the offset + within that file */ +#define TRX_SYS_MYSQL_LOG_NAME 12 /* MySQL log file name */ + +/* The offset of the doublewrite buffer header on the trx system header page */ +#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200) +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_FSEG 0 /* fseg header of the fseg + containing the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE + /* 4-byte magic number which + shows if we already have + created the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE) + /* page number of the + first page in the first + sequence of 64 + (= FSP_EXTENT_SIZE) consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE) + /* page number of the + first page in the second + sequence of 64 consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /* we repeat the above 3 + numbers so that if the trx + sys header is half-written + to disk, we still may be able + to recover the information */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE) + /* If this is not yet set to + .._N, we must reset the + doublewrite buffer, because + starting from 4.1.x the space + id of a data page is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO */ +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386 + + +#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE + +/* The offset of the file format tag on the trx system header page */ +#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16) + +/* We use these random constants to reduce the probability of reading +garbage (from previous versions) that maps to an actual format id. We +use these as bit masks at the time of reading and writing from/to disk. */ +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL +#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL + +/* Doublewrite control struct */ +struct trx_doublewrite_struct{ + mutex_t mutex; /* mutex protecting the first_free field and + write_buf */ + ulint block1; /* the page number of the first + doublewrite block (64 pages) */ + ulint block2; /* page number of the second block */ + ulint first_free; /* first free position in write_buf measured + in units of UNIV_PAGE_SIZE */ + byte* write_buf; /* write buffer used in writing to the + doublewrite buffer, aligned to an + address divisible by UNIV_PAGE_SIZE + (which is required by Windows aio) */ + byte* write_buf_unaligned; /* pointer to write_buf, but unaligned */ + buf_page_t** + buf_block_arr; /* array to store pointers to the buffer + blocks which have been cached to write_buf */ +}; + +/* The transaction system central memory data structure; protected by the +kernel mutex */ +struct trx_sys_struct{ + dulint max_trx_id; /* The smallest number not yet + assigned as a transaction id or + transaction number */ + UT_LIST_BASE_NODE_T(trx_t) trx_list; + /* List of active and committed in + memory transactions, sorted on trx id, + biggest first */ + UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list; + /* List of transactions created + for MySQL */ + UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list; + /* List of rollback segment objects */ + trx_rseg_t* latest_rseg; /* Latest rollback segment in the + round-robin assignment of rollback + segments to transactions */ + trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS]; + /* Pointer array to rollback segments; + NULL if slot not in use */ + ulint rseg_history_len;/* Length of the TRX_RSEG_HISTORY + list (update undo logs for committed + transactions), protected by + rseg->mutex */ + UT_LIST_BASE_NODE_T(read_view_t) view_list; + /* List of read views sorted on trx no, + biggest first */ +}; + +/* When a trx id which is zero modulo this number (which must be a power of +two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system +page is updated */ +#define TRX_SYS_TRX_ID_WRITE_MARGIN 256 + +#ifndef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic new file mode 100644 index 00000000000..4437133188f --- /dev/null +++ b/storage/xtradb/include/trx0sys.ic @@ -0,0 +1,383 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "srv0srv.h" +#include "trx0trx.h" + +/* The typedef for rseg slot in the file copy */ +typedef byte trx_sysf_rseg_t; + +/* Rollback segment specification slot offsets */ +/*-------------------------------------------------------------*/ +#define TRX_SYS_RSEG_SPACE 0 /* space where the the segment + header is placed; starting with + MySQL/InnoDB 5.1.7, this is + UNIV_UNDEFINED if the slot is unused */ +#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the the segment + header is placed; this is FIL_NULL + if the slot is unused */ +/*-------------------------------------------------------------*/ +/* Size of a rollback segment specification slot */ +#define TRX_SYS_RSEG_SLOT_SIZE 8 + +/********************************************************************* +Writes the value of max_trx_id to the file based trx system header. */ +UNIV_INTERN +void +trx_sys_flush_max_trx_id(void); +/*==========================*/ + +/******************************************************************* +Checks if a page address is the trx sys header page. */ +UNIV_INLINE +ibool +trx_sys_hdr_page( +/*=============*/ + /* out: TRUE if trx sys header page */ + ulint space, /* in: space */ + ulint page_no)/* in: page number */ +{ + if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) { + + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************* +Gets the pointer in the nth slot of the rseg array. */ +UNIV_INLINE +trx_rseg_t* +trx_sys_get_nth_rseg( +/*=================*/ + /* out: pointer to rseg object, NULL if slot + not in use */ + trx_sys_t* sys, /* in: trx system */ + ulint n) /* in: index of slot */ +{ + ut_ad(mutex_own(&(kernel_mutex))); + ut_ad(n < TRX_SYS_N_RSEGS); + + return(sys->rseg_array[n]); +} + +/******************************************************************* +Sets the pointer in the nth slot of the rseg array. */ +UNIV_INLINE +void +trx_sys_set_nth_rseg( +/*=================*/ + trx_sys_t* sys, /* in: trx system */ + ulint n, /* in: index of slot */ + trx_rseg_t* rseg) /* in: pointer to rseg object, NULL if slot + not in use */ +{ + ut_ad(n < TRX_SYS_N_RSEGS); + + sys->rseg_array[n] = rseg; +} + +/************************************************************************** +Gets a pointer to the transaction system header and x-latches its page. */ +UNIV_INLINE +trx_sysf_t* +trx_sysf_get( +/*=========*/ + /* out: pointer to system header, page x-latched. */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block; + trx_sysf_t* header; + + ut_ad(mtr); + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + header = TRX_SYS + buf_block_get_frame(block); + + return(header); +} + +/********************************************************************* +Gets the space of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_space( +/*====================*/ + /* out: space id */ + trx_sysf_t* sys_header, /* in: trx sys header */ + ulint i, /* in: slot index == rseg id */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mutex_own(&(kernel_mutex))); + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr)); +} + +/********************************************************************* +Gets the page number of the nth rollback segment slot in the trx system +header. */ +UNIV_INLINE +ulint +trx_sysf_rseg_get_page_no( +/*======================*/ + /* out: page number, FIL_NULL + if slot unused */ + trx_sysf_t* sys_header, /* in: trx system header */ + ulint i, /* in: slot index == rseg id */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(sys_header); + ut_ad(mutex_own(&(kernel_mutex))); + ut_ad(i < TRX_SYS_N_RSEGS); + + return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr)); +} + +/********************************************************************* +Sets the space id of the nth rollback segment slot in the trx system +file copy. */ +UNIV_INLINE +void +trx_sysf_rseg_set_space( +/*====================*/ + trx_sysf_t* sys_header, /* in: trx sys file copy */ + ulint i, /* in: slot index == rseg id */ + ulint space, /* in: space id */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mutex_own(&(kernel_mutex))); + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + mlog_write_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE, + space, + MLOG_4BYTES, mtr); +} + +/********************************************************************* +Sets the page number of the nth rollback segment slot in the trx system +header. */ +UNIV_INLINE +void +trx_sysf_rseg_set_page_no( +/*======================*/ + trx_sysf_t* sys_header, /* in: trx sys header */ + ulint i, /* in: slot index == rseg id */ + ulint page_no, /* in: page number, FIL_NULL if the + slot is reset to unused */ + mtr_t* mtr) /* in: mtr */ +{ + ut_ad(mutex_own(&(kernel_mutex))); + ut_ad(sys_header); + ut_ad(i < TRX_SYS_N_RSEGS); + + mlog_write_ulint(sys_header + TRX_SYS_RSEGS + + i * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_PAGE_NO, + page_no, + MLOG_4BYTES, mtr); +} + +/********************************************************************* +Writes a trx id to an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_trx_id( +/*=============*/ + byte* ptr, /* in: pointer to memory where written */ + dulint id) /* in: id */ +{ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(ptr, id); +} + +/********************************************************************* +Reads a trx id from an index page. In case that the id size changes in +some future version, this function should be used instead of +mach_read_... */ +UNIV_INLINE +dulint +trx_read_trx_id( +/*============*/ + /* out: id */ + const byte* ptr) /* in: pointer to memory from where to read */ +{ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + return(mach_read_from_6(ptr)); +} + +/******************************************************************** +Looks for the trx handle with the given id in trx_list. */ +UNIV_INLINE +trx_t* +trx_get_on_id( +/*==========*/ + /* out: the trx handle or NULL if not found */ + dulint trx_id) /* in: trx id to search for */ +{ + trx_t* trx; + + ut_ad(mutex_own(&(kernel_mutex))); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx != NULL) { + if (0 == ut_dulint_cmp(trx_id, trx->id)) { + + return(trx); + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + return(NULL); +} + +/******************************************************************** +Returns the minumum trx id in trx list. This is the smallest id for which +the trx can possibly be active. (But, you must look at the trx->conc_state to +find out if the minimum trx id transaction itself is active, or already +committed.) */ +UNIV_INLINE +dulint +trx_list_get_min_trx_id(void) +/*=========================*/ + /* out: the minimum trx id, or trx_sys->max_trx_id + if the trx list is empty */ +{ + trx_t* trx; + + ut_ad(mutex_own(&(kernel_mutex))); + + trx = UT_LIST_GET_LAST(trx_sys->trx_list); + + if (trx == NULL) { + + return(trx_sys->max_trx_id); + } + + return(trx->id); +} + +/******************************************************************** +Checks if a transaction with the given id is active. */ +UNIV_INLINE +ibool +trx_is_active( +/*==========*/ + /* out: TRUE if active */ + dulint trx_id) /* in: trx id of the transaction */ +{ + trx_t* trx; + + ut_ad(mutex_own(&(kernel_mutex))); + + if (ut_dulint_cmp(trx_id, trx_list_get_min_trx_id()) < 0) { + + return(FALSE); + } + + if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) { + + /* There must be corruption: we return TRUE because this + function is only called by lock_clust_rec_some_has_impl() + and row_vers_impl_x_locked_off_kernel() and they have + diagnostic prints in this case */ + + return(TRUE); + } + + trx = trx_get_on_id(trx_id); + if (trx && (trx->conc_state == TRX_ACTIVE + || trx->conc_state == TRX_PREPARED)) { + + return(TRUE); + } + + return(FALSE); +} + +/********************************************************************* +Allocates a new transaction id. */ +UNIV_INLINE +dulint +trx_sys_get_new_trx_id(void) +/*========================*/ + /* out: new, allocated trx id */ +{ + dulint id; + + ut_ad(mutex_own(&kernel_mutex)); + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if + will evaluate to TRUE when this function is first time called, + and the value for trx id will be written to disk-based header! + Thus trx id values will not overlap when the database is + repeatedly started! */ + + if (ut_dulint_get_low(trx_sys->max_trx_id) + % TRX_SYS_TRX_ID_WRITE_MARGIN == 0) { + + trx_sys_flush_max_trx_id(); + } + + id = trx_sys->max_trx_id; + + UT_DULINT_INC(trx_sys->max_trx_id); + + return(id); +} + +/********************************************************************* +Allocates a new transaction number. */ +UNIV_INLINE +dulint +trx_sys_get_new_trx_no(void) +/*========================*/ + /* out: new, allocated trx number */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + return(trx_sys_get_new_trx_id()); +} diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h new file mode 100644 index 00000000000..e7f7539f9ee --- /dev/null +++ b/storage/xtradb/include/trx0trx.h @@ -0,0 +1,860 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0trx_h +#define trx0trx_h + +#include "univ.i" +#include "trx0types.h" +#include "lock0types.h" +#include "usr0types.h" +#include "que0types.h" +#include "mem0mem.h" +#include "read0types.h" +#include "dict0types.h" +#include "trx0xa.h" +#include "ut0vec.h" + +/* Dummy session used currently in MySQL interface */ +extern sess_t* trx_dummy_sess; + +/* Number of transactions currently allocated for MySQL: protected by +the kernel mutex */ +extern ulint trx_n_mysql_transactions; + +/***************************************************************** +Resets the new record lock info in a transaction struct. */ +UNIV_INLINE +void +trx_reset_new_rec_lock_info( +/*========================*/ + trx_t* trx); /* in: transaction struct */ +/***************************************************************** +Registers that we have set a new record lock on an index. We only have space +to store 2 indexes! If this is called to store more than 2 indexes after +trx_reset_new_rec_lock_info(), then this function does nothing. */ +UNIV_INLINE +void +trx_register_new_rec_lock( +/*======================*/ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index); /* in: trx sets a new record lock on this + index */ +/***************************************************************** +Checks if trx has set a new record lock on an index. */ +UNIV_INLINE +ibool +trx_new_rec_locks_contain( +/*======================*/ + /* out: TRUE if trx has set a new record lock + on index */ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index); /* in: index */ +/************************************************************************ +Releases the search latch if trx has reserved it. */ +UNIV_INTERN +void +trx_search_latch_release_if_reserved( +/*=================================*/ + trx_t* trx); /* in: transaction */ +/********************************************************************** +Set detailed error message for the transaction. */ +UNIV_INTERN +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /* in: transaction struct */ + const char* msg); /* in: detailed error message */ +/***************************************************************** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +UNIV_INTERN +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /* in: transaction struct */ + FILE* file); /* in: file to read message from */ +/******************************************************************** +Retrieves the error_info field from a trx. */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + /* out: the error info */ + const trx_t* trx); /* in: trx object */ +/******************************************************************** +Creates and initializes a transaction object. */ +UNIV_INTERN +trx_t* +trx_create( +/*=======*/ + /* out, own: the transaction */ + sess_t* sess) /* in: session */ + __attribute__((nonnull)); +/************************************************************************ +Creates a transaction object for MySQL. */ +UNIV_INTERN +trx_t* +trx_allocate_for_mysql(void); +/*========================*/ + /* out, own: transaction object */ +/************************************************************************ +Creates a transaction object for background operations by the master thread. */ +UNIV_INTERN +trx_t* +trx_allocate_for_background(void); +/*=============================*/ + /* out, own: transaction object */ +/************************************************************************ +Frees a transaction object. */ +UNIV_INTERN +void +trx_free( +/*=====*/ + trx_t* trx); /* in, own: trx object */ +/************************************************************************ +Frees a transaction object for MySQL. */ +UNIV_INTERN +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx); /* in, own: trx object */ +/************************************************************************ +Frees a transaction object of a background operation of the master thread. */ +UNIV_INTERN +void +trx_free_for_background( +/*====================*/ + trx_t* trx); /* in, own: trx object */ +/******************************************************************** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ +UNIV_INTERN +void +trx_lists_init_at_db_start(void); +/*============================*/ +/******************************************************************** +Starts a new transaction. */ +UNIV_INTERN +ibool +trx_start( +/*======*/ + /* out: TRUE if success, FALSE if the rollback + segment could not support this many transactions */ + trx_t* trx, /* in: transaction */ + ulint rseg_id);/* in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +/******************************************************************** +Starts a new transaction. */ +UNIV_INTERN +ibool +trx_start_low( +/*==========*/ + /* out: TRUE */ + trx_t* trx, /* in: transaction */ + ulint rseg_id);/* in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +/***************************************************************** +Starts the transaction if it is not yet started. */ +UNIV_INLINE +void +trx_start_if_not_started( +/*=====================*/ + trx_t* trx); /* in: transaction */ +/***************************************************************** +Starts the transaction if it is not yet started. Assumes we have reserved +the kernel mutex! */ +UNIV_INLINE +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx); /* in: transaction */ +/******************************************************************** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit_off_kernel( +/*==================*/ + trx_t* trx); /* in: transaction */ +/******************************************************************** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, andf we cannot roll it back. */ +UNIV_INTERN +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx); /* in: transaction */ +/************************************************************************** +Does the transaction commit for MySQL. */ +UNIV_INTERN +ulint +trx_commit_for_mysql( +/*=================*/ + /* out: DB_SUCCESS or error number */ + trx_t* trx); /* in: trx handle */ +/************************************************************************** +Does the transaction prepare for MySQL. */ +UNIV_INTERN +ulint +trx_prepare_for_mysql( +/*==================*/ + /* out: 0 or error number */ + trx_t* trx); /* in: trx handle */ +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ +UNIV_INTERN +int +trx_recover_for_mysql( +/*==================*/ + /* out: number of prepared transactions */ + XID* xid_list, /* in/out: prepared transactions */ + ulint len); /* in: number of slots in xid_list */ +/*********************************************************************** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state */ +UNIV_INTERN +trx_t * +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid); /* in: X/Open XA transaction identification */ +/************************************************************************** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +UNIV_INTERN +ulint +trx_commit_complete_for_mysql( +/*==========================*/ + /* out: 0 or error number */ + trx_t* trx); /* in: trx handle */ +/************************************************************************** +Marks the latest SQL statement ended. */ +UNIV_INTERN +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx); /* in: trx handle */ +/************************************************************************ +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. */ +UNIV_INTERN +read_view_t* +trx_assign_read_view( +/*=================*/ + /* out: consistent read view */ + trx_t* trx); /* in: active transaction */ +/*************************************************************** +The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to +the TRX_QUE_RUNNING state and releases query threads which were +waiting for a lock in the wait_thrs list. */ +UNIV_INTERN +void +trx_end_lock_wait( +/*==============*/ + trx_t* trx); /* in: transaction */ +/******************************************************************** +Sends a signal to a trx object. */ +UNIV_INTERN +void +trx_sig_send( +/*=========*/ + trx_t* trx, /* in: trx handle */ + ulint type, /* in: signal type */ + ulint sender, /* in: TRX_SIG_SELF or + TRX_SIG_OTHER_SESS */ + que_thr_t* receiver_thr, /* in: query thread which wants the + reply, or NULL; if type is + TRX_SIG_END_WAIT, this must be NULL */ + trx_savept_t* savept, /* in: possible rollback savepoint, or + NULL */ + que_thr_t** next_thr); /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the parameter + is NULL, it is ignored */ +/******************************************************************** +Send the reply message when a signal in the queue of the trx has +been handled. */ +UNIV_INTERN +void +trx_sig_reply( +/*==========*/ + trx_sig_t* sig, /* in: signal */ + que_thr_t** next_thr); /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +/******************************************************************** +Removes the signal object from a trx signal queue. */ +UNIV_INTERN +void +trx_sig_remove( +/*===========*/ + trx_t* trx, /* in: trx handle */ + trx_sig_t* sig); /* in, own: signal */ +/******************************************************************** +Starts handling of a trx signal. */ +UNIV_INTERN +void +trx_sig_start_handle( +/*=================*/ + trx_t* trx, /* in: trx handle */ + que_thr_t** next_thr); /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +/******************************************************************** +Ends signal handling. If the session is in the error state, and +trx->graph_before_signal_handling != NULL, returns control to the error +handling routine of the graph (currently only returns the control to the +graph root which then sends an error message to the client). */ +UNIV_INTERN +void +trx_end_signal_handling( +/*====================*/ + trx_t* trx); /* in: trx */ +/************************************************************************* +Creates a commit command node struct. */ +UNIV_INTERN +commit_node_t* +commit_node_create( +/*===============*/ + /* out, own: commit node struct */ + mem_heap_t* heap); /* in: mem heap where created */ +/*************************************************************** +Performs an execution step for a commit type node in a query graph. */ +UNIV_INTERN +que_thr_t* +trx_commit_step( +/*============*/ + /* out: query thread to run next, or NULL */ + que_thr_t* thr); /* in: query thread */ + +/************************************************************************** +Prints info about a transaction to the given file. The caller must own the +kernel mutex and must have called +innobase_mysql_prepare_print_arbitrary_thd(), unless he knows that MySQL +or InnoDB cannot meanwhile change the info printed here. */ +UNIV_INTERN +void +trx_print( +/*======*/ + FILE* f, /* in: output stream */ + trx_t* trx, /* in: transaction */ + ulint max_query_len); /* in: max query length to print, or 0 to + use the default max length */ + +/** Type of data dictionary operation */ +enum trx_dict_op { + /** The transaction is not modifying the data dictionary. */ + TRX_DICT_OP_NONE = 0, + /** The transaction is creating a table or an index, or + dropping a table. The table must be dropped in crash + recovery. This and TRX_DICT_OP_NONE are the only possible + operation modes in crash recovery. */ + TRX_DICT_OP_TABLE = 1, + /** The transaction is creating or dropping an index in an + existing table. In crash recovery, the the data dictionary + must be locked, but the table must not be dropped. */ + TRX_DICT_OP_INDEX = 2 +}; + +/************************************************************************** +Determine if a transaction is a dictionary operation. */ +UNIV_INLINE +enum trx_dict_op +trx_get_dict_operation( +/*===================*/ + /* out: dictionary operation mode */ + const trx_t* trx) /* in: transaction */ + __attribute__((pure)); +/************************************************************************** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /* in/out: transaction */ + enum trx_dict_op op); /* in: operation, not + TRX_DICT_OP_NONE */ + +#ifndef UNIV_HOTBACKUP +/************************************************************************** +Determines if the currently running transaction has been interrupted. */ +UNIV_INTERN +ibool +trx_is_interrupted( +/*===============*/ + /* out: TRUE if interrupted */ + trx_t* trx); /* in: transaction */ +#else /* !UNIV_HOTBACKUP */ +#define trx_is_interrupted(trx) FALSE +#endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************** +Calculates the "weight" of a transaction. The weight of one transaction +is estimated as the number of altered rows + the number of locked rows. +*/ + +#define TRX_WEIGHT(t) \ + ut_dulint_add((t)->undo_no, UT_LIST_GET_LEN((t)->trx_locks)) + +/*********************************************************************** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. */ +UNIV_INTERN +int +trx_weight_cmp( +/*===========*/ + /* out: <0, 0 or >0; similar to strcmp(3) */ + const trx_t* a, /* in: the first transaction to be compared */ + const trx_t* b); /* in: the second transaction to be compared */ + +/*********************************************************************** +Retrieves transacion's id, represented as unsigned long long. */ +UNIV_INLINE +ullint +trx_get_id( +/*=======*/ + /* out: transaction's id */ + const trx_t* trx); /* in: transaction */ + +/* Maximum length of a string that can be returned by +trx_get_que_state_str(). */ +#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */ + +/*********************************************************************** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + /* out: string in the data segment */ + const trx_t* trx); /* in: transaction */ + +/* Signal to a transaction */ +struct trx_sig_struct{ + unsigned type:3; /* signal type */ + unsigned sender:1; /* TRX_SIG_SELF or + TRX_SIG_OTHER_SESS */ + que_thr_t* receiver; /* non-NULL if the sender of the signal + wants reply after the operation induced + by the signal is completed */ + trx_savept_t savept; /* possible rollback savepoint */ + UT_LIST_NODE_T(trx_sig_t) + signals; /* queue of pending signals to the + transaction */ + UT_LIST_NODE_T(trx_sig_t) + reply_signals; /* list of signals for which the sender + transaction is waiting a reply */ +}; + +#define TRX_MAGIC_N 91118598 + +/* The transaction handle; every session has a trx object which is freed only +when the session is freed; in addition there may be session-less transactions +rolling back after a database recovery */ + +struct trx_struct{ + ulint magic_n; + /* All the next fields are protected by the kernel mutex, except the + undo logs which are protected by undo_mutex */ + const char* op_info; /* English text describing the + current operation, or an empty + string */ + unsigned is_purge:1; /* 0=user transaction, 1=purge */ + unsigned is_recovered:1; /* 0=normal transaction, + 1=recovered, must be rolled back */ + unsigned conc_state:2; /* state of the trx from the point + of view of concurrency control: + TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY, + ... */ + unsigned que_state:2; /* valid when conc_state == TRX_ACTIVE: + TRX_QUE_RUNNING, TRX_QUE_LOCK_WAIT, + ... */ + unsigned isolation_level:2;/* TRX_ISO_REPEATABLE_READ, ... */ + unsigned check_foreigns:1;/* normally TRUE, but if the user + wants to suppress foreign key checks, + (in table imports, for example) we + set this FALSE */ + unsigned check_unique_secondary:1; + /* normally TRUE, but if the user + wants to speed up inserts by + suppressing unique key checks + for secondary indexes when we decide + if we can use the insert buffer for + them, we set this FALSE */ + unsigned support_xa:1; /* normally we do the XA two-phase + commit steps, but by setting this to + FALSE, one can save CPU time and about + 150 bytes in the undo log size as then + we skip XA steps */ + unsigned flush_log_later:1;/* when we commit the transaction + in MySQL's binlog write, we will + flush the log to disk later in + a separate call */ + unsigned must_flush_log_later:1;/* this flag is set to TRUE in + trx_commit_off_kernel() if + flush_log_later was TRUE, and there + were modifications by the transaction; + in that case we must flush the log + in trx_commit_complete_for_mysql() */ + unsigned dict_operation:2;/**< @see enum trx_dict_op */ + unsigned duplicates:2; /* TRX_DUP_IGNORE | TRX_DUP_REPLACE */ + unsigned active_trans:2; /* 1 - if a transaction in MySQL + is active. 2 - if prepare_commit_mutex + was taken */ + unsigned has_search_latch:1; + /* TRUE if this trx has latched the + search system latch in S-mode */ + unsigned declared_to_be_inside_innodb:1; + /* this is TRUE if we have declared + this transaction in + srv_conc_enter_innodb to be inside the + InnoDB engine */ + unsigned handling_signals:1;/* this is TRUE as long as the trx + is handling signals */ + unsigned dict_operation_lock_mode:2; + /* 0, RW_S_LATCH, or RW_X_LATCH: + the latch mode trx currently holds + on dict_operation_lock */ + time_t start_time; /* time the trx object was created + or the state last time became + TRX_ACTIVE */ + dulint id; /* transaction id */ + XID xid; /* X/Open XA transaction + identification to identify a + transaction branch */ + dulint no; /* transaction serialization number == + max trx id when the transaction is + moved to COMMITTED_IN_MEMORY state */ + ib_uint64_t commit_lsn; /* lsn at the time of the commit */ + dulint table_id; /* Table to drop iff dict_operation + is TRUE, or ut_dulint_zero. */ + /*------------------------------*/ + void* mysql_thd; /* MySQL thread handle corresponding + to this trx, or NULL */ + char** mysql_query_str;/* pointer to the field in mysqld_thd + which contains the pointer to the + current SQL query string */ + const char* mysql_log_file_name; + /* if MySQL binlog is used, this field + contains a pointer to the latest file + name; this is NULL if binlog is not + used */ + ib_int64_t mysql_log_offset;/* if MySQL binlog is used, this field + contains the end offset of the binlog + entry */ + const char* mysql_master_log_file_name; + /* if the database server is a MySQL + replication slave, we have here the + master binlog name up to which + replication has processed; otherwise + this is a pointer to a null + character */ + ib_int64_t mysql_master_log_pos; + /* if the database server is a MySQL + replication slave, this is the + position in the log file up to which + replication has processed */ + const char* mysql_relay_log_file_name; + ib_int64_t mysql_relay_log_pos; + + os_thread_id_t mysql_thread_id;/* id of the MySQL thread associated + with this transaction object */ + ulint mysql_process_no;/* since in Linux, 'top' reports + process id's and not thread id's, we + store the process number too */ + /*------------------------------*/ + ulint n_mysql_tables_in_use; /* number of Innobase tables + used in the processing of the current + SQL statement in MySQL */ + ulint mysql_n_tables_locked; + /* how many tables the current SQL + statement uses, except those + in consistent read */ + ulint search_latch_timeout; + /* If we notice that someone is + waiting for our S-lock on the search + latch to be released, we wait in + row0sel.c for BTR_SEA_TIMEOUT new + searches until we try to keep + the search latch again over + calls from MySQL; this is intended + to reduce contention on the search + latch */ + /*------------------------------*/ + ulint n_tickets_to_enter_innodb; + /* this can be > 0 only when + declared_to_... is TRUE; when we come + to srv_conc_innodb_enter, if the value + here is > 0, we decrement this by 1 */ + /*------------------------------*/ + dict_index_t* new_rec_locks[2];/* these are normally NULL; if + srv_locks_unsafe_for_binlog is TRUE + or session is using READ COMMITTED + isolation level, + in a cursor search, if we set a new + record lock on an index, this is set + to point to the index; this is + used in releasing the locks under the + cursors if we are performing an UPDATE + and we determine after retrieving + the row that it does not need to be + locked; thus, these can be used to + implement a 'mini-rollback' that + releases the latest record locks */ + UT_LIST_NODE_T(trx_t) + trx_list; /* list of transactions */ + UT_LIST_NODE_T(trx_t) + mysql_trx_list; /* list of transactions created for + MySQL */ + /*------------------------------*/ + ulint error_state; /* 0 if no error, otherwise error + number; NOTE That ONLY the thread + doing the transaction is allowed to + set this field: this is NOT protected + by the kernel mutex */ + const dict_index_t*error_info; /* if the error number indicates a + duplicate key error, a pointer to + the problematic index is stored here */ + ulint error_key_num; /* if the index creation fails to a + duplicate key error, a mysql key + number of that index is stored here */ + sess_t* sess; /* session of the trx, NULL if none */ + que_t* graph; /* query currently run in the session, + or NULL if none; NOTE that the query + belongs to the session, and it can + survive over a transaction commit, if + it is a stored procedure with a COMMIT + WORK statement, for instance */ + ulint n_active_thrs; /* number of active query threads */ + que_t* graph_before_signal_handling; + /* value of graph when signal handling + for this trx started: this is used to + return control to the original query + graph for error processing */ + trx_sig_t sig; /* one signal object can be allocated + in this space, avoiding mem_alloc */ + UT_LIST_BASE_NODE_T(trx_sig_t) + signals; /* queue of processed or pending + signals to the trx */ + UT_LIST_BASE_NODE_T(trx_sig_t) + reply_signals; /* list of signals sent by the query + threads of this trx for which a thread + is waiting for a reply; if this trx is + killed, the reply requests in the list + must be canceled */ + /*------------------------------*/ + lock_t* wait_lock; /* if trx execution state is + TRX_QUE_LOCK_WAIT, this points to + the lock request, otherwise this is + NULL */ + ibool was_chosen_as_deadlock_victim; + /* when the transaction decides to wait + for a lock, it sets this to FALSE; + if another transaction chooses this + transaction as a victim in deadlock + resolution, it sets this to TRUE */ + time_t wait_started; /* lock wait started at this time */ + UT_LIST_BASE_NODE_T(que_thr_t) + wait_thrs; /* query threads belonging to this + trx that are in the QUE_THR_LOCK_WAIT + state */ + ulint deadlock_mark; /* a mark field used in deadlock + checking algorithm. This must be + in its own machine word, because + it can be changed by other + threads while holding kernel_mutex. */ + /*------------------------------*/ + mem_heap_t* lock_heap; /* memory heap for the locks of the + transaction */ + UT_LIST_BASE_NODE_T(lock_t) + trx_locks; /* locks reserved by the transaction */ + /*------------------------------*/ + mem_heap_t* global_read_view_heap; + /* memory heap for the global read + view */ + read_view_t* global_read_view; + /* consistent read view associated + to a transaction or NULL */ + read_view_t* read_view; /* consistent read view used in the + transaction or NULL, this read view + if defined can be normal read view + associated to a transaction (i.e. + same as global_read_view) or read view + associated to a cursor */ + /*------------------------------*/ + UT_LIST_BASE_NODE_T(trx_named_savept_t) + trx_savepoints; /* savepoints set with SAVEPOINT ..., + oldest first */ + /*------------------------------*/ + mutex_t undo_mutex; /* mutex protecting the fields in this + section (down to undo_no_arr), EXCEPT + last_sql_stat_start, which can be + accessed only when we know that there + cannot be any activity in the undo + logs! */ + dulint undo_no; /* next undo log record number to + assign; since the undo log is + private for a transaction, this + is a simple ascending sequence + with no gaps; thus it represents + the number of modified/inserted + rows in a transaction */ + trx_savept_t last_sql_stat_start; + /* undo_no when the last sql statement + was started: in case of an error, trx + is rolled back down to this undo + number; see note at undo_mutex! */ + trx_rseg_t* rseg; /* rollback segment assigned to the + transaction, or NULL if not assigned + yet */ + trx_undo_t* insert_undo; /* pointer to the insert undo log, or + NULL if no inserts performed yet */ + trx_undo_t* update_undo; /* pointer to the update undo log, or + NULL if no update performed yet */ + dulint roll_limit; /* least undo number to undo during + a rollback */ + ulint pages_undone; /* number of undo log pages undone + since the last undo log truncation */ + trx_undo_arr_t* undo_no_arr; /* array of undo numbers of undo log + records which are currently processed + by a rollback operation */ + /*------------------------------*/ + ulint n_autoinc_rows; /* no. of AUTO-INC rows required for + an SQL statement. This is useful for + multi-row INSERTs */ + ib_vector_t* autoinc_locks; /* AUTOINC locks held by this + transaction. Note that these are + also in the lock list trx_locks. This + vector needs to be freed explicitly + when the trx_t instance is desrtoyed */ + /*------------------------------*/ + char detailed_error[256]; /* detailed error message for last + error, or empty. */ +}; + +#define TRX_MAX_N_THREADS 32 /* maximum number of + concurrent threads running a + single operation of a + transaction, e.g., a parallel + query */ +/* Transaction concurrency states (trx->conc_state) */ +#define TRX_NOT_STARTED 0 +#define TRX_ACTIVE 1 +#define TRX_COMMITTED_IN_MEMORY 2 +#define TRX_PREPARED 3 /* Support for 2PC/XA */ + +/* Transaction execution states when trx->conc_state == TRX_ACTIVE */ +#define TRX_QUE_RUNNING 0 /* transaction is running */ +#define TRX_QUE_LOCK_WAIT 1 /* transaction is waiting for a lock */ +#define TRX_QUE_ROLLING_BACK 2 /* transaction is rolling back */ +#define TRX_QUE_COMMITTING 3 /* transaction is committing */ + +/* Transaction isolation levels (trx->isolation_level) */ +#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking + SELECTs are performed so that + we do not look at a possible + earlier version of a record; + thus they are not 'consistent' + reads under this isolation + level; otherwise like level + 2 */ + +#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like + isolation, except that in + range UPDATE and DELETE we + must block phantom rows + with next-key locks; + SELECT ... FOR UPDATE and ... + LOCK IN SHARE MODE only lock + the index records, NOT the + gaps before them, and thus + allow free inserting; + each consistent read reads its + own snapshot */ + +#define TRX_ISO_REPEATABLE_READ 2 /* this is the default; + all consistent reads in the + same trx read the same + snapshot; + full next-key locking used + in locking reads to block + insertions into gaps */ + +#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are + converted to LOCK IN SHARE + MODE reads */ + +/* Treatment of duplicate values (trx->duplicates; for example, in inserts). +Multiple flags can be combined with bitwise OR. */ +#define TRX_DUP_IGNORE 1 /* duplicate rows are to be updated */ +#define TRX_DUP_REPLACE 2 /* duplicate rows are to be replaced */ + + +/* Types of a trx signal */ +#define TRX_SIG_NO_SIGNAL 0 +#define TRX_SIG_TOTAL_ROLLBACK 1 +#define TRX_SIG_ROLLBACK_TO_SAVEPT 2 +#define TRX_SIG_COMMIT 3 +#define TRX_SIG_ERROR_OCCURRED 4 +#define TRX_SIG_BREAK_EXECUTION 5 + +/* Sender types of a signal */ +#define TRX_SIG_SELF 0 /* sent by the session itself, or + by an error occurring within this + session */ +#define TRX_SIG_OTHER_SESS 1 /* sent by another session (which + must hold rights to this) */ + +/* Commit command node in a query graph */ +struct commit_node_struct{ + que_common_t common; /* node type: QUE_NODE_COMMIT */ + ulint state; /* node execution state */ +}; + +/* Commit node states */ +#define COMMIT_NODE_SEND 1 +#define COMMIT_NODE_WAIT 2 + + +#ifndef UNIV_NONINL +#include "trx0trx.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0trx.ic b/storage/xtradb/include/trx0trx.ic new file mode 100644 index 00000000000..6da89f002fe --- /dev/null +++ b/storage/xtradb/include/trx0trx.ic @@ -0,0 +1,221 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/***************************************************************** +Starts the transaction if it is not yet started. */ +UNIV_INLINE +void +trx_start_if_not_started( +/*=====================*/ + trx_t* trx) /* in: transaction */ +{ + ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY); + + if (trx->conc_state == TRX_NOT_STARTED) { + + trx_start(trx, ULINT_UNDEFINED); + } +} + +/***************************************************************** +Starts the transaction if it is not yet started. Assumes we have reserved +the kernel mutex! */ +UNIV_INLINE +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx) /* in: transaction */ +{ + ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY); + + if (trx->conc_state == TRX_NOT_STARTED) { + + trx_start_low(trx, ULINT_UNDEFINED); + } +} + +/***************************************************************** +Resets the new record lock info in a transaction struct. */ +UNIV_INLINE +void +trx_reset_new_rec_lock_info( +/*========================*/ + trx_t* trx) /* in: transaction struct */ +{ + trx->new_rec_locks[0] = NULL; + trx->new_rec_locks[1] = NULL; +} + +/***************************************************************** +Registers that we have set a new record lock on an index. We only have space +to store 2 indexes! If this is called to store more than 2 indexes after +trx_reset_new_rec_lock_info(), then this function does nothing. */ +UNIV_INLINE +void +trx_register_new_rec_lock( +/*======================*/ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index) /* in: trx sets a new record lock on this + index */ +{ + if (trx->new_rec_locks[0] == NULL) { + trx->new_rec_locks[0] = index; + + return; + } + + if (trx->new_rec_locks[0] == index) { + + return; + } + + if (trx->new_rec_locks[1] != NULL) { + + return; + } + + trx->new_rec_locks[1] = index; +} + +/***************************************************************** +Checks if trx has set a new record lock on an index. */ +UNIV_INLINE +ibool +trx_new_rec_locks_contain( +/*======================*/ + /* out: TRUE if trx has set a new record lock + on index */ + trx_t* trx, /* in: transaction struct */ + dict_index_t* index) /* in: index */ +{ + return(trx->new_rec_locks[0] == index + || trx->new_rec_locks[1] == index); +} + +/******************************************************************** +Retrieves the error_info field from a trx. */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + /* out: the error info */ + const trx_t* trx) /* in: trx object */ +{ + return(trx->error_info); +} + +/*********************************************************************** +Retrieves transacion's id, represented as unsigned long long. */ +UNIV_INLINE +ullint +trx_get_id( +/*=======*/ + /* out: transaction's id */ + const trx_t* trx) /* in: transaction */ +{ + return((ullint)ut_conv_dulint_to_longlong(trx->id)); +} + +/*********************************************************************** +Retrieves transaction's que state in a human readable string. The string +should not be free()'d or modified. */ +UNIV_INLINE +const char* +trx_get_que_state_str( +/*==================*/ + /* out: string in the data segment */ + const trx_t* trx) /* in: transaction */ +{ + /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */ + switch (trx->que_state) { + case TRX_QUE_RUNNING: + return("RUNNING"); + case TRX_QUE_LOCK_WAIT: + return("LOCK WAIT"); + case TRX_QUE_ROLLING_BACK: + return("ROLLING BACK"); + case TRX_QUE_COMMITTING: + return("COMMITTING"); + default: + return("UNKNOWN"); + } +} + +/************************************************************************** +Determine if a transaction is a dictionary operation. */ +UNIV_INLINE +enum trx_dict_op +trx_get_dict_operation( +/*===================*/ + /* out: dictionary operation mode */ + const trx_t* trx) /* in: transaction */ +{ + enum trx_dict_op op = (enum trx_dict_op) trx->dict_operation; + +#ifdef UNIV_DEBUG + switch (op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + return(op); + } + ut_error; +#endif /* UNIV_DEBUG */ + return((enum trx_dict_op) UNIV_EXPECT(op, TRX_DICT_OP_NONE)); +} +/************************************************************************** +Flag a transaction a dictionary operation. */ +UNIV_INLINE +void +trx_set_dict_operation( +/*===================*/ + trx_t* trx, /* in/out: transaction */ + enum trx_dict_op op) /* in: operation, not + TRX_DICT_OP_NONE */ +{ +#ifdef UNIV_DEBUG + enum trx_dict_op old_op = trx_get_dict_operation(trx); + + switch (op) { + case TRX_DICT_OP_NONE: + ut_error; + break; + case TRX_DICT_OP_TABLE: + switch (old_op) { + case TRX_DICT_OP_NONE: + case TRX_DICT_OP_INDEX: + case TRX_DICT_OP_TABLE: + goto ok; + } + ut_error; + break; + case TRX_DICT_OP_INDEX: + ut_ad(old_op == TRX_DICT_OP_NONE); + break; + } +ok: +#endif /* UNIV_DEBUG */ + + trx->dict_operation = op; +} diff --git a/storage/xtradb/include/trx0types.h b/storage/xtradb/include/trx0types.h new file mode 100644 index 00000000000..896f4e8c0a2 --- /dev/null +++ b/storage/xtradb/include/trx0types.h @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction system global type definitions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0types_h +#define trx0types_h + +#include "ut0byte.h" + +/* prepare trx_t::id for being printed via printf(3) */ +#define TRX_ID_PREP_PRINTF(id) (ullint) ut_conv_dulint_to_longlong(id) + +/* printf(3) format used for printing TRX_ID_PRINTF_PREP() */ +#define TRX_ID_FMT "%llX" + +/* maximum length that a formatted trx_t::id could take, not including +the terminating '\0'. */ +#define TRX_ID_MAX_LEN 17 + +/* Memory objects */ +typedef struct trx_struct trx_t; +typedef struct trx_sys_struct trx_sys_t; +typedef struct trx_doublewrite_struct trx_doublewrite_t; +typedef struct trx_sig_struct trx_sig_t; +typedef struct trx_rseg_struct trx_rseg_t; +typedef struct trx_undo_struct trx_undo_t; +typedef struct trx_undo_arr_struct trx_undo_arr_t; +typedef struct trx_undo_inf_struct trx_undo_inf_t; +typedef struct trx_purge_struct trx_purge_t; +typedef struct roll_node_struct roll_node_t; +typedef struct commit_node_struct commit_node_t; +typedef struct trx_named_savept_struct trx_named_savept_t; + +/* Rollback contexts */ +enum trx_rb_ctx { + RB_NONE = 0, /* no rollback */ + RB_NORMAL, /* normal rollback */ + RB_RECOVERY, /* rolling back an incomplete transaction, + in crash recovery */ +}; + +/* Transaction savepoint */ +typedef struct trx_savept_struct trx_savept_t; +struct trx_savept_struct{ + dulint least_undo_no; /* least undo number to undo */ +}; + +/* File objects */ +typedef byte trx_sysf_t; +typedef byte trx_rsegf_t; +typedef byte trx_usegf_t; +typedef byte trx_ulogf_t; +typedef byte trx_upagef_t; + +/* Undo log record */ +typedef byte trx_undo_rec_t; + +#endif diff --git a/storage/xtradb/include/trx0undo.h b/storage/xtradb/include/trx0undo.h new file mode 100644 index 00000000000..7f7408931da --- /dev/null +++ b/storage/xtradb/include/trx0undo.h @@ -0,0 +1,526 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0undo_h +#define trx0undo_h + +#include "univ.i" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "trx0sys.h" +#include "page0types.h" +#include "trx0xa.h" + +/*************************************************************************** +Builds a roll pointer dulint. */ +UNIV_INLINE +dulint +trx_undo_build_roll_ptr( +/*====================*/ + /* out: roll pointer */ + ibool is_insert, /* in: TRUE if insert undo log */ + ulint rseg_id, /* in: rollback segment id */ + ulint page_no, /* in: page number */ + ulint offset); /* in: offset of the undo entry within page */ +/*************************************************************************** +Decodes a roll pointer dulint. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + dulint roll_ptr, /* in: roll pointer */ + ibool* is_insert, /* out: TRUE if insert undo log */ + ulint* rseg_id, /* out: rollback segment id */ + ulint* page_no, /* out: page number */ + ulint* offset); /* out: offset of the undo entry within page */ +/*************************************************************************** +Returns TRUE if the roll pointer is of the insert type. */ +UNIV_INLINE +ibool +trx_undo_roll_ptr_is_insert( +/*========================*/ + /* out: TRUE if insert undo log */ + dulint roll_ptr); /* in: roll pointer */ +/********************************************************************* +Writes a roll ptr to an index page. In case that the size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_roll_ptr( +/*===============*/ + byte* ptr, /* in: pointer to memory where written */ + dulint roll_ptr); /* in: roll ptr */ +/********************************************************************* +Reads a roll ptr from an index page. In case that the roll ptr size +changes in some future version, this function should be used instead of +mach_read_... */ +UNIV_INLINE +dulint +trx_read_roll_ptr( +/*==============*/ + /* out: roll ptr */ + const byte* ptr); /* in: pointer to memory from where to read */ +/********************************************************************** +Gets an undo log page and x-latches it. */ +UNIV_INLINE +page_t* +trx_undo_page_get( +/*==============*/ + /* out: pointer to page x-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Gets an undo log page and s-latches it. */ +UNIV_INLINE +page_t* +trx_undo_page_get_s_latched( +/*========================*/ + /* out: pointer to page s-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Returns the previous undo record on the page in the specified log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_prev_rec( +/*=======================*/ + /* out: pointer to record, NULL if none */ + trx_undo_rec_t* rec, /* in: undo log record */ + ulint page_no,/* in: undo log header page number */ + ulint offset);/* in: undo log header offset on page */ +/********************************************************************** +Returns the next undo log record on the page in the specified log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_next_rec( +/*=======================*/ + /* out: pointer to record, NULL if none */ + trx_undo_rec_t* rec, /* in: undo log record */ + ulint page_no,/* in: undo log header page number */ + ulint offset);/* in: undo log header offset on page */ +/********************************************************************** +Returns the last undo record on the page in the specified undo log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_last_rec( +/*=======================*/ + /* out: pointer to record, NULL if none */ + page_t* undo_page,/* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset); /* in: undo log header offset on page */ +/********************************************************************** +Returns the first undo record on the page in the specified undo log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_first_rec( +/*========================*/ + /* out: pointer to record, NULL if none */ + page_t* undo_page,/* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset);/* in: undo log header offset on page */ +/*************************************************************************** +Gets the previous record in an undo log. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_prev_rec( +/*==================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr); /* in: mtr */ +/*************************************************************************** +Gets the next record in an undo log. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_next_rec( +/*==================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr); /* in: mtr */ +/*************************************************************************** +Gets the first record in an undo log. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_first_rec( +/*===================*/ + /* out: undo log record, the page latched, NULL if + none */ + ulint space, /* in: undo log header space */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + ulint mode, /* in: latching mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************ +Tries to add a page to the undo log segment where the undo log is placed. */ +UNIV_INTERN +ulint +trx_undo_add_page( +/*==============*/ + /* out: page number if success, else + FIL_NULL */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory object */ + mtr_t* mtr); /* in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +/*************************************************************************** +Truncates an undo log from the end. This function is used during a rollback +to free space from an undo log. */ +UNIV_INTERN +void +trx_undo_truncate_end( +/*==================*/ + trx_t* trx, /* in: transaction whose undo log it is */ + trx_undo_t* undo, /* in: undo log */ + dulint limit); /* in: all undo records with undo number + >= this value should be truncated */ +/*************************************************************************** +Truncates an undo log from the start. This function is used during a purge +operation. */ +UNIV_INTERN +void +trx_undo_truncate_start( +/*====================*/ + trx_rseg_t* rseg, /* in: rollback segment */ + ulint space, /* in: space id of the log */ + ulint hdr_page_no, /* in: header page number */ + ulint hdr_offset, /* in: header offset on the page */ + dulint limit); /* in: all undo pages with undo numbers < + this value should be truncated; NOTE that + the function only frees whole pages; the + header page is not freed, but emptied, if + all the records there are < limit */ +/************************************************************************ +Initializes the undo log lists for a rollback segment memory copy. +This function is only called when the database is started or a new +rollback segment created. */ +UNIV_INTERN +ulint +trx_undo_lists_init( +/*================*/ + /* out: the combined size of undo log segments + in pages */ + trx_rseg_t* rseg); /* in: rollback segment memory object */ +/************************************************************************** +Assigns an undo log for a transaction. A new undo log is created or a cached +undo log reused. */ +UNIV_INTERN +ulint +trx_undo_assign_undo( +/*=================*/ + /* out: DB_SUCCESS if undo log assign + successful, possible error codes are: + DB_TOO_MANY_CONCURRENT_TRXS + DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY*/ + trx_t* trx, /* in: transaction */ + ulint type); /* in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ +/********************************************************************** +Sets the state of the undo log segment at a transaction finish. */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_finish( +/*=========================*/ + /* out: undo log segment header page, + x-latched */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr); /* in: mtr */ + +/************************************************************************** +Adds the update undo log header as the first in the history list, and +frees the memory object, or puts it to the list of cached update undo log +segments. */ +UNIV_INTERN +void +trx_undo_update_cleanup( +/*====================*/ + trx_t* trx, /* in: trx owning the update undo log */ + page_t* undo_page, /* in: update undo log header page, + x-latched */ + mtr_t* mtr); /* in: mtr */ +/********************************************************************** +Frees or caches an insert undo log after a transaction commit or rollback. +Knowledge of inserts is not needed after a commit or rollback, therefore +the data can be discarded. */ +UNIV_INTERN +void +trx_undo_insert_cleanup( +/*====================*/ + trx_t* trx); /* in: transaction handle */ +/*************************************************************** +Parses the redo log entry of an undo log page initialization. */ +UNIV_INTERN +byte* +trx_undo_parse_page_init( +/*=====================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ +/*************************************************************** +Parses the redo log entry of an undo log page header create or reuse. */ +UNIV_INTERN +byte* +trx_undo_parse_page_header( +/*=======================*/ + /* out: end of log record or NULL */ + ulint type, /* in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ +/*************************************************************** +Parses the redo log entry of an undo log page header discard. */ +UNIV_INTERN +byte* +trx_undo_parse_discard_latest( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr); /* in: mtr or NULL */ + +/* Types of an undo log segment */ +#define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */ +#define TRX_UNDO_UPDATE 2 /* contains undo entries for updates + and delete markings: in short, + modifys (the name 'UPDATE' is a + historical relic) */ +/* States of an undo log segment */ +#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active + transaction */ +#define TRX_UNDO_CACHED 2 /* cached for quick reuse */ +#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */ +#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be + reused: it can be freed in purge when + all undo data in it is removed */ +#define TRX_UNDO_PREPARED 5 /* contains an undo log of an + prepared transaction */ + +/* Transaction undo log memory object; this is protected by the undo_mutex +in the corresponding transaction object */ + +struct trx_undo_struct{ + /*-----------------------------*/ + ulint id; /* undo log slot number within the + rollback segment */ + ulint type; /* TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + ulint state; /* state of the corresponding undo log + segment */ + ibool del_marks; /* relevant only in an update undo log: + this is TRUE if the transaction may + have delete marked records, because of + a delete of a row or an update of an + indexed field; purge is then + necessary; also TRUE if the transaction + has updated an externally stored + field */ + dulint trx_id; /* id of the trx assigned to the undo + log */ + XID xid; /* X/Open XA transaction + identification */ + ibool dict_operation; /* TRUE if a dict operation trx */ + dulint table_id; /* if a dict operation, then the table + id */ + trx_rseg_t* rseg; /* rseg where the undo log belongs */ + /*-----------------------------*/ + ulint space; /* space id where the undo log + placed */ + ulint zip_size; /* in: compressed page size of space + in bytes, or 0 for uncompressed */ + ulint hdr_page_no; /* page number of the header page in + the undo log */ + ulint hdr_offset; /* header offset of the undo log on the + page */ + ulint last_page_no; /* page number of the last page in the + undo log; this may differ from + top_page_no during a rollback */ + ulint size; /* current size in pages */ + /*-----------------------------*/ + ulint empty; /* TRUE if the stack of undo log + records is currently empty */ + ulint top_page_no; /* page number where the latest undo + log record was catenated; during + rollback the page from which the latest + undo record was chosen */ + ulint top_offset; /* offset of the latest undo record, + i.e., the topmost element in the undo + log if we think of it as a stack */ + dulint top_undo_no; /* undo number of the latest record */ + buf_block_t* guess_block; /* guess for the buffer block where + the top page might reside */ + /*-----------------------------*/ + UT_LIST_NODE_T(trx_undo_t) undo_list; + /* undo log objects in the rollback + segment are chained into lists */ +}; + +/* The offset of the undo log page header on pages of the undo log */ +#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA +/*-------------------------------------------------------------*/ +/* Transaction undo log page header offsets */ +#define TRX_UNDO_PAGE_TYPE 0 /* TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ +#define TRX_UNDO_PAGE_START 2 /* Byte offset where the undo log + records for the LATEST transaction + start on this page (remember that + in an update undo log, the first page + can contain several undo logs) */ +#define TRX_UNDO_PAGE_FREE 4 /* On each page of the undo log this + field contains the byte offset of the + first free byte on the page */ +#define TRX_UNDO_PAGE_NODE 6 /* The file list node in the chain + of undo log pages */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE) + +/* An update undo segment with just one page can be reused if it has +< this number bytes used; we must leave space at least for one new undo +log header on the page */ + +#define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4) + +/* An update undo log segment may contain several undo logs on its first page +if the undo logs took so little space that the segment could be cached and +reused. All the undo log headers are then on the first page, and the last one +owns the undo log records on subsequent pages if the segment is bigger than +one page. If an undo log is stored in a segment, then on the first page it is +allowed to have zero undo records, but if the segment extends to several +pages, then all the rest of the pages must contain at least one undo log +record. */ + +/* The offset of the undo log segment header on the first page of the undo +log segment */ + +#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) +/*-------------------------------------------------------------*/ +#define TRX_UNDO_STATE 0 /* TRX_UNDO_ACTIVE, ... */ +#define TRX_UNDO_LAST_LOG 2 /* Offset of the last undo log header + on the segment header page, 0 if + none */ +#define TRX_UNDO_FSEG_HEADER 4 /* Header for the file segment which + the undo log segment occupies */ +#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE) + /* Base node for the list of pages in + the undo log segment; defined only on + the undo log segment's first page */ +/*-------------------------------------------------------------*/ +/* Size of the undo log segment header */ +#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE) + + +/* The undo log header. There can be several undo log headers on the first +page of an update undo log segment. */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_TRX_ID 0 /* Transaction id */ +#define TRX_UNDO_TRX_NO 8 /* Transaction number of the + transaction; defined only if the log + is in a history list */ +#define TRX_UNDO_DEL_MARKS 16 /* Defined only in an update undo + log: TRUE if the transaction may have + done delete markings of records, and + thus purge is necessary */ +#define TRX_UNDO_LOG_START 18 /* Offset of the first undo log record + of this log on the header page; purge + may remove undo log record from the + log start, and therefore this is not + necessarily the same as this log + header end offset */ +#define TRX_UNDO_XID_EXISTS 20 /* TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /* TRUE if the transaction is a table + create, index create, or drop + transaction: in recovery + the transaction cannot be rolled back + in the usual way: a 'rollback' rather + means dropping the created or dropped + table, if it still exists */ +#define TRX_UNDO_TABLE_ID 22 /* Id of the table if the preceding + field is TRUE */ +#define TRX_UNDO_NEXT_LOG 30 /* Offset of the next undo log header + on this page, 0 if none */ +#define TRX_UNDO_PREV_LOG 32 /* Offset of the previous undo log + header on this page, 0 if none */ +#define TRX_UNDO_HISTORY_NODE 34 /* If the log is put to the history + list, the file list node is here */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE) + +/* Note: the writing of the undo log old header is coded by a log record +MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the +header is logged separately. In this sense, the XID is not really a member +of the undo log header. TODO: do not append the XID to the log header if XA +is not needed by the user. The XID wastes about 150 bytes of space in every +undo log. In the history list we may have millions of undo logs, which means +quite a large overhead. */ + +/* X/Open XA Transaction Identification (XID) */ + +#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE) +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +/*--------------------------------------------------------------*/ +#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE) + /* Total size of the header with the XA XID */ + +#ifndef UNIV_NONINL +#include "trx0undo.ic" +#endif + +#endif diff --git a/storage/xtradb/include/trx0undo.ic b/storage/xtradb/include/trx0undo.ic new file mode 100644 index 00000000000..0bd8b79414b --- /dev/null +++ b/storage/xtradb/include/trx0undo.ic @@ -0,0 +1,344 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "data0type.h" +#include "page0page.h" + +/*************************************************************************** +Builds a roll pointer dulint. */ +UNIV_INLINE +dulint +trx_undo_build_roll_ptr( +/*====================*/ + /* out: roll pointer */ + ibool is_insert, /* in: TRUE if insert undo log */ + ulint rseg_id, /* in: rollback segment id */ + ulint page_no, /* in: page number */ + ulint offset) /* in: offset of the undo entry within page */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + ut_ad(rseg_id < 128); + + return(ut_dulint_create(is_insert * 128 * 256 * 256 + + rseg_id * 256 * 256 + + (page_no / 256) / 256, + (page_no % (256 * 256)) * 256 * 256 + + offset)); +} + +/*************************************************************************** +Decodes a roll pointer dulint. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + dulint roll_ptr, /* in: roll pointer */ + ibool* is_insert, /* out: TRUE if insert undo log */ + ulint* rseg_id, /* out: rollback segment id */ + ulint* page_no, /* out: page number */ + ulint* offset) /* out: offset of the undo entry within page */ +{ + ulint low; + ulint high; +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif +#if TRUE != 1 +# error "TRUE != 1" +#endif + high = ut_dulint_get_high(roll_ptr); + low = ut_dulint_get_low(roll_ptr); + + *offset = low % (256 * 256); + + *is_insert = high / (256 * 256 * 128); /* TRUE == 1 */ + *rseg_id = (high / (256 * 256)) % 128; + + *page_no = (high % (256 * 256)) * 256 * 256 + + (low / 256) / 256; +} + +/*************************************************************************** +Returns TRUE if the roll pointer is of the insert type. */ +UNIV_INLINE +ibool +trx_undo_roll_ptr_is_insert( +/*========================*/ + /* out: TRUE if insert undo log */ + dulint roll_ptr) /* in: roll pointer */ +{ + ulint high; +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif +#if TRUE != 1 +# error "TRUE != 1" +#endif + high = ut_dulint_get_high(roll_ptr); + + return(high / (256 * 256 * 128)); +} + +/********************************************************************* +Writes a roll ptr to an index page. In case that the size changes in +some future version, this function should be used instead of +mach_write_... */ +UNIV_INLINE +void +trx_write_roll_ptr( +/*===============*/ + byte* ptr, /* in: pointer to memory where written */ + dulint roll_ptr) /* in: roll ptr */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(ptr, roll_ptr); +} + +/********************************************************************* +Reads a roll ptr from an index page. In case that the roll ptr size +changes in some future version, this function should be used instead of +mach_read_... */ +UNIV_INLINE +dulint +trx_read_roll_ptr( +/*==============*/ + /* out: roll ptr */ + const byte* ptr) /* in: pointer to memory from where to read */ +{ +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + return(mach_read_from_7(ptr)); +} + +/********************************************************************** +Gets an undo log page and x-latches it. */ +UNIV_INLINE +page_t* +trx_undo_page_get( +/*==============*/ + /* out: pointer to page x-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block = buf_page_get(space, zip_size, page_no, + RW_X_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + return(buf_block_get_frame(block)); +} + +/********************************************************************** +Gets an undo log page and s-latches it. */ +UNIV_INLINE +page_t* +trx_undo_page_get_s_latched( +/*========================*/ + /* out: pointer to page s-latched */ + ulint space, /* in: space where placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number */ + mtr_t* mtr) /* in: mtr */ +{ + buf_block_t* block = buf_page_get(space, zip_size, page_no, + RW_S_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + return(buf_block_get_frame(block)); +} + +/********************************************************************** +Returns the start offset of the undo log records of the specified undo +log on the page. */ +UNIV_INLINE +ulint +trx_undo_page_get_start( +/*====================*/ + /* out: start offset */ + page_t* undo_page,/* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header offset on page */ +{ + ulint start; + + if (page_no == page_get_page_no(undo_page)) { + + start = mach_read_from_2(offset + undo_page + + TRX_UNDO_LOG_START); + } else { + start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; + } + + return(start); +} + +/********************************************************************** +Returns the end offset of the undo log records of the specified undo +log on the page. */ +UNIV_INLINE +ulint +trx_undo_page_get_end( +/*==================*/ + /* out: end offset */ + page_t* undo_page,/* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header offset on page */ +{ + trx_ulogf_t* log_hdr; + ulint end; + + if (page_no == page_get_page_no(undo_page)) { + + log_hdr = undo_page + offset; + + end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); + + if (end == 0) { + end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + } + } else { + end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + } + + return(end); +} + +/********************************************************************** +Returns the previous undo record on the page in the specified log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_prev_rec( +/*=======================*/ + /* out: pointer to record, NULL if none */ + trx_undo_rec_t* rec, /* in: undo log record */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header offset on page */ +{ + page_t* undo_page; + ulint start; + + undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); + + start = trx_undo_page_get_start(undo_page, page_no, offset); + + if (start + undo_page == rec) { + + return(NULL); + } + + return(undo_page + mach_read_from_2(rec - 2)); +} + +/********************************************************************** +Returns the next undo log record on the page in the specified log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_next_rec( +/*=======================*/ + /* out: pointer to record, NULL if none */ + trx_undo_rec_t* rec, /* in: undo log record */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header offset on page */ +{ + page_t* undo_page; + ulint end; + ulint next; + + undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE); + + end = trx_undo_page_get_end(undo_page, page_no, offset); + + next = mach_read_from_2(rec); + + if (next == end) { + + return(NULL); + } + + return(undo_page + next); +} + +/********************************************************************** +Returns the last undo record on the page in the specified undo log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_last_rec( +/*=======================*/ + /* out: pointer to record, NULL if none */ + page_t* undo_page,/* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header offset on page */ +{ + ulint start; + ulint end; + + start = trx_undo_page_get_start(undo_page, page_no, offset); + end = trx_undo_page_get_end(undo_page, page_no, offset); + + if (start == end) { + + return(NULL); + } + + return(undo_page + mach_read_from_2(undo_page + end - 2)); +} + +/********************************************************************** +Returns the first undo record on the page in the specified undo log, or +NULL if none exists. */ +UNIV_INLINE +trx_undo_rec_t* +trx_undo_page_get_first_rec( +/*========================*/ + /* out: pointer to record, NULL if none */ + page_t* undo_page,/* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header offset on page */ +{ + ulint start; + ulint end; + + start = trx_undo_page_get_start(undo_page, page_no, offset); + end = trx_undo_page_get_end(undo_page, page_no, offset); + + if (start == end) { + + return(NULL); + } + + return(undo_page + start); +} diff --git a/storage/xtradb/include/trx0xa.h b/storage/xtradb/include/trx0xa.h new file mode 100644 index 00000000000..0e040b8d8e5 --- /dev/null +++ b/storage/xtradb/include/trx0xa.h @@ -0,0 +1,61 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +#define XIDDATASIZE 128 /* size in bytes */ +#define MAXGTRIDSIZE 64 /* maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /* maximum size in bytes of bqual */ + +struct xid_t { + long formatID; /* format identifier; -1 + means that the XID is null */ + long gtrid_length; /* value from 1 through 64 */ + long bqual_length; /* value from 1 through 64 */ + char data[XIDDATASIZE]; +}; +typedef struct xid_t XID; +#endif +#define XA_OK 0 /* normal execution */ +#define XAER_ASYNC -2 /* asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /* a resource manager error occurred in + the transaction branch */ +#define XAER_NOTA -4 /* the XID is not valid */ +#define XAER_INVAL -5 /* invalid arguments were given */ +#define XAER_PROTO -6 /* routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /* resource manager unavailable */ +#define XAER_DUPID -8 /* the XID already exists */ +#define XAER_OUTSIDE -9 /* resource manager doing work outside + transaction */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i new file mode 100644 index 00000000000..3b01401a0b9 --- /dev/null +++ b/storage/xtradb/include/univ.i @@ -0,0 +1,466 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*************************************************************************** +Version control for database, common definitions, and include files + +Created 1/20/1994 Heikki Tuuri +****************************************************************************/ + +#ifndef univ_i +#define univ_i + +#define INNODB_VERSION_MAJOR 1 +#define INNODB_VERSION_MINOR 0 +#define INNODB_VERSION_BUGFIX 3 +#define PERCONA_INNODB_VERSION 5a + +/* The following is the InnoDB version as shown in +SELECT plugin_version FROM information_schema.plugins; +calculated in in make_version_string() in sql/sql_show.cc like this: +"version >> 8" . "version & 0xff" +because the version is shown with only one dot, we skip the last +component, i.e. we show M.N.P as M.N */ +#define INNODB_VERSION_SHORT \ + (INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR) + +/* auxiliary macros to help creating the version as string */ +#define __INNODB_VERSION(a, b, c, d) (#a "." #b "." #c "-" #d) +#define _INNODB_VERSION(a, b, c, d) __INNODB_VERSION(a, b, c, d) + +#define INNODB_VERSION_STR \ + _INNODB_VERSION(INNODB_VERSION_MAJOR, \ + INNODB_VERSION_MINOR, \ + INNODB_VERSION_BUGFIX, \ + PERCONA_INNODB_VERSION) + +#ifdef MYSQL_DYNAMIC_PLUGIN +/* In the dynamic plugin, redefine some externally visible symbols +in order not to conflict with the symbols of a builtin InnoDB. */ + +/* Rename all C++ classes that contain virtual functions, because we +have not figured out how to apply the visibility=hidden attribute to +the virtual method table (vtable) in GCC 3. */ +# define ha_innobase ha_innodb +#endif /* MYSQL_DYNAMIC_PLUGIN */ + +#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER) && !defined(__WIN__) +# undef __WIN__ +# define __WIN__ + +# include + +# if !defined(WIN64) && !defined(_WIN64) +# define UNIV_CAN_USE_X86_ASSEMBLER +# endif + +# ifdef _NT_ +# define __NT__ +# endif + +#else +/* The defines used with MySQL */ + +/* Include two header files from MySQL to make the Unix flavor used +in compiling more Posix-compatible. These headers also define __WIN__ +if we are compiling on Windows. */ + +# include +# include + +/* Include to get S_I... macros defined for os0file.c */ +# include +# if !defined(__NETWARE__) && !defined(__WIN__) +# include /* mmap() for os0proc.c */ +# endif + +# undef PACKAGE +# undef VERSION + +/* Include the header file generated by GNU autoconf */ +# ifndef __WIN__ +# include "config.h" +# endif + +# ifdef HAVE_SCHED_H +# include +# endif + +/* When compiling for Itanium IA64, undefine the flag below to prevent use +of the 32-bit x86 assembler in mutex operations. */ + +# if defined(__WIN__) && !defined(WIN64) && !defined(_WIN64) +# define UNIV_CAN_USE_X86_ASSEMBLER +# endif + +/* For InnoDB rw_locks to work with atomics we need the thread_id +to be no more than machine word wide. The following enables using +atomics for InnoDB rw_locks where these conditions are met. */ +#ifdef HAVE_GCC_ATOMIC_BUILTINS +/* if HAVE_ATOMIC_PTHREAD_T is defined at this point that means that +the code from plug.in has defined it and we do not need to include +ut0auxconf.h which would either define HAVE_ATOMIC_PTHREAD_T or will +be empty */ +# ifndef HAVE_ATOMIC_PTHREAD_T +# include "ut0auxconf.h" +# endif /* HAVE_ATOMIC_PTHREAD_T */ +/* now HAVE_ATOMIC_PTHREAD_T is eventually defined either by plug.in or +from Makefile.in->ut0auxconf.h */ +# ifdef HAVE_ATOMIC_PTHREAD_T +# define INNODB_RW_LOCKS_USE_ATOMICS +# endif /* HAVE_ATOMIC_PTHREAD_T */ +#endif /* HAVE_GCC_ATOMIC_BUILTINS */ + +/* We only try to do explicit inlining of functions with gcc and +Microsoft Visual C++ */ + +# if !defined(__GNUC__) +# undef UNIV_MUST_NOT_INLINE /* Remove compiler warning */ +# define UNIV_MUST_NOT_INLINE +# endif + +# ifdef HAVE_PREAD +# define HAVE_PWRITE +# endif + +#endif /* #if (defined(WIN32) || ... */ + +/* DEBUG VERSION CONTROL + ===================== */ + +/* The following flag will make InnoDB to initialize +all memory it allocates to zero. It hides Purify +warnings about reading unallocated memory unless +memory is read outside the allocated blocks. */ +/* +#define UNIV_INIT_MEM_TO_ZERO +*/ + +/* When this macro is defined then additional test functions will be +compiled. These functions live at the end of each relevant source file +and have "test_" prefix. These functions are not called from anywhere in +the code, they can be called from gdb after +innobase_start_or_create_for_mysql() has executed using the call +command. Not tested on Windows. */ +/* +#define UNIV_COMPILE_TEST_FUNCS +*/ + +#if 0 +#define UNIV_DEBUG_VALGRIND /* Enable extra + Valgrind instrumentation */ +#define UNIV_DEBUG_PRINT /* Enable the compilation of + some debug print functions */ +#define UNIV_AHI_DEBUG /* Enable adaptive hash index + debugging without UNIV_DEBUG */ +#define UNIV_BUF_DEBUG /* Enable buffer pool + debugging without UNIV_DEBUG */ +#define UNIV_DEBUG /* Enable ut_ad() assertions + and disable UNIV_INLINE */ +#define UNIV_DEBUG_FILE_ACCESSES /* Debug .ibd file access + (field file_page_was_freed + in buf_page_t) */ +#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */ +#define UNIV_HASH_DEBUG /* debug HASH_ macros */ +#define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */ +#define UNIV_MEM_DEBUG /* detect memory leaks etc */ +#define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer; +this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES, +and the insert buffer must be empty when the database is started */ +#define UNIV_SYNC_DEBUG /* debug mutex and latch +operations (very slow); also UNIV_DEBUG must be defined */ +#define UNIV_SEARCH_DEBUG /* debug B-tree comparisons */ +#define UNIV_SYNC_PERF_STAT /* operation counts for + rw-locks and mutexes */ +#define UNIV_SEARCH_PERF_STAT /* statistics for the + adaptive hash index */ +#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output + in sync0sync.c */ +#define UNIV_BTR_PRINT /* enable functions for + printing B-trees */ +#define UNIV_ZIP_DEBUG /* extensive consistency checks + for compressed pages */ +#define UNIV_ZIP_COPY /* call page_zip_copy_recs() + more often */ +#endif + +#define UNIV_BTR_DEBUG /* check B-tree links */ +#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */ + +#ifdef HAVE_valgrind +/* The following sets all new allocated memory to zero before use: +this can be used to eliminate unnecessary Purify warnings, but note that +it also masks many bugs Purify could detect. For detailed Purify analysis it +is best to remove the define below and look through the warnings one +by one. */ +#define UNIV_SET_MEM_TO_ZERO +#endif + +/* +#define UNIV_SQL_DEBUG +#define UNIV_LOG_DEBUG +*/ + /* the above option prevents forcing of log to disk + at a buffer page write: it should be tested with this + option off; also some ibuf tests are suppressed */ +/* +#define UNIV_BASIC_LOG_DEBUG +*/ + /* the above option enables basic recovery debugging: + new allocated file pages are reset */ + +/* Linkage specifier for non-static InnoDB symbols (variables and functions) +that are only referenced from within InnoDB, not from MySQL */ +#ifdef __WIN__ +# define UNIV_INTERN +#else +# define UNIV_INTERN __attribute__((visibility ("hidden"))) +#endif + +#if (!defined(UNIV_DEBUG) && !defined(UNIV_MUST_NOT_INLINE)) +/* Definition for inline version */ + +#ifdef __WIN__ +#define UNIV_INLINE __inline +#else +#define UNIV_INLINE static __inline__ +#endif + +#else +/* If we want to compile a noninlined version we use the following macro +definitions: */ + +#define UNIV_NONINL +#define UNIV_INLINE UNIV_INTERN + +#endif /* UNIV_DEBUG */ + +#ifdef _WIN32 +#define UNIV_WORD_SIZE 4 +#elif defined(_WIN64) +#define UNIV_WORD_SIZE 8 +#else +/* MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */ +#define UNIV_WORD_SIZE SIZEOF_LONG +#endif + +/* The following alignment is used in memory allocations in memory heap +management to ensure correct alignment for doubles etc. */ +#define UNIV_MEM_ALIGNMENT 8 + +/* The following alignment is used in aligning lints etc. */ +#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE + +/* + DATABASE VERSION CONTROL + ======================== +*/ + +/* The 2-logarithm of UNIV_PAGE_SIZE: */ +#define UNIV_PAGE_SIZE_SHIFT 14 +/* The universal page size of the database */ +#define UNIV_PAGE_SIZE (1 << UNIV_PAGE_SIZE_SHIFT) + +/* Maximum number of parallel threads in a parallelized operation */ +#define UNIV_MAX_PARALLELISM 32 + +/* + UNIVERSAL TYPE DEFINITIONS + ========================== +*/ + +/* Note that inside MySQL 'byte' is defined as char on Linux! */ +#define byte unsigned char + +/* Define an unsigned integer type that is exactly 32 bits. */ + +#if SIZEOF_INT == 4 +typedef unsigned int ib_uint32_t; +#elif SIZEOF_LONG == 4 +typedef unsigned long ib_uint32_t; +#else +#error "Neither int or long is 4 bytes" +#endif + +/* Another basic type we use is unsigned long integer which should be equal to +the word size of the machine, that is on a 32-bit platform 32 bits, and on a +64-bit platform 64 bits. We also give the printf format for the type as a +macro ULINTPF. */ + +#ifdef _WIN64 +typedef unsigned __int64 ulint; +#define ULINTPF "%I64u" +typedef __int64 lint; +#else +typedef unsigned long int ulint; +#define ULINTPF "%lu" +typedef long int lint; +#endif + +#ifdef __WIN__ +typedef __int64 ib_int64_t; +typedef unsigned __int64 ib_uint64_t; +#else +/* Note: longlong and ulonglong come from MySQL headers. */ +typedef longlong ib_int64_t; +typedef ulonglong ib_uint64_t; +#endif + +typedef unsigned long long int ullint; + +#ifndef __WIN__ +#if SIZEOF_LONG != SIZEOF_VOIDP +#error "Error: InnoDB's ulint must be of the same size as void*" +#endif +#endif + +/* The 'undefined' value for a ulint */ +#define ULINT_UNDEFINED ((ulint)(-1)) + +/* The undefined 32-bit unsigned integer */ +#define ULINT32_UNDEFINED 0xFFFFFFFF + +/* Maximum value for a ulint */ +#define ULINT_MAX ((ulint)(-2)) + +/* Maximum value for ib_uint64_t */ +#define IB_ULONGLONG_MAX ((ib_uint64_t) (~0ULL)) + +/* This 'ibool' type is used within Innobase. Remember that different included +headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ +#define ibool ulint + +#ifndef TRUE + +#define TRUE 1 +#define FALSE 0 + +#endif + +/* The following number as the length of a logical field means that the field +has the SQL NULL as its value. NOTE that because we assume that the length +of a field is a 32-bit integer when we store it, for example, to an undo log +on disk, we must have also this number fit in 32 bits, also in 64-bit +computers! */ + +#define UNIV_SQL_NULL ULINT32_UNDEFINED + +/* Lengths which are not UNIV_SQL_NULL, but bigger than the following +number indicate that a field contains a reference to an externally +stored part of the field in the tablespace. The length field then +contains the sum of the following flag and the locally stored len. */ + +#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE) + +/* Some macros to improve branch prediction and reduce cache misses */ +#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER) +/* Tell the compiler that 'expr' probably evaluates to 'constant'. */ +# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant) +/* Tell the compiler that a pointer is likely to be NULL */ +# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read. */ +# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read or written. */ +# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3) +#else +/* Dummy versions of the macros */ +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +#endif +/* Tell the compiler that cond is likely to hold */ +#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE) +/* Tell the compiler that cond is unlikely to hold */ +#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE) + +/* Compile-time constant of the given array's size. */ +#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0])) + +/* The return type from a thread's start function differs between Unix and +Windows, so define a typedef for it and a macro to use at the end of such +functions. */ + +#ifdef __WIN__ +typedef ulint os_thread_ret_t; +#define OS_THREAD_DUMMY_RETURN return(0) +#else +typedef void* os_thread_ret_t; +#define OS_THREAD_DUMMY_RETURN return(NULL) +#endif + +#include +#include "ut0dbg.h" +#include "ut0ut.h" +#include "db0err.h" +#ifdef UNIV_DEBUG_VALGRIND +# include +# define UNIV_MEM_VALID(addr, size) VALGRIND_MAKE_MEM_DEFINED(addr, size) +# define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size) +# define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size) +# define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size) +# define UNIV_MEM_DESC(addr, size, b) VALGRIND_CREATE_BLOCK(addr, size, b) +# define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b) +# define UNIV_MEM_ASSERT_RW(addr, size) do { \ + const void* _p = (const void*) (ulint) \ + VALGRIND_CHECK_MEM_IS_DEFINED(addr, size); \ + if (UNIV_LIKELY_NULL(_p)) \ + fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n", \ + __FILE__, __LINE__, \ + (const void*) (addr), (unsigned) (size), (long) \ + (((const char*) _p) - ((const char*) (addr)))); \ + } while (0) +# define UNIV_MEM_ASSERT_W(addr, size) do { \ + const void* _p = (const void*) (ulint) \ + VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size); \ + if (UNIV_LIKELY_NULL(_p)) \ + fprintf(stderr, "%s:%d: %p[%u] unwritable at %ld\n", \ + __FILE__, __LINE__, \ + (const void*) (addr), (unsigned) (size), (long) \ + (((const char*) _p) - ((const char*) (addr)))); \ + } while (0) +#else +# define UNIV_MEM_VALID(addr, size) do {} while(0) +# define UNIV_MEM_INVALID(addr, size) do {} while(0) +# define UNIV_MEM_FREE(addr, size) do {} while(0) +# define UNIV_MEM_ALLOC(addr, size) do {} while(0) +# define UNIV_MEM_DESC(addr, size, b) do {} while(0) +# define UNIV_MEM_UNDESC(b) do {} while(0) +# define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0) +# define UNIV_MEM_ASSERT_W(addr, size) do {} while(0) +#endif +#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do { \ + UNIV_MEM_ASSERT_W(addr, size); \ + UNIV_MEM_FREE(addr, size); \ +} while (0) +#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do { \ + UNIV_MEM_ASSERT_W(addr, size); \ + UNIV_MEM_ALLOC(addr, size); \ +} while (0) + +#endif diff --git a/storage/xtradb/include/usr0sess.h b/storage/xtradb/include/usr0sess.h new file mode 100644 index 00000000000..08c6c70066f --- /dev/null +++ b/storage/xtradb/include/usr0sess.h @@ -0,0 +1,77 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#ifndef usr0sess_h +#define usr0sess_h + +#include "univ.i" +#include "ut0byte.h" +#include "trx0types.h" +#include "srv0srv.h" +#include "trx0types.h" +#include "usr0types.h" +#include "que0types.h" +#include "data0data.h" +#include "rem0rec.h" + +/************************************************************************* +Opens a session. */ +UNIV_INTERN +sess_t* +sess_open(void); +/*============*/ + /* out, own: session object */ +/************************************************************************* +Closes a session, freeing the memory occupied by it, if it is in a state +where it should be closed. */ +UNIV_INTERN +ibool +sess_try_close( +/*===========*/ + /* out: TRUE if closed */ + sess_t* sess); /* in, own: session object */ + +/* The session handle. All fields are protected by the kernel mutex */ +struct sess_struct{ + ulint state; /* state of the session */ + trx_t* trx; /* transaction object permanently + assigned for the session: the + transaction instance designated by the + trx id changes, but the memory + structure is preserved */ + UT_LIST_BASE_NODE_T(que_t) + graphs; /* query graphs belonging to this + session */ +}; + +/* Session states */ +#define SESS_ACTIVE 1 +#define SESS_ERROR 2 /* session contains an error message + which has not yet been communicated + to the client */ +#ifndef UNIV_NONINL +#include "usr0sess.ic" +#endif + +#endif diff --git a/storage/xtradb/include/usr0sess.ic b/storage/xtradb/include/usr0sess.ic new file mode 100644 index 00000000000..5eefed382da --- /dev/null +++ b/storage/xtradb/include/usr0sess.ic @@ -0,0 +1,23 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ diff --git a/storage/xtradb/include/usr0types.h b/storage/xtradb/include/usr0types.h new file mode 100644 index 00000000000..7f7d12f7bf5 --- /dev/null +++ b/storage/xtradb/include/usr0types.h @@ -0,0 +1,30 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Users and sessions global types + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#ifndef usr0types_h +#define usr0types_h + +typedef struct sess_struct sess_t; + +#endif diff --git a/storage/xtradb/include/ut0auxconf.h b/storage/xtradb/include/ut0auxconf.h new file mode 100644 index 00000000000..6362b7ca412 --- /dev/null +++ b/storage/xtradb/include/ut0auxconf.h @@ -0,0 +1,14 @@ +/* Do not remove this file even though it is empty. +This file is included in univ.i and will cause compilation failure +if not present. +A custom check has been added in the generated +storage/innobase/Makefile.in that is shipped with with the InnoDB Plugin +source archive. This check tries to compile a test program and if +successful then adds "#define HAVE_ATOMIC_PTHREAD_T" to this file. +This is a hack that has been developed in order to check for pthread_t +atomicity without the need to regenerate the ./configure script that is +distributed in the MySQL 5.1 official source archives. +If by any chance Makefile.in and ./configure are regenerated and thus +the hack from Makefile.in wiped away then the "real" check from plug.in +will take over. +*/ diff --git a/storage/xtradb/include/ut0byte.h b/storage/xtradb/include/ut0byte.h new file mode 100644 index 00000000000..24aac1678b3 --- /dev/null +++ b/storage/xtradb/include/ut0byte.h @@ -0,0 +1,268 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Utilities for byte operations + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0byte_h +#define ut0byte_h + + +#include "univ.i" + +/* Type definition for a 64-bit unsigned integer, which works also +in 32-bit machines. NOTE! Access the fields only with the accessor +functions. This definition appears here only for the compiler to +know the size of a dulint. */ + +typedef struct dulint_struct dulint; +struct dulint_struct{ + ulint high; /* most significant 32 bits */ + ulint low; /* least significant 32 bits */ +}; + +/* Zero value for a dulint */ +extern const dulint ut_dulint_zero; + +/* Maximum value for a dulint */ +extern const dulint ut_dulint_max; + +/*********************************************************** +Creates a 64-bit dulint out of two ulints. */ +UNIV_INLINE +dulint +ut_dulint_create( +/*=============*/ + /* out: created dulint */ + ulint high, /* in: high-order 32 bits */ + ulint low); /* in: low-order 32 bits */ +/*********************************************************** +Gets the high-order 32 bits of a dulint. */ +UNIV_INLINE +ulint +ut_dulint_get_high( +/*===============*/ + /* out: 32 bits in ulint */ + dulint d); /* in: dulint */ +/*********************************************************** +Gets the low-order 32 bits of a dulint. */ +UNIV_INLINE +ulint +ut_dulint_get_low( +/*==============*/ + /* out: 32 bits in ulint */ + dulint d); /* in: dulint */ +/*********************************************************** +Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit +integer type. */ +UNIV_INLINE +ib_int64_t +ut_conv_dulint_to_longlong( +/*=======================*/ + /* out: value in ib_int64_t type */ + dulint d); /* in: dulint */ +/*********************************************************** +Tests if a dulint is zero. */ +UNIV_INLINE +ibool +ut_dulint_is_zero( +/*==============*/ + /* out: TRUE if zero */ + dulint a); /* in: dulint */ +/*********************************************************** +Compares two dulints. */ +UNIV_INLINE +int +ut_dulint_cmp( +/*==========*/ + /* out: -1 if a < b, 0 if a == b, + 1 if a > b */ + dulint a, /* in: dulint */ + dulint b); /* in: dulint */ +/*********************************************************** +Calculates the max of two dulints. */ +UNIV_INLINE +dulint +ut_dulint_get_max( +/*==============*/ + /* out: max(a, b) */ + dulint a, /* in: dulint */ + dulint b); /* in: dulint */ +/*********************************************************** +Calculates the min of two dulints. */ +UNIV_INLINE +dulint +ut_dulint_get_min( +/*==============*/ + /* out: min(a, b) */ + dulint a, /* in: dulint */ + dulint b); /* in: dulint */ +/*********************************************************** +Adds a ulint to a dulint. */ +UNIV_INLINE +dulint +ut_dulint_add( +/*==========*/ + /* out: sum a + b */ + dulint a, /* in: dulint */ + ulint b); /* in: ulint */ +/*********************************************************** +Subtracts a ulint from a dulint. */ +UNIV_INLINE +dulint +ut_dulint_subtract( +/*===============*/ + /* out: a - b */ + dulint a, /* in: dulint */ + ulint b); /* in: ulint, b <= a */ +/*********************************************************** +Subtracts a dulint from another. NOTE that the difference must be positive +and smaller that 4G. */ +UNIV_INLINE +ulint +ut_dulint_minus( +/*============*/ + /* out: a - b */ + dulint a, /* in: dulint; NOTE a must be >= b and at most + 2 to power 32 - 1 greater */ + dulint b); /* in: dulint */ +/************************************************************ +Rounds a dulint downward to a multiple of a power of 2. */ +UNIV_INLINE +dulint +ut_dulint_align_down( +/*=================*/ + /* out: rounded value */ + dulint n, /* in: number to be rounded */ + ulint align_no); /* in: align by this number which must be a + power of 2 */ +/************************************************************ +Rounds a dulint upward to a multiple of a power of 2. */ +UNIV_INLINE +dulint +ut_dulint_align_up( +/*===============*/ + /* out: rounded value */ + dulint n, /* in: number to be rounded */ + ulint align_no); /* in: align by this number which must be a + power of 2 */ +/************************************************************ +Rounds a dulint downward to a multiple of a power of 2. */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + /* out: rounded value */ + ib_uint64_t n, /* in: number to be rounded */ + ulint align_no); /* in: align by this number + which must be a power of 2 */ +/************************************************************ +Rounds ib_uint64_t upward to a multiple of a power of 2. */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + /* out: rounded value */ + ib_uint64_t n, /* in: number to be rounded */ + ulint align_no); /* in: align by this number + which must be a power of 2 */ +/*********************************************************** +Increments a dulint variable by 1. */ +#define UT_DULINT_INC(D)\ +{\ + if ((D).low == 0xFFFFFFFFUL) {\ + (D).high = (D).high + 1;\ + (D).low = 0;\ + } else {\ + (D).low = (D).low + 1;\ + }\ +} +/*********************************************************** +Tests if two dulints are equal. */ +#define UT_DULINT_EQ(D1, D2) (((D1).low == (D2).low)\ + && ((D1).high == (D2).high)) +#ifdef notdefined +/**************************************************************** +Sort function for dulint arrays. */ +UNIV_INTERN +void +ut_dulint_sort(dulint* arr, dulint* aux_arr, ulint low, ulint high); +/*===============================================================*/ +#endif /* notdefined */ + +/************************************************************* +The following function rounds up a pointer to the nearest aligned address. */ +UNIV_INLINE +void* +ut_align( +/*=====*/ + /* out: aligned pointer */ + void* ptr, /* in: pointer */ + ulint align_no); /* in: align by this number */ +/************************************************************* +The following function rounds down a pointer to the nearest +aligned address. */ +UNIV_INLINE +void* +ut_align_down( +/*==========*/ + /* out: aligned pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + /* out: distance from aligned + pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ + __attribute__((const)); +/********************************************************************* +Gets the nth bit of a ulint. */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + /* out: TRUE if nth bit is 1; 0th bit is defined to + be the least significant */ + ulint a, /* in: ulint */ + ulint n); /* in: nth bit requested */ +/********************************************************************* +Sets the nth bit of a ulint. */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + /* out: the ulint with the bit set as requested */ + ulint a, /* in: ulint */ + ulint n, /* in: nth bit requested */ + ibool val); /* in: value for the bit to set */ + +#ifndef UNIV_NONINL +#include "ut0byte.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0byte.ic b/storage/xtradb/include/ut0byte.ic new file mode 100644 index 00000000000..021a3a15009 --- /dev/null +++ b/storage/xtradb/include/ut0byte.ic @@ -0,0 +1,413 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************************** +Utilities for byte operations + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/*********************************************************** +Creates a 64-bit dulint out of two ulints. */ +UNIV_INLINE +dulint +ut_dulint_create( +/*=============*/ + /* out: created dulint */ + ulint high, /* in: high-order 32 bits */ + ulint low) /* in: low-order 32 bits */ +{ + dulint res; + + ut_ad(high <= 0xFFFFFFFF); + ut_ad(low <= 0xFFFFFFFF); + + res.high = high; + res.low = low; + + return(res); +} + +/*********************************************************** +Gets the high-order 32 bits of a dulint. */ +UNIV_INLINE +ulint +ut_dulint_get_high( +/*===============*/ + /* out: 32 bits in ulint */ + dulint d) /* in: dulint */ +{ + return(d.high); +} + +/*********************************************************** +Gets the low-order 32 bits of a dulint. */ +UNIV_INLINE +ulint +ut_dulint_get_low( +/*==============*/ + /* out: 32 bits in ulint */ + dulint d) /* in: dulint */ +{ + return(d.low); +} + +/*********************************************************** +Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit +integer type. */ +UNIV_INLINE +ib_int64_t +ut_conv_dulint_to_longlong( +/*=======================*/ + /* out: value in ib_int64_t type */ + dulint d) /* in: dulint */ +{ + return((ib_int64_t)d.low + + (((ib_int64_t)d.high) << 32)); +} + +/*********************************************************** +Tests if a dulint is zero. */ +UNIV_INLINE +ibool +ut_dulint_is_zero( +/*==============*/ + /* out: TRUE if zero */ + dulint a) /* in: dulint */ +{ + if ((a.low == 0) && (a.high == 0)) { + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************** +Compares two dulints. */ +UNIV_INLINE +int +ut_dulint_cmp( +/*==========*/ + /* out: -1 if a < b, 0 if a == b, + 1 if a > b */ + dulint a, /* in: dulint */ + dulint b) /* in: dulint */ +{ + if (a.high > b.high) { + return(1); + } else if (a.high < b.high) { + return(-1); + } else if (a.low > b.low) { + return(1); + } else if (a.low < b.low) { + return(-1); + } else { + return(0); + } +} + +/*********************************************************** +Calculates the max of two dulints. */ +UNIV_INLINE +dulint +ut_dulint_get_max( +/*==============*/ + /* out: max(a, b) */ + dulint a, /* in: dulint */ + dulint b) /* in: dulint */ +{ + if (ut_dulint_cmp(a, b) > 0) { + + return(a); + } + + return(b); +} + +/*********************************************************** +Calculates the min of two dulints. */ +UNIV_INLINE +dulint +ut_dulint_get_min( +/*==============*/ + /* out: min(a, b) */ + dulint a, /* in: dulint */ + dulint b) /* in: dulint */ +{ + if (ut_dulint_cmp(a, b) > 0) { + + return(b); + } + + return(a); +} + +/*********************************************************** +Adds a ulint to a dulint. */ +UNIV_INLINE +dulint +ut_dulint_add( +/*==========*/ + /* out: sum a + b */ + dulint a, /* in: dulint */ + ulint b) /* in: ulint */ +{ + if (0xFFFFFFFFUL - b >= a.low) { + a.low += b; + + return(a); + } + + a.low = a.low - (0xFFFFFFFFUL - b) - 1; + + a.high++; + + return(a); +} + +/*********************************************************** +Subtracts a ulint from a dulint. */ +UNIV_INLINE +dulint +ut_dulint_subtract( +/*===============*/ + /* out: a - b */ + dulint a, /* in: dulint */ + ulint b) /* in: ulint, b <= a */ +{ + if (a.low >= b) { + a.low -= b; + + return(a); + } + + b -= a.low + 1; + + a.low = 0xFFFFFFFFUL - b; + + ut_ad(a.high > 0); + + a.high--; + + return(a); +} + +/*********************************************************** +Subtracts a dulint from another. NOTE that the difference must be positive +and smaller that 4G. */ +UNIV_INLINE +ulint +ut_dulint_minus( +/*============*/ + /* out: a - b */ + dulint a, /* in: dulint; NOTE a must be >= b and at most + 2 to power 32 - 1 greater */ + dulint b) /* in: dulint */ +{ + ulint diff; + + if (a.high == b.high) { + ut_ad(a.low >= b.low); + + return(a.low - b.low); + } + + ut_ad(a.high == b.high + 1); + + diff = (ulint)(0xFFFFFFFFUL - b.low); + diff += 1 + a.low; + + ut_ad(diff > a.low); + + return(diff); +} + +/************************************************************ +Rounds a dulint downward to a multiple of a power of 2. */ +UNIV_INLINE +dulint +ut_dulint_align_down( +/*=================*/ + /* out: rounded value */ + dulint n, /* in: number to be rounded */ + ulint align_no) /* in: align by this number which must be a + power of 2 */ +{ + ulint low, high; + + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + + low = ut_dulint_get_low(n); + high = ut_dulint_get_high(n); + + low = low & ~(align_no - 1); + + return(ut_dulint_create(high, low)); +} + +/************************************************************ +Rounds a dulint upward to a multiple of a power of 2. */ +UNIV_INLINE +dulint +ut_dulint_align_up( +/*===============*/ + /* out: rounded value */ + dulint n, /* in: number to be rounded */ + ulint align_no) /* in: align by this number which must be a + power of 2 */ +{ + return(ut_dulint_align_down(ut_dulint_add(n, align_no - 1), align_no)); +} + +/************************************************************ +Rounds ib_uint64_t downward to a multiple of a power of 2. */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + /* out: rounded value */ + ib_uint64_t n, /* in: number to be rounded */ + ulint align_no) /* in: align by this number + which must be a power of 2 */ +{ + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return(n & ~((ib_uint64_t) align_no - 1)); +} + +/************************************************************ +Rounds ib_uint64_t upward to a multiple of a power of 2. */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + /* out: rounded value */ + ib_uint64_t n, /* in: number to be rounded */ + ulint align_no) /* in: align by this number + which must be a power of 2 */ +{ + ib_uint64_t align_1 = (ib_uint64_t) align_no - 1; + + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return((n + align_1) & ~align_1); +} + +/************************************************************* +The following function rounds up a pointer to the nearest aligned address. */ +UNIV_INLINE +void* +ut_align( +/*=====*/ + /* out: aligned pointer */ + void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return((void*)((((ulint)ptr) + align_no - 1) & ~(align_no - 1))); +} + +/************************************************************* +The following function rounds down a pointer to the nearest +aligned address. */ +UNIV_INLINE +void* +ut_align_down( +/*==========*/ + /* out: aligned pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return((void*)((((ulint)ptr)) & ~(align_no - 1))); +} + +/************************************************************* +The following function computes the offset of a pointer from the nearest +aligned address. */ +UNIV_INLINE +ulint +ut_align_offset( +/*============*/ + /* out: distance from + aligned pointer */ + const void* ptr, /* in: pointer */ + ulint align_no) /* in: align by this number */ +{ + ut_ad(align_no > 0); + ut_ad(((align_no - 1) & align_no) == 0); + ut_ad(ptr); + + ut_ad(sizeof(void*) == sizeof(ulint)); + + return(((ulint)ptr) & (align_no - 1)); +} + +/********************************************************************* +Gets the nth bit of a ulint. */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + /* out: TRUE if nth bit is 1; 0th bit is defined to + be the least significant */ + ulint a, /* in: ulint */ + ulint n) /* in: nth bit requested */ +{ + ut_ad(n < 8 * sizeof(ulint)); +#if TRUE != 1 +# error "TRUE != 1" +#endif + return(1 & (a >> n)); +} + +/********************************************************************* +Sets the nth bit of a ulint. */ +UNIV_INLINE +ulint +ut_bit_set_nth( +/*===========*/ + /* out: the ulint with the bit set as requested */ + ulint a, /* in: ulint */ + ulint n, /* in: nth bit requested */ + ibool val) /* in: value for the bit to set */ +{ + ut_ad(n < 8 * sizeof(ulint)); +#if TRUE != 1 +# error "TRUE != 1" +#endif + if (val) { + return(((ulint) 1 << n) | a); + } else { + return(~((ulint) 1 << n) & a); + } +} diff --git a/storage/xtradb/include/ut0dbg.h b/storage/xtradb/include/ut0dbg.h new file mode 100644 index 00000000000..a206789fd4c --- /dev/null +++ b/storage/xtradb/include/ut0dbg.h @@ -0,0 +1,160 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************* +Debug utilities for Innobase + +Created 1/30/1994 Heikki Tuuri +**********************************************************************/ + +#ifndef ut0dbg_h +#define ut0dbg_h + +#include "univ.i" +#include +#include "os0thread.h" + +#if defined(__GNUC__) && (__GNUC__ > 2) +# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR))) +#else +extern ulint ut_dbg_zero; /* This is used to eliminate + compiler warnings */ +# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero) +#endif + +/***************************************************************** +Report a failed assertion. */ +UNIV_INTERN +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /* in: the failed assertion */ + const char* file, /* in: source file containing the assertion */ + ulint line); /* in: line number of the assertion */ + +#ifdef __NETWARE__ +/* Flag for ignoring further assertion failures. +On NetWare, have a graceful exit rather than a segfault to avoid abends. */ +extern ibool panic_shutdown; +/* Abort the execution. */ +void ut_dbg_panic(void); +# define UT_DBG_PANIC ut_dbg_panic() +/* Stop threads in ut_a(). */ +# define UT_DBG_STOP do {} while (0) /* We do not do this on NetWare */ +#else /* __NETWARE__ */ +# if defined(__WIN__) || defined(__INTEL_COMPILER) +# undef UT_DBG_USE_ABORT +# elif defined(__GNUC__) && (__GNUC__ > 2) +# define UT_DBG_USE_ABORT +# endif + +# ifndef UT_DBG_USE_ABORT +/* A null pointer that will be dereferenced to trigger a memory trap */ +extern ulint* ut_dbg_null_ptr; +# endif + +# if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT) +/* Flag for indicating that all threads should stop. This will be set +by ut_dbg_assertion_failed(). */ +extern ibool ut_dbg_stop_threads; + +/***************************************************************** +Stop a thread after assertion failure. */ +UNIV_INTERN +void +ut_dbg_stop_thread( +/*===============*/ + const char* file, + ulint line); +# endif + +# ifdef UT_DBG_USE_ABORT +/* Abort the execution. */ +# define UT_DBG_PANIC abort() +/* Stop threads (null operation) */ +# define UT_DBG_STOP do {} while (0) +# else /* UT_DBG_USE_ABORT */ +/* Abort the execution. */ +# define UT_DBG_PANIC \ + if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL +/* Stop threads in ut_a(). */ +# define UT_DBG_STOP do \ + if (UNIV_UNLIKELY(ut_dbg_stop_threads)) { \ + ut_dbg_stop_thread(__FILE__, (ulint) __LINE__); \ + } while (0) +# endif /* UT_DBG_USE_ABORT */ +#endif /* __NETWARE__ */ + +/* Abort execution if EXPR does not evaluate to nonzero. */ +#define ut_a(EXPR) do { \ + if (UT_DBG_FAIL(EXPR)) { \ + ut_dbg_assertion_failed(#EXPR, \ + __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ + } \ + UT_DBG_STOP; \ +} while (0) + +/* Abort execution. */ +#define ut_error do { \ + ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__); \ + UT_DBG_PANIC; \ +} while (0) + +#ifdef UNIV_DEBUG +#define ut_ad(EXPR) ut_a(EXPR) +#define ut_d(EXPR) do {EXPR;} while (0) +#else +#define ut_ad(EXPR) +#define ut_d(EXPR) +#endif + +#define UT_NOT_USED(A) A = A + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include +#include +#include + +/* structure used for recording usage statistics */ +typedef struct speedo_struct { + struct rusage ru; + struct timeval tv; +} speedo_t; + +/*********************************************************************** +Resets a speedo (records the current time in it). */ +UNIV_INTERN +void +speedo_reset( +/*=========*/ + speedo_t* speedo); /* out: speedo */ + +/*********************************************************************** +Shows the time elapsed and usage statistics since the last reset of a +speedo. */ +UNIV_INTERN +void +speedo_show( +/*========*/ + const speedo_t* speedo); /* in: speedo */ + +#endif /* UNIV_COMPILE_TEST_FUNCS */ + +#endif diff --git a/storage/xtradb/include/ut0list.h b/storage/xtradb/include/ut0list.h new file mode 100644 index 00000000000..034aa400af9 --- /dev/null +++ b/storage/xtradb/include/ut0list.h @@ -0,0 +1,165 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +A double-linked list. This differs from the one in ut0lst.h in that in this +one, each list node contains a pointer to the data, whereas the one in +ut0lst.h uses a strategy where the list pointers are embedded in the data +items themselves. + +Use this one when you need to store arbitrary data in the list where you +can't embed the list pointers in the data, if a data item needs to be +stored in multiple lists, etc. + +Note about the memory management: ib_list_t is a fixed-size struct whose +allocation/deallocation is done through ib_list_create/ib_list_free, but the +memory for the list nodes is allocated through a user-given memory heap, +which can either be the same for all nodes or vary per node. Most users will +probably want to create a memory heap to store the item-specific data, and +pass in this same heap to the list node creation functions, thus +automatically freeing the list node when the item's heap is freed. + +************************************************************************/ + +#ifndef IB_LIST_H +#define IB_LIST_H + +#include "mem0mem.h" + +typedef struct ib_list_struct ib_list_t; +typedef struct ib_list_node_struct ib_list_node_t; +typedef struct ib_list_helper_struct ib_list_helper_t; + +/******************************************************************** +Create a new list using mem_alloc. Lists created with this function must be +freed with ib_list_free. */ +UNIV_INTERN +ib_list_t* +ib_list_create(void); +/*=================*/ + /* out: list */ + + +/******************************************************************** +Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for +lists created with this function. */ +UNIV_INTERN +ib_list_t* +ib_list_create_heap( +/*================*/ + /* out: list */ + mem_heap_t* heap); /* in: memory heap to use */ + +/******************************************************************** +Free a list. */ +UNIV_INTERN +void +ib_list_free( +/*=========*/ + ib_list_t* list); /* in: list */ + +/******************************************************************** +Add the data to the start of the list. */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_first( +/*==============*/ + /* out: new list node*/ + ib_list_t* list, /* in: list */ + void* data, /* in: data */ + mem_heap_t* heap); /* in: memory heap to use */ + +/******************************************************************** +Add the data to the end of the list. */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_last( +/*=============*/ + /* out: new list node*/ + ib_list_t* list, /* in: list */ + void* data, /* in: data */ + mem_heap_t* heap); /* in: memory heap to use */ + +/******************************************************************** +Add the data after the indicated node. */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_after( +/*==============*/ + /* out: new list node*/ + ib_list_t* list, /* in: list */ + ib_list_node_t* prev_node, /* in: node preceding new node (can + be NULL) */ + void* data, /* in: data */ + mem_heap_t* heap); /* in: memory heap to use */ + +/******************************************************************** +Remove the node from the list. */ +UNIV_INTERN +void +ib_list_remove( +/*===========*/ + ib_list_t* list, /* in: list */ + ib_list_node_t* node); /* in: node to remove */ + +/******************************************************************** +Get the first node in the list. */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + /* out: first node, or NULL */ + ib_list_t* list); /* in: list */ + +/******************************************************************** +Get the last node in the list. */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + /* out: last node, or NULL */ + ib_list_t* list); /* in: list */ + +/* List. */ +struct ib_list_struct { + ib_list_node_t* first; /* first node */ + ib_list_node_t* last; /* last node */ + ibool is_heap_list; /* TRUE if this list was + allocated through a heap */ +}; + +/* A list node. */ +struct ib_list_node_struct { + ib_list_node_t* prev; /* previous node */ + ib_list_node_t* next; /* next node */ + void* data; /* user data */ +}; + +/* Quite often, the only additional piece of data you need is the per-item +memory heap, so we have this generic struct available to use in those +cases. */ +struct ib_list_helper_struct { + mem_heap_t* heap; /* memory heap */ + void* data; /* user data */ +}; + +#ifndef UNIV_NONINL +#include "ut0list.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0list.ic b/storage/xtradb/include/ut0list.ic new file mode 100644 index 00000000000..c79a0cf18dc --- /dev/null +++ b/storage/xtradb/include/ut0list.ic @@ -0,0 +1,41 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************** +Get the first node in the list. */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + /* out: first node, or NULL */ + ib_list_t* list) /* in: list */ +{ + return(list->first); +} + +/******************************************************************** +Get the last node in the list. */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + /* out: last node, or NULL */ + ib_list_t* list) /* in: list */ +{ + return(list->last); +} diff --git a/storage/xtradb/include/ut0lst.h b/storage/xtradb/include/ut0lst.h new file mode 100644 index 00000000000..46ee23a2538 --- /dev/null +++ b/storage/xtradb/include/ut0lst.h @@ -0,0 +1,243 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +List utilities + +Created 9/10/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0lst_h +#define ut0lst_h + +#include "univ.i" + +/* This module implements the two-way linear list which should be used +if a list is used in the database. Note that a single struct may belong +to two or more lists, provided that the list are given different names. +An example of the usage of the lists can be found in fil0fil.c. */ + +/*********************************************************************** +This macro expands to the unnamed type definition of a struct which acts +as the two-way list base node. The base node contains pointers +to both ends of the list and a count of nodes in the list (excluding +the base node from the count). TYPE should be the list node type name. */ + +#define UT_LIST_BASE_NODE_T(TYPE)\ +struct {\ + ulint count; /* count of nodes in list */\ + TYPE * start; /* pointer to list start, NULL if empty */\ + TYPE * end; /* pointer to list end, NULL if empty */\ +}\ + +/*********************************************************************** +This macro expands to the unnamed type definition of a struct which +should be embedded in the nodes of the list, the node type must be a struct. +This struct contains the pointers to next and previous nodes in the list. +The name of the field in the node struct should be the name given +to the list. TYPE should be the list node type name. Example of usage: + +typedef struct LRU_node_struct LRU_node_t; +struct LRU_node_struct { + UT_LIST_NODE_T(LRU_node_t) LRU_list; + ... +} +The example implements an LRU list of name LRU_list. Its nodes are of type +LRU_node_t. +*/ + +#define UT_LIST_NODE_T(TYPE)\ +struct {\ + TYPE * prev; /* pointer to the previous node,\ + NULL if start of list */\ + TYPE * next; /* pointer to next node, NULL if end of list */\ +}\ + +/*********************************************************************** +Initializes the base node of a two-way list. */ + +#define UT_LIST_INIT(BASE)\ +{\ + (BASE).count = 0;\ + (BASE).start = NULL;\ + (BASE).end = NULL;\ +}\ + +/*********************************************************************** +Adds the node as the first element in a two-way linked list. +BASE has to be the base node (not a pointer to it). N has to be +the pointer to the node to be added to the list. NAME is the list name. */ + +#define UT_LIST_ADD_FIRST(NAME, BASE, N)\ +{\ + ut_ad(N);\ + ((BASE).count)++;\ + ((N)->NAME).next = (BASE).start;\ + ((N)->NAME).prev = NULL;\ + if (UNIV_LIKELY((BASE).start != NULL)) {\ + ut_ad((BASE).start != (N));\ + (((BASE).start)->NAME).prev = (N);\ + }\ + (BASE).start = (N);\ + if (UNIV_UNLIKELY((BASE).end == NULL)) {\ + (BASE).end = (N);\ + }\ +}\ + +/*********************************************************************** +Adds the node as the last element in a two-way linked list. +BASE has to be the base node (not a pointer to it). N has to be +the pointer to the node to be added to the list. NAME is the list name. */ + +#define UT_LIST_ADD_LAST(NAME, BASE, N)\ +{\ + ut_ad(N);\ + ((BASE).count)++;\ + ((N)->NAME).prev = (BASE).end;\ + ((N)->NAME).next = NULL;\ + if ((BASE).end != NULL) {\ + ut_ad((BASE).end != (N));\ + (((BASE).end)->NAME).next = (N);\ + }\ + (BASE).end = (N);\ + if ((BASE).start == NULL) {\ + (BASE).start = (N);\ + }\ +}\ + +/*********************************************************************** +Inserts a NODE2 after NODE1 in a list. +BASE has to be the base node (not a pointer to it). NAME is the list +name, NODE1 and NODE2 are pointers to nodes. */ + +#define UT_LIST_INSERT_AFTER(NAME, BASE, NODE1, NODE2)\ +{\ + ut_ad(NODE1);\ + ut_ad(NODE2);\ + ut_ad((NODE1) != (NODE2));\ + ((BASE).count)++;\ + ((NODE2)->NAME).prev = (NODE1);\ + ((NODE2)->NAME).next = ((NODE1)->NAME).next;\ + if (((NODE1)->NAME).next != NULL) {\ + ((((NODE1)->NAME).next)->NAME).prev = (NODE2);\ + }\ + ((NODE1)->NAME).next = (NODE2);\ + if ((BASE).end == (NODE1)) {\ + (BASE).end = (NODE2);\ + }\ +}\ + +/* Invalidate the pointers in a list node. */ +#ifdef UNIV_LIST_DEBUG +# define UT_LIST_REMOVE_CLEAR(NAME, N) \ +((N)->NAME.prev = (N)->NAME.next = (void*) -1) +#else +# define UT_LIST_REMOVE_CLEAR(NAME, N) while (0) +#endif + +/*********************************************************************** +Removes a node from a two-way linked list. BASE has to be the base node +(not a pointer to it). N has to be the pointer to the node to be removed +from the list. NAME is the list name. */ + +#define UT_LIST_REMOVE(NAME, BASE, N) \ +do { \ + ut_ad(N); \ + ut_a((BASE).count > 0); \ + ((BASE).count)--; \ + if (((N)->NAME).next != NULL) { \ + ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev; \ + } else { \ + (BASE).end = ((N)->NAME).prev; \ + } \ + if (((N)->NAME).prev != NULL) { \ + ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next; \ + } else { \ + (BASE).start = ((N)->NAME).next; \ + } \ + UT_LIST_REMOVE_CLEAR(NAME, N); \ +} while (0) + +/************************************************************************ +Gets the next node in a two-way list. NAME is the name of the list +and N is pointer to a node. */ + +#define UT_LIST_GET_NEXT(NAME, N)\ + (((N)->NAME).next) + +/************************************************************************ +Gets the previous node in a two-way list. NAME is the name of the list +and N is pointer to a node. */ + +#define UT_LIST_GET_PREV(NAME, N)\ + (((N)->NAME).prev) + +/************************************************************************ +Alternative macro to get the number of nodes in a two-way list, i.e., +its length. BASE is the base node (not a pointer to it). */ + +#define UT_LIST_GET_LEN(BASE)\ + (BASE).count + +/************************************************************************ +Gets the first node in a two-way list, or returns NULL, +if the list is empty. BASE is the base node (not a pointer to it). */ + +#define UT_LIST_GET_FIRST(BASE)\ + (BASE).start + +/************************************************************************ +Gets the last node in a two-way list, or returns NULL, +if the list is empty. BASE is the base node (not a pointer to it). */ + +#define UT_LIST_GET_LAST(BASE)\ + (BASE).end + +/************************************************************************ +Checks the consistency of a two-way list. NAME is the name of the list, +TYPE is the node type, and BASE is the base node (not a pointer to it). */ + +#define UT_LIST_VALIDATE(NAME, TYPE, BASE)\ +{\ + ulint ut_list_i_313;\ + TYPE * ut_list_node_313;\ +\ + ut_list_node_313 = (BASE).start;\ +\ + for (ut_list_i_313 = 0; ut_list_i_313 < (BASE).count;\ + ut_list_i_313++) {\ + ut_a(ut_list_node_313);\ + ut_list_node_313 = (ut_list_node_313->NAME).next;\ + }\ +\ + ut_a(ut_list_node_313 == NULL);\ +\ + ut_list_node_313 = (BASE).end;\ +\ + for (ut_list_i_313 = 0; ut_list_i_313 < (BASE).count;\ + ut_list_i_313++) {\ + ut_a(ut_list_node_313);\ + ut_list_node_313 = (ut_list_node_313->NAME).prev;\ + }\ +\ + ut_a(ut_list_node_313 == NULL);\ +}\ + + +#endif + diff --git a/storage/xtradb/include/ut0mem.h b/storage/xtradb/include/ut0mem.h new file mode 100644 index 00000000000..f8dec99ed4a --- /dev/null +++ b/storage/xtradb/include/ut0mem.h @@ -0,0 +1,271 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#ifndef ut0mem_h +#define ut0mem_h + +#include "univ.i" +#include "os0sync.h" +#include + +/* The total amount of memory currently allocated from the operating +system with os_mem_alloc_large() or malloc(). Does not count malloc() +if srv_use_sys_malloc is set. Protected by ut_list_mutex. */ +extern ulint ut_total_allocated_memory; + +/* Mutex protecting ut_total_allocated_memory and ut_mem_block_list */ +extern os_fast_mutex_t ut_list_mutex; + +UNIV_INLINE +void* +ut_memcpy(void* dest, const void* sour, ulint n); + +UNIV_INLINE +void* +ut_memmove(void* dest, const void* sour, ulint n); + +UNIV_INLINE +int +ut_memcmp(const void* str1, const void* str2, ulint n); + +/************************************************************************** +Initializes the mem block list at database startup. */ +UNIV_INTERN +void +ut_mem_init(void); +/*=============*/ + +/************************************************************************** +Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is +defined and set_to_zero is TRUE. */ +UNIV_INTERN +void* +ut_malloc_low( +/*==========*/ + /* out, own: allocated memory */ + ulint n, /* in: number of bytes to allocate */ + ibool set_to_zero, /* in: TRUE if allocated memory + should be set to zero if + UNIV_SET_MEM_TO_ZERO is defined */ + ibool assert_on_error); /* in: if TRUE, we crash mysqld if + the memory cannot be allocated */ +/************************************************************************** +Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is +defined. */ +UNIV_INTERN +void* +ut_malloc( +/*======*/ + /* out, own: allocated memory */ + ulint n); /* in: number of bytes to allocate */ +/************************************************************************** +Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs +out. It cannot be used if we want to return an error message. Prints to +stderr a message if fails. */ +UNIV_INTERN +ibool +ut_test_malloc( +/*===========*/ + /* out: TRUE if succeeded */ + ulint n); /* in: try to allocate this many bytes */ +/************************************************************************** +Frees a memory block allocated with ut_malloc. */ +UNIV_INTERN +void +ut_free( +/*====*/ + void* ptr); /* in, own: memory block */ +/************************************************************************** +Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not +use this function because the allocation functions in mem0mem.h are the +recommended ones in InnoDB. + +man realloc in Linux, 2004: + + realloc() changes the size of the memory block pointed to + by ptr to size bytes. The contents will be unchanged to + the minimum of the old and new sizes; newly allocated mem­ + ory will be uninitialized. If ptr is NULL, the call is + equivalent to malloc(size); if size is equal to zero, the + call is equivalent to free(ptr). Unless ptr is NULL, it + must have been returned by an earlier call to malloc(), + calloc() or realloc(). + +RETURN VALUE + realloc() returns a pointer to the newly allocated memory, + which is suitably aligned for any kind of variable and may + be different from ptr, or NULL if the request fails. If + size was equal to 0, either NULL or a pointer suitable to + be passed to free() is returned. If realloc() fails the + original block is left untouched - it is not freed or + moved. */ +UNIV_INTERN +void* +ut_realloc( +/*=======*/ + /* out, own: pointer to new mem block or NULL */ + void* ptr, /* in: pointer to old block or NULL */ + ulint size); /* in: desired size */ +/************************************************************************** +Frees in shutdown all allocated memory not freed yet. */ +UNIV_INTERN +void +ut_free_all_mem(void); +/*=================*/ + +UNIV_INLINE +char* +ut_strcpy(char* dest, const char* sour); + +UNIV_INLINE +ulint +ut_strlen(const char* str); + +UNIV_INLINE +int +ut_strcmp(const char* str1, const char* str2); + +/************************************************************************** +Copies up to size - 1 characters from the NUL-terminated string src to +dst, NUL-terminating the result. Returns strlen(src), so truncation +occurred if the return value >= size. */ +UNIV_INTERN +ulint +ut_strlcpy( +/*=======*/ + /* out: strlen(src) */ + char* dst, /* in: destination buffer */ + const char* src, /* in: source buffer */ + ulint size); /* in: size of destination buffer */ + +/************************************************************************** +Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last +(size - 1) bytes of src, not the first. */ +UNIV_INTERN +ulint +ut_strlcpy_rev( +/*===========*/ + /* out: strlen(src) */ + char* dst, /* in: destination buffer */ + const char* src, /* in: source buffer */ + ulint size); /* in: size of destination buffer */ + +/************************************************************************** +Compute strlen(ut_strcpyq(str, q)). */ +UNIV_INLINE +ulint +ut_strlenq( +/*=======*/ + /* out: length of the string when quoted */ + const char* str, /* in: null-terminated string */ + char q); /* in: the quote character */ + +/************************************************************************** +Make a quoted copy of a NUL-terminated string. Leading and trailing +quotes will not be included; only embedded quotes will be escaped. +See also ut_strlenq() and ut_memcpyq(). */ +UNIV_INTERN +char* +ut_strcpyq( +/*=======*/ + /* out: pointer to end of dest */ + char* dest, /* in: output buffer */ + char q, /* in: the quote character */ + const char* src); /* in: null-terminated string */ + +/************************************************************************** +Make a quoted copy of a fixed-length string. Leading and trailing +quotes will not be included; only embedded quotes will be escaped. +See also ut_strlenq() and ut_strcpyq(). */ +UNIV_INTERN +char* +ut_memcpyq( +/*=======*/ + /* out: pointer to end of dest */ + char* dest, /* in: output buffer */ + char q, /* in: the quote character */ + const char* src, /* in: string to be quoted */ + ulint len); /* in: length of src */ + +/************************************************************************** +Return the number of times s2 occurs in s1. Overlapping instances of s2 +are only counted once. */ +UNIV_INTERN +ulint +ut_strcount( +/*========*/ + /* out: the number of times s2 occurs in s1 */ + const char* s1, /* in: string to search in */ + const char* s2); /* in: string to search for */ + +/************************************************************************** +Replace every occurrence of s1 in str with s2. Overlapping instances of s1 +are only replaced once. */ +UNIV_INTERN +char* +ut_strreplace( +/*==========*/ + /* out, own: modified string, must be + freed with mem_free() */ + const char* str, /* in: string to operate on */ + const char* s1, /* in: string to replace */ + const char* s2); /* in: string to replace s1 with */ + +/************************************************************************** +Converts a raw binary data to a '\0'-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the '\0'). */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + /* out: number of chars written */ + const void* raw, /* in: raw data */ + ulint raw_size, /* in: "raw" length in bytes */ + char* hex, /* out: hex string */ + ulint hex_size); /* in: "hex" size in bytes */ + +/*********************************************************************** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating '\0'). If buf_size is too small then the +trailing bytes from "str" are discarded. */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + /* out: number of bytes + that were written */ + const char* str, /* in: string */ + ulint str_len, /* in: string length in bytes */ + char* buf, /* out: output buffer */ + ulint buf_size); /* in: output buffer size + in bytes */ + +#ifndef UNIV_NONINL +#include "ut0mem.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0mem.ic b/storage/xtradb/include/ut0mem.ic new file mode 100644 index 00000000000..5078c721706 --- /dev/null +++ b/storage/xtradb/include/ut0mem.ic @@ -0,0 +1,308 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#include "ut0byte.h" +#include "mach0data.h" + +UNIV_INLINE +void* +ut_memcpy(void* dest, const void* sour, ulint n) +{ + return(memcpy(dest, sour, n)); +} + +UNIV_INLINE +void* +ut_memmove(void* dest, const void* sour, ulint n) +{ + return(memmove(dest, sour, n)); +} + +UNIV_INLINE +int +ut_memcmp(const void* str1, const void* str2, ulint n) +{ + return(memcmp(str1, str2, n)); +} + +UNIV_INLINE +char* +ut_strcpy(char* dest, const char* sour) +{ + return(strcpy(dest, sour)); +} + +UNIV_INLINE +ulint +ut_strlen(const char* str) +{ + return(strlen(str)); +} + +UNIV_INLINE +int +ut_strcmp(const char* str1, const char* str2) +{ + return(strcmp(str1, str2)); +} + +/************************************************************************** +Compute strlen(ut_strcpyq(str, q)). */ +UNIV_INLINE +ulint +ut_strlenq( +/*=======*/ + /* out: length of the string when quoted */ + const char* str, /* in: null-terminated string */ + char q) /* in: the quote character */ +{ + ulint len; + + for (len = 0; *str; len++, str++) { + if (*str == q) { + len++; + } + } + + return(len); +} + +/************************************************************************** +Converts a raw binary data to a '\0'-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the '\0'). */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + /* out: number of chars written */ + const void* raw, /* in: raw data */ + ulint raw_size, /* in: "raw" length in bytes */ + char* hex, /* out: hex string */ + ulint hex_size) /* in: "hex" size in bytes */ +{ + +#ifdef WORDS_BIGENDIAN + +#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b)) + +#define UINT16_GET_A(u) ((unsigned char) ((u) >> 8)) +#define UINT16_GET_B(u) ((unsigned char) ((u) & 0xFF)) + +#else /* WORDS_BIGENDIAN */ + +#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a)) + +#define UINT16_GET_A(u) ((unsigned char) ((u) & 0xFF)) +#define UINT16_GET_B(u) ((unsigned char) ((u) >> 8)) + +#endif /* WORDS_BIGENDIAN */ + +#define MK_ALL_UINT16_WITH_A(a) \ + MK_UINT16(a, '0'), \ + MK_UINT16(a, '1'), \ + MK_UINT16(a, '2'), \ + MK_UINT16(a, '3'), \ + MK_UINT16(a, '4'), \ + MK_UINT16(a, '5'), \ + MK_UINT16(a, '6'), \ + MK_UINT16(a, '7'), \ + MK_UINT16(a, '8'), \ + MK_UINT16(a, '9'), \ + MK_UINT16(a, 'A'), \ + MK_UINT16(a, 'B'), \ + MK_UINT16(a, 'C'), \ + MK_UINT16(a, 'D'), \ + MK_UINT16(a, 'E'), \ + MK_UINT16(a, 'F') + + static const uint16 hex_map[256] = { + MK_ALL_UINT16_WITH_A('0'), + MK_ALL_UINT16_WITH_A('1'), + MK_ALL_UINT16_WITH_A('2'), + MK_ALL_UINT16_WITH_A('3'), + MK_ALL_UINT16_WITH_A('4'), + MK_ALL_UINT16_WITH_A('5'), + MK_ALL_UINT16_WITH_A('6'), + MK_ALL_UINT16_WITH_A('7'), + MK_ALL_UINT16_WITH_A('8'), + MK_ALL_UINT16_WITH_A('9'), + MK_ALL_UINT16_WITH_A('A'), + MK_ALL_UINT16_WITH_A('B'), + MK_ALL_UINT16_WITH_A('C'), + MK_ALL_UINT16_WITH_A('D'), + MK_ALL_UINT16_WITH_A('E'), + MK_ALL_UINT16_WITH_A('F') + }; + const unsigned char* rawc; + ulint read_bytes; + ulint write_bytes; + ulint i; + + rawc = (const unsigned char*) raw; + + if (hex_size == 0) { + + return(0); + } + + if (hex_size <= 2 * raw_size) { + + read_bytes = hex_size / 2; + write_bytes = hex_size; + } else { + + read_bytes = raw_size; + write_bytes = 2 * raw_size + 1; + } + +#define LOOP_READ_BYTES(ASSIGN) \ + for (i = 0; i < read_bytes; i++) { \ + ASSIGN; \ + hex += 2; \ + rawc++; \ + } + + if (ut_align_offset(hex, 2) == 0) { + + LOOP_READ_BYTES( + *(uint16*) hex = hex_map[*rawc] + ); + } else { + + LOOP_READ_BYTES( + *hex = UINT16_GET_A(hex_map[*rawc]); + *(hex + 1) = UINT16_GET_B(hex_map[*rawc]) + ); + } + + if (hex_size <= 2 * raw_size && hex_size % 2 == 0) { + + hex--; + } + + *hex = '\0'; + + return(write_bytes); +} + +/*********************************************************************** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating '\0'). If buf_size is too small then the +trailing bytes from "str" are discarded. */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + /* out: number of bytes + that were written */ + const char* str, /* in: string */ + ulint str_len, /* in: string length in bytes */ + char* buf, /* out: output buffer */ + ulint buf_size) /* in: output buffer size + in bytes */ +{ + ulint str_i; + ulint buf_i; + + buf_i = 0; + + switch (buf_size) { + case 3: + + if (str_len == 0) { + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\''; + buf_i++; + } + /* FALLTHROUGH */ + case 2: + case 1: + + buf[buf_i] = '\0'; + buf_i++; + /* FALLTHROUGH */ + case 0: + + return(buf_i); + } + + /* buf_size >= 4 */ + + buf[0] = '\''; + buf_i = 1; + + for (str_i = 0; str_i < str_len; str_i++) { + + char ch; + + if (buf_size - buf_i == 2) { + + break; + } + + ch = str[str_i]; + + switch (ch) { + case '\0': + + if (UNIV_UNLIKELY(buf_size - buf_i < 4)) { + + goto func_exit; + } + buf[buf_i] = '\\'; + buf_i++; + buf[buf_i] = '0'; + buf_i++; + break; + case '\'': + case '\\': + + if (UNIV_UNLIKELY(buf_size - buf_i < 4)) { + + goto func_exit; + } + buf[buf_i] = ch; + buf_i++; + /* FALLTHROUGH */ + default: + + buf[buf_i] = ch; + buf_i++; + } + } + +func_exit: + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\0'; + buf_i++; + + return(buf_i); +} diff --git a/storage/xtradb/include/ut0rnd.h b/storage/xtradb/include/ut0rnd.h new file mode 100644 index 00000000000..b9e23d7cd14 --- /dev/null +++ b/storage/xtradb/include/ut0rnd.h @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Random numbers and hashing + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0rnd_h +#define ut0rnd_h + +#include "univ.i" + +#include "ut0byte.h" + +/* The 'character code' for end of field or string (used +in folding records */ +#define UT_END_OF_FIELD 257 + +/************************************************************ +This is used to set the random number seed. */ +UNIV_INLINE +void +ut_rnd_set_seed( +/*============*/ + ulint seed); /* in: seed */ +/************************************************************ +The following function generates a series of 'random' ulint integers. */ +UNIV_INLINE +ulint +ut_rnd_gen_next_ulint( +/*==================*/ + /* out: the next 'random' number */ + ulint rnd); /* in: the previous random number value */ +/************************************************************* +The following function generates 'random' ulint integers which +enumerate the value space (let there be N of them) of ulint integers +in a pseudo-random fashion. Note that the same integer is repeated +always after N calls to the generator. */ +UNIV_INLINE +ulint +ut_rnd_gen_ulint(void); +/*==================*/ + /* out: the 'random' number */ +/************************************************************ +Generates a random integer from a given interval. */ +UNIV_INLINE +ulint +ut_rnd_interval( +/*============*/ + /* out: the 'random' number */ + ulint low, /* in: low limit; can generate also this value */ + ulint high); /* in: high limit; can generate also this value */ +/************************************************************* +Generates a random iboolean value. */ +UNIV_INLINE +ibool +ut_rnd_gen_ibool(void); +/*=================*/ + /* out: the random value */ +/*********************************************************** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime or some +random number to work reliably. */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + /* out: hash value */ + ulint key, /* in: value to be hashed */ + ulint table_size); /* in: hash table size */ +/***************************************************************** +Folds a pair of ulints. */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + /* out: folded value */ + ulint n1, /* in: ulint */ + ulint n2) /* in: ulint */ + __attribute__((const)); +/***************************************************************** +Folds a dulint. */ +UNIV_INLINE +ulint +ut_fold_dulint( +/*===========*/ + /* out: folded value */ + dulint d) /* in: dulint */ + __attribute__((const)); +/***************************************************************** +Folds a character string ending in the null character. */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + /* out: folded value */ + const char* str) /* in: null-terminated string */ + __attribute__((pure)); +/***************************************************************** +Folds a binary string. */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + /* out: folded value */ + const byte* str, /* in: string of bytes */ + ulint len) /* in: length */ + __attribute__((pure)); +/*************************************************************** +Looks for a prime number slightly greater than the given argument. +The prime is chosen so that it is not near any power of 2. */ +UNIV_INTERN +ulint +ut_find_prime( +/*==========*/ + /* out: prime */ + ulint n) /* in: positive number > 100 */ + __attribute__((const)); + + +#ifndef UNIV_NONINL +#include "ut0rnd.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0rnd.ic b/storage/xtradb/include/ut0rnd.ic new file mode 100644 index 00000000000..d72100d16a1 --- /dev/null +++ b/storage/xtradb/include/ut0rnd.ic @@ -0,0 +1,228 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************************** +Random numbers and hashing + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +#define UT_HASH_RANDOM_MASK 1463735687 +#define UT_HASH_RANDOM_MASK2 1653893711 +#define UT_RND1 151117737 +#define UT_RND2 119785373 +#define UT_RND3 85689495 +#define UT_RND4 76595339 +#define UT_SUM_RND2 98781234 +#define UT_SUM_RND3 126792457 +#define UT_SUM_RND4 63498502 +#define UT_XOR_RND1 187678878 +#define UT_XOR_RND2 143537923 + +extern ulint ut_rnd_ulint_counter; + +/************************************************************ +This is used to set the random number seed. */ +UNIV_INLINE +void +ut_rnd_set_seed( +/*============*/ + ulint seed) /* in: seed */ +{ + ut_rnd_ulint_counter = seed; +} + +/************************************************************ +The following function generates a series of 'random' ulint integers. */ +UNIV_INLINE +ulint +ut_rnd_gen_next_ulint( +/*==================*/ + /* out: the next 'random' number */ + ulint rnd) /* in: the previous random number value */ +{ + ulint n_bits; + + n_bits = 8 * sizeof(ulint); + + rnd = UT_RND2 * rnd + UT_SUM_RND3; + rnd = UT_XOR_RND1 ^ rnd; + rnd = (rnd << 20) + (rnd >> (n_bits - 20)); + rnd = UT_RND3 * rnd + UT_SUM_RND4; + rnd = UT_XOR_RND2 ^ rnd; + rnd = (rnd << 20) + (rnd >> (n_bits - 20)); + rnd = UT_RND1 * rnd + UT_SUM_RND2; + + return(rnd); +} + +/************************************************************ +The following function generates 'random' ulint integers which +enumerate the value space of ulint integers in a pseudo random +fashion. Note that the same integer is repeated always after +2 to power 32 calls to the generator (if ulint is 32-bit). */ +UNIV_INLINE +ulint +ut_rnd_gen_ulint(void) +/*==================*/ + /* out: the 'random' number */ +{ + ulint rnd; + ulint n_bits; + + n_bits = 8 * sizeof(ulint); + + ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2; + + rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter); + + return(rnd); +} + +/************************************************************ +Generates a random integer from a given interval. */ +UNIV_INLINE +ulint +ut_rnd_interval( +/*============*/ + /* out: the 'random' number */ + ulint low, /* in: low limit; can generate also this value */ + ulint high) /* in: high limit; can generate also this value */ +{ + ulint rnd; + + ut_ad(high >= low); + + if (low == high) { + + return(low); + } + + rnd = ut_rnd_gen_ulint(); + + return(low + (rnd % (high - low + 1))); +} + +/************************************************************* +Generates a random iboolean value. */ +UNIV_INLINE +ibool +ut_rnd_gen_ibool(void) +/*=================*/ + /* out: the random value */ +{ + ulint x; + + x = ut_rnd_gen_ulint(); + + if (((x >> 20) + (x >> 15)) & 1) { + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime +or some random number for the hash table to work reliably. */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + /* out: hash value */ + ulint key, /* in: value to be hashed */ + ulint table_size) /* in: hash table size */ +{ + key = key ^ UT_HASH_RANDOM_MASK2; + + return(key % table_size); +} + +/***************************************************************** +Folds a pair of ulints. */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + /* out: folded value */ + ulint n1, /* in: ulint */ + ulint n2) /* in: ulint */ +{ + return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1) + ^ UT_HASH_RANDOM_MASK) + n2); +} + +/***************************************************************** +Folds a dulint. */ +UNIV_INLINE +ulint +ut_fold_dulint( +/*===========*/ + /* out: folded value */ + dulint d) /* in: dulint */ +{ + return(ut_fold_ulint_pair(ut_dulint_get_low(d), + ut_dulint_get_high(d))); +} + +/***************************************************************** +Folds a character string ending in the null character. */ +UNIV_INLINE +ulint +ut_fold_string( +/*===========*/ + /* out: folded value */ + const char* str) /* in: null-terminated string */ +{ + ulint fold = 0; + + ut_ad(str); + + while (*str != '\0') { + fold = ut_fold_ulint_pair(fold, (ulint)(*str)); + str++; + } + + return(fold); +} + +/***************************************************************** +Folds a binary string. */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + /* out: folded value */ + const byte* str, /* in: string of bytes */ + ulint len) /* in: length */ +{ + const byte* str_end = str + len; + ulint fold = 0; + + ut_ad(str || !len); + + while (str < str_end) { + fold = ut_fold_ulint_pair(fold, (ulint)(*str)); + + str++; + } + + return(fold); +} diff --git a/storage/xtradb/include/ut0sort.h b/storage/xtradb/include/ut0sort.h new file mode 100644 index 00000000000..5fd5db54832 --- /dev/null +++ b/storage/xtradb/include/ut0sort.h @@ -0,0 +1,105 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Sort utility + +Created 11/9/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0sort_h +#define ut0sort_h + +#include "univ.i" + +/* This module gives a macro definition of the body of +a standard sort function for an array of elements of any +type. The comparison function is given as a parameter to +the macro. The sort algorithm is mergesort which has logarithmic +worst case. +*/ + +/*********************************************************************** +This macro expands to the body of a standard sort function. +The sort function uses mergesort and must be defined separately +for each type of array. +Also the comparison function has to be defined individually +for each array cell type. SORT_FUN is the sort function name. +The function takes the array to be sorted (ARR), +the array of auxiliary space (AUX_ARR) of same size, +and the low (LOW), inclusive, and high (HIGH), noninclusive, +limits for the sort interval as arguments. +CMP_FUN is the comparison function name. It takes as arguments +two elements from the array and returns 1, if the first is bigger, +0 if equal, and -1 if the second bigger. */ + +#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\ +{\ + ulint ut_sort_mid77;\ + ulint ut_sort_i77;\ + ulint ut_sort_low77;\ + ulint ut_sort_high77;\ +\ + ut_ad((LOW) < (HIGH));\ + ut_ad(ARR);\ + ut_ad(AUX_ARR);\ +\ + if ((LOW) == (HIGH) - 1) {\ + return;\ + } else if ((LOW) == (HIGH) - 2) {\ + if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\ + (AUX_ARR)[LOW] = (ARR)[LOW];\ + (ARR)[LOW] = (ARR)[(HIGH) - 1];\ + (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\ + }\ + return;\ + }\ +\ + ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\ +\ + SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\ + SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\ +\ + ut_sort_low77 = (LOW);\ + ut_sort_high77 = ut_sort_mid77;\ +\ + for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\ +\ + if (ut_sort_low77 >= ut_sort_mid77) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else if (ut_sort_high77 >= (HIGH)) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + } else if (CMP_FUN((ARR)[ut_sort_low77],\ + (ARR)[ut_sort_high77]) > 0) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + }\ + }\ +\ + memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\ + ((HIGH) - (LOW)) * sizeof *(ARR));\ +}\ + + +#endif + diff --git a/storage/xtradb/include/ut0ut.h b/storage/xtradb/include/ut0ut.h new file mode 100644 index 00000000000..3ca14acd2ef --- /dev/null +++ b/storage/xtradb/include/ut0ut.h @@ -0,0 +1,328 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Various utilities + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0ut_h +#define ut0ut_h + +#include "univ.i" +#include +#ifndef MYSQL_SERVER +#include +#endif + +#define TEMP_INDEX_PREFIX '\377' /* Index name prefix in fast index + creation */ + +typedef time_t ib_time_t; + +/************************************************************************* +Delays execution for at most max_wait_us microseconds or returns earlier +if cond becomes true; cond is evaluated every 2 ms. */ + +#define UT_WAIT_FOR(cond, max_wait_us) \ +do { \ + ullint start_us; \ + start_us = ut_time_us(NULL); \ + while (!(cond) \ + && ut_time_us(NULL) - start_us < (max_wait_us)) {\ + \ + os_thread_sleep(2000 /* 2 ms */); \ + } \ +} while (0) + +/************************************************************ +Gets the high 32 bits in a ulint. That is makes a shift >> 32, +but since there seem to be compiler bugs in both gcc and Visual C++, +we do this by a special conversion. */ +UNIV_INTERN +ulint +ut_get_high32( +/*==========*/ + /* out: a >> 32 */ + ulint a); /* in: ulint */ +/********************************************************** +Calculates the minimum of two ulints. */ +UNIV_INLINE +ulint +ut_min( +/*===*/ + /* out: minimum */ + ulint n1, /* in: first number */ + ulint n2); /* in: second number */ +/********************************************************** +Calculates the maximum of two ulints. */ +UNIV_INLINE +ulint +ut_max( +/*===*/ + /* out: maximum */ + ulint n1, /* in: first number */ + ulint n2); /* in: second number */ +/******************************************************************** +Calculates minimum of two ulint-pairs. */ +UNIV_INLINE +void +ut_pair_min( +/*========*/ + ulint* a, /* out: more significant part of minimum */ + ulint* b, /* out: less significant part of minimum */ + ulint a1, /* in: more significant part of first pair */ + ulint b1, /* in: less significant part of first pair */ + ulint a2, /* in: more significant part of second pair */ + ulint b2); /* in: less significant part of second pair */ +/********************************************************** +Compares two ulints. */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + /* out: 1 if a > b, 0 if a == b, -1 if a < b */ + ulint a, /* in: ulint */ + ulint b); /* in: ulint */ +/*********************************************************** +Compares two pairs of ulints. */ +UNIV_INLINE +int +ut_pair_cmp( +/*========*/ + /* out: -1 if a < b, 0 if a == b, + 1 if a > b */ + ulint a1, /* in: more significant part of first pair */ + ulint a2, /* in: less significant part of first pair */ + ulint b1, /* in: more significant part of second pair */ + ulint b2); /* in: less significant part of second pair */ +/***************************************************************** +Determines if a number is zero or a power of two. */ +#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1))) +/***************************************************************** +Calculates fast the remainder of n/m when m is a power of two. */ +#define ut_2pow_remainder(n, m) ((n) & ((m) - 1)) +/***************************************************************** +Calculates the biggest multiple of m that is not bigger than n +when m is a power of two. In other words, rounds n down to m * k. */ +#define ut_2pow_round(n, m) ((n) & ~((m) - 1)) +#define ut_calc_align_down(n, m) ut_2pow_round(n, m) +/************************************************************ +Calculates the smallest multiple of m that is not smaller than n +when m is a power of two. In other words, rounds n up to m * k. */ +#define ut_calc_align(n, m) (((n) + ((m) - 1)) & ~((m) - 1)) +/***************************************************************** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + /* out: logarithm in the base 2, rounded upward */ + ulint n); /* in: number */ +/***************************************************************** +Calculates 2 to power n. */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + /* out: 2 to power n */ + ulint n); /* in: number */ +/***************************************************************** +Calculates fast the number rounded up to the nearest power of 2. */ +UNIV_INTERN +ulint +ut_2_power_up( +/*==========*/ + /* out: first power of 2 which is >= n */ + ulint n) /* in: number != 0 */ + __attribute__((const)); + +/* Determine how many bytes (groups of 8 bits) are needed to +store the given number of bits. */ +#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8) + +/************************************************************** +Returns system time. We do not specify the format of the time returned: +the only way to manipulate it is to use the function ut_difftime. */ +UNIV_INTERN +ib_time_t +ut_time(void); +/*=========*/ +/************************************************************** +Returns system time. +Upon successful completion, the value 0 is returned; otherwise the +value -1 is returned and the global variable errno is set to indicate the +error. */ +UNIV_INTERN +int +ut_usectime( +/*========*/ + /* out: 0 on success, -1 otherwise */ + ulint* sec, /* out: seconds since the Epoch */ + ulint* ms); /* out: microseconds since the Epoch+*sec */ + +/************************************************************** +Returns the number of microseconds since epoch. Similar to +time(3), the return value is also stored in *tloc, provided +that tloc is non-NULL. */ +UNIV_INTERN +ullint +ut_time_us( +/*=======*/ + /* out: us since epoch */ + ullint* tloc); /* out: us since epoch, if non-NULL */ + +/************************************************************** +Returns the difference of two times in seconds. */ +UNIV_INTERN +double +ut_difftime( +/*========*/ + /* out: time2 - time1 expressed in seconds */ + ib_time_t time2, /* in: time */ + ib_time_t time1); /* in: time */ +/************************************************************** +Prints a timestamp to a file. */ +UNIV_INTERN +void +ut_print_timestamp( +/*===============*/ + FILE* file); /* in: file where to print */ +/************************************************************** +Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */ +UNIV_INTERN +void +ut_sprintf_timestamp( +/*=================*/ + char* buf); /* in: buffer where to sprintf */ +#ifdef UNIV_HOTBACKUP +/************************************************************** +Sprintfs a timestamp to a buffer with no spaces and with ':' characters +replaced by '_'. */ +UNIV_INTERN +void +ut_sprintf_timestamp_without_extra_chars( +/*=====================================*/ + char* buf); /* in: buffer where to sprintf */ +/************************************************************** +Returns current year, month, day. */ +UNIV_INTERN +void +ut_get_year_month_day( +/*==================*/ + ulint* year, /* out: current year */ + ulint* month, /* out: month */ + ulint* day); /* out: day */ +#endif /* UNIV_HOTBACKUP */ +/***************************************************************** +Runs an idle loop on CPU. The argument gives the desired delay +in microseconds on 100 MHz Pentium + Visual C++. */ +UNIV_INTERN +ulint +ut_delay( +/*=====*/ + /* out: dummy value */ + ulint delay); /* in: delay in microseconds on 100 MHz Pentium */ +/***************************************************************** +Prints the contents of a memory buffer in hex and ascii. */ +UNIV_INTERN +void +ut_print_buf( +/*=========*/ + FILE* file, /* in: file where to print */ + const void* buf, /* in: memory buffer */ + ulint len); /* in: length of the buffer */ + +/************************************************************************** +Outputs a NUL-terminated file name, quoted with apostrophes. */ +UNIV_INTERN +void +ut_print_filename( +/*==============*/ + FILE* f, /* in: output stream */ + const char* name); /* in: name to print */ + +/* Forward declaration of transaction handle */ +struct trx_struct; + +/************************************************************************** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_name( +/*==========*/ + FILE* f, /* in: output stream */ + struct trx_struct*trx, /* in: transaction */ + ibool table_id,/* in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name); /* in: name to print */ + +/************************************************************************** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_namel( +/*===========*/ + FILE* f, /* in: output stream */ + struct trx_struct*trx, /* in: transaction (NULL=no quotes) */ + ibool table_id,/* in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name, /* in: name to print */ + ulint namelen);/* in: length of name */ + +/************************************************************************** +Catenate files. */ +UNIV_INTERN +void +ut_copy_file( +/*=========*/ + FILE* dest, /* in: output file */ + FILE* src); /* in: input file to be appended to output */ + +/************************************************************************** +snprintf(). */ + +#ifdef __WIN__ +int +ut_snprintf( + /* out: number of characters that would + have been printed if the size were + unlimited, not including the terminating + '\0'. */ + char* str, /* out: string */ + size_t size, /* in: str size */ + const char* fmt, /* in: format */ + ...); /* in: format values */ +#else +#define ut_snprintf snprintf +#endif /* __WIN__ */ + +#ifndef UNIV_NONINL +#include "ut0ut.ic" +#endif + +#endif + diff --git a/storage/xtradb/include/ut0ut.ic b/storage/xtradb/include/ut0ut.ic new file mode 100644 index 00000000000..e4e0a2acce6 --- /dev/null +++ b/storage/xtradb/include/ut0ut.ic @@ -0,0 +1,161 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************************** +Various utilities + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/********************************************************** +Calculates the minimum of two ulints. */ +UNIV_INLINE +ulint +ut_min( +/*===*/ + /* out: minimum */ + ulint n1, /* in: first number */ + ulint n2) /* in: second number */ +{ + return((n1 <= n2) ? n1 : n2); +} + +/********************************************************** +Calculates the maximum of two ulints. */ +UNIV_INLINE +ulint +ut_max( +/*===*/ + /* out: maximum */ + ulint n1, /* in: first number */ + ulint n2) /* in: second number */ +{ + return((n1 <= n2) ? n2 : n1); +} + +/******************************************************************** +Calculates minimum of two ulint-pairs. */ +UNIV_INLINE +void +ut_pair_min( +/*========*/ + ulint* a, /* out: more significant part of minimum */ + ulint* b, /* out: less significant part of minimum */ + ulint a1, /* in: more significant part of first pair */ + ulint b1, /* in: less significant part of first pair */ + ulint a2, /* in: more significant part of second pair */ + ulint b2) /* in: less significant part of second pair */ +{ + if (a1 == a2) { + *a = a1; + *b = ut_min(b1, b2); + } else if (a1 < a2) { + *a = a1; + *b = b1; + } else { + *a = a2; + *b = b2; + } +} + +/********************************************************** +Compares two ulints. */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + /* out: 1 if a > b, 0 if a == b, -1 if a < b */ + ulint a, /* in: ulint */ + ulint b) /* in: ulint */ +{ + if (a < b) { + return(-1); + } else if (a == b) { + return(0); + } else { + return(1); + } +} + +/*********************************************************** +Compares two pairs of ulints. */ +UNIV_INLINE +int +ut_pair_cmp( +/*========*/ + /* out: -1 if a < b, 0 if a == b, 1 if a > b */ + ulint a1, /* in: more significant part of first pair */ + ulint a2, /* in: less significant part of first pair */ + ulint b1, /* in: more significant part of second pair */ + ulint b2) /* in: less significant part of second pair */ +{ + if (a1 > b1) { + return(1); + } else if (a1 < b1) { + return(-1); + } else if (a2 > b2) { + return(1); + } else if (a2 < b2) { + return(-1); + } else { + return(0); + } +} + +/***************************************************************** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + /* out: logarithm in the base 2, rounded upward */ + ulint n) /* in: number != 0 */ +{ + ulint res; + + res = 0; + + ut_ad(n > 0); + + n = n - 1; + + for (;;) { + n = n / 2; + + if (n == 0) { + break; + } + + res++; + } + + return(res + 1); +} + +/***************************************************************** +Calculates 2 to power n. */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + /* out: 2 to power n */ + ulint n) /* in: number */ +{ + return((ulint) 1 << n); +} diff --git a/storage/xtradb/include/ut0vec.h b/storage/xtradb/include/ut0vec.h new file mode 100644 index 00000000000..aeb7e168dc6 --- /dev/null +++ b/storage/xtradb/include/ut0vec.h @@ -0,0 +1,117 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +#ifndef IB_VECTOR_H +#define IB_VECTOR_H + +#include "univ.i" +#include "mem0mem.h" + +typedef struct ib_vector_struct ib_vector_t; + +/* An automatically resizing vector datatype with the following properties: + + -Contains void* items. + + -The items are owned by the caller. + + -All memory allocation is done through a heap owned by the caller, who is + responsible for freeing it when done with the vector. + + -When the vector is resized, the old memory area is left allocated since it + uses the same heap as the new memory area, so this is best used for + relatively small or short-lived uses. +*/ + +/******************************************************************** +Create a new vector with the given initial size. */ +UNIV_INTERN +ib_vector_t* +ib_vector_create( +/*=============*/ + /* out: vector */ + mem_heap_t* heap, /* in: heap */ + ulint size); /* in: initial size */ + +/******************************************************************** +Push a new element to the vector, increasing its size if necessary. */ +UNIV_INTERN +void +ib_vector_push( +/*===========*/ + ib_vector_t* vec, /* in: vector */ + void* elem); /* in: data element */ + +/******************************************************************** +Get the number of elements in the vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Test whether a vector is empty or not. */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + /* out: TRUE if empty */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Get the n'th element. */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + /* out: n'th element */ + ib_vector_t* vec, /* in: vector */ + ulint n); /* in: element index to get */ + +/******************************************************************** +Remove the last element from the vector. */ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Free the underlying heap of the vector. Note that vec is invalid +after this call. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec); /* in,own: vector */ + +/* See comment at beginning of file. */ +struct ib_vector_struct { + mem_heap_t* heap; /* heap */ + void** data; /* data elements */ + ulint used; /* number of elements currently used */ + ulint total; /* number of elements allocated */ +}; + +#ifndef UNIV_NONINL +#include "ut0vec.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0vec.ic b/storage/xtradb/include/ut0vec.ic new file mode 100644 index 00000000000..b0e853717e3 --- /dev/null +++ b/storage/xtradb/include/ut0vec.ic @@ -0,0 +1,88 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************** +Get number of elements in vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector */ + const ib_vector_t* vec) /* in: vector */ +{ + return(vec->used); +} + +/******************************************************************** +Get n'th element. */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + /* out: n'th element */ + ib_vector_t* vec, /* in: vector */ + ulint n) /* in: element index to get */ +{ + ut_a(n < vec->used); + + return(vec->data[n]); +} + +/******************************************************************** +Remove the last element from the vector. */ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: last vector element */ + ib_vector_t* vec) /* in/out: vector */ +{ + void* elem; + + ut_a(vec->used > 0); + --vec->used; + elem = vec->data[vec->used]; + + ut_d(vec->data[vec->used] = NULL); + UNIV_MEM_INVALID(&vec->data[vec->used], sizeof(*vec->data)); + + return(elem); +} + +/******************************************************************** +Free the underlying heap of the vector. Note that vec is invalid +after this call. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec) /* in, own: vector */ +{ + mem_heap_free(vec->heap); +} + +/******************************************************************** +Test whether a vector is empty or not. */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ /* out: TRUE if empty else FALSE */ + const ib_vector_t* vec) /* in vector to test */ +{ + return(ib_vector_size(vec) == 0); +} diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h new file mode 100644 index 00000000000..6bb80dad532 --- /dev/null +++ b/storage/xtradb/include/ut0wqueue.h @@ -0,0 +1,77 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +A Work queue. Threads can add work items to the queue and other threads can +wait for work items to be available and take them off the queue for +processing. +************************************************************************/ + +#ifndef IB_WORK_QUEUE_H +#define IB_WORK_QUEUE_H + +#include "ut0list.h" +#include "mem0mem.h" +#include "os0sync.h" +#include "sync0types.h" + +typedef struct ib_wqueue_struct ib_wqueue_t; + +/******************************************************************** +Create a new work queue. */ +UNIV_INTERN +ib_wqueue_t* +ib_wqueue_create(void); +/*===================*/ + /* out: work queue */ + +/******************************************************************** +Free a work queue. */ +UNIV_INTERN +void +ib_wqueue_free( +/*===========*/ + ib_wqueue_t* wq); /* in: work queue */ + +/******************************************************************** +Add a work item to the queue. */ +UNIV_INTERN +void +ib_wqueue_add( +/*==========*/ + ib_wqueue_t* wq, /* in: work queue */ + void* item, /* in: work item */ + mem_heap_t* heap); /* in: memory heap to use for allocating the + list node */ + +/******************************************************************** +Wait for a work item to appear in the queue. */ +UNIV_INTERN +void* +ib_wqueue_wait( + /* out: work item */ + ib_wqueue_t* wq); /* in: work queue */ + +/* Work queue. */ +struct ib_wqueue_struct { + mutex_t mutex; /* mutex protecting everything */ + ib_list_t* items; /* work item list */ + os_event_t event; /* event we use to signal additions to list */ +}; + +#endif diff --git a/storage/xtradb/lock/lock0iter.c b/storage/xtradb/lock/lock0iter.c new file mode 100644 index 00000000000..e7a128d0db3 --- /dev/null +++ b/storage/xtradb/lock/lock0iter.c @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Lock queue iterator. Can iterate over table and record +lock queues. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "univ.i" +#include "lock0iter.h" +#include "lock0lock.h" +#include "lock0priv.h" +#include "ut0dbg.h" +#include "ut0lst.h" +#ifdef UNIV_DEBUG +# include "srv0srv.h" /* kernel_mutex */ +#endif /* UNIV_DEBUG */ + +/*********************************************************************** +Initialize lock queue iterator so that it starts to iterate from +"lock". bit_no specifies the record number within the heap where the +record is stored. It can be undefined (ULINT_UNDEFINED) in two cases: +1. If the lock is a table lock, thus we have a table lock queue; +2. If the lock is a record lock and it is a wait lock. In this case + bit_no is calculated in this function by using + lock_rec_find_set_bit(). There is exactly one bit set in the bitmap + of a wait lock. */ +UNIV_INTERN +void +lock_queue_iterator_reset( +/*======================*/ + lock_queue_iterator_t* iter, /* out: iterator */ + const lock_t* lock, /* in: lock to start from */ + ulint bit_no) /* in: record number in the + heap */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + iter->current_lock = lock; + + if (bit_no != ULINT_UNDEFINED) { + + iter->bit_no = bit_no; + } else { + + switch (lock_get_type_low(lock)) { + case LOCK_TABLE: + iter->bit_no = ULINT_UNDEFINED; + break; + case LOCK_REC: + iter->bit_no = lock_rec_find_set_bit(lock); + ut_a(iter->bit_no != ULINT_UNDEFINED); + break; + default: + ut_error; + } + } +} + +/*********************************************************************** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). */ +UNIV_INTERN +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + /* out: previous lock or NULL */ + lock_queue_iterator_t* iter) /* in/out: iterator */ +{ + const lock_t* prev_lock; + + ut_ad(mutex_own(&kernel_mutex)); + + switch (lock_get_type_low(iter->current_lock)) { + case LOCK_REC: + prev_lock = lock_rec_get_prev( + iter->current_lock, iter->bit_no); + break; + case LOCK_TABLE: + prev_lock = UT_LIST_GET_PREV( + un_member.tab_lock.locks, iter->current_lock); + break; + default: + ut_error; + } + + if (prev_lock != NULL) { + + iter->current_lock = prev_lock; + } + + return(prev_lock); +} diff --git a/storage/xtradb/lock/lock0lock.c b/storage/xtradb/lock/lock0lock.c new file mode 100644 index 00000000000..3730c66313d --- /dev/null +++ b/storage/xtradb/lock/lock0lock.c @@ -0,0 +1,5680 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "lock0lock.h" +#include "lock0priv.h" + +#ifdef UNIV_NONINL +#include "lock0lock.ic" +#include "lock0priv.ic" +#endif + +#include "ha_prototypes.h" +#include "usr0sess.h" +#include "trx0purge.h" +#include "dict0mem.h" +#include "trx0sys.h" + +/* Restricts the length of search we will do in the waits-for +graph of transactions */ +#define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000 + +/* Restricts the recursion depth of the search we will do in the waits-for +graph of transactions */ +#define LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK 200 + +/* When releasing transaction locks, this specifies how often we release +the kernel mutex for a moment to give also others access to it */ + +#define LOCK_RELEASE_KERNEL_INTERVAL 1000 + +/* Safety margin when creating a new record lock: this many extra records +can be inserted to the page without need to create a lock with a bigger +bitmap */ + +#define LOCK_PAGE_BITMAP_MARGIN 64 + +/* An explicit record lock affects both the record and the gap before it. +An implicit x-lock does not affect the gap, it only locks the index +record from read or update. + +If a transaction has modified or inserted an index record, then +it owns an implicit x-lock on the record. On a secondary index record, +a transaction has an implicit x-lock also if it has modified the +clustered index record, the max trx id of the page where the secondary +index record resides is >= trx id of the transaction (or database recovery +is running), and there are no explicit non-gap lock requests on the +secondary index record. + +This complicated definition for a secondary index comes from the +implementation: we want to be able to determine if a secondary index +record has an implicit x-lock, just by looking at the present clustered +index record, not at the historical versions of the record. The +complicated definition can be explained to the user so that there is +nondeterminism in the access path when a query is answered: we may, +or may not, access the clustered index record and thus may, or may not, +bump into an x-lock set there. + +Different transaction can have conflicting locks set on the gap at the +same time. The locks on the gap are purely inhibitive: an insert cannot +be made, or a select cursor may have to wait if a different transaction +has a conflicting lock on the gap. An x-lock on the gap does not give +the right to insert into the gap. + +An explicit lock can be placed on a user record or the supremum record of +a page. The locks on the supremum record are always thought to be of the gap +type, though the gap bit is not set. When we perform an update of a record +where the size of the record changes, we may temporarily store its explicit +locks on the infimum record of the page, though the infimum otherwise never +carries locks. + +A waiting record lock can also be of the gap type. A waiting lock request +can be granted when there is no conflicting mode lock request by another +transaction ahead of it in the explicit lock queue. + +In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP. +It only locks the record it is placed on, not the gap before the record. +This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation +level. + +------------------------------------------------------------------------- +RULE 1: If there is an implicit x-lock on a record, and there are non-gap +------- +lock requests waiting in the queue, then the transaction holding the implicit +x-lock also has an explicit non-gap record x-lock. Therefore, as locks are +released, we can grant locks to waiting lock requests purely by looking at +the explicit lock requests in the queue. + +RULE 3: Different transactions cannot have conflicting granted non-gap locks +------- +on a record at the same time. However, they can have conflicting granted gap +locks. +RULE 4: If a there is a waiting lock request in a queue, no lock request, +------- +gap or not, can be inserted ahead of it in the queue. In record deletes +and page splits new gap type locks can be created by the database manager +for a transaction, and without rule 4, the waits-for graph of transactions +might become cyclic without the database noticing it, as the deadlock check +is only performed when a transaction itself requests a lock! +------------------------------------------------------------------------- + +An insert is allowed to a gap if there are no explicit lock requests by +other transactions on the next record. It does not matter if these lock +requests are granted or waiting, gap bit set or not, with the exception +that a gap type request set by another transaction to wait for +its turn to do an insert is ignored. On the other hand, an +implicit x-lock by another transaction does not prevent an insert, which +allows for more concurrency when using an Oracle-style sequence number +generator for the primary key with many transactions doing inserts +concurrently. + +A modify of a record is allowed if the transaction has an x-lock on the +record, or if other transactions do not have any non-gap lock requests on the +record. + +A read of a single user record with a cursor is allowed if the transaction +has a non-gap explicit, or an implicit lock on the record, or if the other +transactions have no x-lock requests on the record. At a page supremum a +read is always allowed. + +In summary, an implicit lock is seen as a granted x-lock only on the +record, not on the gap. An explicit lock with no gap bit set is a lock +both on the record and the gap. If the gap bit is set, the lock is only +on the gap. Different transaction cannot own conflicting locks on the +record at the same time, but they may own conflicting locks on the gap. +Granted locks on a record give an access right to the record, but gap type +locks just inhibit operations. + +NOTE: Finding out if some transaction has an implicit x-lock on a secondary +index record can be cumbersome. We may have to look at previous versions of +the corresponding clustered index record to find out if a delete marked +secondary index record was delete marked by an active transaction, not by +a committed one. + +FACT A: If a transaction has inserted a row, it can delete it any time +without need to wait for locks. + +PROOF: The transaction has an implicit x-lock on every index record inserted +for the row, and can thus modify each record without the need to wait. Q.E.D. + +FACT B: If a transaction has read some result set with a cursor, it can read +it again, and retrieves the same result set, if it has not modified the +result set in the meantime. Hence, there is no phantom problem. If the +biggest record, in the alphabetical order, touched by the cursor is removed, +a lock wait may occur, otherwise not. + +PROOF: When a read cursor proceeds, it sets an s-lock on each user record +it passes, and a gap type s-lock on each page supremum. The cursor must +wait until it has these locks granted. Then no other transaction can +have a granted x-lock on any of the user records, and therefore cannot +modify the user records. Neither can any other transaction insert into +the gaps which were passed over by the cursor. Page splits and merges, +and removal of obsolete versions of records do not affect this, because +when a user record or a page supremum is removed, the next record inherits +its locks as gap type locks, and therefore blocks inserts to the same gap. +Also, if a page supremum is inserted, it inherits its locks from the successor +record. When the cursor is positioned again at the start of the result set, +the records it will touch on its course are either records it touched +during the last pass or new inserted page supremums. It can immediately +access all these records, and when it arrives at the biggest record, it +notices that the result set is complete. If the biggest record was removed, +lock wait can occur because the next record only inherits a gap type lock, +and a wait may be needed. Q.E.D. */ + +/* If an index record should be changed or a new inserted, we must check +the lock on the record or the next. When a read cursor starts reading, +we will set a record level s-lock on each record it passes, except on the +initial record on which the cursor is positioned before we start to fetch +records. Our index tree search has the convention that the B-tree +cursor is positioned BEFORE the first possibly matching record in +the search. Optimizations are possible here: if the record is searched +on an equality condition to a unique key, we could actually set a special +lock on the record, a lock which would not prevent any insert before +this record. In the next key locking an x-lock set on a record also +prevents inserts just before that record. + There are special infimum and supremum records on each page. +A supremum record can be locked by a read cursor. This records cannot be +updated but the lock prevents insert of a user record to the end of +the page. + Next key locks will prevent the phantom problem where new rows +could appear to SELECT result sets after the select operation has been +performed. Prevention of phantoms ensures the serilizability of +transactions. + What should we check if an insert of a new record is wanted? +Only the lock on the next record on the same page, because also the +supremum record can carry a lock. An s-lock prevents insertion, but +what about an x-lock? If it was set by a searched update, then there +is implicitly an s-lock, too, and the insert should be prevented. +What if our transaction owns an x-lock to the next record, but there is +a waiting s-lock request on the next record? If this s-lock was placed +by a read cursor moving in the ascending order in the index, we cannot +do the insert immediately, because when we finally commit our transaction, +the read cursor should see also the new inserted record. So we should +move the read cursor backward from the the next record for it to pass over +the new inserted record. This move backward may be too cumbersome to +implement. If we in this situation just enqueue a second x-lock request +for our transaction on the next record, then the deadlock mechanism +notices a deadlock between our transaction and the s-lock request +transaction. This seems to be an ok solution. + We could have the convention that granted explicit record locks, +lock the corresponding records from changing, and also lock the gaps +before them from inserting. A waiting explicit lock request locks the gap +before from inserting. Implicit record x-locks, which we derive from the +transaction id in the clustered index record, only lock the record itself +from modification, not the gap before it from inserting. + How should we store update locks? If the search is done by a unique +key, we could just modify the record trx id. Otherwise, we could put a record +x-lock on the record. If the update changes ordering fields of the +clustered index record, the inserted new record needs no record lock in +lock table, the trx id is enough. The same holds for a secondary index +record. Searched delete is similar to update. + +PROBLEM: +What about waiting lock requests? If a transaction is waiting to make an +update to a record which another modified, how does the other transaction +know to send the end-lock-wait signal to the waiting transaction? If we have +the convention that a transaction may wait for just one lock at a time, how +do we preserve it if lock wait ends? + +PROBLEM: +Checking the trx id label of a secondary index record. In the case of a +modification, not an insert, is this necessary? A secondary index record +is modified only by setting or resetting its deleted flag. A secondary index +record contains fields to uniquely determine the corresponding clustered +index record. A secondary index record is therefore only modified if we +also modify the clustered index record, and the trx id checking is done +on the clustered index record, before we come to modify the secondary index +record. So, in the case of delete marking or unmarking a secondary index +record, we do not have to care about trx ids, only the locks in the lock +table must be checked. In the case of a select from a secondary index, the +trx id is relevant, and in this case we may have to search the clustered +index record. + +PROBLEM: How to update record locks when page is split or merged, or +-------------------------------------------------------------------- +a record is deleted or updated? +If the size of fields in a record changes, we perform the update by +a delete followed by an insert. How can we retain the locks set or +waiting on the record? Because a record lock is indexed in the bitmap +by the heap number of the record, when we remove the record from the +record list, it is possible still to keep the lock bits. If the page +is reorganized, we could make a table of old and new heap numbers, +and permute the bitmaps in the locks accordingly. We can add to the +table a row telling where the updated record ended. If the update does +not require a reorganization of the page, we can simply move the lock +bits for the updated record to the position determined by its new heap +number (we may have to allocate a new lock, if we run out of the bitmap +in the old one). + A more complicated case is the one where the reinsertion of the +updated record is done pessimistically, because the structure of the +tree may change. + +PROBLEM: If a supremum record is removed in a page merge, or a record +--------------------------------------------------------------------- +removed in a purge, what to do to the waiting lock requests? In a split to +the right, we just move the lock requests to the new supremum. If a record +is removed, we could move the waiting lock request to its inheritor, the +next record in the index. But, the next record may already have lock +requests on its own queue. A new deadlock check should be made then. Maybe +it is easier just to release the waiting transactions. They can then enqueue +new lock requests on appropriate records. + +PROBLEM: When a record is inserted, what locks should it inherit from the +------------------------------------------------------------------------- +upper neighbor? An insert of a new supremum record in a page split is +always possible, but an insert of a new user record requires that the upper +neighbor does not have any lock requests by other transactions, granted or +waiting, in its lock queue. Solution: We can copy the locks as gap type +locks, so that also the waiting locks are transformed to granted gap type +locks on the inserted record. */ + +/* LOCK COMPATIBILITY MATRIX + * IS IX S X AI + * IS + + + - + + * IX + + - - + + * S + - + - - + * X - - - - - + * AI + + - - - + * + * Note that for rows, InnoDB only acquires S or X locks. + * For tables, InnoDB normally acquires IS or IX locks. + * S or X table locks are only acquired for LOCK TABLES. + * Auto-increment (AI) locks are needed because of + * statement-level MySQL binlog. + * See also lock_mode_compatible(). + */ +#define LK(a,b) (1 << ((a) * LOCK_NUM + (b))) +#define LKS(a,b) LK(a,b) | LK(b,a) + +/* Define the lock compatibility matrix in a ulint. The first line below +defines the diagonal entries. The following lines define the compatibility +for LOCK_IX, LOCK_S, and LOCK_AUTO_INC using LKS(), since the matrix +is symmetric. */ +#define LOCK_MODE_COMPATIBILITY 0 \ + | LK(LOCK_IS, LOCK_IS) | LK(LOCK_IX, LOCK_IX) | LK(LOCK_S, LOCK_S) \ + | LKS(LOCK_IX, LOCK_IS) | LKS(LOCK_IS, LOCK_AUTO_INC) \ + | LKS(LOCK_S, LOCK_IS) \ + | LKS(LOCK_AUTO_INC, LOCK_IS) | LKS(LOCK_AUTO_INC, LOCK_IX) + +/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column) + * IS IX S X AI + * IS + - - - - + * IX + + - - - + * S + - + - - + * X + + + + + + * AI - - - - + + * See lock_mode_stronger_or_eq(). + */ + +/* Define the stronger-or-equal lock relation in a ulint. This relation +contains all pairs LK(mode1, mode2) where mode1 is stronger than or +equal to mode2. */ +#define LOCK_MODE_STRONGER_OR_EQ 0 \ + | LK(LOCK_IS, LOCK_IS) \ + | LK(LOCK_IX, LOCK_IS) | LK(LOCK_IX, LOCK_IX) \ + | LK(LOCK_S, LOCK_IS) | LK(LOCK_S, LOCK_S) \ + | LK(LOCK_AUTO_INC, LOCK_AUTO_INC) \ + | LK(LOCK_X, LOCK_IS) | LK(LOCK_X, LOCK_IX) | LK(LOCK_X, LOCK_S) \ + | LK(LOCK_X, LOCK_AUTO_INC) | LK(LOCK_X, LOCK_X) + +#ifdef UNIV_DEBUG +UNIV_INTERN ibool lock_print_waits = FALSE; + +/************************************************************************* +Validates the lock system. */ +static +ibool +lock_validate(void); +/*===============*/ + /* out: TRUE if ok */ + +/************************************************************************* +Validates the record lock queues on a page. */ +static +ibool +lock_rec_validate_page( +/*===================*/ + /* out: TRUE if ok */ + ulint space, /* in: space id */ + ulint page_no);/* in: page number */ + +/* Define the following in order to enable lock_rec_validate_page() checks. */ +# undef UNIV_DEBUG_LOCK_VALIDATE +#endif /* UNIV_DEBUG */ + +/* The lock system */ +UNIV_INTERN lock_sys_t* lock_sys = NULL; + +/* We store info on the latest deadlock error to this buffer. InnoDB +Monitor will then fetch it and print */ +UNIV_INTERN ibool lock_deadlock_found = FALSE; +UNIV_INTERN FILE* lock_latest_err_file; + +/* Flags for recursive deadlock search */ +#define LOCK_VICTIM_IS_START 1 +#define LOCK_VICTIM_IS_OTHER 2 + +/************************************************************************ +Checks if a lock request results in a deadlock. */ +static +ibool +lock_deadlock_occurs( +/*=================*/ + /* out: TRUE if a deadlock was detected and we + chose trx as a victim; FALSE if no deadlock, or + there was a deadlock, but we chose other + transaction(s) as victim(s) */ + lock_t* lock, /* in: lock the transaction is requesting */ + trx_t* trx); /* in: transaction */ +/************************************************************************ +Looks recursively for a deadlock. */ +static +ulint +lock_deadlock_recursive( +/*====================*/ + /* out: 0 if no deadlock found, + LOCK_VICTIM_IS_START if there was a deadlock + and we chose 'start' as the victim, + LOCK_VICTIM_IS_OTHER if a deadlock + was found and we chose some other trx as a + victim: we must do the search again in this + last case because there may be another + deadlock! */ + trx_t* start, /* in: recursion starting point */ + trx_t* trx, /* in: a transaction waiting for a lock */ + lock_t* wait_lock, /* in: the lock trx is waiting to be granted */ + ulint* cost, /* in/out: number of calculation steps thus + far: if this exceeds LOCK_MAX_N_STEPS_... + we return LOCK_VICTIM_IS_START */ + ulint depth); /* in: recursion depth: if this exceeds + LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we + return LOCK_VICTIM_IS_START */ + +/************************************************************************* +Gets the nth bit of a record lock. */ +UNIV_INLINE +ibool +lock_rec_get_nth_bit( +/*=================*/ + /* out: TRUE if bit set */ + const lock_t* lock, /* in: record lock */ + ulint i) /* in: index of the bit */ +{ + ulint byte_index; + ulint bit_index; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + if (i >= lock->un_member.rec_lock.n_bits) { + + return(FALSE); + } + + byte_index = i / 8; + bit_index = i % 8; + + return(1 & ((const byte*) &lock[1])[byte_index] >> bit_index); +} + +/*************************************************************************/ + +#define lock_mutex_enter_kernel() mutex_enter(&kernel_mutex) +#define lock_mutex_exit_kernel() mutex_exit(&kernel_mutex) + +/************************************************************************* +Checks that a transaction id is sensible, i.e., not in the future. */ +UNIV_INTERN +ibool +lock_check_trx_id_sanity( +/*=====================*/ + /* out: TRUE if ok */ + dulint trx_id, /* in: trx id */ + const rec_t* rec, /* in: user record */ + dict_index_t* index, /* in: index */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + ibool has_kernel_mutex)/* in: TRUE if the caller owns the + kernel mutex */ +{ + ibool is_ok = TRUE; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!has_kernel_mutex) { + mutex_enter(&kernel_mutex); + } + + /* A sanity check: the trx_id in rec must be smaller than the global + trx id counter */ + + if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: transaction id associated" + " with record\n", + stderr); + rec_print_new(stderr, rec, offsets); + fputs("InnoDB: in ", stderr); + dict_index_name_print(stderr, NULL, index); + fprintf(stderr, "\n" + "InnoDB: is " TRX_ID_FMT " which is higher than the" + " global trx id counter " TRX_ID_FMT "!\n" + "InnoDB: The table is corrupt. You have to do" + " dump + drop + reimport.\n", + TRX_ID_PREP_PRINTF(trx_id), + TRX_ID_PREP_PRINTF(trx_sys->max_trx_id)); + + is_ok = FALSE; + } + + if (!has_kernel_mutex) { + mutex_exit(&kernel_mutex); + } + + return(is_ok); +} + +/************************************************************************* +Checks that a record is seen in a consistent read. */ +UNIV_INTERN +ibool +lock_clust_rec_cons_read_sees( +/*==========================*/ + /* out: TRUE if sees, or FALSE if an earlier + version of the record should be retrieved */ + const rec_t* rec, /* in: user record which should be read or + passed over by a read cursor */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + read_view_t* view) /* in: consistent read view */ +{ + dulint trx_id; + + ut_ad(dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* NOTE that we call this function while holding the search + system latch. To obey the latching order we must NOT reserve the + kernel mutex here! */ + + trx_id = row_get_rec_trx_id(rec, index, offsets); + + return(read_view_sees_trx_id(view, trx_id)); +} + +/************************************************************************* +Checks that a non-clustered index record is seen in a consistent read. */ +UNIV_INTERN +ulint +lock_sec_rec_cons_read_sees( +/*========================*/ + /* out: TRUE if certainly + sees, or FALSE if an earlier + version of the clustered index + record might be needed: NOTE + that a non-clustered index + page contains so little + information on its + modifications that also in the + case FALSE, the present + version of rec may be the + right, but we must check this + from the clustered index + record */ + const rec_t* rec, /* in: user record which + should be read or passed over + by a read cursor */ + const read_view_t* view) /* in: consistent read view */ +{ + dulint max_trx_id; + + ut_ad(page_rec_is_user_rec(rec)); + + /* NOTE that we might call this function while holding the search + system latch. To obey the latching order we must NOT reserve the + kernel mutex here! */ + + if (recv_recovery_is_on()) { + + return(FALSE); + } + + max_trx_id = page_get_max_trx_id(page_align(rec)); + + return(ut_dulint_cmp(max_trx_id, view->up_limit_id) < 0); +} + +/************************************************************************* +Creates the lock system at database start. */ +UNIV_INTERN +void +lock_sys_create( +/*============*/ + ulint n_cells) /* in: number of slots in lock hash table */ +{ + lock_sys = mem_alloc(sizeof(lock_sys_t)); + + lock_sys->rec_hash = hash_create(n_cells); + + /* hash_create_mutexes(lock_sys->rec_hash, 2, SYNC_REC_LOCK); */ + + lock_latest_err_file = os_file_create_tmpfile(); + ut_a(lock_latest_err_file); +} + +/************************************************************************* +Gets the size of a lock struct. */ +UNIV_INTERN +ulint +lock_get_size(void) +/*===============*/ + /* out: size in bytes */ +{ + return((ulint)sizeof(lock_t)); +} + +/************************************************************************* +Gets the mode of a lock. */ +UNIV_INLINE +enum lock_mode +lock_get_mode( +/*==========*/ + /* out: mode */ + const lock_t* lock) /* in: lock */ +{ + ut_ad(lock); + + return(lock->type_mode & LOCK_MODE_MASK); +} + +/************************************************************************* +Gets the wait flag of a lock. */ +UNIV_INLINE +ibool +lock_get_wait( +/*==========*/ + /* out: TRUE if waiting */ + const lock_t* lock) /* in: lock */ +{ + ut_ad(lock); + + if (UNIV_UNLIKELY(lock->type_mode & LOCK_WAIT)) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Gets the source table of an ALTER TABLE transaction. The table must be +covered by an IX or IS table lock. */ +UNIV_INTERN +dict_table_t* +lock_get_src_table( +/*===============*/ + /* out: the source table of transaction, + if it is covered by an IX or IS table lock; + dest if there is no source table, and + NULL if the transaction is locking more than + two tables or an inconsistency is found */ + trx_t* trx, /* in: transaction */ + dict_table_t* dest, /* in: destination of ALTER TABLE */ + enum lock_mode* mode) /* out: lock mode of the source table */ +{ + dict_table_t* src; + lock_t* lock; + + src = NULL; + *mode = LOCK_NONE; + + for (lock = UT_LIST_GET_FIRST(trx->trx_locks); + lock; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + lock_table_t* tab_lock; + enum lock_mode lock_mode; + if (!(lock_get_type_low(lock) & LOCK_TABLE)) { + /* We are only interested in table locks. */ + continue; + } + tab_lock = &lock->un_member.tab_lock; + if (dest == tab_lock->table) { + /* We are not interested in the destination table. */ + continue; + } else if (!src) { + /* This presumably is the source table. */ + src = tab_lock->table; + if (UT_LIST_GET_LEN(src->locks) != 1 + || UT_LIST_GET_FIRST(src->locks) != lock) { + /* We only support the case when + there is only one lock on this table. */ + return(NULL); + } + } else if (src != tab_lock->table) { + /* The transaction is locking more than + two tables (src and dest): abort */ + return(NULL); + } + + /* Check that the source table is locked by + LOCK_IX or LOCK_IS. */ + lock_mode = lock_get_mode(lock); + if (lock_mode == LOCK_IX || lock_mode == LOCK_IS) { + if (*mode != LOCK_NONE && *mode != lock_mode) { + /* There are multiple locks on src. */ + return(NULL); + } + *mode = lock_mode; + } + } + + if (!src) { + /* No source table lock found: flag the situation to caller */ + src = dest; + } + + return(src); +} + +/************************************************************************* +Determine if the given table is exclusively "owned" by the given +transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC +on the table. */ +UNIV_INTERN +ibool +lock_is_table_exclusive( +/*====================*/ + /* out: TRUE if table is only locked by trx, + with LOCK_IX, and possibly LOCK_AUTO_INC */ + dict_table_t* table, /* in: table */ + trx_t* trx) /* in: transaction */ +{ + const lock_t* lock; + ibool ok = FALSE; + + ut_ad(table); + ut_ad(trx); + + lock_mutex_enter_kernel(); + + for (lock = UT_LIST_GET_FIRST(table->locks); + lock; + lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) { + if (lock->trx != trx) { + /* A lock on the table is held + by some other transaction. */ + goto not_ok; + } + + if (!(lock_get_type_low(lock) & LOCK_TABLE)) { + /* We are interested in table locks only. */ + continue; + } + + switch (lock_get_mode(lock)) { + case LOCK_IX: + ok = TRUE; + break; + case LOCK_AUTO_INC: + /* It is allowed for trx to hold an + auto_increment lock. */ + break; + default: +not_ok: + /* Other table locks than LOCK_IX are not allowed. */ + ok = FALSE; + goto func_exit; + } + } + +func_exit: + lock_mutex_exit_kernel(); + + return(ok); +} + +/************************************************************************* +Sets the wait flag of a lock and the back pointer in trx to lock. */ +UNIV_INLINE +void +lock_set_lock_and_trx_wait( +/*=======================*/ + lock_t* lock, /* in: lock */ + trx_t* trx) /* in: trx */ +{ + ut_ad(lock); + ut_ad(trx->wait_lock == NULL); + + trx->wait_lock = lock; + lock->type_mode |= LOCK_WAIT; +} + +/************************************************************************** +The back pointer to a waiting lock request in the transaction is set to NULL +and the wait bit in lock type_mode is reset. */ +UNIV_INLINE +void +lock_reset_lock_and_trx_wait( +/*=========================*/ + lock_t* lock) /* in: record lock */ +{ + ut_ad((lock->trx)->wait_lock == lock); + ut_ad(lock_get_wait(lock)); + + /* Reset the back pointer in trx to this waiting lock request */ + + (lock->trx)->wait_lock = NULL; + lock->type_mode &= ~LOCK_WAIT; +} + +/************************************************************************* +Gets the gap flag of a record lock. */ +UNIV_INLINE +ibool +lock_rec_get_gap( +/*=============*/ + /* out: TRUE if gap flag set */ + const lock_t* lock) /* in: record lock */ +{ + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + if (lock->type_mode & LOCK_GAP) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Gets the LOCK_REC_NOT_GAP flag of a record lock. */ +UNIV_INLINE +ibool +lock_rec_get_rec_not_gap( +/*=====================*/ + /* out: TRUE if LOCK_REC_NOT_GAP flag set */ + const lock_t* lock) /* in: record lock */ +{ + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + if (lock->type_mode & LOCK_REC_NOT_GAP) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Gets the waiting insert flag of a record lock. */ +UNIV_INLINE +ibool +lock_rec_get_insert_intention( +/*==========================*/ + /* out: TRUE if gap flag set */ + const lock_t* lock) /* in: record lock */ +{ + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + if (lock->type_mode & LOCK_INSERT_INTENTION) { + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Calculates if lock mode 1 is stronger or equal to lock mode 2. */ +UNIV_INLINE +ulint +lock_mode_stronger_or_eq( +/*=====================*/ + /* out: nonzero + if mode1 stronger or equal to mode2 */ + enum lock_mode mode1, /* in: lock mode */ + enum lock_mode mode2) /* in: lock mode */ +{ + ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX + || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC); + ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX + || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC); + + return((LOCK_MODE_STRONGER_OR_EQ) & LK(mode1, mode2)); +} + +/************************************************************************* +Calculates if lock mode 1 is compatible with lock mode 2. */ +UNIV_INLINE +ulint +lock_mode_compatible( +/*=================*/ + /* out: nonzero if mode1 compatible with mode2 */ + enum lock_mode mode1, /* in: lock mode */ + enum lock_mode mode2) /* in: lock mode */ +{ + ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX + || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC); + ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX + || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC); + + return((LOCK_MODE_COMPATIBILITY) & LK(mode1, mode2)); +} + +/************************************************************************* +Checks if a lock request for a new lock has to wait for request lock2. */ +UNIV_INLINE +ibool +lock_rec_has_to_wait( +/*=================*/ + /* out: TRUE if new lock has to wait + for lock2 to be removed */ + const trx_t* trx, /* in: trx of new lock */ + ulint type_mode,/* in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_GAP or LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION */ + const lock_t* lock2, /* in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ + ibool lock_is_on_supremum) /* in: TRUE if we are setting the + lock on the 'supremum' record of an + index page: we know then that the lock + request is really for a 'gap' type lock */ +{ + ut_ad(trx && lock2); + ut_ad(lock_get_type_low(lock2) == LOCK_REC); + + if (trx != lock2->trx + && !lock_mode_compatible(LOCK_MODE_MASK & type_mode, + lock_get_mode(lock2))) { + + /* We have somewhat complex rules when gap type record locks + cause waits */ + + if ((lock_is_on_supremum || (type_mode & LOCK_GAP)) + && !(type_mode & LOCK_INSERT_INTENTION)) { + + /* Gap type locks without LOCK_INSERT_INTENTION flag + do not need to wait for anything. This is because + different users can have conflicting lock types + on gaps. */ + + return(FALSE); + } + + if (!(type_mode & LOCK_INSERT_INTENTION) + && lock_rec_get_gap(lock2)) { + + /* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP + does not need to wait for a gap type lock */ + + return(FALSE); + } + + if ((type_mode & LOCK_GAP) + && lock_rec_get_rec_not_gap(lock2)) { + + /* Lock on gap does not need to wait for + a LOCK_REC_NOT_GAP type lock */ + + return(FALSE); + } + + if (lock_rec_get_insert_intention(lock2)) { + + /* No lock request needs to wait for an insert + intention lock to be removed. This is ok since our + rules allow conflicting locks on gaps. This eliminates + a spurious deadlock caused by a next-key lock waiting + for an insert intention lock; when the insert + intention lock was granted, the insert deadlocked on + the waiting next-key lock. + + Also, insert intention locks do not disturb each + other. */ + + return(FALSE); + } + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Checks if a lock request lock1 has to wait for request lock2. */ +UNIV_INTERN +ibool +lock_has_to_wait( +/*=============*/ + /* out: TRUE if lock1 has to wait for + lock2 to be removed */ + const lock_t* lock1, /* in: waiting lock */ + const lock_t* lock2) /* in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +{ + ut_ad(lock1 && lock2); + + if (lock1->trx != lock2->trx + && !lock_mode_compatible(lock_get_mode(lock1), + lock_get_mode(lock2))) { + if (lock_get_type_low(lock1) == LOCK_REC) { + ut_ad(lock_get_type_low(lock2) == LOCK_REC); + + /* If this lock request is for a supremum record + then the second bit on the lock bitmap is set */ + + return(lock_rec_has_to_wait(lock1->trx, + lock1->type_mode, lock2, + lock_rec_get_nth_bit( + lock1, 1))); + } + + return(TRUE); + } + + return(FALSE); +} + +/*============== RECORD LOCK BASIC FUNCTIONS ============================*/ + +/************************************************************************* +Gets the number of bits in a record lock bitmap. */ +UNIV_INLINE +ulint +lock_rec_get_n_bits( +/*================*/ + /* out: number of bits */ + const lock_t* lock) /* in: record lock */ +{ + return(lock->un_member.rec_lock.n_bits); +} + +/************************************************************************** +Sets the nth bit of a record lock to TRUE. */ +UNIV_INLINE +void +lock_rec_set_nth_bit( +/*=================*/ + lock_t* lock, /* in: record lock */ + ulint i) /* in: index of the bit */ +{ + ulint byte_index; + ulint bit_index; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte_index = i / 8; + bit_index = i % 8; + + ((byte*) &lock[1])[byte_index] |= 1 << bit_index; +} + +/************************************************************************** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. */ +UNIV_INTERN +ulint +lock_rec_find_set_bit( +/*==================*/ + /* out: bit index == heap number of + the record, or ULINT_UNDEFINED if none found */ + const lock_t* lock) /* in: record lock with at least one bit set */ +{ + ulint i; + + for (i = 0; i < lock_rec_get_n_bits(lock); i++) { + + if (lock_rec_get_nth_bit(lock, i)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/************************************************************************** +Resets the nth bit of a record lock. */ +UNIV_INLINE +void +lock_rec_reset_nth_bit( +/*===================*/ + lock_t* lock, /* in: record lock */ + ulint i) /* in: index of the bit which must be set to TRUE + when this function is called */ +{ + ulint byte_index; + ulint bit_index; + + ut_ad(lock); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte_index = i / 8; + bit_index = i % 8; + + ((byte*) &lock[1])[byte_index] &= ~(1 << bit_index); +} + +/************************************************************************* +Gets the first or next record lock on a page. */ +UNIV_INLINE +lock_t* +lock_rec_get_next_on_page( +/*======================*/ + /* out: next lock, NULL if none exists */ + lock_t* lock) /* in: a record lock */ +{ + ulint space; + ulint page_no; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + space = lock->un_member.rec_lock.space; + page_no = lock->un_member.rec_lock.page_no; + + for (;;) { + lock = HASH_GET_NEXT(hash, lock); + + if (!lock) { + + break; + } + + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no)) { + + break; + } + } + + return(lock); +} + +/************************************************************************* +Gets the first record lock on a page, where the page is identified by its +file address. */ +UNIV_INLINE +lock_t* +lock_rec_get_first_on_page_addr( +/*============================*/ + /* out: first lock, NULL if none exists */ + ulint space, /* in: space */ + ulint page_no)/* in: page number */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = HASH_GET_FIRST(lock_sys->rec_hash, + lock_rec_hash(space, page_no)); + while (lock) { + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no)) { + + break; + } + + lock = HASH_GET_NEXT(hash, lock); + } + + return(lock); +} + +/************************************************************************* +Returns TRUE if there are explicit record locks on a page. */ +UNIV_INTERN +ibool +lock_rec_expl_exist_on_page( +/*========================*/ + /* out: TRUE if there are explicit record locks on + the page */ + ulint space, /* in: space id */ + ulint page_no)/* in: page number */ +{ + ibool ret; + + mutex_enter(&kernel_mutex); + + if (lock_rec_get_first_on_page_addr(space, page_no)) { + ret = TRUE; + } else { + ret = FALSE; + } + + mutex_exit(&kernel_mutex); + + return(ret); +} + +/************************************************************************* +Gets the first record lock on a page, where the page is identified by a +pointer to it. */ +UNIV_INLINE +lock_t* +lock_rec_get_first_on_page( +/*=======================*/ + /* out: first lock, NULL if + none exists */ + const buf_block_t* block) /* in: buffer block */ +{ + ulint hash; + lock_t* lock; + ulint space = buf_block_get_space(block); + ulint page_no = buf_block_get_page_no(block); + + ut_ad(mutex_own(&kernel_mutex)); + + hash = buf_block_get_lock_hash_val(block); + + lock = HASH_GET_FIRST(lock_sys->rec_hash, hash); + + while (lock) { + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no)) { + + break; + } + + lock = HASH_GET_NEXT(hash, lock); + } + + return(lock); +} + +/************************************************************************* +Gets the next explicit lock request on a record. */ +UNIV_INLINE +lock_t* +lock_rec_get_next( +/*==============*/ + /* out: next lock, NULL if none exists */ + ulint heap_no,/* in: heap number of the record */ + lock_t* lock) /* in: lock */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + do { + ut_ad(lock_get_type_low(lock) == LOCK_REC); + lock = lock_rec_get_next_on_page(lock); + } while (lock && !lock_rec_get_nth_bit(lock, heap_no)); + + return(lock); +} + +/************************************************************************* +Gets the first explicit lock request on a record. */ +UNIV_INLINE +lock_t* +lock_rec_get_first( +/*===============*/ + /* out: first lock, NULL if + none exists */ + const buf_block_t* block, /* in: block containing the record */ + ulint heap_no)/* in: heap number of the record */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + for (lock = lock_rec_get_first_on_page(block); lock; + lock = lock_rec_get_next_on_page(lock)) { + if (lock_rec_get_nth_bit(lock, heap_no)) { + break; + } + } + + return(lock); +} + +/************************************************************************* +Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock +pointer in the transaction! This function is used in lock object creation +and resetting. */ +static +void +lock_rec_bitmap_reset( +/*==================*/ + lock_t* lock) /* in: record lock */ +{ + ulint n_bytes; + + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + /* Reset to zero the bitmap which resides immediately after the lock + struct */ + + n_bytes = lock_rec_get_n_bits(lock) / 8; + + ut_ad((lock_rec_get_n_bits(lock) % 8) == 0); + + memset(&lock[1], 0, n_bytes); +} + +/************************************************************************* +Copies a record lock to heap. */ +static +lock_t* +lock_rec_copy( +/*==========*/ + /* out: copy of lock */ + const lock_t* lock, /* in: record lock */ + mem_heap_t* heap) /* in: memory heap */ +{ + ulint size; + + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8; + + return(mem_heap_dup(heap, lock, size)); +} + +/************************************************************************* +Gets the previous record lock set on a record. */ +UNIV_INTERN +const lock_t* +lock_rec_get_prev( +/*==============*/ + /* out: previous lock on the same + record, NULL if none exists */ + const lock_t* in_lock,/* in: record lock */ + ulint heap_no)/* in: heap number of the record */ +{ + lock_t* lock; + ulint space; + ulint page_no; + lock_t* found_lock = NULL; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_get_type_low(in_lock) == LOCK_REC); + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + + lock = lock_rec_get_first_on_page_addr(space, page_no); + + for (;;) { + ut_ad(lock); + + if (lock == in_lock) { + + return(found_lock); + } + + if (lock_rec_get_nth_bit(lock, heap_no)) { + + found_lock = lock; + } + + lock = lock_rec_get_next_on_page(lock); + } +} + +/*============= FUNCTIONS FOR ANALYZING TABLE LOCK QUEUE ================*/ + +/************************************************************************* +Checks if a transaction has the specified table lock, or stronger. */ +UNIV_INLINE +lock_t* +lock_table_has( +/*===========*/ + /* out: lock or NULL */ + trx_t* trx, /* in: transaction */ + dict_table_t* table, /* in: table */ + enum lock_mode mode) /* in: lock mode */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + /* Look for stronger locks the same trx already has on the table */ + + lock = UT_LIST_GET_LAST(table->locks); + + while (lock != NULL) { + + if (lock->trx == trx + && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) { + + /* The same trx already has locked the table in + a mode stronger or equal to the mode given */ + + ut_ad(!lock_get_wait(lock)); + + return(lock); + } + + lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock); + } + + return(NULL); +} + +/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/ + +/************************************************************************* +Checks if a transaction has a GRANTED explicit lock on rec stronger or equal +to precise_mode. */ +UNIV_INLINE +lock_t* +lock_rec_has_expl( +/*==============*/ + /* out: lock or NULL */ + ulint precise_mode,/* in: LOCK_S or LOCK_X + possibly ORed to LOCK_GAP or + LOCK_REC_NOT_GAP, for a + supremum record we regard this + always a gap type request */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of the record */ + trx_t* trx) /* in: transaction */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S + || (precise_mode & LOCK_MODE_MASK) == LOCK_X); + ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); + + lock = lock_rec_get_first(block, heap_no); + + while (lock) { + if (lock->trx == trx + && lock_mode_stronger_or_eq(lock_get_mode(lock), + precise_mode & LOCK_MODE_MASK) + && !lock_get_wait(lock) + && (!lock_rec_get_rec_not_gap(lock) + || (precise_mode & LOCK_REC_NOT_GAP) + || heap_no == PAGE_HEAP_NO_SUPREMUM) + && (!lock_rec_get_gap(lock) + || (precise_mode & LOCK_GAP) + || heap_no == PAGE_HEAP_NO_SUPREMUM) + && (!lock_rec_get_insert_intention(lock))) { + + return(lock); + } + + lock = lock_rec_get_next(heap_no, lock); + } + + return(NULL); +} + +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/************************************************************************* +Checks if some other transaction has a lock request in the queue. */ +static +lock_t* +lock_rec_other_has_expl_req( +/*========================*/ + /* out: lock or NULL */ + enum lock_mode mode, /* in: LOCK_S or LOCK_X */ + ulint gap, /* in: LOCK_GAP if also gap + locks are taken into account, + or 0 if not */ + ulint wait, /* in: LOCK_WAIT if also + waiting locks are taken into + account, or 0 if not */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of the record */ + const trx_t* trx) /* in: transaction, or NULL if + requests by all transactions + are taken into account */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(mode == LOCK_X || mode == LOCK_S); + ut_ad(gap == 0 || gap == LOCK_GAP); + ut_ad(wait == 0 || wait == LOCK_WAIT); + + lock = lock_rec_get_first(block, heap_no); + + while (lock) { + if (lock->trx != trx + && (gap + || !(lock_rec_get_gap(lock) + || heap_no == PAGE_HEAP_NO_SUPREMUM)) + && (wait || !lock_get_wait(lock)) + && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) { + + return(lock); + } + + lock = lock_rec_get_next(heap_no, lock); + } + + return(NULL); +} +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ + +/************************************************************************* +Checks if some other transaction has a conflicting explicit lock request +in the queue, so that we have to wait. */ +static +lock_t* +lock_rec_other_has_conflicting( +/*===========================*/ + /* out: lock or NULL */ + enum lock_mode mode, /* in: LOCK_S or LOCK_X, + possibly ORed to LOCK_GAP or + LOC_REC_NOT_GAP, + LOCK_INSERT_INTENTION */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of the record */ + trx_t* trx) /* in: our transaction */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = lock_rec_get_first(block, heap_no); + + if (UNIV_LIKELY_NULL(lock)) { + if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) { + + do { + if (lock_rec_has_to_wait(trx, mode, lock, + TRUE)) { + return(lock); + } + + lock = lock_rec_get_next(heap_no, lock); + } while (lock); + } else { + + do { + if (lock_rec_has_to_wait(trx, mode, lock, + FALSE)) { + return(lock); + } + + lock = lock_rec_get_next(heap_no, lock); + } while (lock); + } + } + + return(NULL); +} + +/************************************************************************* +Looks for a suitable type record lock struct by the same trx on the same page. +This can be used to save space when a new record lock should be set on a page: +no new struct is needed, if a suitable old is found. */ +UNIV_INLINE +lock_t* +lock_rec_find_similar_on_page( +/*==========================*/ + /* out: lock or NULL */ + ulint type_mode, /* in: lock type_mode field */ + ulint heap_no, /* in: heap number of the record */ + lock_t* lock, /* in: lock_rec_get_first_on_page() */ + const trx_t* trx) /* in: transaction */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + while (lock != NULL) { + if (lock->trx == trx + && lock->type_mode == type_mode + && lock_rec_get_n_bits(lock) > heap_no) { + + return(lock); + } + + lock = lock_rec_get_next_on_page(lock); + } + + return(NULL); +} + +/************************************************************************* +Checks if some transaction has an implicit x-lock on a record in a secondary +index. */ +static +trx_t* +lock_sec_rec_some_has_impl_off_kernel( +/*==================================*/ + /* out: transaction which has the x-lock, or + NULL */ + const rec_t* rec, /* in: user record */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + const page_t* page = page_align(rec); + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(!dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* Some transaction may have an implicit x-lock on the record only + if the max trx id for the page >= min trx id for the trx list, or + database recovery is running. We do not write the changes of a page + max trx id to the log, and therefore during recovery, this value + for a page may be incorrect. */ + + if (!(ut_dulint_cmp(page_get_max_trx_id(page), + trx_list_get_min_trx_id()) >= 0) + && !recv_recovery_is_on()) { + + return(NULL); + } + + /* Ok, in this case it is possible that some transaction has an + implicit x-lock. We have to look in the clustered index. */ + + if (!lock_check_trx_id_sanity(page_get_max_trx_id(page), + rec, index, offsets, TRUE)) { + buf_page_print(page, 0); + + /* The page is corrupt: try to avoid a crash by returning + NULL */ + return(NULL); + } + + return(row_vers_impl_x_locked_off_kernel(rec, index, offsets)); +} + +/************************************************************************* +Return approximate number or record locks (bits set in the bitmap) for +this transaction. Since delete-marked records may be removed, the +record count will not be precise. */ +UNIV_INTERN +ulint +lock_number_of_rows_locked( +/*=======================*/ + trx_t* trx) /* in: transaction */ +{ + lock_t* lock; + ulint n_records = 0; + ulint n_bits; + ulint n_bit; + + lock = UT_LIST_GET_FIRST(trx->trx_locks); + + while (lock) { + if (lock_get_type_low(lock) == LOCK_REC) { + n_bits = lock_rec_get_n_bits(lock); + + for (n_bit = 0; n_bit < n_bits; n_bit++) { + if (lock_rec_get_nth_bit(lock, n_bit)) { + n_records++; + } + } + } + + lock = UT_LIST_GET_NEXT(trx_locks, lock); + } + + return (n_records); +} + +/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ + +/************************************************************************* +Creates a new record lock and inserts it to the lock queue. Does NOT check +for deadlocks or lock compatibility! */ +static +lock_t* +lock_rec_create( +/*============*/ + /* out: created lock */ + ulint type_mode,/* in: lock mode and wait + flag, type is ignored and + replaced by LOCK_REC */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of the record */ + dict_index_t* index, /* in: index of record */ + trx_t* trx) /* in: transaction */ +{ + lock_t* lock; + ulint page_no; + ulint space; + ulint n_bits; + ulint n_bytes; + const page_t* page; + + ut_ad(mutex_own(&kernel_mutex)); + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + page = block->frame; + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + /* If rec is the supremum record, then we reset the gap and + LOCK_REC_NOT_GAP bits, as all locks on the supremum are + automatically of the gap type */ + + if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); + + type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP); + } + + /* Make lock bitmap bigger by a safety margin */ + n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN; + n_bytes = 1 + n_bits / 8; + + lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes); + + UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock); + + lock->trx = trx; + + lock->type_mode = (type_mode & ~LOCK_TYPE_MASK) | LOCK_REC; + lock->index = index; + + lock->un_member.rec_lock.space = space; + lock->un_member.rec_lock.page_no = page_no; + lock->un_member.rec_lock.n_bits = n_bytes * 8; + + /* Reset to zero the bitmap which resides immediately after the + lock struct */ + + lock_rec_bitmap_reset(lock); + + /* Set the bit corresponding to rec */ + lock_rec_set_nth_bit(lock, heap_no); + + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + + lock_set_lock_and_trx_wait(lock, trx); + } + + return(lock); +} + +/************************************************************************* +Enqueues a waiting request for a lock which cannot be granted immediately. +Checks for deadlocks. */ +static +ulint +lock_rec_enqueue_waiting( +/*=====================*/ + /* out: DB_LOCK_WAIT, + DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED, or + DB_SUCCESS; DB_SUCCESS means + that there was a deadlock, but + another transaction was chosen + as a victim, and we got the + lock immediately: no need to + wait then */ + ulint type_mode,/* in: lock mode this + transaction is requesting: + LOCK_S or LOCK_X, possibly + ORed with LOCK_GAP or + LOCK_REC_NOT_GAP, ORed with + LOCK_INSERT_INTENTION if this + waiting lock request is set + when performing an insert of + an index record */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of the record */ + dict_index_t* index, /* in: index of record */ + que_thr_t* thr) /* in: query thread */ +{ + lock_t* lock; + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + + /* Test if there already is some other reason to suspend thread: + we do not enqueue a lock request if the query thread should be + stopped anyway */ + + if (UNIV_UNLIKELY(que_thr_stop(thr))) { + + ut_error; + + return(DB_QUE_THR_SUSPENDED); + } + + trx = thr_get_trx(thr); + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + break; + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: a record lock wait happens" + " in a dictionary operation!\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs(".\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + stderr); + } + + /* Enqueue the lock request that will wait to be granted */ + lock = lock_rec_create(type_mode | LOCK_WAIT, + block, heap_no, index, trx); + + /* Check if a deadlock occurs: if yes, remove the lock request and + return an error code */ + + if (UNIV_UNLIKELY(lock_deadlock_occurs(lock, trx))) { + + lock_reset_lock_and_trx_wait(lock); + lock_rec_reset_nth_bit(lock, heap_no); + + return(DB_DEADLOCK); + } + + /* If there was a deadlock but we chose another transaction as a + victim, it is possible that we already have the lock now granted! */ + + if (trx->wait_lock == NULL) { + + return(DB_SUCCESS); + } + + trx->que_state = TRX_QUE_LOCK_WAIT; + trx->was_chosen_as_deadlock_victim = FALSE; + trx->wait_started = time(NULL); + + ut_a(que_thr_stop(thr)); + +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fprintf(stderr, "Lock wait for trx %lu in index ", + (ulong) ut_dulint_get_low(trx->id)); + ut_print_name(stderr, trx, FALSE, index->name); + } +#endif /* UNIV_DEBUG */ + + return(DB_LOCK_WAIT); +} + +/************************************************************************* +Adds a record lock request in the record queue. The request is normally +added as the last in the queue, but if there are no waiting lock requests +on the record, and the request to be added is not a waiting request, we +can reuse a suitable record lock object already existing on the same page, +just setting the appropriate bit in its bitmap. This is a low-level function +which does NOT check for deadlocks or lock compatibility! */ +static +lock_t* +lock_rec_add_to_queue( +/*==================*/ + /* out: lock where the bit was set */ + ulint type_mode,/* in: lock mode, wait, gap + etc. flags; type is ignored + and replaced by LOCK_REC */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of the record */ + dict_index_t* index, /* in: index of record */ + trx_t* trx) /* in: transaction */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); +#ifdef UNIV_DEBUG + switch (type_mode & LOCK_MODE_MASK) { + case LOCK_X: + case LOCK_S: + break; + default: + ut_error; + } + + if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) { + enum lock_mode mode = (type_mode & LOCK_MODE_MASK) == LOCK_S + ? LOCK_X + : LOCK_S; + lock_t* other_lock + = lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT, + block, heap_no, trx); + ut_a(!other_lock); + } +#endif /* UNIV_DEBUG */ + + type_mode |= LOCK_REC; + + /* If rec is the supremum record, then we can reset the gap bit, as + all locks on the supremum are automatically of the gap type, and we + try to avoid unnecessary memory consumption of a new record lock + struct for a gap type lock */ + + if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); + + /* There should never be LOCK_REC_NOT_GAP on a supremum + record, but let us play safe */ + + type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP); + } + + /* Look for a waiting lock request on the same record or on a gap */ + + lock = lock_rec_get_first_on_page(block); + + while (lock != NULL) { + if (lock_get_wait(lock) + && (lock_rec_get_nth_bit(lock, heap_no))) { + + goto somebody_waits; + } + + lock = lock_rec_get_next_on_page(lock); + } + + if (UNIV_LIKELY(!(type_mode & LOCK_WAIT))) { + + /* Look for a similar record lock on the same page: + if one is found and there are no waiting lock requests, + we can just set the bit */ + + lock = lock_rec_find_similar_on_page( + type_mode, heap_no, + lock_rec_get_first_on_page(block), trx); + + if (lock) { + + lock_rec_set_nth_bit(lock, heap_no); + + return(lock); + } + } + +somebody_waits: + return(lock_rec_create(type_mode, block, heap_no, index, trx)); +} + +/************************************************************************* +This is a fast routine for locking a record in the most common cases: +there are no explicit locks on the page, or there is just one lock, owned +by this transaction, and of the right type_mode. This is a low-level function +which does NOT look at implicit locks! Checks lock compatibility within +explicit locks. This function sets a normal next-key lock, or in the case of +a page supremum record, a gap type lock. */ +UNIV_INLINE +ibool +lock_rec_lock_fast( +/*===============*/ + /* out: TRUE if locking succeeded */ + ibool impl, /* in: if TRUE, no lock is set + if no wait is necessary: we + assume that the caller will + set an implicit lock */ + ulint mode, /* in: lock mode: LOCK_X or + LOCK_S possibly ORed to either + LOCK_GAP or LOCK_REC_NOT_GAP */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of record */ + dict_index_t* index, /* in: index of record */ + que_thr_t* thr) /* in: query thread */ +{ + lock_t* lock; + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == 0 + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + + lock = lock_rec_get_first_on_page(block); + + trx = thr_get_trx(thr); + + if (lock == NULL) { + if (!impl) { + lock_rec_create(mode, block, heap_no, index, trx); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) { + trx_register_new_rec_lock(trx, index); + } + } + + return(TRUE); + } + + if (lock_rec_get_next_on_page(lock)) { + + return(FALSE); + } + + if (lock->trx != trx + || lock->type_mode != (mode | LOCK_REC) + || lock_rec_get_n_bits(lock) <= heap_no) { + + return(FALSE); + } + + if (!impl) { + /* If the nth bit of the record lock is already set then we + do not set a new lock bit, otherwise we do set */ + + if (!lock_rec_get_nth_bit(lock, heap_no)) { + lock_rec_set_nth_bit(lock, heap_no); + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) { + trx_register_new_rec_lock(trx, index); + } + } + } + + return(TRUE); +} + +/************************************************************************* +This is the general, and slower, routine for locking a record. This is a +low-level function which does NOT look at implicit locks! Checks lock +compatibility within explicit locks. This function sets a normal next-key +lock, or in the case of a page supremum record, a gap type lock. */ +static +ulint +lock_rec_lock_slow( +/*===============*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, or error code */ + ibool impl, /* in: if TRUE, no lock is set + if no wait is necessary: we + assume that the caller will + set an implicit lock */ + ulint mode, /* in: lock mode: LOCK_X or + LOCK_S possibly ORed to either + LOCK_GAP or LOCK_REC_NOT_GAP */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of record */ + dict_index_t* index, /* in: index of record */ + que_thr_t* thr) /* in: query thread */ +{ + trx_t* trx; + ulint err; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == 0 + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); + + trx = thr_get_trx(thr); + + if (lock_rec_has_expl(mode, block, heap_no, trx)) { + /* The trx already has a strong enough lock on rec: do + nothing */ + + err = DB_SUCCESS; + } else if (lock_rec_other_has_conflicting(mode, block, heap_no, trx)) { + + /* If another transaction has a non-gap conflicting request in + the queue, as this transaction does not have a lock strong + enough already granted on the record, we have to wait. */ + + err = lock_rec_enqueue_waiting(mode, block, heap_no, + index, thr); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) { + trx_register_new_rec_lock(trx, index); + } + } else { + if (!impl) { + /* Set the requested lock on the record */ + + lock_rec_add_to_queue(LOCK_REC | mode, block, + heap_no, index, trx); + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) { + trx_register_new_rec_lock(trx, index); + } + } + + err = DB_SUCCESS; + } + + return(err); +} + +/************************************************************************* +Tries to lock the specified record in the mode requested. If not immediately +possible, enqueues a waiting lock request. This is a low-level function +which does NOT look at implicit locks! Checks lock compatibility within +explicit locks. This function sets a normal next-key lock, or in the case +of a page supremum record, a gap type lock. */ +static +ulint +lock_rec_lock( +/*==========*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, or error code */ + ibool impl, /* in: if TRUE, no lock is set + if no wait is necessary: we + assume that the caller will + set an implicit lock */ + ulint mode, /* in: lock mode: LOCK_X or + LOCK_S possibly ORed to either + LOCK_GAP or LOCK_REC_NOT_GAP */ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no,/* in: heap number of record */ + dict_index_t* index, /* in: index of record */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad((LOCK_MODE_MASK & mode) == LOCK_S + || (LOCK_MODE_MASK & mode) == LOCK_X); + ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP + || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP + || mode - (LOCK_MODE_MASK & mode) == 0); + + if (lock_rec_lock_fast(impl, mode, block, heap_no, index, thr)) { + + /* We try a simplified and faster subroutine for the most + common cases */ + + err = DB_SUCCESS; + } else { + err = lock_rec_lock_slow(impl, mode, block, + heap_no, index, thr); + } + + return(err); +} + +/************************************************************************* +Checks if a waiting record lock request still has to wait in a queue. */ +static +ibool +lock_rec_has_to_wait_in_queue( +/*==========================*/ + /* out: TRUE if still has to wait */ + lock_t* wait_lock) /* in: waiting record lock */ +{ + lock_t* lock; + ulint space; + ulint page_no; + ulint heap_no; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_get_wait(wait_lock)); + ut_ad(lock_get_type_low(wait_lock) == LOCK_REC); + + space = wait_lock->un_member.rec_lock.space; + page_no = wait_lock->un_member.rec_lock.page_no; + heap_no = lock_rec_find_set_bit(wait_lock); + + lock = lock_rec_get_first_on_page_addr(space, page_no); + + while (lock != wait_lock) { + + if (lock_rec_get_nth_bit(lock, heap_no) + && lock_has_to_wait(wait_lock, lock)) { + + return(TRUE); + } + + lock = lock_rec_get_next_on_page(lock); + } + + return(FALSE); +} + +/***************************************************************** +Grants a lock to a waiting lock request and releases the waiting +transaction. */ +static +void +lock_grant( +/*=======*/ + lock_t* lock) /* in/out: waiting lock request */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + lock_reset_lock_and_trx_wait(lock); + + if (lock_get_mode(lock) == LOCK_AUTO_INC) { + trx_t* trx = lock->trx; + dict_table_t* table = lock->un_member.tab_lock.table; + + if (table->autoinc_trx == trx) { + fprintf(stderr, + "InnoDB: Error: trx already had" + " an AUTO-INC lock!\n"); + } else { + table->autoinc_trx = trx; + + ib_vector_push(trx->autoinc_locks, lock); + } + } + +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fprintf(stderr, "Lock wait for trx %lu ends\n", + (ulong) ut_dulint_get_low(lock->trx->id)); + } +#endif /* UNIV_DEBUG */ + + /* If we are resolving a deadlock by choosing another transaction + as a victim, then our original transaction may not be in the + TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait + for it */ + + if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) { + trx_end_lock_wait(lock->trx); + } +} + +/***************************************************************** +Cancels a waiting record lock request and releases the waiting transaction +that requested it. NOTE: does NOT check if waiting lock requests behind this +one can now be granted! */ +static +void +lock_rec_cancel( +/*============*/ + lock_t* lock) /* in: waiting record lock request */ +{ + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_get_type_low(lock) == LOCK_REC); + + /* Reset the bit (there can be only one set bit) in the lock bitmap */ + lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock)); + + /* Reset the wait flag and the back pointer to lock in trx */ + + lock_reset_lock_and_trx_wait(lock); + + /* The following function releases the trx from lock wait */ + + trx_end_lock_wait(lock->trx); +} + +/***************************************************************** +Removes a record lock request, waiting or granted, from the queue and +grants locks to other transactions in the queue if they now are entitled +to a lock. NOTE: all record locks contained in in_lock are removed. */ +static +void +lock_rec_dequeue_from_page( +/*=======================*/ + lock_t* in_lock)/* in: record lock object: all record locks which + are contained in this lock object are removed; + transactions waiting behind will get their lock + requests granted, if they are now qualified to it */ +{ + ulint space; + ulint page_no; + lock_t* lock; + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_get_type_low(in_lock) == LOCK_REC); + + trx = in_lock->trx; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + + HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), in_lock); + + UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock); + + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. */ + + lock = lock_rec_get_first_on_page_addr(space, page_no); + + while (lock != NULL) { + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + lock_grant(lock); + } + + lock = lock_rec_get_next_on_page(lock); + } +} + +/***************************************************************** +Removes a record lock request, waiting or granted, from the queue. */ +static +void +lock_rec_discard( +/*=============*/ + lock_t* in_lock)/* in: record lock object: all record locks which + are contained in this lock object are removed */ +{ + ulint space; + ulint page_no; + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_get_type_low(in_lock) == LOCK_REC); + + trx = in_lock->trx; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + + HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), in_lock); + + UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock); +} + +/***************************************************************** +Removes record lock objects set on an index page which is discarded. This +function does not move locks, or check for waiting locks, therefore the +lock bitmaps must already be reset when this function is called. */ +static +void +lock_rec_free_all_from_discard_page( +/*================================*/ + const buf_block_t* block) /* in: page to be discarded */ +{ + ulint space; + ulint page_no; + lock_t* lock; + lock_t* next_lock; + + ut_ad(mutex_own(&kernel_mutex)); + + space = buf_block_get_space(block); + page_no = buf_block_get_page_no(block); + + lock = lock_rec_get_first_on_page_addr(space, page_no); + + while (lock != NULL) { + ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED); + ut_ad(!lock_get_wait(lock)); + + next_lock = lock_rec_get_next_on_page(lock); + + lock_rec_discard(lock); + + lock = next_lock; + } +} + +/*============= RECORD LOCK MOVING AND INHERITING ===================*/ + +/***************************************************************** +Resets the lock bits for a single record. Releases transactions waiting for +lock requests here. */ +static +void +lock_rec_reset_and_release_wait( +/*============================*/ + const buf_block_t* block, /* in: buffer block containing + the record */ + ulint heap_no)/* in: heap number of record */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = lock_rec_get_first(block, heap_no); + + while (lock != NULL) { + if (lock_get_wait(lock)) { + lock_rec_cancel(lock); + } else { + lock_rec_reset_nth_bit(lock, heap_no); + } + + lock = lock_rec_get_next(heap_no, lock); + } +} + +/***************************************************************** +Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of +the other record. Also waiting lock requests on rec are inherited as +GRANTED gap locks. */ +static +void +lock_rec_inherit_to_gap( +/*====================*/ + const buf_block_t* heir_block, /* in: block containing the + record which inherits */ + const buf_block_t* block, /* in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /* in: heap_no of the + inheriting record */ + ulint heap_no) /* in: heap_no of the + donating record */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = lock_rec_get_first(block, heap_no); + + /* If srv_locks_unsafe_for_binlog is TRUE or session is using + READ COMMITTED isolation level, we do not want locks set + by an UPDATE or a DELETE to be inherited as gap type locks. But we + DO want S-locks set by a consistency constraint to be inherited also + then. */ + + while (lock != NULL) { + if (!lock_rec_get_insert_intention(lock) + && !((srv_locks_unsafe_for_binlog + || lock->trx->isolation_level + == TRX_ISO_READ_COMMITTED) + && lock_get_mode(lock) == LOCK_X)) { + + lock_rec_add_to_queue(LOCK_REC | LOCK_GAP + | lock_get_mode(lock), + heir_block, heir_heap_no, + lock->index, lock->trx); + } + + lock = lock_rec_get_next(heap_no, lock); + } +} + +/***************************************************************** +Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of the +other record. Also waiting lock requests are inherited as GRANTED gap locks. */ +static +void +lock_rec_inherit_to_gap_if_gap_lock( +/*================================*/ + const buf_block_t* block, /* in: buffer block */ + ulint heir_heap_no, /* in: heap_no of + record which inherits */ + ulint heap_no) /* in: heap_no of record + from which inherited; + does NOT reset the locks + on this record */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = lock_rec_get_first(block, heap_no); + + while (lock != NULL) { + if (!lock_rec_get_insert_intention(lock) + && (heap_no == PAGE_HEAP_NO_SUPREMUM + || !lock_rec_get_rec_not_gap(lock))) { + + lock_rec_add_to_queue(LOCK_REC | LOCK_GAP + | lock_get_mode(lock), + block, heir_heap_no, + lock->index, lock->trx); + } + + lock = lock_rec_get_next(heap_no, lock); + } +} + +/***************************************************************** +Moves the locks of a record to another record and resets the lock bits of +the donating record. */ +static +void +lock_rec_move( +/*==========*/ + const buf_block_t* receiver, /* in: buffer block containing + the receiving record */ + const buf_block_t* donator, /* in: buffer block containing + the donating record */ + ulint receiver_heap_no,/* in: heap_no of the record + which gets the locks; there + must be no lock requests + on it! */ + ulint donator_heap_no)/* in: heap_no of the record + which gives the locks */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = lock_rec_get_first(donator, donator_heap_no); + + ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL); + + while (lock != NULL) { + const ulint type_mode = lock->type_mode; + + lock_rec_reset_nth_bit(lock, donator_heap_no); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + lock_reset_lock_and_trx_wait(lock); + } + + /* Note that we FIRST reset the bit, and then set the lock: + the function works also if donator == receiver */ + + lock_rec_add_to_queue(type_mode, receiver, receiver_heap_no, + lock->index, lock->trx); + lock = lock_rec_get_next(donator_heap_no, lock); + } + + ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL); +} + +/***************************************************************** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +UNIV_INTERN +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /* in: old index page, now + reorganized */ + const buf_block_t* oblock) /* in: copy of the old, not + reorganized page */ +{ + lock_t* lock; + UT_LIST_BASE_NODE_T(lock_t) old_locks; + mem_heap_t* heap = NULL; + ulint comp; + + lock_mutex_enter_kernel(); + + lock = lock_rec_get_first_on_page(block); + + if (lock == NULL) { + lock_mutex_exit_kernel(); + + return; + } + + heap = mem_heap_create(256); + + /* Copy first all the locks on the page to heap and reset the + bitmaps in the original locks; chain the copies of the locks + using the trx_locks field in them. */ + + UT_LIST_INIT(old_locks); + + do { + /* Make a copy of the lock */ + lock_t* old_lock = lock_rec_copy(lock, heap); + + UT_LIST_ADD_LAST(trx_locks, old_locks, old_lock); + + /* Reset bitmap of lock */ + lock_rec_bitmap_reset(lock); + + if (lock_get_wait(lock)) { + lock_reset_lock_and_trx_wait(lock); + } + + lock = lock_rec_get_next_on_page(lock); + } while (lock != NULL); + + comp = page_is_comp(block->frame); + ut_ad(comp == page_is_comp(oblock->frame)); + + for (lock = UT_LIST_GET_FIRST(old_locks); lock; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + /* NOTE: we copy also the locks set on the infimum and + supremum of the page; the infimum may carry locks if an + update of a record is occurring on the page, and its locks + were temporarily stored on the infimum */ + page_cur_t cur1; + page_cur_t cur2; + + page_cur_set_before_first(block, &cur1); + page_cur_set_before_first(oblock, &cur2); + + /* Set locks according to old locks */ + for (;;) { + ulint old_heap_no; + ulint new_heap_no; + + ut_ad(comp || !memcmp(page_cur_get_rec(&cur1), + page_cur_get_rec(&cur2), + rec_get_data_size_old( + page_cur_get_rec( + &cur2)))); + if (UNIV_LIKELY(comp)) { + old_heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + new_heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + old_heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + new_heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + } + + if (lock_rec_get_nth_bit(lock, old_heap_no)) { + + /* Clear the bit in old_lock. */ + ut_d(lock_rec_reset_nth_bit(lock, + old_heap_no)); + + /* NOTE that the old lock bitmap could be too + small for the new heap number! */ + + lock_rec_add_to_queue(lock->type_mode, block, + new_heap_no, + lock->index, lock->trx); + + /* if (new_heap_no == PAGE_HEAP_NO_SUPREMUM + && lock_get_wait(lock)) { + fprintf(stderr, + "---\n--\n!!!Lock reorg: supr type %lu\n", + lock->type_mode); + } */ + } + + if (UNIV_UNLIKELY + (new_heap_no == PAGE_HEAP_NO_SUPREMUM)) { + + ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM); + break; + } + + page_cur_move_to_next(&cur1); + page_cur_move_to_next(&cur2); + } + +#ifdef UNIV_DEBUG + { + ulint i = lock_rec_find_set_bit(lock); + + /* Check that all locks were moved. */ + if (UNIV_UNLIKELY(i != ULINT_UNDEFINED)) { + fprintf(stderr, + "lock_move_reorganize_page():" + " %lu not moved in %p\n", + (ulong) i, (void*) lock); + ut_error; + } + } +#endif /* UNIV_DEBUG */ + } + + lock_mutex_exit_kernel(); + + mem_heap_free(heap); + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(buf_block_get_space(block), + buf_block_get_page_no(block))); +#endif +} + +/***************************************************************** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /* in: index page to move to */ + const buf_block_t* block, /* in: index page */ + const rec_t* rec) /* in: record on page: this + is the first record moved */ +{ + lock_t* lock; + const ulint comp = page_rec_is_comp(rec); + + lock_mutex_enter_kernel(); + + /* Note: when we move locks from record to record, waiting locks + and possible granted gap type locks behind them are enqueued in + the original order, because new elements are inserted to a hash + table to the end of the hash chain, and lock_rec_add_to_queue + does not reuse locks if there are waiters in the queue. */ + + for (lock = lock_rec_get_first_on_page(block); lock; + lock = lock_rec_get_next_on_page(lock)) { + page_cur_t cur1; + page_cur_t cur2; + const ulint type_mode = lock->type_mode; + + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1)) { + page_cur_move_to_next(&cur1); + } + + page_cur_set_before_first(new_block, &cur2); + page_cur_move_to_next(&cur2); + + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ + + while (!page_cur_is_after_last(&cur1)) { + ulint heap_no; + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + ut_ad(!memcmp(page_cur_get_rec(&cur1), + page_cur_get_rec(&cur2), + rec_get_data_size_old( + page_cur_get_rec(&cur2)))); + } + + if (lock_rec_get_nth_bit(lock, heap_no)) { + lock_rec_reset_nth_bit(lock, heap_no); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + lock_reset_lock_and_trx_wait(lock); + } + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + } + + lock_rec_add_to_queue(type_mode, + new_block, heap_no, + lock->index, lock->trx); + } + + page_cur_move_to_next(&cur1); + page_cur_move_to_next(&cur2); + } + } + + lock_mutex_exit_kernel(); + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(buf_block_get_space(block), + buf_block_get_page_no(block))); + ut_ad(lock_rec_validate_page(buf_block_get_space(new_block), + buf_block_get_page_no(new_block))); +#endif +} + +/***************************************************************** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +UNIV_INTERN +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /* in: index page to move to */ + const buf_block_t* block, /* in: index page */ + const rec_t* rec, /* in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end) /* in: old + previous-to-last + record on new_page + before the records + were copied */ +{ + lock_t* lock; + const ulint comp = page_rec_is_comp(rec); + + ut_ad(block->frame == page_align(rec)); + ut_ad(new_block->frame == page_align(old_end)); + + lock_mutex_enter_kernel(); + + for (lock = lock_rec_get_first_on_page(block); lock; + lock = lock_rec_get_next_on_page(lock)) { + page_cur_t cur1; + page_cur_t cur2; + const ulint type_mode = lock->type_mode; + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + page_cur_position(old_end, new_block, &cur2); + page_cur_move_to_next(&cur2); + + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ + + while (page_cur_get_rec(&cur1) != rec) { + ulint heap_no; + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + ut_ad(!memcmp(page_cur_get_rec(&cur1), + page_cur_get_rec(&cur2), + rec_get_data_size_old( + page_cur_get_rec( + &cur2)))); + } + + if (lock_rec_get_nth_bit(lock, heap_no)) { + lock_rec_reset_nth_bit(lock, heap_no); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + lock_reset_lock_and_trx_wait(lock); + } + + if (comp) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + } + + lock_rec_add_to_queue(type_mode, + new_block, heap_no, + lock->index, lock->trx); + } + + page_cur_move_to_next(&cur1); + page_cur_move_to_next(&cur2); + } + +#ifdef UNIV_DEBUG + if (page_rec_is_supremum(rec)) { + ulint i; + + for (i = PAGE_HEAP_NO_USER_LOW; + i < lock_rec_get_n_bits(lock); i++) { + if (UNIV_UNLIKELY + (lock_rec_get_nth_bit(lock, i))) { + + fprintf(stderr, + "lock_move_rec_list_start():" + " %lu not moved in %p\n", + (ulong) i, (void*) lock); + ut_error; + } + } + } +#endif /* UNIV_DEBUG */ + } + + lock_mutex_exit_kernel(); + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(buf_block_get_space(block), + buf_block_get_page_no(block))); +#endif +} + +/***************************************************************** +Updates the lock table when a page is split to the right. */ +UNIV_INTERN +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /* in: right page */ + const buf_block_t* left_block) /* in: left page */ +{ + ulint heap_no = lock_get_min_heap_no(right_block); + + lock_mutex_enter_kernel(); + + /* Move the locks on the supremum of the left page to the supremum + of the right page */ + + lock_rec_move(right_block, left_block, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + + /* Inherit the locks to the supremum of left page from the successor + of the infimum on right page */ + + lock_rec_inherit_to_gap(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, heap_no); + + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Updates the lock table when a page is merged to the right. */ +UNIV_INTERN +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /* in: right page to + which merged */ + const rec_t* orig_succ, /* in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block) /* in: merged index + page which will be + discarded */ +{ + lock_mutex_enter_kernel(); + + /* Inherit the locks from the supremum of the left page to the + original successor of infimum on the right page, to which the left + page was merged */ + + lock_rec_inherit_to_gap(right_block, left_block, + page_rec_get_heap_no(orig_succ), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, releasing + waiting transactions */ + + lock_rec_reset_and_release_wait(left_block, + PAGE_HEAP_NO_SUPREMUM); + + lock_rec_free_all_from_discard_page(left_block); + + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Updates the lock table when the root page is copied to another in +btr_root_raise_and_insert. Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +UNIV_INTERN +void +lock_update_root_raise( +/*===================*/ + const buf_block_t* block, /* in: index page to which copied */ + const buf_block_t* root) /* in: root page */ +{ + lock_mutex_enter_kernel(); + + /* Move the locks on the supremum of the root to the supremum + of block */ + + lock_rec_move(block, root, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Updates the lock table when a page is copied to another and the original page +is removed from the chain of leaf pages, except if page is the root! */ +UNIV_INTERN +void +lock_update_copy_and_discard( +/*=========================*/ + const buf_block_t* new_block, /* in: index page to + which copied */ + const buf_block_t* block) /* in: index page; + NOT the root! */ +{ + lock_mutex_enter_kernel(); + + /* Move the locks on the supremum of the old page to the supremum + of new_page */ + + lock_rec_move(new_block, block, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + lock_rec_free_all_from_discard_page(block); + + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Updates the lock table when a page is split to the left. */ +UNIV_INTERN +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /* in: right page */ + const buf_block_t* left_block) /* in: left page */ +{ + ulint heap_no = lock_get_min_heap_no(right_block); + + lock_mutex_enter_kernel(); + + /* Inherit the locks to the supremum of the left page from the + successor of the infimum on the right page */ + + lock_rec_inherit_to_gap(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, heap_no); + + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Updates the lock table when a page is merged to the left. */ +UNIV_INTERN +void +lock_update_merge_left( +/*===================*/ + const buf_block_t* left_block, /* in: left page to + which merged */ + const rec_t* orig_pred, /* in: original predecessor + of supremum on the left page + before merge */ + const buf_block_t* right_block) /* in: merged index page + which will be discarded */ +{ + const rec_t* left_next_rec; + + ut_ad(left_block->frame == page_align(orig_pred)); + + lock_mutex_enter_kernel(); + + left_next_rec = page_rec_get_next_const(orig_pred); + + if (!page_rec_is_supremum(left_next_rec)) { + + /* Inherit the locks on the supremum of the left page to the + first record which was moved from the right page */ + + lock_rec_inherit_to_gap(left_block, left_block, + page_rec_get_heap_no(left_next_rec), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, + releasing waiting transactions */ + + lock_rec_reset_and_release_wait(left_block, + PAGE_HEAP_NO_SUPREMUM); + } + + /* Move the locks from the supremum of right page to the supremum + of the left page */ + + lock_rec_move(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + + lock_rec_free_all_from_discard_page(right_block); + + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +UNIV_INTERN +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t* heir_block, /* in: block containing the + record which inherits */ + const buf_block_t* block, /* in: block containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /* in: heap_no of the + inheriting record */ + ulint heap_no) /* in: heap_no of the + donating record */ +{ + mutex_enter(&kernel_mutex); + + lock_rec_reset_and_release_wait(heir_block, heir_heap_no); + + lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no); + + mutex_exit(&kernel_mutex); +} + +/***************************************************************** +Updates the lock table when a page is discarded. */ +UNIV_INTERN +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /* in: index page + which will inherit the locks */ + ulint heir_heap_no, /* in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block) /* in: index page + which will be discarded */ +{ + const page_t* page = block->frame; + const rec_t* rec; + ulint heap_no; + + lock_mutex_enter_kernel(); + + if (!lock_rec_get_first_on_page(block)) { + /* No locks exist on page, nothing to do */ + + lock_mutex_exit_kernel(); + + return; + } + + /* Inherit all the locks on the page to the record and reset all + the locks on the page */ + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + do { + heap_no = rec_get_heap_no_new(rec); + + lock_rec_inherit_to_gap(heir_block, block, + heir_heap_no, heap_no); + + lock_rec_reset_and_release_wait(block, heap_no); + + rec = page + rec_get_next_offs(rec, TRUE); + } while (heap_no != PAGE_HEAP_NO_SUPREMUM); + } else { + rec = page + PAGE_OLD_INFIMUM; + + do { + heap_no = rec_get_heap_no_old(rec); + + lock_rec_inherit_to_gap(heir_block, block, + heir_heap_no, heap_no); + + lock_rec_reset_and_release_wait(block, heap_no); + + rec = page + rec_get_next_offs(rec, FALSE); + } while (heap_no != PAGE_HEAP_NO_SUPREMUM); + } + + lock_rec_free_all_from_discard_page(block); + + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Updates the lock table when a new user record is inserted. */ +UNIV_INTERN +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec) /* in: the inserted record */ +{ + ulint receiver_heap_no; + ulint donator_heap_no; + + ut_ad(block->frame == page_align(rec)); + + /* Inherit the gap-locking locks for rec, in gap mode, from the next + record */ + + if (page_rec_is_comp(rec)) { + receiver_heap_no = rec_get_heap_no_new(rec); + donator_heap_no = rec_get_heap_no_new( + page_rec_get_next_low(rec, TRUE)); + } else { + receiver_heap_no = rec_get_heap_no_old(rec); + donator_heap_no = rec_get_heap_no_old( + page_rec_get_next_low(rec, FALSE)); + } + + lock_mutex_enter_kernel(); + lock_rec_inherit_to_gap_if_gap_lock(block, + receiver_heap_no, donator_heap_no); + lock_mutex_exit_kernel(); +} + +/***************************************************************** +Updates the lock table when a record is removed. */ +UNIV_INTERN +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec) /* in: the record to be removed */ +{ + const page_t* page = block->frame; + ulint heap_no; + ulint next_heap_no; + + ut_ad(page == page_align(rec)); + + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new(rec); + next_heap_no = rec_get_heap_no_new(page + + rec_get_next_offs(rec, + TRUE)); + } else { + heap_no = rec_get_heap_no_old(rec); + next_heap_no = rec_get_heap_no_old(page + + rec_get_next_offs(rec, + FALSE)); + } + + lock_mutex_enter_kernel(); + + /* Let the next record inherit the locks from rec, in gap mode */ + + lock_rec_inherit_to_gap(block, block, next_heap_no, heap_no); + + /* Reset the lock bits on rec and release waiting transactions */ + + lock_rec_reset_and_release_wait(block, heap_no); + + lock_mutex_exit_kernel(); +} + +/************************************************************************* +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is moved in such an update, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +UNIV_INTERN +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec) /* in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +{ + ulint heap_no = page_rec_get_heap_no(rec); + + ut_ad(block->frame == page_align(rec)); + + lock_mutex_enter_kernel(); + + lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no); + + lock_mutex_exit_kernel(); +} + +/************************************************************************* +Restores the state of explicit lock requests on a single record, where the +state was stored on the infimum of the page. */ +UNIV_INTERN +void +lock_rec_restore_from_page_infimum( +/*===============================*/ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec, /* in: record whose lock state + is restored */ + const buf_block_t* donator)/* in: page (rec is not + necessarily on this page) + whose infimum stored the lock + state; lock bits are reset on + the infimum */ +{ + ulint heap_no = page_rec_get_heap_no(rec); + + lock_mutex_enter_kernel(); + + lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM); + + lock_mutex_exit_kernel(); +} + +/*=========== DEADLOCK CHECKING ======================================*/ + +/************************************************************************ +Checks if a lock request results in a deadlock. */ +static +ibool +lock_deadlock_occurs( +/*=================*/ + /* out: TRUE if a deadlock was detected and we + chose trx as a victim; FALSE if no deadlock, or + there was a deadlock, but we chose other + transaction(s) as victim(s) */ + lock_t* lock, /* in: lock the transaction is requesting */ + trx_t* trx) /* in: transaction */ +{ + dict_table_t* table; + dict_index_t* index; + trx_t* mark_trx; + ulint ret; + ulint cost = 0; + + ut_ad(trx); + ut_ad(lock); + ut_ad(mutex_own(&kernel_mutex)); +retry: + /* We check that adding this trx to the waits-for graph + does not produce a cycle. First mark all active transactions + with 0: */ + + mark_trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (mark_trx) { + mark_trx->deadlock_mark = 0; + mark_trx = UT_LIST_GET_NEXT(trx_list, mark_trx); + } + + ret = lock_deadlock_recursive(trx, trx, lock, &cost, 0); + + if (ret == LOCK_VICTIM_IS_OTHER) { + /* We chose some other trx as a victim: retry if there still + is a deadlock */ + + goto retry; + } + + if (UNIV_UNLIKELY(ret == LOCK_VICTIM_IS_START)) { + if (lock_get_type_low(lock) & LOCK_TABLE) { + table = lock->un_member.tab_lock.table; + index = NULL; + } else { + index = lock->index; + table = index->table; + } + + lock_deadlock_found = TRUE; + + fputs("*** WE ROLL BACK TRANSACTION (2)\n", + lock_latest_err_file); + + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************ +Looks recursively for a deadlock. */ +static +ulint +lock_deadlock_recursive( +/*====================*/ + /* out: 0 if no deadlock found, + LOCK_VICTIM_IS_START if there was a deadlock + and we chose 'start' as the victim, + LOCK_VICTIM_IS_OTHER if a deadlock + was found and we chose some other trx as a + victim: we must do the search again in this + last case because there may be another + deadlock! */ + trx_t* start, /* in: recursion starting point */ + trx_t* trx, /* in: a transaction waiting for a lock */ + lock_t* wait_lock, /* in: the lock trx is waiting to be granted */ + ulint* cost, /* in/out: number of calculation steps thus + far: if this exceeds LOCK_MAX_N_STEPS_... + we return LOCK_VICTIM_IS_START */ + ulint depth) /* in: recursion depth: if this exceeds + LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we + return LOCK_VICTIM_IS_START */ +{ + lock_t* lock; + ulint bit_no = ULINT_UNDEFINED; + trx_t* lock_trx; + ulint ret; + + ut_a(trx); + ut_a(start); + ut_a(wait_lock); + ut_ad(mutex_own(&kernel_mutex)); + + if (trx->deadlock_mark == 1) { + /* We have already exhaustively searched the subtree starting + from this trx */ + + return(0); + } + + *cost = *cost + 1; + + lock = wait_lock; + + if (lock_get_type_low(wait_lock) == LOCK_REC) { + + bit_no = lock_rec_find_set_bit(wait_lock); + + ut_a(bit_no != ULINT_UNDEFINED); + } + + /* Look at the locks ahead of wait_lock in the lock queue */ + + for (;;) { + if (lock_get_type_low(lock) & LOCK_TABLE) { + + lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, + lock); + } else { + ut_ad(lock_get_type_low(lock) == LOCK_REC); + ut_a(bit_no != ULINT_UNDEFINED); + + lock = (lock_t*) lock_rec_get_prev(lock, bit_no); + } + + if (lock == NULL) { + /* We can mark this subtree as searched */ + trx->deadlock_mark = 1; + + return(FALSE); + } + + if (lock_has_to_wait(wait_lock, lock)) { + + ibool too_far + = depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK + || *cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK; + + lock_trx = lock->trx; + + if (lock_trx == start || too_far) { + + /* We came back to the recursion starting + point: a deadlock detected; or we have + searched the waits-for graph too long */ + + FILE* ef = lock_latest_err_file; + + rewind(ef); + ut_print_timestamp(ef); + + fputs("\n*** (1) TRANSACTION:\n", ef); + + trx_print(ef, wait_lock->trx, 3000); + + fputs("*** (1) WAITING FOR THIS LOCK" + " TO BE GRANTED:\n", ef); + + if (lock_get_type_low(wait_lock) == LOCK_REC) { + lock_rec_print(ef, wait_lock); + } else { + lock_table_print(ef, wait_lock); + } + + fputs("*** (2) TRANSACTION:\n", ef); + + trx_print(ef, lock->trx, 3000); + + fputs("*** (2) HOLDS THE LOCK(S):\n", ef); + + if (lock_get_type_low(lock) == LOCK_REC) { + lock_rec_print(ef, lock); + } else { + lock_table_print(ef, lock); + } + + fputs("*** (2) WAITING FOR THIS LOCK" + " TO BE GRANTED:\n", ef); + + if (lock_get_type_low(start->wait_lock) + == LOCK_REC) { + lock_rec_print(ef, start->wait_lock); + } else { + lock_table_print(ef, start->wait_lock); + } +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fputs("Deadlock detected" + " or too long search\n", + stderr); + } +#endif /* UNIV_DEBUG */ + if (too_far) { + + fputs("TOO DEEP OR LONG SEARCH" + " IN THE LOCK TABLE" + " WAITS-FOR GRAPH\n", ef); + + return(LOCK_VICTIM_IS_START); + } + + if (trx_weight_cmp(wait_lock->trx, + start) >= 0) { + /* Our recursion starting point + transaction is 'smaller', let us + choose 'start' as the victim and roll + back it */ + + return(LOCK_VICTIM_IS_START); + } + + lock_deadlock_found = TRUE; + + /* Let us choose the transaction of wait_lock + as a victim to try to avoid deadlocking our + recursion starting point transaction */ + + fputs("*** WE ROLL BACK TRANSACTION (1)\n", + ef); + + wait_lock->trx->was_chosen_as_deadlock_victim + = TRUE; + + lock_cancel_waiting_and_release(wait_lock); + + /* Since trx and wait_lock are no longer + in the waits-for graph, we can return FALSE; + note that our selective algorithm can choose + several transactions as victims, but still + we may end up rolling back also the recursion + starting point transaction! */ + + return(LOCK_VICTIM_IS_OTHER); + } + + if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) { + + /* Another trx ahead has requested lock in an + incompatible mode, and is itself waiting for + a lock */ + + ret = lock_deadlock_recursive( + start, lock_trx, + lock_trx->wait_lock, cost, depth + 1); + if (ret != 0) { + + return(ret); + } + } + } + }/* end of the 'for (;;)'-loop */ +} + +/*========================= TABLE LOCKS ==============================*/ + +/************************************************************************* +Creates a table lock object and adds it as the last in the lock queue +of the table. Does NOT check for deadlocks or lock compatibility. */ +UNIV_INLINE +lock_t* +lock_table_create( +/*==============*/ + /* out, own: new lock object */ + dict_table_t* table, /* in: database table in dictionary cache */ + ulint type_mode,/* in: lock mode possibly ORed with + LOCK_WAIT */ + trx_t* trx) /* in: trx */ +{ + lock_t* lock; + + ut_ad(table && trx); + ut_ad(mutex_own(&kernel_mutex)); + + if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) { + ++table->n_waiting_or_granted_auto_inc_locks; + } + + /* For AUTOINC locking we reuse the lock instance only if + there is no wait involved else we allocate the waiting lock + from the transaction lock heap. */ + if (type_mode == LOCK_AUTO_INC) { + + lock = table->autoinc_lock; + + table->autoinc_trx = trx; + + ib_vector_push(trx->autoinc_locks, lock); + } else { + lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t)); + } + + UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock); + + lock->type_mode = type_mode | LOCK_TABLE; + lock->trx = trx; + + lock->un_member.tab_lock.table = table; + + UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock); + + if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) { + + lock_set_lock_and_trx_wait(lock, trx); + } + + return(lock); +} + +/***************************************************************** +Removes a table lock request from the queue and the trx list of locks; +this is a low-level function which does NOT check if waiting requests +can now be granted. */ +UNIV_INLINE +void +lock_table_remove_low( +/*==================*/ + lock_t* lock) /* in: table lock */ +{ + trx_t* trx; + dict_table_t* table; + + ut_ad(mutex_own(&kernel_mutex)); + + trx = lock->trx; + table = lock->un_member.tab_lock.table; + + /* Remove the table from the transaction's AUTOINC vector, if + the lock that is being release is an AUTOINC lock. */ + if (lock_get_mode(lock) == LOCK_AUTO_INC) { + + /* The table's AUTOINC lock can get transferred to + another transaction before we get here. */ + if (table->autoinc_trx == trx) { + table->autoinc_trx = NULL; + } + + /* The locks must be freed in the reverse order from + the one in which they were acquired. This is to avoid + traversing the AUTOINC lock vector unnecessarily. + + We only store locks that were granted in the + trx->autoinc_locks vector (see lock_table_create() + and lock_grant()). Therefore it can be empty and we + need to check for that. */ + + if (!ib_vector_is_empty(trx->autoinc_locks)) { + lock_t* autoinc_lock; + + autoinc_lock = ib_vector_pop(trx->autoinc_locks); + ut_a(autoinc_lock == lock); + } + + ut_a(table->n_waiting_or_granted_auto_inc_locks > 0); + --table->n_waiting_or_granted_auto_inc_locks; + } + + UT_LIST_REMOVE(trx_locks, trx->trx_locks, lock); + UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock); +} + +/************************************************************************* +Enqueues a waiting request for a table lock which cannot be granted +immediately. Checks for deadlocks. */ +static +ulint +lock_table_enqueue_waiting( +/*=======================*/ + /* out: DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED, or DB_SUCCESS; + DB_SUCCESS means that there was a deadlock, + but another transaction was chosen as a + victim, and we got the lock immediately: + no need to wait then */ + ulint mode, /* in: lock mode this transaction is + requesting */ + dict_table_t* table, /* in: table */ + que_thr_t* thr) /* in: query thread */ +{ + lock_t* lock; + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + + /* Test if there already is some other reason to suspend thread: + we do not enqueue a lock request if the query thread should be + stopped anyway */ + + if (que_thr_stop(thr)) { + ut_error; + + return(DB_QUE_THR_SUSPENDED); + } + + trx = thr_get_trx(thr); + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + break; + case TRX_DICT_OP_TABLE: + case TRX_DICT_OP_INDEX: + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: a table lock wait happens" + " in a dictionary operation!\n" + "InnoDB: Table name ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(".\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", + stderr); + } + + /* Enqueue the lock request that will wait to be granted */ + + lock = lock_table_create(table, mode | LOCK_WAIT, trx); + + /* Check if a deadlock occurs: if yes, remove the lock request and + return an error code */ + + if (lock_deadlock_occurs(lock, trx)) { + + lock_reset_lock_and_trx_wait(lock); + lock_table_remove_low(lock); + + return(DB_DEADLOCK); + } + + if (trx->wait_lock == NULL) { + /* Deadlock resolution chose another transaction as a victim, + and we accidentally got our lock granted! */ + + return(DB_SUCCESS); + } + + trx->que_state = TRX_QUE_LOCK_WAIT; + trx->was_chosen_as_deadlock_victim = FALSE; + trx->wait_started = time(NULL); + + ut_a(que_thr_stop(thr)); + + return(DB_LOCK_WAIT); +} + +/************************************************************************* +Checks if other transactions have an incompatible mode lock request in +the lock queue. */ +UNIV_INLINE +ibool +lock_table_other_has_incompatible( +/*==============================*/ + trx_t* trx, /* in: transaction, or NULL if all + transactions should be included */ + ulint wait, /* in: LOCK_WAIT if also waiting locks are + taken into account, or 0 if not */ + dict_table_t* table, /* in: table */ + enum lock_mode mode) /* in: lock mode */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = UT_LIST_GET_LAST(table->locks); + + while (lock != NULL) { + + if ((lock->trx != trx) + && (!lock_mode_compatible(lock_get_mode(lock), mode)) + && (wait || !(lock_get_wait(lock)))) { + + return(TRUE); + } + + lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock); + } + + return(FALSE); +} + +/************************************************************************* +Locks the specified database table in the mode given. If the lock cannot +be granted immediately, the query thread is put to wait. */ +UNIV_INTERN +ulint +lock_table( +/*=======*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is set, + does nothing */ + dict_table_t* table, /* in: database table in dictionary cache */ + enum lock_mode mode, /* in: lock mode */ + que_thr_t* thr) /* in: query thread */ +{ + trx_t* trx; + ulint err; + + ut_ad(table && thr); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + ut_a(flags == 0); + + trx = thr_get_trx(thr); + + lock_mutex_enter_kernel(); + + /* Look for stronger locks the same trx already has on the table */ + + if (lock_table_has(trx, table, mode)) { + + lock_mutex_exit_kernel(); + + return(DB_SUCCESS); + } + + /* We have to check if the new lock is compatible with any locks + other transactions have in the table lock queue. */ + + if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) { + + /* Another trx has a request on the table in an incompatible + mode: this trx may have to wait */ + + err = lock_table_enqueue_waiting(mode | flags, table, thr); + + lock_mutex_exit_kernel(); + + return(err); + } + + lock_table_create(table, mode | flags, trx); + + ut_a(!flags || mode == LOCK_S || mode == LOCK_X); + + lock_mutex_exit_kernel(); + + return(DB_SUCCESS); +} + +/************************************************************************* +Checks if there are any locks set on the table. */ +UNIV_INTERN +ibool +lock_is_on_table( +/*=============*/ + /* out: TRUE if there are lock(s) */ + dict_table_t* table) /* in: database table in dictionary cache */ +{ + ibool ret; + + ut_ad(table); + + lock_mutex_enter_kernel(); + + if (UT_LIST_GET_LAST(table->locks)) { + ret = TRUE; + } else { + ret = FALSE; + } + + lock_mutex_exit_kernel(); + + return(ret); +} + +/************************************************************************* +Checks if a waiting table lock request still has to wait in a queue. */ +static +ibool +lock_table_has_to_wait_in_queue( +/*============================*/ + /* out: TRUE if still has to wait */ + lock_t* wait_lock) /* in: waiting table lock */ +{ + dict_table_t* table; + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(lock_get_wait(wait_lock)); + + table = wait_lock->un_member.tab_lock.table; + + lock = UT_LIST_GET_FIRST(table->locks); + + while (lock != wait_lock) { + + if (lock_has_to_wait(wait_lock, lock)) { + + return(TRUE); + } + + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock); + } + + return(FALSE); +} + +/***************************************************************** +Removes a table lock request, waiting or granted, from the queue and grants +locks to other transactions in the queue, if they now are entitled to a +lock. */ +static +void +lock_table_dequeue( +/*===============*/ + lock_t* in_lock)/* in: table lock object; transactions waiting + behind will get their lock requests granted, if + they are now qualified to it */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + ut_a(lock_get_type_low(in_lock) == LOCK_TABLE); + + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock); + + lock_table_remove_low(in_lock); + + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. */ + + while (lock != NULL) { + + if (lock_get_wait(lock) + && !lock_table_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + lock_grant(lock); + } + + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock); + } +} + +/*=========================== LOCK RELEASE ==============================*/ + +/***************************************************************** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +UNIV_INTERN +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /* in: transaction that has + set a record lock */ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec, /* in: record */ + enum lock_mode lock_mode)/* in: LOCK_S or LOCK_X */ +{ + lock_t* lock; + lock_t* release_lock = NULL; + ulint heap_no; + + ut_ad(trx && rec); + ut_ad(block->frame == page_align(rec)); + + heap_no = page_rec_get_heap_no(rec); + + mutex_enter(&kernel_mutex); + + lock = lock_rec_get_first(block, heap_no); + + /* Find the last lock with the same lock_mode and transaction + from the record. */ + + while (lock != NULL) { + if (lock->trx == trx && lock_get_mode(lock) == lock_mode) { + release_lock = lock; + ut_a(!lock_get_wait(lock)); + } + + lock = lock_rec_get_next(heap_no, lock); + } + + /* If a record lock is found, release the record lock */ + + if (UNIV_LIKELY(release_lock != NULL)) { + lock_rec_reset_nth_bit(release_lock, heap_no); + } else { + mutex_exit(&kernel_mutex); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unlock row could not" + " find a %lu mode lock on the record\n", + (ulong) lock_mode); + + return; + } + + /* Check if we can now grant waiting lock requests */ + + lock = lock_rec_get_first(block, heap_no); + + while (lock != NULL) { + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + lock_grant(lock); + } + + lock = lock_rec_get_next(heap_no, lock); + } + + mutex_exit(&kernel_mutex); +} + +/************************************************************************* +Releases a table lock. +Releases possible other transactions waiting for this lock. */ +UNIV_INTERN +void +lock_table_unlock( +/*==============*/ + lock_t* lock) /* in: lock */ +{ + mutex_enter(&kernel_mutex); + + lock_table_dequeue(lock); + + mutex_exit(&kernel_mutex); +} + +/************************************************************************* +Releases transaction locks, and releases possible other transactions waiting +because of these locks. */ +UNIV_INTERN +void +lock_release_off_kernel( +/*====================*/ + trx_t* trx) /* in: transaction */ +{ + dict_table_t* table; + ulint count; + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = UT_LIST_GET_LAST(trx->trx_locks); + + count = 0; + + while (lock != NULL) { + + count++; + + if (lock_get_type_low(lock) == LOCK_REC) { + + lock_rec_dequeue_from_page(lock); + } else { + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + + if (lock_get_mode(lock) != LOCK_IS + && !ut_dulint_is_zero(trx->undo_no)) { + + /* The trx may have modified the table. We + block the use of the MySQL query cache for + all currently active transactions. */ + + table = lock->un_member.tab_lock.table; + + table->query_cache_inv_trx_id + = trx_sys->max_trx_id; + } + + lock_table_dequeue(lock); + } + + if (count == LOCK_RELEASE_KERNEL_INTERVAL) { + /* Release the kernel mutex for a while, so that we + do not monopolize it */ + + lock_mutex_exit_kernel(); + + lock_mutex_enter_kernel(); + + count = 0; + } + + lock = UT_LIST_GET_LAST(trx->trx_locks); + } + + ut_a(ib_vector_size(trx->autoinc_locks) == 0); + + mem_heap_empty(trx->lock_heap); +} + +/************************************************************************* +Cancels a waiting lock request and releases possible other transactions +waiting behind it. */ +UNIV_INTERN +void +lock_cancel_waiting_and_release( +/*============================*/ + lock_t* lock) /* in: waiting lock request */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + if (lock_get_type_low(lock) == LOCK_REC) { + + lock_rec_dequeue_from_page(lock); + } else { + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + + if (lock->trx->autoinc_locks != NULL) { + /* Release the transaction's AUTOINC locks/ */ + lock_release_autoinc_locks(lock->trx); + } + + lock_table_dequeue(lock); + } + + /* Reset the wait flag and the back pointer to lock in trx */ + + lock_reset_lock_and_trx_wait(lock); + + /* The following function releases the trx from lock wait */ + + trx_end_lock_wait(lock->trx); +} + +/* True if a lock mode is S or X */ +#define IS_LOCK_S_OR_X(lock) \ + (lock_get_mode(lock) == LOCK_S \ + || lock_get_mode(lock) == LOCK_X) + + +/************************************************************************* +Removes locks of a transaction on a table to be dropped. +If remove_also_table_sx_locks is TRUE then table-level S and X locks are +also removed in addition to other table-level and record-level locks. +No lock, that is going to be removed, is allowed to be a wait lock. */ +static +void +lock_remove_all_on_table_for_trx( +/*=============================*/ + dict_table_t* table, /* in: table to be dropped */ + trx_t* trx, /* in: a transaction */ + ibool remove_also_table_sx_locks)/* in: also removes + table S and X locks */ +{ + lock_t* lock; + lock_t* prev_lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = UT_LIST_GET_LAST(trx->trx_locks); + + while (lock != NULL) { + prev_lock = UT_LIST_GET_PREV(trx_locks, lock); + + if (lock_get_type_low(lock) == LOCK_REC + && lock->index->table == table) { + ut_a(!lock_get_wait(lock)); + + lock_rec_discard(lock); + } else if (lock_get_type_low(lock) & LOCK_TABLE + && lock->un_member.tab_lock.table == table + && (remove_also_table_sx_locks + || !IS_LOCK_S_OR_X(lock))) { + + ut_a(!lock_get_wait(lock)); + + lock_table_remove_low(lock); + } + + lock = prev_lock; + } +} + +/************************************************************************* +Removes locks on a table to be dropped or truncated. +If remove_also_table_sx_locks is TRUE then table-level S and X locks are +also removed in addition to other table-level and record-level locks. +No lock, that is going to be removed, is allowed to be a wait lock. */ +UNIV_INTERN +void +lock_remove_all_on_table( +/*=====================*/ + dict_table_t* table, /* in: table to be dropped + or truncated */ + ibool remove_also_table_sx_locks)/* in: also removes + table S and X locks */ +{ + lock_t* lock; + lock_t* prev_lock; + + mutex_enter(&kernel_mutex); + + lock = UT_LIST_GET_FIRST(table->locks); + + while (lock != NULL) { + + prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, + lock); + + /* If we should remove all locks (remove_also_table_sx_locks + is TRUE), or if the lock is not table-level S or X lock, + then check we are not going to remove a wait lock. */ + if (remove_also_table_sx_locks + || !(lock_get_type(lock) == LOCK_TABLE + && IS_LOCK_S_OR_X(lock))) { + + ut_a(!lock_get_wait(lock)); + } + + lock_remove_all_on_table_for_trx(table, lock->trx, + remove_also_table_sx_locks); + + if (prev_lock == NULL) { + if (lock == UT_LIST_GET_FIRST(table->locks)) { + /* lock was not removed, pick its successor */ + lock = UT_LIST_GET_NEXT( + un_member.tab_lock.locks, lock); + } else { + /* lock was removed, pick the first one */ + lock = UT_LIST_GET_FIRST(table->locks); + } + } else if (UT_LIST_GET_NEXT(un_member.tab_lock.locks, + prev_lock) != lock) { + /* If lock was removed by + lock_remove_all_on_table_for_trx() then pick the + successor of prev_lock ... */ + lock = UT_LIST_GET_NEXT( + un_member.tab_lock.locks, prev_lock); + } else { + /* ... otherwise pick the successor of lock. */ + lock = UT_LIST_GET_NEXT( + un_member.tab_lock.locks, lock); + } + } + + mutex_exit(&kernel_mutex); +} + +/*===================== VALIDATION AND DEBUGGING ====================*/ + +/************************************************************************* +Prints info of a table lock. */ +UNIV_INTERN +void +lock_table_print( +/*=============*/ + FILE* file, /* in: file where to print */ + const lock_t* lock) /* in: table type lock */ +{ + ut_ad(mutex_own(&kernel_mutex)); + ut_a(lock_get_type_low(lock) == LOCK_TABLE); + + fputs("TABLE LOCK table ", file); + ut_print_name(file, lock->trx, TRUE, + lock->un_member.tab_lock.table->name); + fprintf(file, " trx id " TRX_ID_FMT, + TRX_ID_PREP_PRINTF(lock->trx->id)); + + if (lock_get_mode(lock) == LOCK_S) { + fputs(" lock mode S", file); + } else if (lock_get_mode(lock) == LOCK_X) { + fputs(" lock mode X", file); + } else if (lock_get_mode(lock) == LOCK_IS) { + fputs(" lock mode IS", file); + } else if (lock_get_mode(lock) == LOCK_IX) { + fputs(" lock mode IX", file); + } else if (lock_get_mode(lock) == LOCK_AUTO_INC) { + fputs(" lock mode AUTO-INC", file); + } else { + fprintf(file, " unknown lock mode %lu", + (ulong) lock_get_mode(lock)); + } + + if (lock_get_wait(lock)) { + fputs(" waiting", file); + } + + putc('\n', file); +} + +/************************************************************************* +Prints info of a record lock. */ +UNIV_INTERN +void +lock_rec_print( +/*===========*/ + FILE* file, /* in: file where to print */ + const lock_t* lock) /* in: record type lock */ +{ + const buf_block_t* block; + ulint space; + ulint page_no; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(mutex_own(&kernel_mutex)); + ut_a(lock_get_type_low(lock) == LOCK_REC); + + space = lock->un_member.rec_lock.space; + page_no = lock->un_member.rec_lock.page_no; + + fprintf(file, "RECORD LOCKS space id %lu page no %lu n bits %lu ", + (ulong) space, (ulong) page_no, + (ulong) lock_rec_get_n_bits(lock)); + dict_index_name_print(file, lock->trx, lock->index); + fprintf(file, " trx id " TRX_ID_FMT, + TRX_ID_PREP_PRINTF(lock->trx->id)); + + if (lock_get_mode(lock) == LOCK_S) { + fputs(" lock mode S", file); + } else if (lock_get_mode(lock) == LOCK_X) { + fputs(" lock_mode X", file); + } else { + ut_error; + } + + if (lock_rec_get_gap(lock)) { + fputs(" locks gap before rec", file); + } + + if (lock_rec_get_rec_not_gap(lock)) { + fputs(" locks rec but not gap", file); + } + + if (lock_rec_get_insert_intention(lock)) { + fputs(" insert intention", file); + } + + if (lock_get_wait(lock)) { + fputs(" waiting", file); + } + + mtr_start(&mtr); + + putc('\n', file); + + if ( srv_show_verbose_locks ) { + block = buf_page_try_get(space, page_no, &mtr); + if (block) { + for (i = 0; i < lock_rec_get_n_bits(lock); i++) { + + if (lock_rec_get_nth_bit(lock, i)) { + + const rec_t* rec + = page_find_rec_with_heap_no( + buf_block_get_frame(block), i); + offsets = rec_get_offsets( + rec, lock->index, offsets, + ULINT_UNDEFINED, &heap); + + fprintf(file, "Record lock, heap no %lu ", + (ulong) i); + rec_print_new(file, rec, offsets); + putc('\n', file); + } + } + } else { + for (i = 0; i < lock_rec_get_n_bits(lock); i++) { + fprintf(file, "Record lock, heap no %lu\n", (ulong) i); + } + } + } + mtr_commit(&mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +#ifndef UNIV_HOTBACKUP + +#ifdef UNIV_DEBUG +/* Print the number of lock structs from lock_print_info_summary() only +in non-production builds for performance reasons, see +http://bugs.mysql.com/36942 */ +#define PRINT_NUM_OF_LOCK_STRUCTS +#endif /* UNIV_DEBUG */ + +#ifdef PRINT_NUM_OF_LOCK_STRUCTS +/************************************************************************* +Calculates the number of record lock structs in the record lock hash table. */ +static +ulint +lock_get_n_rec_locks(void) +/*======================*/ +{ + lock_t* lock; + ulint n_locks = 0; + ulint i; + + ut_ad(mutex_own(&kernel_mutex)); + + for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { + + lock = HASH_GET_FIRST(lock_sys->rec_hash, i); + + while (lock) { + n_locks++; + + lock = HASH_GET_NEXT(hash, lock); + } + } + + return(n_locks); +} +#endif /* PRINT_NUM_OF_LOCK_STRUCTS */ + +/************************************************************************* +Prints info of locks for all transactions. */ +UNIV_INTERN +void +lock_print_info_summary( +/*====================*/ + FILE* file) /* in: file where to print */ +{ + /* We must protect the MySQL thd->query field with a MySQL mutex, and + because the MySQL mutex must be reserved before the kernel_mutex of + InnoDB, we call innobase_mysql_prepare_print_arbitrary_thd() here. */ + + innobase_mysql_prepare_print_arbitrary_thd(); + lock_mutex_enter_kernel(); + + if (lock_deadlock_found) { + fputs("------------------------\n" + "LATEST DETECTED DEADLOCK\n" + "------------------------\n", file); + + ut_copy_file(file, lock_latest_err_file); + } + + fputs("------------\n" + "TRANSACTIONS\n" + "------------\n", file); + + fprintf(file, "Trx id counter " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF(trx_sys->max_trx_id)); + + fprintf(file, + "Purge done for trx's n:o < " TRX_ID_FMT + " undo n:o < " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF(purge_sys->purge_trx_no), + TRX_ID_PREP_PRINTF(purge_sys->purge_undo_no)); + + fprintf(file, + "History list length %lu\n", + (ulong) trx_sys->rseg_history_len); + +#ifdef PRINT_NUM_OF_LOCK_STRUCTS + fprintf(file, + "Total number of lock structs in row lock hash table %lu\n", + (ulong) lock_get_n_rec_locks()); +#endif /* PRINT_NUM_OF_LOCK_STRUCTS */ +} + +/************************************************************************* +Prints info of locks for each transaction. */ +UNIV_INTERN +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file) /* in: file where to print */ +{ + lock_t* lock; + ibool load_page_first = TRUE; + ulint nth_trx = 0; + ulint nth_lock = 0; + ulint i; + mtr_t mtr; + trx_t* trx; + + fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); + + /* First print info on non-active transactions */ + + trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + + while (trx) { + if (trx->conc_state == TRX_NOT_STARTED) { + fputs("---", file); + trx_print(file, trx, 600); + } + + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } + +loop: + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + i = 0; + + /* Since we temporarily release the kernel mutex when + reading a database page in below, variable trx may be + obsolete now and we must loop through the trx list to + get probably the same trx, or some other trx. */ + + while (trx && (i < nth_trx)) { + trx = UT_LIST_GET_NEXT(trx_list, trx); + i++; + } + + if (trx == NULL) { + lock_mutex_exit_kernel(); + innobase_mysql_end_print_arbitrary_thd(); + + ut_ad(lock_validate()); + + return; + } + + if (nth_lock == 0) { + fputs("---", file); + trx_print(file, trx, 600); + + if (trx->read_view) { + fprintf(file, + "Trx read view will not see trx with" + " id >= " TRX_ID_FMT + ", sees < " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF( + trx->read_view->low_limit_id), + TRX_ID_PREP_PRINTF( + trx->read_view->up_limit_id)); + } + + if (trx->que_state == TRX_QUE_LOCK_WAIT) { + fprintf(file, + "------- TRX HAS BEEN WAITING %lu SEC" + " FOR THIS LOCK TO BE GRANTED:\n", + (ulong) difftime(time(NULL), + trx->wait_started)); + + if (lock_get_type_low(trx->wait_lock) == LOCK_REC) { + lock_rec_print(file, trx->wait_lock); + } else { + lock_table_print(file, trx->wait_lock); + } + + fputs("------------------\n", file); + } + } + + if (!srv_print_innodb_lock_monitor && !srv_show_locks_held) { + nth_trx++; + goto loop; + } + + i = 0; + + /* Look at the note about the trx loop above why we loop here: + lock may be an obsolete pointer now. */ + + lock = UT_LIST_GET_FIRST(trx->trx_locks); + + while (lock && (i < nth_lock)) { + lock = UT_LIST_GET_NEXT(trx_locks, lock); + i++; + } + + if (lock == NULL) { + nth_trx++; + nth_lock = 0; + + goto loop; + } + + if (lock_get_type_low(lock) == LOCK_REC) { + if (load_page_first) { + ulint space = lock->un_member.rec_lock.space; + ulint zip_size= fil_space_get_zip_size(space); + ulint page_no = lock->un_member.rec_lock.page_no; + + lock_mutex_exit_kernel(); + innobase_mysql_end_print_arbitrary_thd(); + + mtr_start(&mtr); + + buf_page_get_with_no_latch(space, zip_size, + page_no, &mtr); + + mtr_commit(&mtr); + + load_page_first = FALSE; + + innobase_mysql_prepare_print_arbitrary_thd(); + lock_mutex_enter_kernel(); + + goto loop; + } + + lock_rec_print(file, lock); + } else { + ut_ad(lock_get_type_low(lock) & LOCK_TABLE); + + lock_table_print(file, lock); + } + + load_page_first = TRUE; + + nth_lock++; + + if (nth_lock >= srv_show_locks_held) { + fputs("TOO LOCKS PRINTED FOR THIS TRX:" + " SUPPRESSING FURTHER PRINTS\n", + file); + + nth_trx++; + nth_lock = 0; + + goto loop; + } + + goto loop; +} + +# ifdef UNIV_DEBUG +/************************************************************************* +Validates the lock queue on a table. */ +static +ibool +lock_table_queue_validate( +/*======================*/ + /* out: TRUE if ok */ + dict_table_t* table) /* in: table */ +{ + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + + lock = UT_LIST_GET_FIRST(table->locks); + + while (lock) { + ut_a(((lock->trx)->conc_state == TRX_ACTIVE) + || ((lock->trx)->conc_state == TRX_PREPARED) + || ((lock->trx)->conc_state == TRX_COMMITTED_IN_MEMORY)); + + if (!lock_get_wait(lock)) { + + ut_a(!lock_table_other_has_incompatible( + lock->trx, 0, table, + lock_get_mode(lock))); + } else { + + ut_a(lock_table_has_to_wait_in_queue(lock)); + } + + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock); + } + + return(TRUE); +} + +/************************************************************************* +Validates the lock queue on a single record. */ +static +ibool +lock_rec_queue_validate( +/*====================*/ + /* out: TRUE if ok */ + const buf_block_t* block, /* in: buffer block containing rec */ + const rec_t* rec, /* in: record to look at */ + dict_index_t* index, /* in: index, or NULL if not known */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + trx_t* impl_trx; + lock_t* lock; + ulint heap_no; + + ut_a(rec); + ut_a(block->frame == page_align(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + + heap_no = page_rec_get_heap_no(rec); + + lock_mutex_enter_kernel(); + + if (!page_rec_is_user_rec(rec)) { + + lock = lock_rec_get_first(block, heap_no); + + while (lock) { + switch(lock->trx->conc_state) { + case TRX_ACTIVE: + case TRX_PREPARED: + case TRX_COMMITTED_IN_MEMORY: + break; + default: + ut_error; + } + + ut_a(trx_in_trx_list(lock->trx)); + + if (lock_get_wait(lock)) { + ut_a(lock_rec_has_to_wait_in_queue(lock)); + } + + if (index) { + ut_a(lock->index == index); + } + + lock = lock_rec_get_next(heap_no, lock); + } + + lock_mutex_exit_kernel(); + + return(TRUE); + } + + if (!index); + else if (dict_index_is_clust(index)) { + + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); + + if (impl_trx + && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, + block, heap_no, impl_trx)) { + + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, impl_trx)); + } + } else { + + /* The kernel mutex may get released temporarily in the + next function call: we have to release lock table mutex + to obey the latching order */ + + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); + + if (impl_trx + && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, + block, heap_no, impl_trx)) { + + ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, impl_trx)); + } + } + + lock = lock_rec_get_first(block, heap_no); + + while (lock) { + ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED + || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); + ut_a(trx_in_trx_list(lock->trx)); + + if (index) { + ut_a(lock->index == index); + } + + if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) { + + enum lock_mode mode; + + if (lock_get_mode(lock) == LOCK_S) { + mode = LOCK_X; + } else { + mode = LOCK_S; + } + ut_a(!lock_rec_other_has_expl_req( + mode, 0, 0, block, heap_no, lock->trx)); + + } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { + + ut_a(lock_rec_has_to_wait_in_queue(lock)); + } + + lock = lock_rec_get_next(heap_no, lock); + } + + lock_mutex_exit_kernel(); + + return(TRUE); +} + +/************************************************************************* +Validates the record lock queues on a page. */ +static +ibool +lock_rec_validate_page( +/*===================*/ + /* out: TRUE if ok */ + ulint space, /* in: space id */ + ulint page_no)/* in: page number */ +{ + dict_index_t* index; + buf_block_t* block; + const page_t* page; + lock_t* lock; + const rec_t* rec; + ulint nth_lock = 0; + ulint nth_bit = 0; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(!mutex_own(&kernel_mutex)); + + mtr_start(&mtr); + + block = buf_page_get(space, fil_space_get_zip_size(space), + page_no, RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + page = block->frame; + + lock_mutex_enter_kernel(); +loop: + lock = lock_rec_get_first_on_page_addr(space, page_no); + + if (!lock) { + goto function_exit; + } + + for (i = 0; i < nth_lock; i++) { + + lock = lock_rec_get_next_on_page(lock); + + if (!lock) { + goto function_exit; + } + } + + ut_a(trx_in_trx_list(lock->trx)); + ut_a(lock->trx->conc_state == TRX_ACTIVE + || lock->trx->conc_state == TRX_PREPARED + || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY); + + for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) { + + if (i == 1 || lock_rec_get_nth_bit(lock, i)) { + + index = lock->index; + rec = page_find_rec_with_heap_no(page, i); + ut_a(rec); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + fprintf(stderr, + "Validating %lu %lu\n", + (ulong) space, (ulong) page_no); + + lock_mutex_exit_kernel(); + + lock_rec_queue_validate(block, rec, index, offsets); + + lock_mutex_enter_kernel(); + + nth_bit = i + 1; + + goto loop; + } + } + + nth_bit = 0; + nth_lock++; + + goto loop; + +function_exit: + lock_mutex_exit_kernel(); + + mtr_commit(&mtr); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(TRUE); +} + +/************************************************************************* +Validates the lock system. */ +static +ibool +lock_validate(void) +/*===============*/ + /* out: TRUE if ok */ +{ + lock_t* lock; + trx_t* trx; + dulint limit; + ulint space; + ulint page_no; + ulint i; + + lock_mutex_enter_kernel(); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + lock = UT_LIST_GET_FIRST(trx->trx_locks); + + while (lock) { + if (lock_get_type_low(lock) & LOCK_TABLE) { + + lock_table_queue_validate( + lock->un_member.tab_lock.table); + } + + lock = UT_LIST_GET_NEXT(trx_locks, lock); + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) { + + limit = ut_dulint_zero; + + for (;;) { + lock = HASH_GET_FIRST(lock_sys->rec_hash, i); + + while (lock) { + ut_a(trx_in_trx_list(lock->trx)); + + space = lock->un_member.rec_lock.space; + page_no = lock->un_member.rec_lock.page_no; + + if (ut_dulint_cmp( + ut_dulint_create(space, page_no), + limit) >= 0) { + break; + } + + lock = HASH_GET_NEXT(hash, lock); + } + + if (!lock) { + + break; + } + + lock_mutex_exit_kernel(); + + lock_rec_validate_page(space, page_no); + + lock_mutex_enter_kernel(); + + limit = ut_dulint_create(space, page_no + 1); + } + } + + lock_mutex_exit_kernel(); + + return(TRUE); +} +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ +/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/ + +/************************************************************************* +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. */ +UNIV_INTERN +ulint +lock_rec_insert_check_and_lock( +/*===========================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG bit is + set, does nothing */ + rec_t* rec, /* in: record after which to insert */ + buf_block_t* block, /* in/out: buffer block of rec */ + dict_index_t* index, /* in: index */ + que_thr_t* thr, /* in: query thread */ + ibool* inherit)/* out: set to TRUE if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ +{ + const rec_t* next_rec; + trx_t* trx; + lock_t* lock; + ulint err; + ulint next_rec_heap_no; + + ut_ad(block->frame == page_align(rec)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + trx = thr_get_trx(thr); + next_rec = page_rec_get_next(rec); + next_rec_heap_no = page_rec_get_heap_no(next_rec); + + lock_mutex_enter_kernel(); + + /* When inserting a record into an index, the table must be at + least IX-locked or we must be building an index, in which case + the table must be at least S-locked. */ + ut_ad(lock_table_has(trx, index->table, LOCK_IX) + || (*index->name == TEMP_INDEX_PREFIX + && lock_table_has(trx, index->table, LOCK_S))); + + lock = lock_rec_get_first(block, next_rec_heap_no); + + if (UNIV_LIKELY(lock == NULL)) { + /* We optimize CPU time usage in the simplest case */ + + lock_mutex_exit_kernel(); + + if (!dict_index_is_clust(index)) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + trx->id); + } + + *inherit = FALSE; + + return(DB_SUCCESS); + } + + *inherit = TRUE; + + /* If another transaction has an explicit lock request which locks + the gap, waiting or granted, on the successor, the insert has to wait. + + An exception is the case where the lock by the another transaction + is a gap type lock which it placed to wait for its turn to insert. We + do not consider that kind of a lock conflicting with our insert. This + eliminates an unnecessary deadlock which resulted when 2 transactions + had to wait for their insert. Both had waiting gap type lock requests + on the successor, which produced an unnecessary deadlock. */ + + if (lock_rec_other_has_conflicting( + LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION, + block, next_rec_heap_no, trx)) { + + /* Note that we may get DB_SUCCESS also here! */ + err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP + | LOCK_INSERT_INTENTION, + block, next_rec_heap_no, + index, thr); + } else { + err = DB_SUCCESS; + } + + lock_mutex_exit_kernel(); + + if ((err == DB_SUCCESS) && !dict_index_is_clust(index)) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + trx->id); + } + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(next_rec, index, offsets_, + ULINT_UNDEFINED, &heap); + ut_ad(lock_rec_queue_validate(block, + next_rec, index, offsets)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ + + return(err); +} + +/************************************************************************* +If a transaction has an implicit x-lock on a record, but no explicit x-lock +set on the record, sets one for it. NOTE that in the case of a secondary +index, the kernel mutex may get temporarily released. */ +static +void +lock_rec_convert_impl_to_expl( +/*==========================*/ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: user record on page */ + dict_index_t* index, /* in: index of record */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + trx_t* impl_trx; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + + if (dict_index_is_clust(index)) { + impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); + } else { + impl_trx = lock_sec_rec_some_has_impl_off_kernel( + rec, index, offsets); + } + + if (impl_trx) { + ulint heap_no = page_rec_get_heap_no(rec); + + /* If the transaction has no explicit x-lock set on the + record, set one for it */ + + if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, + heap_no, impl_trx)) { + + lock_rec_add_to_queue( + LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, index, impl_trx); + } + } +} + +/************************************************************************* +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. */ +UNIV_INTERN +ulint +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: record which should be + modified */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + ulint heap_no; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(dict_index_is_clust(index)); + ut_ad(block->frame == page_align(rec)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + heap_no = rec_offs_comp(offsets) + ? rec_get_heap_no_new(rec) + : rec_get_heap_no_old(rec); + + lock_mutex_enter_kernel(); + + ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + + /* If a transaction has no explicit x-lock set on the record, set one + for it */ + + lock_rec_convert_impl_to_expl(block, rec, index, offsets); + + err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, index, thr); + + lock_mutex_exit_kernel(); + + ut_ad(lock_rec_queue_validate(block, rec, index, offsets)); + + return(err); +} + +/************************************************************************* +Checks if locks of other transactions prevent an immediate modify (delete +mark or delete unmark) of a secondary index record. */ +UNIV_INTERN +ulint +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /* in/out: buffer block of rec */ + rec_t* rec, /* in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /* in: secondary index */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + ulint heap_no; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(block->frame == page_align(rec)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + heap_no = page_rec_get_heap_no(rec); + + /* Another transaction cannot have an implicit lock on the record, + because when we come here, we already have modified the clustered + index record, and this would not have been possible if another active + transaction had modified this secondary index record. */ + + lock_mutex_enter_kernel(); + + ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + + err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, index, thr); + + lock_mutex_exit_kernel(); + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + ut_ad(lock_rec_queue_validate(block, rec, index, offsets)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ + + if (err == DB_SUCCESS) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + thr_get_trx(thr)->id); + } + + return(err); +} + +/************************************************************************* +Like the counterpart for a clustered index below, but now we read a +secondary index record. */ +UNIV_INTERN +ulint +lock_sec_rec_read_check_and_lock( +/*=============================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /* in: secondary index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /* in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + ulint heap_no; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(block->frame == page_align(rec)); + ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mode == LOCK_X || mode == LOCK_S); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + heap_no = page_rec_get_heap_no(rec); + + lock_mutex_enter_kernel(); + + ut_ad(mode != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad(mode != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + + /* Some transaction may have an implicit x-lock on the record only + if the max trx id for the page >= min trx id for the trx list or a + database recovery is running. */ + + if (((ut_dulint_cmp(page_get_max_trx_id(block->frame), + trx_list_get_min_trx_id()) >= 0) + || recv_recovery_is_on()) + && !page_rec_is_supremum(rec)) { + + lock_rec_convert_impl_to_expl(block, rec, index, offsets); + } + + err = lock_rec_lock(FALSE, mode | gap_mode, + block, heap_no, index, thr); + + lock_mutex_exit_kernel(); + + ut_ad(lock_rec_queue_validate(block, rec, index, offsets)); + + return(err); +} + +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. */ +UNIV_INTERN +ulint +lock_clust_rec_read_check_and_lock( +/*===============================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + enum lock_mode mode, /* in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + ulint heap_no; + + ut_ad(dict_index_is_clust(index)); + ut_ad(block->frame == page_align(rec)); + ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP + || gap_mode == LOCK_REC_NOT_GAP); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + + heap_no = page_rec_get_heap_no(rec); + + lock_mutex_enter_kernel(); + + ut_ad(mode != LOCK_X + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); + ut_ad(mode != LOCK_S + || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS)); + + if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) { + + lock_rec_convert_impl_to_expl(block, rec, index, offsets); + } + + err = lock_rec_lock(FALSE, mode | gap_mode, + block, heap_no, index, thr); + + lock_mutex_exit_kernel(); + + ut_ad(lock_rec_queue_validate(block, rec, index, offsets)); + + return(err); +} +/************************************************************************* +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". */ +UNIV_INTERN +ulint +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + /* out: DB_SUCCESS, + DB_LOCK_WAIT, DB_DEADLOCK, or + DB_QUE_THR_SUSPENDED */ + ulint flags, /* in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /* in: clustered index */ + enum lock_mode mode, /* in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + ulint gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ulint ret; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + ret = lock_clust_rec_read_check_and_lock(flags, block, rec, index, + offsets, mode, gap_mode, thr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + return(ret); +} + +/*********************************************************************** +Release the last lock from the transaction's autoinc locks. */ +UNIV_INLINE +void +lock_release_autoinc_last_lock( +/*===========================*/ + ib_vector_t* autoinc_locks) /* in/out: vector of AUTOINC locks */ +{ + ulint last; + lock_t* lock; + + ut_ad(mutex_own(&kernel_mutex)); + ut_a(!ib_vector_is_empty(autoinc_locks)); + + /* The lock to be release must be the last lock acquired. */ + last = ib_vector_size(autoinc_locks) - 1; + lock = ib_vector_get(autoinc_locks, last); + + /* Should have only AUTOINC locks in the vector. */ + ut_a(lock_get_mode(lock) == LOCK_AUTO_INC); + ut_a(lock_get_type(lock) == LOCK_TABLE); + + ut_a(lock->un_member.tab_lock.table != NULL); + + /* This will remove the lock from the trx autoinc_locks too. */ + lock_table_dequeue(lock); +} + +/*********************************************************************** +Release all the transaction's autoinc locks. */ +UNIV_INTERN +void +lock_release_autoinc_locks( +/*=======================*/ + trx_t* trx) /* in/out: transaction */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + ut_a(trx->autoinc_locks != NULL); + + /* We release the locks in the reverse order. This is to + avoid searching the vector for the element to delete at + the lower level. See (lock_table_remove_low()) for details. */ + while (!ib_vector_is_empty(trx->autoinc_locks)) { + + /* lock_table_remove_low() will also remove the lock from + the transaction's autoinc_locks vector. */ + lock_release_autoinc_last_lock(trx->autoinc_locks); + } + + /* Should release all locks. */ + ut_a(ib_vector_is_empty(trx->autoinc_locks)); +} + +/*********************************************************************** +Gets the type of a lock. Non-inline version for using outside of the +lock module. */ +UNIV_INTERN +ulint +lock_get_type( +/*==========*/ + /* out: LOCK_TABLE or LOCK_REC */ + const lock_t* lock) /* in: lock */ +{ + return(lock_get_type_low(lock)); +} + +/*********************************************************************** +Gets the id of the transaction owning a lock. */ +UNIV_INTERN +ullint +lock_get_trx_id( +/*============*/ + /* out: transaction id */ + const lock_t* lock) /* in: lock */ +{ + return(trx_get_id(lock->trx)); +} + +/*********************************************************************** +Gets the mode of a lock in a human readable string. +The string should not be free()'d or modified. */ +UNIV_INTERN +const char* +lock_get_mode_str( +/*==============*/ + /* out: lock mode */ + const lock_t* lock) /* in: lock */ +{ + ibool is_gap_lock; + + is_gap_lock = lock_get_type_low(lock) == LOCK_REC + && lock_rec_get_gap(lock); + + switch (lock_get_mode(lock)) { + case LOCK_S: + if (is_gap_lock) { + return("S,GAP"); + } else { + return("S"); + } + case LOCK_X: + if (is_gap_lock) { + return("X,GAP"); + } else { + return("X"); + } + case LOCK_IS: + if (is_gap_lock) { + return("IS,GAP"); + } else { + return("IS"); + } + case LOCK_IX: + if (is_gap_lock) { + return("IX,GAP"); + } else { + return("IX"); + } + case LOCK_AUTO_INC: + return("AUTO_INC"); + default: + return("UNKNOWN"); + } +} + +/*********************************************************************** +Gets the type of a lock in a human readable string. +The string should not be free()'d or modified. */ +UNIV_INTERN +const char* +lock_get_type_str( +/*==============*/ + /* out: lock type */ + const lock_t* lock) /* in: lock */ +{ + switch (lock_get_type_low(lock)) { + case LOCK_REC: + return("RECORD"); + case LOCK_TABLE: + return("TABLE"); + default: + return("UNKNOWN"); + } +} + +/*********************************************************************** +Gets the table on which the lock is. */ +UNIV_INLINE +dict_table_t* +lock_get_table( +/*===========*/ + /* out: table */ + const lock_t* lock) /* in: lock */ +{ + switch (lock_get_type_low(lock)) { + case LOCK_REC: + return(lock->index->table); + case LOCK_TABLE: + return(lock->un_member.tab_lock.table); + default: + ut_error; + return(NULL); + } +} + +/*********************************************************************** +Gets the id of the table on which the lock is. */ +UNIV_INTERN +ullint +lock_get_table_id( +/*==============*/ + /* out: id of the table */ + const lock_t* lock) /* in: lock */ +{ + dict_table_t* table; + + table = lock_get_table(lock); + + return((ullint)ut_conv_dulint_to_longlong(table->id)); +} + +/*********************************************************************** +Gets the name of the table on which the lock is. +The string should not be free()'d or modified. */ +UNIV_INTERN +const char* +lock_get_table_name( +/*================*/ + /* out: name of the table */ + const lock_t* lock) /* in: lock */ +{ + dict_table_t* table; + + table = lock_get_table(lock); + + return(table->name); +} + +/*********************************************************************** +For a record lock, gets the index on which the lock is. */ +UNIV_INTERN +const dict_index_t* +lock_rec_get_index( +/*===============*/ + /* out: index */ + const lock_t* lock) /* in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + + return(lock->index); +} + +/*********************************************************************** +For a record lock, gets the name of the index on which the lock is. +The string should not be free()'d or modified. */ +UNIV_INTERN +const char* +lock_rec_get_index_name( +/*====================*/ + /* out: name of the index */ + const lock_t* lock) /* in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + + return(lock->index->name); +} + +/*********************************************************************** +For a record lock, gets the tablespace number on which the lock is. */ +UNIV_INTERN +ulint +lock_rec_get_space_id( +/*==================*/ + /* out: tablespace number */ + const lock_t* lock) /* in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + + return(lock->un_member.rec_lock.space); +} + +/*********************************************************************** +For a record lock, gets the page number on which the lock is. */ +UNIV_INTERN +ulint +lock_rec_get_page_no( +/*=================*/ + /* out: page number */ + const lock_t* lock) /* in: lock */ +{ + ut_a(lock_get_type_low(lock) == LOCK_REC); + + return(lock->un_member.rec_lock.page_no); +} diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c new file mode 100644 index 00000000000..d2ea0507705 --- /dev/null +++ b/storage/xtradb/log/log0log.c @@ -0,0 +1,3316 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#include "log0log.h" + +#ifdef UNIV_NONINL +#include "log0log.ic" +#endif + +#include "mem0mem.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "srv0srv.h" +#include "log0recv.h" +#include "fil0fil.h" +#include "dict0boot.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "trx0trx.h" + +/* +General philosophy of InnoDB redo-logs: + +1) Every change to a contents of a data page must be done +through mtr, which in mtr_commit() writes log records +to the InnoDB redo log. + +2) Normally these changes are performed using a mlog_write_ulint() +or similar function. + +3) In some page level operations only a code number of a +c-function and its parameters are written to the log to +reduce the size of the log. + + 3a) You should not add parameters to these kind of functions + (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse()) + + 3b) You should not add such functionality which either change + working when compared with the old or are dependent on data + outside of the page. These kind of functions should implement + self-contained page transformation and it should be unchanged + if you don't have very essential reasons to change log + semantics or format. + +*/ + +/* Current free limit of space 0; protected by the log sys mutex; 0 means +uninitialized */ +UNIV_INTERN ulint log_fsp_current_free_limit = 0; + +/* Global log system variable */ +UNIV_INTERN log_t* log_sys = NULL; + +#ifdef UNIV_DEBUG +UNIV_INTERN ibool log_do_write = TRUE; + +UNIV_INTERN ibool log_debug_writes = FALSE; +#endif /* UNIV_DEBUG */ + +/* These control how often we print warnings if the last checkpoint is too +old */ +UNIV_INTERN ibool log_has_printed_chkp_warning = FALSE; +UNIV_INTERN time_t log_last_warning_time; + +#ifdef UNIV_LOG_ARCHIVE +/* Pointer to this variable is used as the i/o-message when we do i/o to an +archive */ +UNIV_INTERN byte log_archive_io; +#endif /* UNIV_LOG_ARCHIVE */ + +/* A margin for free space in the log buffer before a log entry is catenated */ +#define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE) + +/* Margins for free space in the log buffer after a log entry is catenated */ +#define LOG_BUF_FLUSH_RATIO 2 +#define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE) + +/* Margin for the free space in the smallest log group, before a new query +step which modifies the database, is started */ + +#define LOG_CHECKPOINT_FREE_PER_THREAD (4 * UNIV_PAGE_SIZE) +#define LOG_CHECKPOINT_EXTRA_FREE (8 * UNIV_PAGE_SIZE) + +/* This parameter controls asynchronous making of a new checkpoint; the value +should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */ + +#define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32 + +/* This parameter controls synchronous preflushing of modified buffer pages */ +#define LOG_POOL_PREFLUSH_RATIO_SYNC 16 + +/* The same ratio for asynchronous preflushing; this value should be less than +the previous */ +#define LOG_POOL_PREFLUSH_RATIO_ASYNC 8 + +/* Extra margin, in addition to one log file, used in archiving */ +#define LOG_ARCHIVE_EXTRA_MARGIN (4 * UNIV_PAGE_SIZE) + +/* This parameter controls asynchronous writing to the archive */ +#define LOG_ARCHIVE_RATIO_ASYNC 16 + +/* Codes used in unlocking flush latches */ +#define LOG_UNLOCK_NONE_FLUSHED_LOCK 1 +#define LOG_UNLOCK_FLUSH_LOCK 2 + +/* States of an archiving operation */ +#define LOG_ARCHIVE_READ 1 +#define LOG_ARCHIVE_WRITE 2 + +/********************************************************** +Completes a checkpoint write i/o to a log file. */ +static +void +log_io_complete_checkpoint(void); +/*============================*/ +#ifdef UNIV_LOG_ARCHIVE +/********************************************************** +Completes an archiving i/o. */ +static +void +log_io_complete_archive(void); +/*=========================*/ +#endif /* UNIV_LOG_ARCHIVE */ + +/******************************************************************** +Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint, +so that we know that the limit has been written to a log checkpoint field +on disk. */ +UNIV_INTERN +void +log_fsp_current_free_limit_set_and_checkpoint( +/*==========================================*/ + ulint limit) /* in: limit to set */ +{ + ibool success; + + mutex_enter(&(log_sys->mutex)); + + log_fsp_current_free_limit = limit; + + mutex_exit(&(log_sys->mutex)); + + /* Try to make a synchronous checkpoint */ + + success = FALSE; + + while (!success) { + success = log_checkpoint(TRUE, TRUE); + } +} + +/******************************************************************** +Returns the oldest modified block lsn in the pool, or log_sys->lsn if none +exists. */ +static +ib_uint64_t +log_buf_pool_get_oldest_modification(void) +/*======================================*/ +{ + ib_uint64_t lsn; + + ut_ad(mutex_own(&(log_sys->mutex))); + + lsn = buf_pool_get_oldest_modification(); + + if (!lsn) { + + lsn = log_sys->lsn; + } + + return(lsn); +} + +/**************************************************************** +Opens the log for log_write_low. The log must be closed with log_close and +released with log_release. */ +UNIV_INTERN +ib_uint64_t +log_reserve_and_open( +/*=================*/ + /* out: start lsn of the log record */ + ulint len) /* in: length of data to be catenated */ +{ + log_t* log = log_sys; + ulint len_upper_limit; +#ifdef UNIV_LOG_ARCHIVE + ulint archived_lsn_age; + ulint dummy; +#endif /* UNIV_LOG_ARCHIVE */ +#ifdef UNIV_DEBUG + ulint count = 0; +#endif /* UNIV_DEBUG */ + + ut_a(len < log->buf_size / 2); +loop: + mutex_enter(&(log->mutex)); + + /* Calculate an upper limit for the space the string may take in the + log buffer */ + + len_upper_limit = LOG_BUF_WRITE_MARGIN + (5 * len) / 4; + + if (log->buf_free + len_upper_limit > log->buf_size) { + + mutex_exit(&(log->mutex)); + + /* Not enough free space, do a syncronous flush of the log + buffer */ + + log_buffer_flush_to_disk(); + + srv_log_waits++; + + ut_ad(++count < 50); + + goto loop; + } + +#ifdef UNIV_LOG_ARCHIVE + if (log->archiving_state != LOG_ARCH_OFF) { + + archived_lsn_age = log->lsn - log->archived_lsn; + if (archived_lsn_age + len_upper_limit + > log->max_archived_lsn_age) { + /* Not enough free archived space in log groups: do a + synchronous archive write batch: */ + + mutex_exit(&(log->mutex)); + + ut_ad(len_upper_limit <= log->max_archived_lsn_age); + + log_archive_do(TRUE, &dummy); + + ut_ad(++count < 50); + + goto loop; + } + } +#endif /* UNIV_LOG_ARCHIVE */ + +#ifdef UNIV_LOG_DEBUG + log->old_buf_free = log->buf_free; + log->old_lsn = log->lsn; +#endif + return(log->lsn); +} + +/**************************************************************** +Writes to the log the string given. It is assumed that the caller holds the +log mutex. */ +UNIV_INTERN +void +log_write_low( +/*==========*/ + byte* str, /* in: string */ + ulint str_len) /* in: string length */ +{ + log_t* log = log_sys; + ulint len; + ulint data_len; + byte* log_block; + + ut_ad(mutex_own(&(log->mutex))); +part_loop: + /* Calculate a part length */ + + data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len; + + if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + + /* The string fits within the current log block */ + + len = str_len; + } else { + data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; + + len = OS_FILE_LOG_BLOCK_SIZE + - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + - LOG_BLOCK_TRL_SIZE; + } + + ut_memcpy(log->buf + log->buf_free, str, len); + + str_len -= len; + str = str + len; + + log_block = ut_align_down(log->buf + log->buf_free, + OS_FILE_LOG_BLOCK_SIZE); + log_block_set_data_len(log_block, data_len); + + if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + /* This block became full */ + log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE); + log_block_set_checkpoint_no(log_block, + log_sys->next_checkpoint_no); + len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE; + + log->lsn += len; + + /* Initialize the next block header */ + log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn); + } else { + log->lsn += len; + } + + log->buf_free += len; + + ut_ad(log->buf_free <= log->buf_size); + + if (str_len > 0) { + goto part_loop; + } + + srv_log_write_requests++; +} + +/**************************************************************** +Closes the log. */ +UNIV_INTERN +ib_uint64_t +log_close(void) +/*===========*/ + /* out: lsn */ +{ + byte* log_block; + ulint first_rec_group; + ib_uint64_t oldest_lsn; + ib_uint64_t lsn; + log_t* log = log_sys; + ib_uint64_t checkpoint_age; + + ut_ad(mutex_own(&(log->mutex))); + + lsn = log->lsn; + + log_block = ut_align_down(log->buf + log->buf_free, + OS_FILE_LOG_BLOCK_SIZE); + first_rec_group = log_block_get_first_rec_group(log_block); + + if (first_rec_group == 0) { + /* We initialized a new log block which was not written + full by the current mtr: the next mtr log record group + will start within this block at the offset data_len */ + + log_block_set_first_rec_group( + log_block, log_block_get_data_len(log_block)); + } + + if (log->buf_free > log->max_buf_free) { + + log->check_flush_or_checkpoint = TRUE; + } + + checkpoint_age = lsn - log->last_checkpoint_lsn; + + if (checkpoint_age >= log->log_group_capacity) { + /* TODO: split btr_store_big_rec_extern_fields() into small + steps so that we can release all latches in the middle, and + call log_free_check() to ensure we never write over log written + after the latest checkpoint. In principle, we should split all + big_rec operations, but other operations are smaller. */ + + if (!log_has_printed_chkp_warning + || difftime(time(NULL), log_last_warning_time) > 15) { + + log_has_printed_chkp_warning = TRUE; + log_last_warning_time = time(NULL); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: the age of the last" + " checkpoint is %lu,\n" + "InnoDB: which exceeds the log group" + " capacity %lu.\n" + "InnoDB: If you are using big" + " BLOB or TEXT rows, you must set the\n" + "InnoDB: combined size of log files" + " at least 10 times bigger than the\n" + "InnoDB: largest such row.\n", + (ulong) checkpoint_age, + (ulong) log->log_group_capacity); + } + } + + if (checkpoint_age <= log->max_modified_age_async) { + + goto function_exit; + } + + oldest_lsn = buf_pool_get_oldest_modification(); + + if (!oldest_lsn + || lsn - oldest_lsn > log->max_modified_age_async + || checkpoint_age > log->max_checkpoint_age_async) { + + log->check_flush_or_checkpoint = TRUE; + } +function_exit: + +#ifdef UNIV_LOG_DEBUG + log_check_log_recs(log->buf + log->old_buf_free, + log->buf_free - log->old_buf_free, log->old_lsn); +#endif + + return(lsn); +} + +#ifdef UNIV_LOG_ARCHIVE +/********************************************************** +Pads the current log block full with dummy log records. Used in producing +consistent archived log files. */ +static +void +log_pad_current_log_block(void) +/*===========================*/ +{ + byte b = MLOG_DUMMY_RECORD; + ulint pad_length; + ulint i; + ib_uint64_t lsn; + + /* We retrieve lsn only because otherwise gcc crashed on HP-UX */ + lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE); + + pad_length = OS_FILE_LOG_BLOCK_SIZE + - (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE) + - LOG_BLOCK_TRL_SIZE; + + for (i = 0; i < pad_length; i++) { + log_write_low(&b, 1); + } + + lsn = log_sys->lsn; + + log_close(); + log_release(); + + ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE); +} +#endif /* UNIV_LOG_ARCHIVE */ + +/********************************************************** +Calculates the data capacity of a log group, when the log file headers are not +included. */ +UNIV_INTERN +ulint +log_group_get_capacity( +/*===================*/ + /* out: capacity in bytes */ + log_group_t* group) /* in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files); +} + +/********************************************************** +Calculates the offset within a log group, when the log file headers are not +included. */ +UNIV_INLINE +ulint +log_group_calc_size_offset( +/*=======================*/ + /* out: size offset (<= offset) */ + ulint offset, /* in: real offset within the log group */ + log_group_t* group) /* in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size)); +} + +/********************************************************** +Calculates the offset within a log group, when the log file headers are +included. */ +UNIV_INLINE +ulint +log_group_calc_real_offset( +/*=======================*/ + /* out: real offset (>= offset) */ + ulint offset, /* in: size offset within the log group */ + log_group_t* group) /* in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + return(offset + LOG_FILE_HDR_SIZE + * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE))); +} + +/********************************************************** +Calculates the offset of an lsn within a log group. */ +static +ulint +log_group_calc_lsn_offset( +/*======================*/ + /* out: offset within the log group */ + ib_uint64_t lsn, /* in: lsn, must be within 4 GB of + group->lsn */ + log_group_t* group) /* in: log group */ +{ + ib_uint64_t gr_lsn; + ib_int64_t gr_lsn_size_offset; + ib_int64_t difference; + ib_int64_t group_size; + ib_int64_t offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + + /* If total log file size is > 2 GB we can easily get overflows + with 32-bit integers. Use 64-bit integers instead. */ + + gr_lsn = group->lsn; + + gr_lsn_size_offset = (ib_int64_t) + log_group_calc_size_offset(group->lsn_offset, group); + + group_size = (ib_int64_t) log_group_get_capacity(group); + + if (lsn >= gr_lsn) { + + difference = (ib_int64_t) (lsn - gr_lsn); + } else { + difference = (ib_int64_t) (gr_lsn - lsn); + + difference = difference % group_size; + + difference = group_size - difference; + } + + offset = (gr_lsn_size_offset + difference) % group_size; + + ut_a(offset < (((ib_int64_t) 1) << 32)); /* offset must be < 4 GB */ + + /* fprintf(stderr, + "Offset is %lu gr_lsn_offset is %lu difference is %lu\n", + (ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference); + */ + + return(log_group_calc_real_offset((ulint)offset, group)); +} + +/*********************************************************************** +Calculates where in log files we find a specified lsn. */ +UNIV_INTERN +ulint +log_calc_where_lsn_is( +/*==================*/ + /* out: log file number */ + ib_int64_t* log_file_offset, /* out: offset in that file + (including the header) */ + ib_uint64_t first_header_lsn, /* in: first log file start + lsn */ + ib_uint64_t lsn, /* in: lsn whose position to + determine */ + ulint n_log_files, /* in: total number of log + files */ + ib_int64_t log_file_size) /* in: log file size + (including the header) */ +{ + ib_int64_t capacity = log_file_size - LOG_FILE_HDR_SIZE; + ulint file_no; + ib_int64_t add_this_many; + + if (lsn < first_header_lsn) { + add_this_many = 1 + (first_header_lsn - lsn) + / (capacity * (ib_int64_t)n_log_files); + lsn += add_this_many + * capacity * (ib_int64_t)n_log_files; + } + + ut_a(lsn >= first_header_lsn); + + file_no = ((ulint)((lsn - first_header_lsn) / capacity)) + % n_log_files; + *log_file_offset = (lsn - first_header_lsn) % capacity; + + *log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE; + + return(file_no); +} + +/************************************************************ +Sets the field values in group to correspond to a given lsn. For this function +to work, the values must already be correctly initialized to correspond to +some lsn, for instance, a checkpoint lsn. */ +UNIV_INTERN +void +log_group_set_fields( +/*=================*/ + log_group_t* group, /* in: group */ + ib_uint64_t lsn) /* in: lsn for which the values should be + set */ +{ + group->lsn_offset = log_group_calc_lsn_offset(lsn, group); + group->lsn = lsn; +} + +/********************************************************************* +Calculates the recommended highest values for lsn - last_checkpoint_lsn, +lsn - buf_get_oldest_modification(), and lsn - max_archive_lsn_age. */ +static +ibool +log_calc_max_ages(void) +/*===================*/ + /* out: error value FALSE if the smallest log group is + too small to accommodate the number of OS threads in + the database server */ +{ + log_group_t* group; + ulint margin; + ulint free; + ibool success = TRUE; + ulint smallest_capacity; + ulint archive_margin; + ulint smallest_archive_margin; + + ut_ad(!mutex_own(&(log_sys->mutex))); + + mutex_enter(&(log_sys->mutex)); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + ut_ad(group); + + smallest_capacity = ULINT_MAX; + smallest_archive_margin = ULINT_MAX; + + while (group) { + if (log_group_get_capacity(group) < smallest_capacity) { + + smallest_capacity = log_group_get_capacity(group); + } + + archive_margin = log_group_get_capacity(group) + - (group->file_size - LOG_FILE_HDR_SIZE) + - LOG_ARCHIVE_EXTRA_MARGIN; + + if (archive_margin < smallest_archive_margin) { + + smallest_archive_margin = archive_margin; + } + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + /* Add extra safety */ + smallest_capacity = smallest_capacity - smallest_capacity / 10; + + /* For each OS thread we must reserve so much free space in the + smallest log group that it can accommodate the log entries produced + by single query steps: running out of free log space is a serious + system error which requires rebooting the database. */ + + free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency) + + LOG_CHECKPOINT_EXTRA_FREE; + if (free >= smallest_capacity / 2) { + success = FALSE; + + goto failure; + } else { + margin = smallest_capacity - free; + } + + margin = ut_min(margin, log_sys->adm_checkpoint_interval); + + margin = margin - margin / 10; /* Add still some extra safety */ + + log_sys->log_group_capacity = smallest_capacity; + + log_sys->max_modified_age_async = margin + - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC; + log_sys->max_modified_age_sync = margin + - margin / LOG_POOL_PREFLUSH_RATIO_SYNC; + + log_sys->max_checkpoint_age_async = margin - margin + / LOG_POOL_CHECKPOINT_RATIO_ASYNC; + log_sys->max_checkpoint_age = margin; + +#ifdef UNIV_LOG_ARCHIVE + log_sys->max_archived_lsn_age = smallest_archive_margin; + + log_sys->max_archived_lsn_age_async = smallest_archive_margin + - smallest_archive_margin / LOG_ARCHIVE_RATIO_ASYNC; +#endif /* UNIV_LOG_ARCHIVE */ +failure: + mutex_exit(&(log_sys->mutex)); + + if (!success) { + fprintf(stderr, + "InnoDB: Error: ib_logfiles are too small" + " for innodb_thread_concurrency %lu.\n" + "InnoDB: The combined size of ib_logfiles" + " should be bigger than\n" + "InnoDB: 200 kB * innodb_thread_concurrency.\n" + "InnoDB: To get mysqld to start up, set" + " innodb_thread_concurrency in my.cnf\n" + "InnoDB: to a lower value, for example, to 8." + " After an ERROR-FREE shutdown\n" + "InnoDB: of mysqld you can adjust the size of" + " ib_logfiles, as explained in\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "adding-and-removing.html\n" + "InnoDB: Cannot continue operation." + " Calling exit(1).\n", + (ulong)srv_thread_concurrency); + + exit(1); + } + + return(success); +} + +/********************************************************** +Initializes the log. */ +UNIV_INTERN +void +log_init(void) +/*==========*/ +{ + byte* buf; + + log_sys = mem_alloc(sizeof(log_t)); + + mutex_create(&log_sys->mutex, SYNC_LOG); + + mutex_enter(&(log_sys->mutex)); + + /* Start the lsn from one log block from zero: this way every + log record has a start lsn != zero, a fact which we will use */ + + log_sys->lsn = LOG_START_LSN; + + ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE); + ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE); + + buf = mem_alloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE); + log_sys->buf = ut_align(buf, OS_FILE_LOG_BLOCK_SIZE); + + log_sys->buf_size = LOG_BUFFER_SIZE; + + memset(log_sys->buf, '\0', LOG_BUFFER_SIZE); + + log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO + - LOG_BUF_FLUSH_MARGIN; + log_sys->check_flush_or_checkpoint = TRUE; + UT_LIST_INIT(log_sys->log_groups); + + log_sys->n_log_ios = 0; + + log_sys->n_log_ios_old = log_sys->n_log_ios; + log_sys->last_printout_time = time(NULL); + /*----------------------------*/ + + log_sys->buf_next_to_write = 0; + + log_sys->write_lsn = 0; + log_sys->current_flush_lsn = 0; + log_sys->flushed_to_disk_lsn = 0; + + log_sys->written_to_some_lsn = log_sys->lsn; + log_sys->written_to_all_lsn = log_sys->lsn; + + log_sys->n_pending_writes = 0; + + log_sys->no_flush_event = os_event_create(NULL); + + os_event_set(log_sys->no_flush_event); + + log_sys->one_flushed_event = os_event_create(NULL); + + os_event_set(log_sys->one_flushed_event); + + /*----------------------------*/ + log_sys->adm_checkpoint_interval = ULINT_MAX; + + log_sys->next_checkpoint_no = 0; + log_sys->last_checkpoint_lsn = log_sys->lsn; + log_sys->n_pending_checkpoint_writes = 0; + + rw_lock_create(&log_sys->checkpoint_lock, SYNC_NO_ORDER_CHECK); + + log_sys->checkpoint_buf + = ut_align(mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE), + OS_FILE_LOG_BLOCK_SIZE); + memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE); + /*----------------------------*/ + +#ifdef UNIV_LOG_ARCHIVE + /* Under MySQL, log archiving is always off */ + log_sys->archiving_state = LOG_ARCH_OFF; + log_sys->archived_lsn = log_sys->lsn; + log_sys->next_archived_lsn = 0; + + log_sys->n_pending_archive_ios = 0; + + rw_lock_create(&log_sys->archive_lock, SYNC_NO_ORDER_CHECK); + + log_sys->archive_buf = NULL; + + /* ut_align( + ut_malloc(LOG_ARCHIVE_BUF_SIZE + + OS_FILE_LOG_BLOCK_SIZE), + OS_FILE_LOG_BLOCK_SIZE); */ + log_sys->archive_buf_size = 0; + + /* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */ + + log_sys->archiving_on = os_event_create(NULL); +#endif /* UNIV_LOG_ARCHIVE */ + + /*----------------------------*/ + + log_block_init(log_sys->buf, log_sys->lsn); + log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); + + log_sys->buf_free = LOG_BLOCK_HDR_SIZE; + log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE; + + mutex_exit(&(log_sys->mutex)); + +#ifdef UNIV_LOG_DEBUG + recv_sys_create(); + recv_sys_init(FALSE, buf_pool_get_curr_size()); + + recv_sys->parse_start_lsn = log_sys->lsn; + recv_sys->scanned_lsn = log_sys->lsn; + recv_sys->scanned_checkpoint_no = 0; + recv_sys->recovered_lsn = log_sys->lsn; + recv_sys->limit_lsn = IB_ULONGLONG_MAX; +#endif +} + +/********************************************************************** +Inits a log group to the log system. */ +UNIV_INTERN +void +log_group_init( +/*===========*/ + ulint id, /* in: group id */ + ulint n_files, /* in: number of log files */ + ulint file_size, /* in: log file size in bytes */ + ulint space_id, /* in: space id of the file space + which contains the log files of this + group */ + ulint archive_space_id __attribute__((unused))) + /* in: space id of the file space + which contains some archived log + files for this group; currently, only + for the first log group this is + used */ +{ + ulint i; + + log_group_t* group; + + group = mem_alloc(sizeof(log_group_t)); + + group->id = id; + group->n_files = n_files; + group->file_size = file_size; + group->space_id = space_id; + group->state = LOG_GROUP_OK; + group->lsn = LOG_START_LSN; + group->lsn_offset = LOG_FILE_HDR_SIZE; + group->n_pending_writes = 0; + + group->file_header_bufs = mem_alloc(sizeof(byte*) * n_files); +#ifdef UNIV_LOG_ARCHIVE + group->archive_file_header_bufs = mem_alloc(sizeof(byte*) * n_files); +#endif /* UNIV_LOG_ARCHIVE */ + + for (i = 0; i < n_files; i++) { + *(group->file_header_bufs + i) = ut_align( + mem_alloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE), + OS_FILE_LOG_BLOCK_SIZE); + + memset(*(group->file_header_bufs + i), '\0', + LOG_FILE_HDR_SIZE); + +#ifdef UNIV_LOG_ARCHIVE + *(group->archive_file_header_bufs + i) = ut_align( + mem_alloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE), + OS_FILE_LOG_BLOCK_SIZE); + memset(*(group->archive_file_header_bufs + i), '\0', + LOG_FILE_HDR_SIZE); +#endif /* UNIV_LOG_ARCHIVE */ + } + +#ifdef UNIV_LOG_ARCHIVE + group->archive_space_id = archive_space_id; + + group->archived_file_no = 0; + group->archived_offset = 0; +#endif /* UNIV_LOG_ARCHIVE */ + + group->checkpoint_buf = ut_align( + mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE), OS_FILE_LOG_BLOCK_SIZE); + + memset(group->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE); + + UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group); + + ut_a(log_calc_max_ages()); +} + +/********************************************************************** +Does the unlockings needed in flush i/o completion. */ +UNIV_INLINE +void +log_flush_do_unlocks( +/*=================*/ + ulint code) /* in: any ORed combination of LOG_UNLOCK_FLUSH_LOCK + and LOG_UNLOCK_NONE_FLUSHED_LOCK */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + /* NOTE that we must own the log mutex when doing the setting of the + events: this is because transactions will wait for these events to + be set, and at that moment the log flush they were waiting for must + have ended. If the log mutex were not reserved here, the i/o-thread + calling this function might be preempted for a while, and when it + resumed execution, it might be that a new flush had been started, and + this function would erroneously signal the NEW flush as completed. + Thus, the changes in the state of these events are performed + atomically in conjunction with the changes in the state of + log_sys->n_pending_writes etc. */ + + if (code & LOG_UNLOCK_NONE_FLUSHED_LOCK) { + os_event_set(log_sys->one_flushed_event); + } + + if (code & LOG_UNLOCK_FLUSH_LOCK) { + os_event_set(log_sys->no_flush_event); + } +} + +/********************************************************************** +Checks if a flush is completed for a log group and does the completion +routine if yes. */ +UNIV_INLINE +ulint +log_group_check_flush_completion( +/*=============================*/ + /* out: LOG_UNLOCK_NONE_FLUSHED_LOCK or 0 */ + log_group_t* group) /* in: log group */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + if (!log_sys->one_flushed && group->n_pending_writes == 0) { +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Log flushed first to group %lu\n", + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + log_sys->written_to_some_lsn = log_sys->write_lsn; + log_sys->one_flushed = TRUE; + + return(LOG_UNLOCK_NONE_FLUSHED_LOCK); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes && (group->n_pending_writes == 0)) { + + fprintf(stderr, "Log flushed to group %lu\n", + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + return(0); +} + +/********************************************************** +Checks if a flush is completed and does the completion routine if yes. */ +static +ulint +log_sys_check_flush_completion(void) +/*================================*/ + /* out: LOG_UNLOCK_FLUSH_LOCK or 0 */ +{ + ulint move_start; + ulint move_end; + + ut_ad(mutex_own(&(log_sys->mutex))); + + if (log_sys->n_pending_writes == 0) { + + log_sys->written_to_all_lsn = log_sys->write_lsn; + log_sys->buf_next_to_write = log_sys->write_end_offset; + + if (log_sys->write_end_offset > log_sys->max_buf_free / 2) { + /* Move the log buffer content to the start of the + buffer */ + + move_start = ut_calc_align_down( + log_sys->write_end_offset, + OS_FILE_LOG_BLOCK_SIZE); + move_end = ut_calc_align(log_sys->buf_free, + OS_FILE_LOG_BLOCK_SIZE); + + ut_memmove(log_sys->buf, log_sys->buf + move_start, + move_end - move_start); + log_sys->buf_free -= move_start; + + log_sys->buf_next_to_write -= move_start; + } + + return(LOG_UNLOCK_FLUSH_LOCK); + } + + return(0); +} + +/********************************************************** +Completes an i/o to a log file. */ +UNIV_INTERN +void +log_io_complete( +/*============*/ + log_group_t* group) /* in: log group or a dummy pointer */ +{ + ulint unlock; + +#ifdef UNIV_LOG_ARCHIVE + if ((byte*)group == &log_archive_io) { + /* It was an archive write */ + + log_io_complete_archive(); + + return; + } +#endif /* UNIV_LOG_ARCHIVE */ + + if ((ulint)group & 0x1UL) { + /* It was a checkpoint write */ + group = (log_group_t*)((ulint)group - 1); + + if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + + fil_flush(group->space_id); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Checkpoint info written to group %lu\n", + group->id); + } +#endif /* UNIV_DEBUG */ + log_io_complete_checkpoint(); + + return; + } + + ut_error; /* We currently use synchronous writing of the + logs and cannot end up here! */ + + if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && srv_flush_log_at_trx_commit != 2) { + + fil_flush(group->space_id); + } + + mutex_enter(&(log_sys->mutex)); + + ut_a(group->n_pending_writes > 0); + ut_a(log_sys->n_pending_writes > 0); + + group->n_pending_writes--; + log_sys->n_pending_writes--; + + unlock = log_group_check_flush_completion(group); + unlock = unlock | log_sys_check_flush_completion(); + + log_flush_do_unlocks(unlock); + + mutex_exit(&(log_sys->mutex)); +} + +/********************************************************** +Writes a log file header to a log file space. */ +static +void +log_group_file_header_flush( +/*========================*/ + log_group_t* group, /* in: log group */ + ulint nth_file, /* in: header to the nth file in the + log file space */ + ib_uint64_t start_lsn) /* in: log file data starts at this + lsn */ +{ + byte* buf; + ulint dest_offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_a(nth_file < group->n_files); + + buf = *(group->file_header_bufs + nth_file); + + mach_write_to_4(buf + LOG_GROUP_ID, group->id); + mach_write_ull(buf + LOG_FILE_START_LSN, start_lsn); + + /* Wipe over possible label of ibbackup --restore */ + memcpy(buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, " ", 4); + + dest_offset = nth_file * group->file_size; + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Writing log file header to group %lu file %lu\n", + (ulong) group->id, (ulong) nth_file); + } +#endif /* UNIV_DEBUG */ + if (log_do_write) { + log_sys->n_log_ios++; + + srv_os_log_pending_writes++; + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0, + dest_offset / UNIV_PAGE_SIZE, + dest_offset % UNIV_PAGE_SIZE, + OS_FILE_LOG_BLOCK_SIZE, + buf, group); + + srv_os_log_pending_writes--; + } +} + +/********************************************************** +Stores a 4-byte checksum to the trailer checksum field of a log block +before writing it to a log file. This checksum is used in recovery to +check the consistency of a log block. */ +static +void +log_block_store_checksum( +/*=====================*/ + byte* block) /* in/out: pointer to a log block */ +{ + log_block_set_checksum(block, log_block_calc_checksum(block)); +} + +/********************************************************** +Writes a buffer to a log file group. */ +UNIV_INTERN +void +log_group_write_buf( +/*================*/ + log_group_t* group, /* in: log group */ + byte* buf, /* in: buffer */ + ulint len, /* in: buffer len; must be divisible + by OS_FILE_LOG_BLOCK_SIZE */ + ib_uint64_t start_lsn, /* in: start lsn of the buffer; must + be divisible by + OS_FILE_LOG_BLOCK_SIZE */ + ulint new_data_offset)/* in: start offset of new data in + buf: this parameter is used to decide + if we have to write a new log file + header */ +{ + ulint write_len; + ibool write_header; + ulint next_offset; + ulint i; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_a(((ulint) start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); + + if (new_data_offset == 0) { + write_header = TRUE; + } else { + write_header = FALSE; + } +loop: + if (len == 0) { + + return; + } + + next_offset = log_group_calc_lsn_offset(start_lsn, group); + + if ((next_offset % group->file_size == LOG_FILE_HDR_SIZE) + && write_header) { + /* We start to write a new log file instance in the group */ + + log_group_file_header_flush(group, + next_offset / group->file_size, + start_lsn); + srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE; + srv_log_writes++; + } + + if ((next_offset % group->file_size) + len > group->file_size) { + + write_len = group->file_size + - (next_offset % group->file_size); + } else { + write_len = len; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + + fprintf(stderr, + "Writing log file segment to group %lu" + " offset %lu len %lu\n" + "start lsn %llu\n" + "First block n:o %lu last block n:o %lu\n", + (ulong) group->id, (ulong) next_offset, + (ulong) write_len, + start_lsn, + (ulong) log_block_get_hdr_no(buf), + (ulong) log_block_get_hdr_no( + buf + write_len - OS_FILE_LOG_BLOCK_SIZE)); + ut_a(log_block_get_hdr_no(buf) + == log_block_convert_lsn_to_no(start_lsn)); + + for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) { + + ut_a(log_block_get_hdr_no(buf) + i + == log_block_get_hdr_no( + buf + i * OS_FILE_LOG_BLOCK_SIZE)); + } + } +#endif /* UNIV_DEBUG */ + /* Calculate the checksums for each log block and write them to + the trailer fields of the log blocks */ + + for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) { + log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE); + } + + if (log_do_write) { + log_sys->n_log_ios++; + + srv_os_log_pending_writes++; + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0, + next_offset / UNIV_PAGE_SIZE, + next_offset % UNIV_PAGE_SIZE, write_len, buf, group); + + srv_os_log_pending_writes--; + + srv_os_log_written+= write_len; + srv_log_writes++; + } + + if (write_len < len) { + start_lsn += write_len; + len -= write_len; + buf += write_len; + + write_header = TRUE; + + goto loop; + } +} + +/********************************************************** +This function is called, e.g., when a transaction wants to commit. It checks +that the log has been written to the log file up to the last log entry written +by the transaction. If there is a flush running, it waits and checks if the +flush flushed enough. If not, starts a new flush. */ +UNIV_INTERN +void +log_write_up_to( +/*============*/ + ib_uint64_t lsn, /* in: log sequence number up to which + the log should be written, + IB_ULONGLONG_MAX if not specified */ + ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + or LOG_WAIT_ALL_GROUPS */ + ibool flush_to_disk) + /* in: TRUE if we want the written log + also to be flushed to disk */ +{ + log_group_t* group; + ulint start_offset; + ulint end_offset; + ulint area_start; + ulint area_end; +#ifdef UNIV_DEBUG + ulint loop_count = 0; +#endif /* UNIV_DEBUG */ + ulint unlock; + + if (recv_no_ibuf_operations) { + /* Recovery is running and no operations on the log files are + allowed yet (the variable name .._no_ibuf_.. is misleading) */ + + return; + } + +loop: +#ifdef UNIV_DEBUG + loop_count++; + + ut_ad(loop_count < 5); + +# if 0 + if (loop_count > 2) { + fprintf(stderr, "Log loop count %lu\n", loop_count); + } +# endif +#endif + + mutex_enter(&(log_sys->mutex)); + + if (flush_to_disk + && log_sys->flushed_to_disk_lsn >= lsn) { + + mutex_exit(&(log_sys->mutex)); + + return; + } + + if (!flush_to_disk + && (log_sys->written_to_all_lsn >= lsn + || (log_sys->written_to_some_lsn >= lsn + && wait != LOG_WAIT_ALL_GROUPS))) { + + mutex_exit(&(log_sys->mutex)); + + return; + } + + if (log_sys->n_pending_writes > 0) { + /* A write (+ possibly flush to disk) is running */ + + if (flush_to_disk + && log_sys->current_flush_lsn >= lsn) { + /* The write + flush will write enough: wait for it to + complete */ + + goto do_waits; + } + + if (!flush_to_disk + && log_sys->write_lsn >= lsn) { + /* The write will write enough: wait for it to + complete */ + + goto do_waits; + } + + mutex_exit(&(log_sys->mutex)); + + /* Wait for the write to complete and try to start a new + write */ + + os_event_wait(log_sys->no_flush_event); + + goto loop; + } + + if (!flush_to_disk + && log_sys->buf_free == log_sys->buf_next_to_write) { + /* Nothing to write and no flush to disk requested */ + + mutex_exit(&(log_sys->mutex)); + + return; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Writing log from %llu up to lsn %llu\n", + log_sys->written_to_all_lsn, + log_sys->lsn); + } +#endif /* UNIV_DEBUG */ + log_sys->n_pending_writes++; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + group->n_pending_writes++; /* We assume here that we have only + one log group! */ + + os_event_reset(log_sys->no_flush_event); + os_event_reset(log_sys->one_flushed_event); + + start_offset = log_sys->buf_next_to_write; + end_offset = log_sys->buf_free; + + area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE); + area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE); + + ut_ad(area_end - area_start > 0); + + log_sys->write_lsn = log_sys->lsn; + + if (flush_to_disk) { + log_sys->current_flush_lsn = log_sys->lsn; + } + + log_sys->one_flushed = FALSE; + + log_block_set_flush_bit(log_sys->buf + area_start, TRUE); + log_block_set_checkpoint_no( + log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE, + log_sys->next_checkpoint_no); + + /* Copy the last, incompletely written, log block a log block length + up, so that when the flush operation writes from the log buffer, the + segment to write will not be changed by writers to the log */ + + ut_memcpy(log_sys->buf + area_end, + log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE, + OS_FILE_LOG_BLOCK_SIZE); + + log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE; + log_sys->write_end_offset = log_sys->buf_free; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + /* Do the write to the log files */ + + while (group) { + log_group_write_buf( + group, log_sys->buf + area_start, + area_end - area_start, + ut_uint64_align_down(log_sys->written_to_all_lsn, + OS_FILE_LOG_BLOCK_SIZE), + start_offset - area_start); + + log_group_set_fields(group, log_sys->write_lsn); + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + mutex_exit(&(log_sys->mutex)); + + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + /* O_DSYNC means the OS did not buffer the log file at all: + so we have also flushed to disk what we have written */ + + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; + + } else if (flush_to_disk) { + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + fil_flush(group->space_id); + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; + } + + mutex_enter(&(log_sys->mutex)); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + ut_a(group->n_pending_writes == 1); + ut_a(log_sys->n_pending_writes == 1); + + group->n_pending_writes--; + log_sys->n_pending_writes--; + + unlock = log_group_check_flush_completion(group); + unlock = unlock | log_sys_check_flush_completion(); + + log_flush_do_unlocks(unlock); + + mutex_exit(&(log_sys->mutex)); + + return; + +do_waits: + mutex_exit(&(log_sys->mutex)); + + switch (wait) { + case LOG_WAIT_ONE_GROUP: + os_event_wait(log_sys->one_flushed_event); + break; + case LOG_WAIT_ALL_GROUPS: + os_event_wait(log_sys->no_flush_event); + break; +#ifdef UNIV_DEBUG + case LOG_NO_WAIT: + break; + default: + ut_error; +#endif /* UNIV_DEBUG */ + } +} + +/******************************************************************** +Does a syncronous flush of the log buffer to disk. */ +UNIV_INTERN +void +log_buffer_flush_to_disk(void) +/*==========================*/ +{ + ib_uint64_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE); +} + +/******************************************************************** +Tries to establish a big enough margin of free space in the log buffer, such +that a new log entry can be catenated without an immediate need for a flush. */ +static +void +log_flush_margin(void) +/*==================*/ +{ + log_t* log = log_sys; + ib_uint64_t lsn = 0; + + mutex_enter(&(log->mutex)); + + if (log->buf_free > log->max_buf_free) { + + if (log->n_pending_writes > 0) { + /* A flush is running: hope that it will provide enough + free space */ + } else { + lsn = log->lsn; + } + } + + mutex_exit(&(log->mutex)); + + if (lsn) { + log_write_up_to(lsn, LOG_NO_WAIT, FALSE); + } +} + +/******************************************************************** +Advances the smallest lsn for which there are unflushed dirty blocks in the +buffer pool. NOTE: this function may only be called if the calling thread owns +no synchronization objects! */ +UNIV_INTERN +ibool +log_preflush_pool_modified_pages( +/*=============================*/ + /* out: FALSE if there was a + flush batch of the same type + running, which means that we + could not start this flush + batch */ + ib_uint64_t new_oldest, /* in: try to advance + oldest_modified_lsn at least + to this lsn */ + ibool sync) /* in: TRUE if synchronous + operation is desired */ +{ + ulint n_pages; + + if (recv_recovery_on) { + /* If the recovery is running, we must first apply all + log records to their respective file pages to get the + right modify lsn values to these pages: otherwise, there + might be pages on disk which are not yet recovered to the + current lsn, and even after calling this function, we could + not know how up-to-date the disk version of the database is, + and we could not make a new checkpoint on the basis of the + info on the buffer pool only. */ + + recv_apply_hashed_log_recs(TRUE); + } + + n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, new_oldest); + + if (sync) { + buf_flush_wait_batch_end(BUF_FLUSH_LIST); + } + + if (n_pages == ULINT_UNDEFINED) { + + return(FALSE); + } + + return(TRUE); +} + +/********************************************************** +Completes a checkpoint. */ +static +void +log_complete_checkpoint(void) +/*=========================*/ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + ut_ad(log_sys->n_pending_checkpoint_writes == 0); + + log_sys->next_checkpoint_no++; + + log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn; + + rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT); +} + +/********************************************************** +Completes an asynchronous checkpoint info write i/o to a log file. */ +static +void +log_io_complete_checkpoint(void) +/*============================*/ +{ + mutex_enter(&(log_sys->mutex)); + + ut_ad(log_sys->n_pending_checkpoint_writes > 0); + + log_sys->n_pending_checkpoint_writes--; + + if (log_sys->n_pending_checkpoint_writes == 0) { + log_complete_checkpoint(); + } + + mutex_exit(&(log_sys->mutex)); +} + +/*********************************************************************** +Writes info to a checkpoint about a log group. */ +static +void +log_checkpoint_set_nth_group_info( +/*==============================*/ + byte* buf, /* in: buffer for checkpoint info */ + ulint n, /* in: nth slot */ + ulint file_no,/* in: archived file number */ + ulint offset) /* in: archived file offset */ +{ + ut_ad(n < LOG_MAX_N_GROUPS); + + mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY + + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO, file_no); + mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY + + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET, offset); +} + +/*********************************************************************** +Gets info from a checkpoint about a log group. */ +UNIV_INTERN +void +log_checkpoint_get_nth_group_info( +/*==============================*/ + byte* buf, /* in: buffer containing checkpoint info */ + ulint n, /* in: nth slot */ + ulint* file_no,/* out: archived file number */ + ulint* offset) /* out: archived file offset */ +{ + ut_ad(n < LOG_MAX_N_GROUPS); + + *file_no = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY + + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO); + *offset = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY + + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET); +} + +/********************************************************** +Writes the checkpoint info to a log group header. */ +static +void +log_group_checkpoint( +/*=================*/ + log_group_t* group) /* in: log group */ +{ + log_group_t* group2; +#ifdef UNIV_LOG_ARCHIVE + ib_uint64_t archived_lsn; + ib_uint64_t next_archived_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + ulint write_offset; + ulint fold; + byte* buf; + ulint i; + + ut_ad(mutex_own(&(log_sys->mutex))); +#if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE +# error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE" +#endif + + buf = group->checkpoint_buf; + + mach_write_ull(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no); + mach_write_ull(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn); + + mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET, + log_group_calc_lsn_offset( + log_sys->next_checkpoint_lsn, group)); + + mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size); + +#ifdef UNIV_LOG_ARCHIVE + if (log_sys->archiving_state == LOG_ARCH_OFF) { + archived_lsn = IB_ULONGLONG_MAX; + } else { + archived_lsn = log_sys->archived_lsn; + + if (archived_lsn != log_sys->next_archived_lsn) { + next_archived_lsn = log_sys->next_archived_lsn; + /* For debugging only */ + } + } + + mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn); +#else /* UNIV_LOG_ARCHIVE */ + mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN, IB_ULONGLONG_MAX); +#endif /* UNIV_LOG_ARCHIVE */ + + for (i = 0; i < LOG_MAX_N_GROUPS; i++) { + log_checkpoint_set_nth_group_info(buf, i, 0, 0); + } + + group2 = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group2) { + log_checkpoint_set_nth_group_info(buf, group2->id, +#ifdef UNIV_LOG_ARCHIVE + group2->archived_file_no, + group2->archived_offset +#else /* UNIV_LOG_ARCHIVE */ + 0, 0 +#endif /* UNIV_LOG_ARCHIVE */ + ); + + group2 = UT_LIST_GET_NEXT(log_groups, group2); + } + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold); + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold); + + /* Starting from InnoDB-3.23.50, we also write info on allocated + size in the tablespace */ + + mach_write_to_4(buf + LOG_CHECKPOINT_FSP_FREE_LIMIT, + log_fsp_current_free_limit); + + mach_write_to_4(buf + LOG_CHECKPOINT_FSP_MAGIC_N, + LOG_CHECKPOINT_FSP_MAGIC_N_VAL); + + /* We alternate the physical place of the checkpoint info in the first + log file */ + + if ((log_sys->next_checkpoint_no & 1) == 0) { + write_offset = LOG_CHECKPOINT_1; + } else { + write_offset = LOG_CHECKPOINT_2; + } + + if (log_do_write) { + if (log_sys->n_pending_checkpoint_writes == 0) { + + rw_lock_x_lock_gen(&(log_sys->checkpoint_lock), + LOG_CHECKPOINT); + } + + log_sys->n_pending_checkpoint_writes++; + + log_sys->n_log_ios++; + + /* We send as the last parameter the group machine address + added with 1, as we want to distinguish between a normal log + file write and a checkpoint field write */ + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->space_id, 0, + write_offset / UNIV_PAGE_SIZE, + write_offset % UNIV_PAGE_SIZE, + OS_FILE_LOG_BLOCK_SIZE, + buf, ((byte*)group + 1)); + + ut_ad(((ulint)group & 0x1UL) == 0); + } +} + +#ifdef UNIV_HOTBACKUP +/********************************************************** +Writes info to a buffer of a log group when log files are created in +backup restoration. */ +UNIV_INTERN +void +log_reset_first_header_and_checkpoint( +/*==================================*/ + byte* hdr_buf,/* in: buffer which will be written to the + start of the first log file */ + ib_uint64_t start) /* in: lsn of the start of the first log file; + we pretend that there is a checkpoint at + start + LOG_BLOCK_HDR_SIZE */ +{ + ulint fold; + byte* buf; + ib_uint64_t lsn; + + mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0); + mach_write_ull(hdr_buf + LOG_FILE_START_LSN, start); + + lsn = start + LOG_BLOCK_HDR_SIZE; + + /* Write the label of ibbackup --restore */ + strcpy((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + "ibbackup "); + ut_sprintf_timestamp((char*) hdr_buf + + (LOG_FILE_WAS_CREATED_BY_HOT_BACKUP + + (sizeof "ibbackup ") - 1)); + buf = hdr_buf + LOG_CHECKPOINT_1; + + mach_write_ull(buf + LOG_CHECKPOINT_NO, 0); + mach_write_ull(buf + LOG_CHECKPOINT_LSN, lsn); + + mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET, + LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE); + + mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024); + + mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN, IB_ULONGLONG_MAX); + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold); + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold); + + /* Starting from InnoDB-3.23.50, we should also write info on + allocated size in the tablespace, but unfortunately we do not + know it here */ +} +#endif /* UNIV_HOTBACKUP */ + +/********************************************************** +Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */ +UNIV_INTERN +void +log_group_read_checkpoint_info( +/*===========================*/ + log_group_t* group, /* in: log group */ + ulint field) /* in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + log_sys->n_log_ios++; + + fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->space_id, 0, + field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); +} + +/********************************************************** +Writes checkpoint info to groups. */ +UNIV_INTERN +void +log_groups_write_checkpoint_info(void) +/*==================================*/ +{ + log_group_t* group; + + ut_ad(mutex_own(&(log_sys->mutex))); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + log_group_checkpoint(group); + + group = UT_LIST_GET_NEXT(log_groups, group); + } +} + +/********************************************************** +Makes a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log files. Use log_make_checkpoint_at to flush also the pool. */ +UNIV_INTERN +ibool +log_checkpoint( +/*===========*/ + /* out: TRUE if success, FALSE if a checkpoint + write was already running */ + ibool sync, /* in: TRUE if synchronous operation is + desired */ + ibool write_always) /* in: the function normally checks if the + the new checkpoint would have a greater + lsn than the previous one: if not, then no + physical write is done; by setting this + parameter TRUE, a physical write will always be + made to log files */ +{ + ib_uint64_t oldest_lsn; + + if (recv_recovery_is_on()) { + recv_apply_hashed_log_recs(TRUE); + } + + if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + fil_flush_file_spaces(FIL_TABLESPACE); + } + + mutex_enter(&(log_sys->mutex)); + + oldest_lsn = log_buf_pool_get_oldest_modification(); + + mutex_exit(&(log_sys->mutex)); + + /* Because log also contains headers and dummy log records, + if the buffer pool contains no dirty buffers, oldest_lsn + gets the value log_sys->lsn from the previous function, + and we must make sure that the log is flushed up to that + lsn. If there are dirty buffers in the buffer pool, then our + write-ahead-logging algorithm ensures that the log has been flushed + up to oldest_lsn. */ + + log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE); + + mutex_enter(&(log_sys->mutex)); + + if (!write_always + && log_sys->last_checkpoint_lsn >= oldest_lsn) { + + mutex_exit(&(log_sys->mutex)); + + return(TRUE); + } + + ut_ad(log_sys->written_to_all_lsn >= oldest_lsn); + + if (log_sys->n_pending_checkpoint_writes > 0) { + /* A checkpoint write is running */ + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + /* Wait for the checkpoint write to complete */ + rw_lock_s_lock(&(log_sys->checkpoint_lock)); + rw_lock_s_unlock(&(log_sys->checkpoint_lock)); + } + + return(FALSE); + } + + log_sys->next_checkpoint_lsn = oldest_lsn; + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, "Making checkpoint no %lu at lsn %llu\n", + (ulong) log_sys->next_checkpoint_no, + oldest_lsn); + } +#endif /* UNIV_DEBUG */ + + log_groups_write_checkpoint_info(); + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + /* Wait for the checkpoint write to complete */ + rw_lock_s_lock(&(log_sys->checkpoint_lock)); + rw_lock_s_unlock(&(log_sys->checkpoint_lock)); + } + + return(TRUE); +} + +/******************************************************************** +Makes a checkpoint at a given lsn or later. */ +UNIV_INTERN +void +log_make_checkpoint_at( +/*===================*/ + ib_uint64_t lsn, /* in: make a checkpoint at this or a + later lsn, if IB_ULONGLONG_MAX, makes + a checkpoint at the latest lsn */ + ibool write_always) /* in: the function normally checks if + the the new checkpoint would have a + greater lsn than the previous one: if + not, then no physical write is done; + by setting this parameter TRUE, a + physical write will always be made to + log files */ +{ + /* Preflush pages synchronously */ + + while (!log_preflush_pool_modified_pages(lsn, TRUE)); + + while (!log_checkpoint(TRUE, write_always)); +} + +/******************************************************************** +Tries to establish a big enough margin of free space in the log groups, such +that a new log entry can be catenated without an immediate need for a +checkpoint. NOTE: this function may only be called if the calling thread +owns no synchronization objects! */ +static +void +log_checkpoint_margin(void) +/*=======================*/ +{ + log_t* log = log_sys; + ib_uint64_t age; + ib_uint64_t checkpoint_age; + ib_uint64_t advance; + ib_uint64_t oldest_lsn; + ibool sync; + ibool checkpoint_sync; + ibool do_checkpoint; + ibool success; +loop: + sync = FALSE; + checkpoint_sync = FALSE; + do_checkpoint = FALSE; + + mutex_enter(&(log->mutex)); + + if (log->check_flush_or_checkpoint == FALSE) { + mutex_exit(&(log->mutex)); + + return; + } + + oldest_lsn = log_buf_pool_get_oldest_modification(); + + age = log->lsn - oldest_lsn; + + if (age > log->max_modified_age_sync) { + + /* A flush is urgent: we have to do a synchronous preflush */ + + sync = TRUE; + advance = 2 * (age - log->max_modified_age_sync); + } else if (age > log->max_modified_age_async) { + + /* A flush is not urgent: we do an asynchronous preflush */ + advance = age - log->max_modified_age_async; + } else { + advance = 0; + } + + checkpoint_age = log->lsn - log->last_checkpoint_lsn; + + if (checkpoint_age > log->max_checkpoint_age) { + /* A checkpoint is urgent: we do it synchronously */ + + checkpoint_sync = TRUE; + + do_checkpoint = TRUE; + + } else if (checkpoint_age > log->max_checkpoint_age_async) { + /* A checkpoint is not urgent: do it asynchronously */ + + do_checkpoint = TRUE; + + log->check_flush_or_checkpoint = FALSE; + } else { + log->check_flush_or_checkpoint = FALSE; + } + + mutex_exit(&(log->mutex)); + + if (advance) { + ib_uint64_t new_oldest = oldest_lsn + advance; + + success = log_preflush_pool_modified_pages(new_oldest, sync); + + /* If the flush succeeded, this thread has done its part + and can proceed. If it did not succeed, there was another + thread doing a flush at the same time. If sync was FALSE, + the flush was not urgent, and we let this thread proceed. + Otherwise, we let it start from the beginning again. */ + + if (sync && !success) { + mutex_enter(&(log->mutex)); + + log->check_flush_or_checkpoint = TRUE; + + mutex_exit(&(log->mutex)); + goto loop; + } + } + + if (do_checkpoint) { + log_checkpoint(checkpoint_sync, FALSE); + + if (checkpoint_sync) { + + goto loop; + } + } +} + +/********************************************************** +Reads a specified log segment to a buffer. */ +UNIV_INTERN +void +log_group_read_log_seg( +/*===================*/ + ulint type, /* in: LOG_ARCHIVE or LOG_RECOVER */ + byte* buf, /* in: buffer where to read */ + log_group_t* group, /* in: log group */ + ib_uint64_t start_lsn, /* in: read area start */ + ib_uint64_t end_lsn) /* in: read area end */ +{ + ulint len; + ulint source_offset; + ibool sync; + + ut_ad(mutex_own(&(log_sys->mutex))); + + sync = (type == LOG_RECOVER); +loop: + source_offset = log_group_calc_lsn_offset(start_lsn, group); + + len = (ulint) (end_lsn - start_lsn); + + ut_ad(len != 0); + + if ((source_offset % group->file_size) + len > group->file_size) { + + len = group->file_size - (source_offset % group->file_size); + } + +#ifdef UNIV_LOG_ARCHIVE + if (type == LOG_ARCHIVE) { + + log_sys->n_pending_archive_ios++; + } +#endif /* UNIV_LOG_ARCHIVE */ + + log_sys->n_log_ios++; + + fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, + source_offset / UNIV_PAGE_SIZE, source_offset % UNIV_PAGE_SIZE, + len, buf, NULL); + + start_lsn += len; + buf += len; + + if (start_lsn != end_lsn) { + + goto loop; + } +} + +#ifdef UNIV_LOG_ARCHIVE +/********************************************************** +Generates an archived log file name. */ +UNIV_INTERN +void +log_archived_file_name_gen( +/*=======================*/ + char* buf, /* in: buffer where to write */ + ulint id __attribute__((unused)), + /* in: group id; + currently we only archive the first group */ + ulint file_no)/* in: file number */ +{ + sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no); +} + +/********************************************************** +Writes a log file header to a log file space. */ +static +void +log_group_archive_file_header_write( +/*================================*/ + log_group_t* group, /* in: log group */ + ulint nth_file, /* in: header to the nth file in the + archive log file space */ + ulint file_no, /* in: archived file number */ + ib_uint64_t start_lsn) /* in: log file data starts at this + lsn */ +{ + byte* buf; + ulint dest_offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + + ut_a(nth_file < group->n_files); + + buf = *(group->archive_file_header_bufs + nth_file); + + mach_write_to_4(buf + LOG_GROUP_ID, group->id); + mach_write_ull(buf + LOG_FILE_START_LSN, start_lsn); + mach_write_to_4(buf + LOG_FILE_NO, file_no); + + mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, FALSE); + + dest_offset = nth_file * group->file_size; + + log_sys->n_log_ios++; + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id, + dest_offset / UNIV_PAGE_SIZE, + dest_offset % UNIV_PAGE_SIZE, + 2 * OS_FILE_LOG_BLOCK_SIZE, + buf, &log_archive_io); +} + +/********************************************************** +Writes a log file header to a completed archived log file. */ +static +void +log_group_archive_completed_header_write( +/*=====================================*/ + log_group_t* group, /* in: log group */ + ulint nth_file, /* in: header to the nth file in the + archive log file space */ + ib_uint64_t end_lsn) /* in: end lsn of the file */ +{ + byte* buf; + ulint dest_offset; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_a(nth_file < group->n_files); + + buf = *(group->archive_file_header_bufs + nth_file); + + mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, TRUE); + mach_write_ull(buf + LOG_FILE_END_LSN, end_lsn); + + dest_offset = nth_file * group->file_size + LOG_FILE_ARCH_COMPLETED; + + log_sys->n_log_ios++; + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id, + dest_offset / UNIV_PAGE_SIZE, + dest_offset % UNIV_PAGE_SIZE, + OS_FILE_LOG_BLOCK_SIZE, + buf + LOG_FILE_ARCH_COMPLETED, + &log_archive_io); +} + +/********************************************************** +Does the archive writes for a single log group. */ +static +void +log_group_archive( +/*==============*/ + log_group_t* group) /* in: log group */ +{ + os_file_t file_handle; + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + char name[1024]; + byte* buf; + ulint len; + ibool ret; + ulint next_offset; + ulint n_files; + ulint open_mode; + + ut_ad(mutex_own(&(log_sys->mutex))); + + start_lsn = log_sys->archived_lsn; + + ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + + end_lsn = log_sys->next_archived_lsn; + + ut_a(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + + buf = log_sys->archive_buf; + + n_files = 0; + + next_offset = group->archived_offset; +loop: + if ((next_offset % group->file_size == 0) + || (fil_space_get_size(group->archive_space_id) == 0)) { + + /* Add the file to the archive file space; create or open the + file */ + + if (next_offset % group->file_size == 0) { + open_mode = OS_FILE_CREATE; + } else { + open_mode = OS_FILE_OPEN; + } + + log_archived_file_name_gen(name, group->id, + group->archived_file_no + n_files); + + file_handle = os_file_create(name, open_mode, OS_FILE_AIO, + OS_DATA_FILE, &ret); + + if (!ret && (open_mode == OS_FILE_CREATE)) { + file_handle = os_file_create( + name, OS_FILE_OPEN, OS_FILE_AIO, + OS_DATA_FILE, &ret); + } + + if (!ret) { + fprintf(stderr, + "InnoDB: Cannot create or open" + " archive log file %s.\n" + "InnoDB: Cannot continue operation.\n" + "InnoDB: Check that the log archive" + " directory exists,\n" + "InnoDB: you have access rights to it, and\n" + "InnoDB: there is space available.\n", name); + exit(1); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, "Created archive file %s\n", name); + } +#endif /* UNIV_DEBUG */ + + ret = os_file_close(file_handle); + + ut_a(ret); + + /* Add the archive file as a node to the space */ + + fil_node_create(name, group->file_size / UNIV_PAGE_SIZE, + group->archive_space_id, FALSE); + + if (next_offset % group->file_size == 0) { + log_group_archive_file_header_write( + group, n_files, + group->archived_file_no + n_files, + start_lsn); + + next_offset += LOG_FILE_HDR_SIZE; + } + } + + len = end_lsn - start_lsn; + + if (group->file_size < (next_offset % group->file_size) + len) { + + len = group->file_size - (next_offset % group->file_size); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Archiving starting at lsn %llu, len %lu" + " to group %lu\n", + start_lsn, + (ulong) len, (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + + log_sys->n_pending_archive_ios++; + + log_sys->n_log_ios++; + + fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->archive_space_id, + next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE, + ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, + &log_archive_io); + + start_lsn += len; + next_offset += len; + buf += len; + + if (next_offset % group->file_size == 0) { + n_files++; + } + + if (end_lsn != start_lsn) { + + goto loop; + } + + group->next_archived_file_no = group->archived_file_no + n_files; + group->next_archived_offset = next_offset % group->file_size; + + ut_a(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0); +} + +/********************************************************* +(Writes to the archive of each log group.) Currently, only the first +group is archived. */ +static +void +log_archive_groups(void) +/*====================*/ +{ + log_group_t* group; + + ut_ad(mutex_own(&(log_sys->mutex))); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + log_group_archive(group); +} + +/********************************************************* +Completes the archiving write phase for (each log group), currently, +the first log group. */ +static +void +log_archive_write_complete_groups(void) +/*===================================*/ +{ + log_group_t* group; + ulint end_offset; + ulint trunc_files; + ulint n_files; + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + ulint i; + + ut_ad(mutex_own(&(log_sys->mutex))); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + group->archived_file_no = group->next_archived_file_no; + group->archived_offset = group->next_archived_offset; + + /* Truncate from the archive file space all but the last + file, or if it has been written full, all files */ + + n_files = (UNIV_PAGE_SIZE + * fil_space_get_size(group->archive_space_id)) + / group->file_size; + ut_ad(n_files > 0); + + end_offset = group->archived_offset; + + if (end_offset % group->file_size == 0) { + + trunc_files = n_files; + } else { + trunc_files = n_files - 1; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes && trunc_files) { + fprintf(stderr, + "Complete file(s) archived to group %lu\n", + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + + /* Calculate the archive file space start lsn */ + start_lsn = log_sys->next_archived_lsn + - (end_offset - LOG_FILE_HDR_SIZE + trunc_files + * (group->file_size - LOG_FILE_HDR_SIZE)); + end_lsn = start_lsn; + + for (i = 0; i < trunc_files; i++) { + + end_lsn += group->file_size - LOG_FILE_HDR_SIZE; + + /* Write a notice to the headers of archived log + files that the file write has been completed */ + + log_group_archive_completed_header_write(group, i, end_lsn); + } + + fil_space_truncate_start(group->archive_space_id, + trunc_files * group->file_size); + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fputs("Archiving writes completed\n", stderr); + } +#endif /* UNIV_DEBUG */ +} + +/********************************************************** +Completes an archiving i/o. */ +static +void +log_archive_check_completion_low(void) +/*==================================*/ +{ + ut_ad(mutex_own(&(log_sys->mutex))); + + if (log_sys->n_pending_archive_ios == 0 + && log_sys->archiving_phase == LOG_ARCHIVE_READ) { + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fputs("Archiving read completed\n", stderr); + } +#endif /* UNIV_DEBUG */ + + /* Archive buffer has now been read in: start archive writes */ + + log_sys->archiving_phase = LOG_ARCHIVE_WRITE; + + log_archive_groups(); + } + + if (log_sys->n_pending_archive_ios == 0 + && log_sys->archiving_phase == LOG_ARCHIVE_WRITE) { + + log_archive_write_complete_groups(); + + log_sys->archived_lsn = log_sys->next_archived_lsn; + + rw_lock_x_unlock_gen(&(log_sys->archive_lock), LOG_ARCHIVE); + } +} + +/********************************************************** +Completes an archiving i/o. */ +static +void +log_io_complete_archive(void) +/*=========================*/ +{ + log_group_t* group; + + mutex_enter(&(log_sys->mutex)); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + mutex_exit(&(log_sys->mutex)); + + fil_flush(group->archive_space_id); + + mutex_enter(&(log_sys->mutex)); + + ut_ad(log_sys->n_pending_archive_ios > 0); + + log_sys->n_pending_archive_ios--; + + log_archive_check_completion_low(); + + mutex_exit(&(log_sys->mutex)); +} + +/************************************************************************ +Starts an archiving operation. */ +UNIV_INTERN +ibool +log_archive_do( +/*===========*/ + /* out: TRUE if succeed, FALSE if an archiving + operation was already running */ + ibool sync, /* in: TRUE if synchronous operation is desired */ + ulint* n_bytes)/* out: archive log buffer size, 0 if nothing to + archive */ +{ + ibool calc_new_limit; + ib_uint64_t start_lsn; + ib_uint64_t limit_lsn; + + calc_new_limit = TRUE; +loop: + mutex_enter(&(log_sys->mutex)); + + switch (log_sys->archiving_state) { + case LOG_ARCH_OFF: +arch_none: + mutex_exit(&(log_sys->mutex)); + + *n_bytes = 0; + + return(TRUE); + case LOG_ARCH_STOPPED: + case LOG_ARCH_STOPPING2: + mutex_exit(&(log_sys->mutex)); + + os_event_wait(log_sys->archiving_on); + + goto loop; + } + + start_lsn = log_sys->archived_lsn; + + if (calc_new_limit) { + ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0); + limit_lsn = start_lsn + log_sys->archive_buf_size; + + *n_bytes = log_sys->archive_buf_size; + + if (limit_lsn >= log_sys->lsn) { + + limit_lsn = ut_uint64_align_down( + log_sys->lsn, OS_FILE_LOG_BLOCK_SIZE); + } + } + + if (log_sys->archived_lsn >= limit_lsn) { + + goto arch_none; + } + + if (log_sys->written_to_all_lsn < limit_lsn) { + + mutex_exit(&(log_sys->mutex)); + + log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE); + + calc_new_limit = FALSE; + + goto loop; + } + + if (log_sys->n_pending_archive_ios > 0) { + /* An archiving operation is running */ + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + rw_lock_s_lock(&(log_sys->archive_lock)); + rw_lock_s_unlock(&(log_sys->archive_lock)); + } + + *n_bytes = log_sys->archive_buf_size; + + return(FALSE); + } + + rw_lock_x_lock_gen(&(log_sys->archive_lock), LOG_ARCHIVE); + + log_sys->archiving_phase = LOG_ARCHIVE_READ; + + log_sys->next_archived_lsn = limit_lsn; + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Archiving from lsn %llu to lsn %llu\n", + log_sys->archived_lsn, limit_lsn); + } +#endif /* UNIV_DEBUG */ + + /* Read the log segment to the archive buffer */ + + log_group_read_log_seg(LOG_ARCHIVE, log_sys->archive_buf, + UT_LIST_GET_FIRST(log_sys->log_groups), + start_lsn, limit_lsn); + + mutex_exit(&(log_sys->mutex)); + + if (sync) { + rw_lock_s_lock(&(log_sys->archive_lock)); + rw_lock_s_unlock(&(log_sys->archive_lock)); + } + + *n_bytes = log_sys->archive_buf_size; + + return(TRUE); +} + +/******************************************************************** +Writes the log contents to the archive at least up to the lsn when this +function was called. */ +static +void +log_archive_all(void) +/*=================*/ +{ + ib_uint64_t present_lsn; + ulint dummy; + + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + mutex_exit(&(log_sys->mutex)); + + return; + } + + present_lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + log_pad_current_log_block(); + + for (;;) { + mutex_enter(&(log_sys->mutex)); + + if (present_lsn <= log_sys->archived_lsn) { + + mutex_exit(&(log_sys->mutex)); + + return; + } + + mutex_exit(&(log_sys->mutex)); + + log_archive_do(TRUE, &dummy); + } +} + +/********************************************************* +Closes the possible open archive log file (for each group) the first group, +and if it was open, increments the group file count by 2, if desired. */ +static +void +log_archive_close_groups( +/*=====================*/ + ibool increment_file_count) /* in: TRUE if we want to increment + the file count */ +{ + log_group_t* group; + ulint trunc_len; + + ut_ad(mutex_own(&(log_sys->mutex))); + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + + return; + } + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + trunc_len = UNIV_PAGE_SIZE + * fil_space_get_size(group->archive_space_id); + if (trunc_len > 0) { + ut_a(trunc_len == group->file_size); + + /* Write a notice to the headers of archived log + files that the file write has been completed */ + + log_group_archive_completed_header_write( + group, 0, log_sys->archived_lsn); + + fil_space_truncate_start(group->archive_space_id, + trunc_len); + if (increment_file_count) { + group->archived_offset = 0; + group->archived_file_no += 2; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "Incrementing arch file no to %lu" + " in log group %lu\n", + (ulong) group->archived_file_no + 2, + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + } +} + +/******************************************************************** +Writes the log contents to the archive up to the lsn when this function was +called, and stops the archiving. When archiving is started again, the archived +log file numbers start from 2 higher, so that the archiving will not write +again to the archived log files which exist when this function returns. */ +UNIV_INTERN +ulint +log_archive_stop(void) +/*==================*/ + /* out: DB_SUCCESS or DB_ERROR */ +{ + ibool success; + + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state != LOG_ARCH_ON) { + + mutex_exit(&(log_sys->mutex)); + + return(DB_ERROR); + } + + log_sys->archiving_state = LOG_ARCH_STOPPING; + + mutex_exit(&(log_sys->mutex)); + + log_archive_all(); + + mutex_enter(&(log_sys->mutex)); + + log_sys->archiving_state = LOG_ARCH_STOPPING2; + os_event_reset(log_sys->archiving_on); + + mutex_exit(&(log_sys->mutex)); + + /* Wait for a possible archiving operation to end */ + + rw_lock_s_lock(&(log_sys->archive_lock)); + rw_lock_s_unlock(&(log_sys->archive_lock)); + + mutex_enter(&(log_sys->mutex)); + + /* Close all archived log files, incrementing the file count by 2, + if appropriate */ + + log_archive_close_groups(TRUE); + + mutex_exit(&(log_sys->mutex)); + + /* Make a checkpoint, so that if recovery is needed, the file numbers + of new archived log files will start from the right value */ + + success = FALSE; + + while (!success) { + success = log_checkpoint(TRUE, TRUE); + } + + mutex_enter(&(log_sys->mutex)); + + log_sys->archiving_state = LOG_ARCH_STOPPED; + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); +} + +/******************************************************************** +Starts again archiving which has been stopped. */ +UNIV_INTERN +ulint +log_archive_start(void) +/*===================*/ + /* out: DB_SUCCESS or DB_ERROR */ +{ + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state != LOG_ARCH_STOPPED) { + + mutex_exit(&(log_sys->mutex)); + + return(DB_ERROR); + } + + log_sys->archiving_state = LOG_ARCH_ON; + + os_event_set(log_sys->archiving_on); + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); +} + +/******************************************************************** +Stop archiving the log so that a gap may occur in the archived log files. */ +UNIV_INTERN +ulint +log_archive_noarchivelog(void) +/*==========================*/ + /* out: DB_SUCCESS or DB_ERROR */ +{ +loop: + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state == LOG_ARCH_STOPPED + || log_sys->archiving_state == LOG_ARCH_OFF) { + + log_sys->archiving_state = LOG_ARCH_OFF; + + os_event_set(log_sys->archiving_on); + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); + } + + mutex_exit(&(log_sys->mutex)); + + log_archive_stop(); + + os_thread_sleep(500000); + + goto loop; +} + +/******************************************************************** +Start archiving the log so that a gap may occur in the archived log files. */ +UNIV_INTERN +ulint +log_archive_archivelog(void) +/*========================*/ + /* out: DB_SUCCESS or DB_ERROR */ +{ + mutex_enter(&(log_sys->mutex)); + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + + log_sys->archiving_state = LOG_ARCH_ON; + + log_sys->archived_lsn + = ut_uint64_align_down(log_sys->lsn, + OS_FILE_LOG_BLOCK_SIZE); + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); + } + + mutex_exit(&(log_sys->mutex)); + + return(DB_ERROR); +} + +/******************************************************************** +Tries to establish a big enough margin of free space in the log groups, such +that a new log entry can be catenated without an immediate need for +archiving. */ +static +void +log_archive_margin(void) +/*====================*/ +{ + log_t* log = log_sys; + ulint age; + ibool sync; + ulint dummy; +loop: + mutex_enter(&(log->mutex)); + + if (log->archiving_state == LOG_ARCH_OFF) { + mutex_exit(&(log->mutex)); + + return; + } + + age = log->lsn - log->archived_lsn; + + if (age > log->max_archived_lsn_age) { + + /* An archiving is urgent: we have to do synchronous i/o */ + + sync = TRUE; + + } else if (age > log->max_archived_lsn_age_async) { + + /* An archiving is not urgent: we do asynchronous i/o */ + + sync = FALSE; + } else { + /* No archiving required yet */ + + mutex_exit(&(log->mutex)); + + return; + } + + mutex_exit(&(log->mutex)); + + log_archive_do(sync, &dummy); + + if (sync == TRUE) { + /* Check again that enough was written to the archive */ + + goto loop; + } +} +#endif /* UNIV_LOG_ARCHIVE */ + +/************************************************************************ +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +UNIV_INTERN +void +log_check_margins(void) +/*===================*/ +{ +loop: + log_flush_margin(); + + log_checkpoint_margin(); + +#ifdef UNIV_LOG_ARCHIVE + log_archive_margin(); +#endif /* UNIV_LOG_ARCHIVE */ + + mutex_enter(&(log_sys->mutex)); + + if (log_sys->check_flush_or_checkpoint) { + + mutex_exit(&(log_sys->mutex)); + + goto loop; + } + + mutex_exit(&(log_sys->mutex)); +} + +/******************************************************************** +Makes a checkpoint at the latest lsn and writes it to first page of each +data file in the database, so that we know that the file spaces contain +all modifications up to that lsn. This can only be called at database +shutdown. This function also writes all log in log files to the log archive. */ +UNIV_INTERN +void +logs_empty_and_mark_files_at_shutdown(void) +/*=======================================*/ +{ + ib_uint64_t lsn; + ulint arch_log_no; + + if (srv_print_verbose_log) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Starting shutdown...\n"); + } + /* Wait until the master thread and all other operations are idle: our + algorithm only works if the server is idle at shutdown */ + + srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; +loop: + os_thread_sleep(100000); + + mutex_enter(&kernel_mutex); + + /* We need the monitor threads to stop before we proceed with a + normal shutdown. In case of very fast shutdown, however, we can + proceed without waiting for monitor threads. */ + + if (srv_fast_shutdown < 2 + && (srv_error_monitor_active + || srv_lock_timeout_and_monitor_active)) { + + mutex_exit(&kernel_mutex); + + goto loop; + } + + /* Check that there are no longer transactions. We need this wait even + for the 'very fast' shutdown, because the InnoDB layer may have + committed or prepared transactions and we don't want to lose them. */ + + if (trx_n_mysql_transactions > 0 + || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { + + mutex_exit(&kernel_mutex); + + goto loop; + } + + if (srv_fast_shutdown == 2) { + /* In this fastest shutdown we do not flush the buffer pool: + it is essentially a 'crash' of the InnoDB server. Make sure + that the log is all flushed to disk, so that we can recover + all committed transactions in a crash recovery. We must not + write the lsn stamps to the data files, since at a startup + InnoDB deduces from the stamps if the previous shutdown was + clean. */ + + log_buffer_flush_to_disk(); + + return; /* We SKIP ALL THE REST !! */ + } + + /* Check that the master thread is suspended */ + + if (srv_n_threads_active[SRV_MASTER] != 0) { + + mutex_exit(&kernel_mutex); + + goto loop; + } + + mutex_exit(&kernel_mutex); + + mutex_enter(&(log_sys->mutex)); + + if (log_sys->n_pending_checkpoint_writes +#ifdef UNIV_LOG_ARCHIVE + || log_sys->n_pending_archive_ios +#endif /* UNIV_LOG_ARCHIVE */ + || log_sys->n_pending_writes) { + + mutex_exit(&(log_sys->mutex)); + + goto loop; + } + + mutex_exit(&(log_sys->mutex)); + + if (!buf_pool_check_no_pending_io()) { + + goto loop; + } + +#ifdef UNIV_LOG_ARCHIVE + log_archive_all(); +#endif /* UNIV_LOG_ARCHIVE */ + + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + if (lsn != log_sys->last_checkpoint_lsn +#ifdef UNIV_LOG_ARCHIVE + || (srv_log_archive_on + && lsn != log_sys->archived_lsn + LOG_BLOCK_HDR_SIZE) +#endif /* UNIV_LOG_ARCHIVE */ + ) { + + mutex_exit(&(log_sys->mutex)); + + goto loop; + } + + arch_log_no = 0; + +#ifdef UNIV_LOG_ARCHIVE + UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no; + + if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) { + + arch_log_no--; + } + + log_archive_close_groups(TRUE); +#endif /* UNIV_LOG_ARCHIVE */ + + mutex_exit(&(log_sys->mutex)); + + mutex_enter(&kernel_mutex); + /* Check that the master thread has stayed suspended */ + if (srv_n_threads_active[SRV_MASTER] != 0) { + fprintf(stderr, + "InnoDB: Warning: the master thread woke up" + " during shutdown\n"); + + mutex_exit(&kernel_mutex); + + goto loop; + } + mutex_exit(&kernel_mutex); + + fil_flush_file_spaces(FIL_TABLESPACE); + fil_flush_file_spaces(FIL_LOG); + + /* The call fil_write_flushed_lsn_to_data_files() will pass the buffer + pool: therefore it is essential that the buffer pool has been + completely flushed to disk! (We do not call fil_write... if the + 'very fast' shutdown is enabled.) */ + + if (!buf_all_freed()) { + + goto loop; + } + + srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + + /* Make some checks that the server really is quiet */ + ut_a(srv_n_threads_active[SRV_MASTER] == 0); + ut_a(buf_all_freed()); + ut_a(lsn == log_sys->lsn); + + if (lsn < srv_start_lsn) { + fprintf(stderr, + "InnoDB: Error: log sequence number" + " at shutdown %llu\n" + "InnoDB: is lower than at startup %llu!\n", + lsn, srv_start_lsn); + } + + srv_shutdown_lsn = lsn; + + fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); + + fil_flush_file_spaces(FIL_TABLESPACE); + + fil_close_all_files(); + + /* Make some checks that the server really is quiet */ + ut_a(srv_n_threads_active[SRV_MASTER] == 0); + ut_a(buf_all_freed()); + ut_a(lsn == log_sys->lsn); +} + +/********************************************************** +Checks by parsing that the catenated log segment for a single mtr is +consistent. */ +UNIV_INTERN +ibool +log_check_log_recs( +/*===============*/ + byte* buf, /* in: pointer to the start of + the log segment in the + log_sys->buf log buffer */ + ulint len, /* in: segment length in bytes */ + ib_uint64_t buf_start_lsn) /* in: buffer start lsn */ +{ + ib_uint64_t contiguous_lsn; + ib_uint64_t scanned_lsn; + byte* start; + byte* end; + byte* buf1; + byte* scan_buf; + + ut_ad(mutex_own(&(log_sys->mutex))); + + if (len == 0) { + + return(TRUE); + } + + start = ut_align_down(buf, OS_FILE_LOG_BLOCK_SIZE); + end = ut_align(buf + len, OS_FILE_LOG_BLOCK_SIZE); + + buf1 = mem_alloc((end - start) + OS_FILE_LOG_BLOCK_SIZE); + scan_buf = ut_align(buf1, OS_FILE_LOG_BLOCK_SIZE); + + ut_memcpy(scan_buf, start, end - start); + + recv_scan_log_recs(TRUE, + (buf_pool->curr_size + - recv_n_pool_free_frames) * UNIV_PAGE_SIZE, + FALSE, scan_buf, end - start, + ut_uint64_align_down(buf_start_lsn, + OS_FILE_LOG_BLOCK_SIZE), + &contiguous_lsn, &scanned_lsn); + + ut_a(scanned_lsn == buf_start_lsn + len); + ut_a(recv_sys->recovered_lsn == scanned_lsn); + + mem_free(buf1); + + return(TRUE); +} + +/********************************************************** +Peeks the current lsn. */ +UNIV_INTERN +ibool +log_peek_lsn( +/*=========*/ + /* out: TRUE if success, FALSE if + could not get the log system mutex */ + ib_uint64_t* lsn) /* out: if returns TRUE, current lsn is here */ +{ + if (0 == mutex_enter_nowait(&(log_sys->mutex))) { + *lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + return(TRUE); + } + + return(FALSE); +} + +/********************************************************** +Prints info of the log. */ +UNIV_INTERN +void +log_print( +/*======*/ + FILE* file) /* in: file where to print */ +{ + double time_elapsed; + time_t current_time; + + mutex_enter(&(log_sys->mutex)); + + fprintf(file, + "Log sequence number %llu\n" + "Log flushed up to %llu\n" + "Last checkpoint at %llu\n", + log_sys->lsn, + log_sys->flushed_to_disk_lsn, + log_sys->last_checkpoint_lsn); + + fprintf(file, + "Max checkpoint age %lu\n" + "Modified age %lu\n" + "Checkpoint age %lu\n", + (ulong) log_sys->max_checkpoint_age, + (ulong) (log_sys->lsn - + log_buf_pool_get_oldest_modification()), + (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn)); + + current_time = time(NULL); + + time_elapsed = 0.001 + difftime(current_time, + log_sys->last_printout_time); + fprintf(file, + "%lu pending log writes, %lu pending chkp writes\n" + "%lu log i/o's done, %.2f log i/o's/second\n", + (ulong) log_sys->n_pending_writes, + (ulong) log_sys->n_pending_checkpoint_writes, + (ulong) log_sys->n_log_ios, + ((log_sys->n_log_ios - log_sys->n_log_ios_old) + / time_elapsed)); + + log_sys->n_log_ios_old = log_sys->n_log_ios; + log_sys->last_printout_time = current_time; + + mutex_exit(&(log_sys->mutex)); +} + +/************************************************************************** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +log_refresh_stats(void) +/*===================*/ +{ + log_sys->n_log_ios_old = log_sys->n_log_ios; + log_sys->last_printout_time = time(NULL); +} diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c new file mode 100644 index 00000000000..b72dde4efcf --- /dev/null +++ b/storage/xtradb/log/log0recv.c @@ -0,0 +1,3476 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#include "log0recv.h" + +#ifdef UNIV_NONINL +#include "log0recv.ic" +#endif + +#include "mem0mem.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0log.h" +#include "page0cur.h" +#include "page0zip.h" +#include "btr0cur.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "row0merge.h" + +#ifdef UNIV_HOTBACKUP +/* This is set to FALSE if the backup was originally taken with the +ibbackup --include regexp option: then we do not want to create tables in +directories which were not included */ +UNIV_INTERN ibool recv_replay_file_ops = TRUE; +#endif /* UNIV_HOTBACKUP */ + +/* Log records are stored in the hash table in chunks at most of this size; +this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */ +#define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t)) + +/* Read-ahead area in applying log records to file pages */ +#define RECV_READ_AHEAD_AREA 32 + +UNIV_INTERN recv_sys_t* recv_sys = NULL; +UNIV_INTERN ibool recv_recovery_on = FALSE; +UNIV_INTERN ibool recv_recovery_from_backup_on = FALSE; + +UNIV_INTERN ibool recv_needed_recovery = FALSE; + +UNIV_INTERN ibool recv_lsn_checks_on = FALSE; + +/* There are two conditions under which we scan the logs, the first +is normal startup and the second is when we do a recovery from an +archive. +This flag is set if we are doing a scan from the last checkpoint during +startup. If we find log entries that were written after the last checkpoint +we know that the server was not cleanly shutdown. We must then initialize +the crash recovery environment before attempting to store these entries in +the log hash table. */ +UNIV_INTERN ibool recv_log_scan_is_startup_type = FALSE; + +/* If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this becomes TRUE if +the log record hash table becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state */ + +/* Recovery is running and no operations on the log files are allowed +yet: the variable name is misleading */ + +UNIV_INTERN ibool recv_no_ibuf_operations = FALSE; + +/* The following counter is used to decide when to print info on +log scan */ +UNIV_INTERN ulint recv_scan_print_counter = 0; + +UNIV_INTERN ibool recv_is_from_backup = FALSE; +#ifdef UNIV_HOTBACKUP +UNIV_INTERN ibool recv_is_making_a_backup = FALSE; +#else +# define recv_is_making_a_backup FALSE +#endif /* UNIV_HOTBACKUP */ + +UNIV_INTERN ulint recv_previous_parsed_rec_type = 999999; +UNIV_INTERN ulint recv_previous_parsed_rec_offset = 0; +UNIV_INTERN ulint recv_previous_parsed_rec_is_multi = 0; + +UNIV_INTERN ulint recv_max_parsed_page_no = 0; + +/* This many frames must be left free in the buffer pool when we scan +the log and store the scanned log records in the buffer pool: we will +use these free frames to read in pages when we start applying the +log records to the database. */ + +UNIV_INTERN ulint recv_n_pool_free_frames = 256; + +/* The maximum lsn we see for a page during the recovery process. If this +is bigger than the lsn we are able to scan up to, that is an indication that +the recovery failed and the database may be corrupt. */ + +UNIV_INTERN ib_uint64_t recv_max_page_lsn; + +/* prototypes */ + +/*********************************************************** +Initialize crash recovery environment. Can be called iff +recv_needed_recovery == FALSE. */ +static +void +recv_init_crash_recovery(void); +/*===========================*/ + +/************************************************************ +Creates the recovery system. */ +UNIV_INTERN +void +recv_sys_create(void) +/*=================*/ +{ + if (recv_sys != NULL) { + + return; + } + + recv_sys = mem_alloc(sizeof(recv_sys_t)); + + mutex_create(&recv_sys->mutex, SYNC_RECV); + + recv_sys->heap = NULL; + recv_sys->addr_hash = NULL; +} + +/************************************************************ +Inits the recovery system for a recovery operation. */ +UNIV_INTERN +void +recv_sys_init( +/*==========*/ + ibool recover_from_backup, /* in: TRUE if this is called + to recover from a hot backup */ + ulint available_memory) /* in: available memory in bytes */ +{ + if (recv_sys->heap != NULL) { + + return; + } + + mutex_enter(&(recv_sys->mutex)); + + if (!recover_from_backup) { + recv_sys->heap = mem_heap_create_in_buffer(256); + } else { + recv_sys->heap = mem_heap_create(256); + recv_is_from_backup = TRUE; + } + + recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE); + recv_sys->len = 0; + recv_sys->recovered_offset = 0; + + recv_sys->addr_hash = hash_create(available_memory / 64); + recv_sys->n_addrs = 0; + + recv_sys->apply_log_recs = FALSE; + recv_sys->apply_batch_on = FALSE; + + recv_sys->last_block_buf_start = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE); + + recv_sys->last_block = ut_align(recv_sys->last_block_buf_start, + OS_FILE_LOG_BLOCK_SIZE); + recv_sys->found_corrupt_log = FALSE; + + recv_max_page_lsn = 0; + + mutex_exit(&(recv_sys->mutex)); +} + +/************************************************************ +Empties the hash table when it has been fully processed. */ +static +void +recv_sys_empty_hash(void) +/*=====================*/ +{ + ut_ad(mutex_own(&(recv_sys->mutex))); + + if (recv_sys->n_addrs != 0) { + fprintf(stderr, + "InnoDB: Error: %lu pages with log records" + " were left unprocessed!\n" + "InnoDB: Maximum page number with" + " log records on it %lu\n", + (ulong) recv_sys->n_addrs, + (ulong) recv_max_parsed_page_no); + ut_error; + } + + hash_table_free(recv_sys->addr_hash); + mem_heap_empty(recv_sys->heap); + + recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 256); +} + +#ifndef UNIV_LOG_DEBUG +/************************************************************ +Frees the recovery system. */ +static +void +recv_sys_free(void) +/*===============*/ +{ + mutex_enter(&(recv_sys->mutex)); + + hash_table_free(recv_sys->addr_hash); + mem_heap_free(recv_sys->heap); + ut_free(recv_sys->buf); + mem_free(recv_sys->last_block_buf_start); + + recv_sys->addr_hash = NULL; + recv_sys->heap = NULL; + + mutex_exit(&(recv_sys->mutex)); +} +#endif /* UNIV_LOG_DEBUG */ + +/************************************************************ +Truncates possible corrupted or extra records from a log group. */ +static +void +recv_truncate_group( +/*================*/ + log_group_t* group, /* in: log group */ + ib_uint64_t recovered_lsn, /* in: recovery succeeded up to this + lsn */ + ib_uint64_t limit_lsn, /* in: this was the limit for + recovery */ + ib_uint64_t checkpoint_lsn, /* in: recovery was started from this + checkpoint */ + ib_uint64_t archived_lsn) /* in: the log has been archived up to + this lsn */ +{ + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + ib_uint64_t finish_lsn1; + ib_uint64_t finish_lsn2; + ib_uint64_t finish_lsn; + ulint len; + ulint i; + + if (archived_lsn == IB_ULONGLONG_MAX) { + /* Checkpoint was taken in the NOARCHIVELOG mode */ + archived_lsn = checkpoint_lsn; + } + + finish_lsn1 = ut_uint64_align_down(archived_lsn, + OS_FILE_LOG_BLOCK_SIZE) + + log_group_get_capacity(group); + + finish_lsn2 = ut_uint64_align_up(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE) + + recv_sys->last_log_buf_size; + + if (limit_lsn != IB_ULONGLONG_MAX) { + /* We do not know how far we should erase log records: erase + as much as possible */ + + finish_lsn = finish_lsn1; + } else { + /* It is enough to erase the length of the log buffer */ + finish_lsn = finish_lsn1 < finish_lsn2 + ? finish_lsn1 : finish_lsn2; + } + + ut_a(RECV_SCAN_SIZE <= log_sys->buf_size); + + /* Write the log buffer full of zeros */ + for (i = 0; i < RECV_SCAN_SIZE; i++) { + + *(log_sys->buf + i) = '\0'; + } + + start_lsn = ut_uint64_align_down(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); + + if (start_lsn != recovered_lsn) { + /* Copy the last incomplete log block to the log buffer and + edit its data length: */ + + ut_memcpy(log_sys->buf, recv_sys->last_block, + OS_FILE_LOG_BLOCK_SIZE); + log_block_set_data_len(log_sys->buf, + (ulint) (recovered_lsn - start_lsn)); + } + + if (start_lsn >= finish_lsn) { + + return; + } + + for (;;) { + end_lsn = start_lsn + RECV_SCAN_SIZE; + + if (end_lsn > finish_lsn) { + + end_lsn = finish_lsn; + } + + len = (ulint) (end_lsn - start_lsn); + + log_group_write_buf(group, log_sys->buf, len, start_lsn, 0); + if (end_lsn >= finish_lsn) { + + return; + } + + /* Write the log buffer full of zeros */ + for (i = 0; i < RECV_SCAN_SIZE; i++) { + + *(log_sys->buf + i) = '\0'; + } + + start_lsn = end_lsn; + } +} + +/************************************************************ +Copies the log segment between group->recovered_lsn and recovered_lsn from the +most up-to-date log group to group, so that it contains the latest log data. */ +static +void +recv_copy_group( +/*============*/ + log_group_t* up_to_date_group, /* in: the most up-to-date log + group */ + log_group_t* group, /* in: copy to this log + group */ + ib_uint64_t recovered_lsn) /* in: recovery succeeded up + to this lsn */ +{ + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + ulint len; + + if (group->scanned_lsn >= recovered_lsn) { + + return; + } + + ut_a(RECV_SCAN_SIZE <= log_sys->buf_size); + + start_lsn = ut_uint64_align_down(group->scanned_lsn, + OS_FILE_LOG_BLOCK_SIZE); + for (;;) { + end_lsn = start_lsn + RECV_SCAN_SIZE; + + if (end_lsn > recovered_lsn) { + end_lsn = ut_uint64_align_up(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); + } + + log_group_read_log_seg(LOG_RECOVER, log_sys->buf, + up_to_date_group, start_lsn, end_lsn); + + len = (ulint) (end_lsn - start_lsn); + + log_group_write_buf(group, log_sys->buf, len, start_lsn, 0); + + if (end_lsn >= recovered_lsn) { + + return; + } + + start_lsn = end_lsn; + } +} + +/************************************************************ +Copies a log segment from the most up-to-date log group to the other log +groups, so that they all contain the latest log data. Also writes the info +about the latest checkpoint to the groups, and inits the fields in the group +memory structs to up-to-date values. */ +static +void +recv_synchronize_groups( +/*====================*/ + log_group_t* up_to_date_group) /* in: the most up-to-date + log group */ +{ + log_group_t* group; + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + ib_uint64_t recovered_lsn; + ib_uint64_t limit_lsn; + + recovered_lsn = recv_sys->recovered_lsn; + limit_lsn = recv_sys->limit_lsn; + + /* Read the last recovered log block to the recovery system buffer: + the block is always incomplete */ + + start_lsn = ut_uint64_align_down(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); + end_lsn = ut_uint64_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); + + ut_a(start_lsn != end_lsn); + + log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block, + up_to_date_group, start_lsn, end_lsn); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + if (group != up_to_date_group) { + + /* Copy log data if needed */ + + recv_copy_group(group, up_to_date_group, + recovered_lsn); + } + + /* Update the fields in the group struct to correspond to + recovered_lsn */ + + log_group_set_fields(group, recovered_lsn); + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + /* Copy the checkpoint info to the groups; remember that we have + incremented checkpoint_no by one, and the info will not be written + over the max checkpoint info, thus making the preservation of max + checkpoint info on disk certain */ + + log_groups_write_checkpoint_info(); + + mutex_exit(&(log_sys->mutex)); + + /* Wait for the checkpoint write to complete */ + rw_lock_s_lock(&(log_sys->checkpoint_lock)); + rw_lock_s_unlock(&(log_sys->checkpoint_lock)); + + mutex_enter(&(log_sys->mutex)); +} + +/*************************************************************************** +Checks the consistency of the checkpoint info */ +static +ibool +recv_check_cp_is_consistent( +/*========================*/ + /* out: TRUE if ok */ + byte* buf) /* in: buffer containing checkpoint info */ +{ + ulint fold; + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4( + buf + LOG_CHECKPOINT_CHECKSUM_1)) { + return(FALSE); + } + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4( + buf + LOG_CHECKPOINT_CHECKSUM_2)) { + return(FALSE); + } + + return(TRUE); +} + +/************************************************************ +Looks for the maximum consistent checkpoint from the log groups. */ +static +ulint +recv_find_max_checkpoint( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + log_group_t** max_group, /* out: max group */ + ulint* max_field) /* out: LOG_CHECKPOINT_1 or + LOG_CHECKPOINT_2 */ +{ + log_group_t* group; + ib_uint64_t max_no; + ib_uint64_t checkpoint_no; + ulint field; + byte* buf; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + max_no = 0; + *max_group = NULL; + *max_field = 0; + + buf = log_sys->checkpoint_buf; + + while (group) { + group->state = LOG_GROUP_CORRUPTED; + + for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; + field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) { + + log_group_read_checkpoint_info(group, field); + + if (!recv_check_cp_is_consistent(buf)) { +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Checkpoint in group" + " %lu at %lu invalid, %lu\n", + (ulong) group->id, + (ulong) field, + (ulong) mach_read_from_4( + buf + + LOG_CHECKPOINT_CHECKSUM_1)); + + } +#endif /* UNIV_DEBUG */ + goto not_consistent; + } + + group->state = LOG_GROUP_OK; + + group->lsn = mach_read_ull( + buf + LOG_CHECKPOINT_LSN); + group->lsn_offset = mach_read_from_4( + buf + LOG_CHECKPOINT_OFFSET); + checkpoint_no = mach_read_ull( + buf + LOG_CHECKPOINT_NO); + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Checkpoint number %lu" + " found in group %lu\n", + (ulong) checkpoint_no, + (ulong) group->id); + } +#endif /* UNIV_DEBUG */ + + if (checkpoint_no >= max_no) { + *max_group = group; + *max_field = field; + max_no = checkpoint_no; + } + +not_consistent: + ; + } + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + if (*max_group == NULL) { + + fprintf(stderr, + "InnoDB: No valid checkpoint found.\n" + "InnoDB: If this error appears when you are" + " creating an InnoDB database,\n" + "InnoDB: the problem may be that during" + " an earlier attempt you managed\n" + "InnoDB: to create the InnoDB data files," + " but log file creation failed.\n" + "InnoDB: If that is the case, please refer to\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "error-creating-innodb.html\n"); + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +#ifdef UNIV_HOTBACKUP +/*********************************************************************** +Reads the checkpoint info needed in hot backup. */ +UNIV_INTERN +ibool +recv_read_cp_info_for_backup( +/*=========================*/ + /* out: TRUE if success */ + byte* hdr, /* in: buffer containing the log group + header */ + ib_uint64_t* lsn, /* out: checkpoint lsn */ + ulint* offset, /* out: checkpoint offset in the log group */ + ulint* fsp_limit,/* out: fsp limit of space 0, + 1000000000 if the database is running + with < version 3.23.50 of InnoDB */ + ib_uint64_t* cp_no, /* out: checkpoint number */ + ib_uint64_t* first_header_lsn) + /* out: lsn of of the start of the + first log file */ +{ + ulint max_cp = 0; + ib_uint64_t max_cp_no = 0; + byte* cp_buf; + + cp_buf = hdr + LOG_CHECKPOINT_1; + + if (recv_check_cp_is_consistent(cp_buf)) { + max_cp_no = mach_read_ull(cp_buf + LOG_CHECKPOINT_NO); + max_cp = LOG_CHECKPOINT_1; + } + + cp_buf = hdr + LOG_CHECKPOINT_2; + + if (recv_check_cp_is_consistent(cp_buf)) { + if (mach_read_ull(cp_buf + LOG_CHECKPOINT_NO) > max_cp_no) { + max_cp = LOG_CHECKPOINT_2; + } + } + + if (max_cp == 0) { + return(FALSE); + } + + cp_buf = hdr + max_cp; + + *lsn = mach_read_ull(cp_buf + LOG_CHECKPOINT_LSN); + *offset = mach_read_from_4(cp_buf + LOG_CHECKPOINT_OFFSET); + + /* If the user is running a pre-3.23.50 version of InnoDB, its + checkpoint data does not contain the fsp limit info */ + if (mach_read_from_4(cp_buf + LOG_CHECKPOINT_FSP_MAGIC_N) + == LOG_CHECKPOINT_FSP_MAGIC_N_VAL) { + + *fsp_limit = mach_read_from_4( + cp_buf + LOG_CHECKPOINT_FSP_FREE_LIMIT); + + if (*fsp_limit == 0) { + *fsp_limit = 1000000000; + } + } else { + *fsp_limit = 1000000000; + } + + /* fprintf(stderr, "fsp limit %lu MB\n", *fsp_limit); */ + + *cp_no = mach_read_ull(cp_buf + LOG_CHECKPOINT_NO); + + *first_header_lsn = mach_read_ull(hdr + LOG_FILE_START_LSN); + + return(TRUE); +} +#endif /* UNIV_HOTBACKUP */ + +/********************************************************** +Checks the 4-byte checksum to the trailer checksum field of a log block. +We also accept a log block in the old format < InnoDB-3.23.52 where the +checksum field contains the log block number. */ +static +ibool +log_block_checksum_is_ok_or_old_format( +/*===================================*/ + /* out: TRUE if ok, or if the log block may be in the + format of InnoDB version < 3.23.52 */ + byte* block) /* in: pointer to a log block */ +{ +#ifdef UNIV_LOG_DEBUG + return(TRUE); +#endif /* UNIV_LOG_DEBUG */ + if (log_block_calc_checksum(block) == log_block_get_checksum(block)) { + + return(TRUE); + } + + if (log_block_get_hdr_no(block) == log_block_get_checksum(block)) { + + /* We assume the log block is in the format of + InnoDB version < 3.23.52 and the block is ok */ +#if 0 + fprintf(stderr, + "InnoDB: Scanned old format < InnoDB-3.23.52" + " log block number %lu\n", + log_block_get_hdr_no(block)); +#endif + return(TRUE); + } + + return(FALSE); +} + +#ifdef UNIV_HOTBACKUP +/*********************************************************************** +Scans the log segment and n_bytes_scanned is set to the length of valid +log scanned. */ +UNIV_INTERN +void +recv_scan_log_seg_for_backup( +/*=========================*/ + byte* buf, /* in: buffer containing log data */ + ulint buf_len, /* in: data length in that buffer */ + ib_uint64_t* scanned_lsn, /* in/out: lsn of buffer start, + we return scanned lsn */ + ulint* scanned_checkpoint_no, + /* in/out: 4 lowest bytes of the + highest scanned checkpoint number so + far */ + ulint* n_bytes_scanned)/* out: how much we were able to + scan, smaller than buf_len if log + data ended here */ +{ + ulint data_len; + byte* log_block; + ulint no; + + *n_bytes_scanned = 0; + + for (log_block = buf; log_block < buf + buf_len; + log_block += OS_FILE_LOG_BLOCK_SIZE) { + + no = log_block_get_hdr_no(log_block); + +#if 0 + fprintf(stderr, "Log block header no %lu\n", no); +#endif + + if (no != log_block_convert_lsn_to_no(*scanned_lsn) + || !log_block_checksum_is_ok_or_old_format(log_block)) { +#if 0 + fprintf(stderr, + "Log block n:o %lu, scanned lsn n:o %lu\n", + no, log_block_convert_lsn_to_no(*scanned_lsn)); +#endif + /* Garbage or an incompletely written log block */ + + log_block += OS_FILE_LOG_BLOCK_SIZE; +#if 0 + fprintf(stderr, + "Next log block n:o %lu\n", + log_block_get_hdr_no(log_block)); +#endif + break; + } + + if (*scanned_checkpoint_no > 0 + && log_block_get_checkpoint_no(log_block) + < *scanned_checkpoint_no + && *scanned_checkpoint_no + - log_block_get_checkpoint_no(log_block) + > 0x80000000UL) { + + /* Garbage from a log buffer flush which was made + before the most recent database recovery */ +#if 0 + fprintf(stderr, + "Scanned cp n:o %lu, block cp n:o %lu\n", + *scanned_checkpoint_no, + log_block_get_checkpoint_no(log_block)); +#endif + break; + } + + data_len = log_block_get_data_len(log_block); + + *scanned_checkpoint_no + = log_block_get_checkpoint_no(log_block); + *scanned_lsn += data_len; + + *n_bytes_scanned += data_len; + + if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + /* Log data ends here */ + +#if 0 + fprintf(stderr, "Log block data len %lu\n", + data_len); +#endif + break; + } + } +} +#endif /* UNIV_HOTBACKUP */ + +/*********************************************************************** +Tries to parse a single log record body and also applies it to a page if +specified. File ops are parsed, but not applied in this function. */ +static +byte* +recv_parse_or_apply_log_rec_body( +/*=============================*/ + /* out: log record end, NULL if not a + complete record */ + byte type, /* in: type */ + byte* ptr, /* in: pointer to a buffer */ + byte* end_ptr,/* in: pointer to the buffer end */ + buf_block_t* block, /* in/out: buffer block or NULL; if + not NULL, then the log record is + applied to the page, and the log + record should be complete then */ + mtr_t* mtr) /* in: mtr or NULL; should be non-NULL + if and only if block is non-NULL */ +{ + dict_index_t* index = NULL; + page_t* page; + page_zip_des_t* page_zip; + + ut_ad(!block == !mtr); + + if (block) { + page = block->frame; + page_zip = buf_block_get_page_zip(block); + } else { + page = NULL; + page_zip = NULL; + } + + switch (type) { + case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES: + ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip); + break; + case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_INSERT, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, + block, index, mtr); + } + break; + case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_CLUST_DELETE_MARK, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = btr_cur_parse_del_mark_set_clust_rec( + ptr, end_ptr, page, page_zip, index); + } + break; + case MLOG_COMP_REC_SEC_DELETE_MARK: + /* This log record type is obsolete, but we process it for + backward compatibility with MySQL 5.0.3 and 5.0.4. */ + ut_a(!page || page_is_comp(page)); + ut_a(!page_zip); + ptr = mlog_parse_index(ptr, end_ptr, TRUE, &index); + if (!ptr) { + break; + } + /* Fall through */ + case MLOG_REC_SEC_DELETE_MARK: + ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, + page, page_zip); + break; + case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_UPDATE_IN_PLACE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page, + page_zip, index); + } + break; + case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE: + case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_parse_delete_rec_list(type, ptr, end_ptr, + block, index, mtr); + } + break; + case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_LIST_END_COPY_CREATED, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_parse_copy_rec_list_to_created_page( + ptr, end_ptr, block, index, mtr); + } + break; + case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_PAGE_REORGANIZE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = btr_parse_page_reorganize(ptr, end_ptr, index, + block, mtr); + } + break; + case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE: + ut_a(!page_zip); + ptr = page_parse_create(ptr, end_ptr, + type == MLOG_COMP_PAGE_CREATE, + block, mtr); + break; + case MLOG_UNDO_INSERT: + ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page); + break; + case MLOG_UNDO_ERASE_END: + ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_INIT: + ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_DISCARD: + ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr); + break; + case MLOG_UNDO_HDR_CREATE: + case MLOG_UNDO_HDR_REUSE: + ptr = trx_undo_parse_page_header(type, ptr, end_ptr, + page, mtr); + break; + case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK: + /* On a compressed page, MLOG_COMP_REC_MIN_MARK + will be followed by MLOG_COMP_REC_DELETE + or MLOG_ZIP_WRITE_HEADER(FIL_PAGE_PREV, FIL_NULL) + in the same mini-transaction. */ + ut_a(type == MLOG_COMP_REC_MIN_MARK || !page_zip); + ptr = btr_parse_set_min_rec_mark( + ptr, end_ptr, type == MLOG_COMP_REC_MIN_MARK, + page, mtr); + break; + case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE: + if (NULL != (ptr = mlog_parse_index( + ptr, end_ptr, + type == MLOG_COMP_REC_DELETE, + &index))) { + ut_a(!page + || (ibool)!!page_is_comp(page) + == dict_table_is_comp(index->table)); + ptr = page_cur_parse_delete_rec(ptr, end_ptr, + block, index, mtr); + } + break; + case MLOG_IBUF_BITMAP_INIT: + ptr = ibuf_parse_bitmap_init(ptr, end_ptr, block, mtr); + break; + case MLOG_INIT_FILE_PAGE: + ptr = fsp_parse_init_file_page(ptr, end_ptr, block); + break; + case MLOG_WRITE_STRING: + ptr = mlog_parse_string(ptr, end_ptr, page, page_zip); + break; + case MLOG_FILE_CREATE: + case MLOG_FILE_RENAME: + case MLOG_FILE_DELETE: + case MLOG_FILE_CREATE2: + ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, 0); + break; + case MLOG_ZIP_WRITE_NODE_PTR: + ptr = page_zip_parse_write_node_ptr(ptr, end_ptr, + page, page_zip); + break; + case MLOG_ZIP_WRITE_BLOB_PTR: + ptr = page_zip_parse_write_blob_ptr(ptr, end_ptr, + page, page_zip); + break; + case MLOG_ZIP_WRITE_HEADER: + ptr = page_zip_parse_write_header(ptr, end_ptr, + page, page_zip); + break; + case MLOG_ZIP_PAGE_COMPRESS: + ptr = page_zip_parse_compress(ptr, end_ptr, + page, page_zip); + break; + default: + ptr = NULL; + recv_sys->found_corrupt_log = TRUE; + } + + if (index) { + dict_table_t* table = index->table; + + dict_mem_index_free(index); + dict_mem_table_free(table); + } + + return(ptr); +} + +/************************************************************************* +Calculates the fold value of a page file address: used in inserting or +searching for a log record in the hash table. */ +UNIV_INLINE +ulint +recv_fold( +/*======*/ + /* out: folded value */ + ulint space, /* in: space */ + ulint page_no)/* in: page number */ +{ + return(ut_fold_ulint_pair(space, page_no)); +} + +/************************************************************************* +Calculates the hash value of a page file address: used in inserting or +searching for a log record in the hash table. */ +UNIV_INLINE +ulint +recv_hash( +/*======*/ + /* out: folded value */ + ulint space, /* in: space */ + ulint page_no)/* in: page number */ +{ + return(hash_calc_hash(recv_fold(space, page_no), recv_sys->addr_hash)); +} + +/************************************************************************* +Gets the hashed file address struct for a page. */ +static +recv_addr_t* +recv_get_fil_addr_struct( +/*=====================*/ + /* out: file address struct, NULL if not found from + the hash table */ + ulint space, /* in: space id */ + ulint page_no)/* in: page number */ +{ + recv_addr_t* recv_addr; + + recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, + recv_hash(space, page_no)); + while (recv_addr) { + if ((recv_addr->space == space) + && (recv_addr->page_no == page_no)) { + + break; + } + + recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); + } + + return(recv_addr); +} + +/*********************************************************************** +Adds a new log record to the hash table of log records. */ +static +void +recv_add_to_hash_table( +/*===================*/ + byte type, /* in: log record type */ + ulint space, /* in: space id */ + ulint page_no, /* in: page number */ + byte* body, /* in: log record body */ + byte* rec_end, /* in: log record end */ + ib_uint64_t start_lsn, /* in: start lsn of the mtr */ + ib_uint64_t end_lsn) /* in: end lsn of the mtr */ +{ + recv_t* recv; + ulint len; + recv_data_t* recv_data; + recv_data_t** prev_field; + recv_addr_t* recv_addr; + + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) { + /* The tablespace does not exist any more: do not store the + log record */ + + return; + } + + len = rec_end - body; + + recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t)); + recv->type = type; + recv->len = rec_end - body; + recv->start_lsn = start_lsn; + recv->end_lsn = end_lsn; + + recv_addr = recv_get_fil_addr_struct(space, page_no); + + if (recv_addr == NULL) { + recv_addr = mem_heap_alloc(recv_sys->heap, + sizeof(recv_addr_t)); + recv_addr->space = space; + recv_addr->page_no = page_no; + recv_addr->state = RECV_NOT_PROCESSED; + + UT_LIST_INIT(recv_addr->rec_list); + + HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash, + recv_fold(space, page_no), recv_addr); + recv_sys->n_addrs++; +#if 0 + fprintf(stderr, "Inserting log rec for space %lu, page %lu\n", + space, page_no); +#endif + } + + UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv); + + prev_field = &(recv->data); + + /* Store the log record body in chunks of less than UNIV_PAGE_SIZE: + recv_sys->heap grows into the buffer pool, and bigger chunks could not + be allocated */ + + while (rec_end > body) { + + len = rec_end - body; + + if (len > RECV_DATA_BLOCK_SIZE) { + len = RECV_DATA_BLOCK_SIZE; + } + + recv_data = mem_heap_alloc(recv_sys->heap, + sizeof(recv_data_t) + len); + *prev_field = recv_data; + + ut_memcpy(((byte*)recv_data) + sizeof(recv_data_t), body, len); + + prev_field = &(recv_data->next); + + body += len; + } + + *prev_field = NULL; +} + +/************************************************************************* +Copies the log record body from recv to buf. */ +static +void +recv_data_copy_to_buf( +/*==================*/ + byte* buf, /* in: buffer of length at least recv->len */ + recv_t* recv) /* in: log record */ +{ + recv_data_t* recv_data; + ulint part_len; + ulint len; + + len = recv->len; + recv_data = recv->data; + + while (len > 0) { + if (len > RECV_DATA_BLOCK_SIZE) { + part_len = RECV_DATA_BLOCK_SIZE; + } else { + part_len = len; + } + + ut_memcpy(buf, ((byte*)recv_data) + sizeof(recv_data_t), + part_len); + buf += part_len; + len -= part_len; + + recv_data = recv_data->next; + } +} + +/**************************************************************************** +Applies the hashed log records to the page, if the page lsn is less than the +lsn of a log record. This can be called when a buffer page has just been +read in, or also for a page already in the buffer pool. */ +UNIV_INTERN +void +recv_recover_page( +/*==============*/ + ibool recover_backup, + /* in: TRUE if we are recovering a backup + page: then we do not acquire any latches + since the page was read in outside the + buffer pool */ + ibool just_read_in, + /* in: TRUE if the i/o-handler calls this for + a freshly read page */ + buf_block_t* block) /* in: buffer block */ +{ + page_t* page; + recv_addr_t* recv_addr; + recv_t* recv; + byte* buf; + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + ib_uint64_t page_lsn; + ib_uint64_t page_newest_lsn; + ibool modification_to_page; + ibool success; + mtr_t mtr; + + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_log_recs == FALSE) { + + /* Log records should not be applied now */ + + mutex_exit(&(recv_sys->mutex)); + + return; + } + + recv_addr = recv_get_fil_addr_struct(buf_block_get_space(block), + buf_block_get_page_no(block)); + + if ((recv_addr == NULL) + || (recv_addr->state == RECV_BEING_PROCESSED) + || (recv_addr->state == RECV_PROCESSED)) { + + mutex_exit(&(recv_sys->mutex)); + + return; + } + +#if 0 + fprintf(stderr, "Recovering space %lu, page %lu\n", + buf_block_get_space(block), buf_block_get_page_no(block)); +#endif + + recv_addr->state = RECV_BEING_PROCESSED; + + mutex_exit(&(recv_sys->mutex)); + + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NONE); + + page = block->frame; + + if (!recover_backup) { + if (just_read_in) { + /* Move the ownership of the x-latch on the + page to this OS thread, so that we can acquire + a second x-latch on it. This is needed for the + operations to the page to pass the debug + checks. */ + + rw_lock_x_lock_move_ownership(&(block->lock)); + } + + success = buf_page_get_known_nowait(RW_X_LATCH, block, + BUF_KEEP_OLD, + __FILE__, __LINE__, + &mtr); + ut_a(success); + + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + } + + /* Read the newest modification lsn from the page */ + page_lsn = mach_read_ull(page + FIL_PAGE_LSN); + + if (!recover_backup) { + /* It may be that the page has been modified in the buffer + pool: read the newest modification lsn there */ + + page_newest_lsn + = buf_page_get_newest_modification(&block->page); + + if (page_newest_lsn) { + + page_lsn = page_newest_lsn; + } + } else { + /* In recovery from a backup we do not really use the buffer + pool */ + + page_newest_lsn = 0; + } + + modification_to_page = FALSE; + start_lsn = end_lsn = 0; + + recv = UT_LIST_GET_FIRST(recv_addr->rec_list); + + while (recv) { + end_lsn = recv->end_lsn; + + if (recv->len > RECV_DATA_BLOCK_SIZE) { + /* We have to copy the record body to a separate + buffer */ + + buf = mem_alloc(recv->len); + + recv_data_copy_to_buf(buf, recv); + } else { + buf = ((byte*)(recv->data)) + sizeof(recv_data_t); + } + + if (recv->type == MLOG_INIT_FILE_PAGE) { + page_lsn = page_newest_lsn; + + mach_write_ull(page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM, 0); + mach_write_ull(page + FIL_PAGE_LSN, 0); + } + + if (recv->start_lsn >= page_lsn) { + + if (!modification_to_page) { + + modification_to_page = TRUE; + start_lsn = recv->start_lsn; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Applying log rec" + " type %lu len %lu" + " to space %lu page no %lu\n", + (ulong) recv->type, (ulong) recv->len, + (ulong) recv_addr->space, + (ulong) recv_addr->page_no); + } +#endif /* UNIV_DEBUG */ + + recv_parse_or_apply_log_rec_body(recv->type, buf, + buf + recv->len, + block, &mtr); + mach_write_ull(page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM, + recv->start_lsn + recv->len); + mach_write_ull(page + FIL_PAGE_LSN, + recv->start_lsn + recv->len); + } + + if (recv->len > RECV_DATA_BLOCK_SIZE) { + mem_free(buf); + } + + recv = UT_LIST_GET_NEXT(rec_list, recv); + } + +#ifdef UNIV_ZIP_DEBUG + if (fil_page_get_type(page) == FIL_PAGE_INDEX) { + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + if (page_zip) { + ut_a(page_zip_validate_low(page_zip, page, FALSE)); + } + } +#endif /* UNIV_ZIP_DEBUG */ + + mutex_enter(&(recv_sys->mutex)); + + if (recv_max_page_lsn < page_lsn) { + recv_max_page_lsn = page_lsn; + } + + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + + mutex_exit(&(recv_sys->mutex)); + + if (!recover_backup && modification_to_page) { + ut_a(block); + + buf_flush_recv_note_modification(block, start_lsn, end_lsn); + } + + /* Make sure that committing mtr does not change the modification + lsn values of page */ + + mtr.modifications = FALSE; + + mtr_commit(&mtr); +} + +/*********************************************************************** +Reads in pages which have hashed log records, from an area around a given +page number. */ +static +ulint +recv_read_in_area( +/*==============*/ + /* out: number of pages found */ + ulint space, /* in: space */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + ulint page_no)/* in: page number */ +{ + recv_addr_t* recv_addr; + ulint page_nos[RECV_READ_AHEAD_AREA]; + ulint low_limit; + ulint n; + + low_limit = page_no - (page_no % RECV_READ_AHEAD_AREA); + + n = 0; + + for (page_no = low_limit; page_no < low_limit + RECV_READ_AHEAD_AREA; + page_no++) { + recv_addr = recv_get_fil_addr_struct(space, page_no); + + if (recv_addr && !buf_page_peek(space, page_no)) { + + mutex_enter(&(recv_sys->mutex)); + + if (recv_addr->state == RECV_NOT_PROCESSED) { + recv_addr->state = RECV_BEING_READ; + + page_nos[n] = page_no; + + n++; + } + + mutex_exit(&(recv_sys->mutex)); + } + } + + buf_read_recv_pages(FALSE, space, zip_size, page_nos, n); + /* + fprintf(stderr, "Recv pages at %lu n %lu\n", page_nos[0], n); + */ + return(n); +} + +/*********************************************************************** +Empties the hash table of stored log records, applying them to appropriate +pages. */ +UNIV_INTERN +void +recv_apply_hashed_log_recs( +/*=======================*/ + ibool allow_ibuf) /* in: if TRUE, also ibuf operations are + allowed during the application; if FALSE, + no ibuf operations are allowed, and after + the application all file pages are flushed to + disk and invalidated in buffer pool: this + alternative means that no new log records + can be generated during the application; + the caller must in this case own the log + mutex */ +{ + recv_addr_t* recv_addr; + ulint i; + ulint n_pages; + ibool has_printed = FALSE; + mtr_t mtr; +loop: + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_batch_on) { + + mutex_exit(&(recv_sys->mutex)); + + os_thread_sleep(500000); + + goto loop; + } + + ut_ad(!allow_ibuf == mutex_own(&log_sys->mutex)); + + if (!allow_ibuf) { + recv_no_ibuf_operations = TRUE; + } + + recv_sys->apply_log_recs = TRUE; + recv_sys->apply_batch_on = TRUE; + + for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) { + + recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, i); + + while (recv_addr) { + ulint space = recv_addr->space; + ulint zip_size = fil_space_get_zip_size(space); + ulint page_no = recv_addr->page_no; + + if (recv_addr->state == RECV_NOT_PROCESSED) { + if (!has_printed) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Starting an" + " apply batch of log records" + " to the database...\n" + "InnoDB: Progress in percents: ", + stderr); + has_printed = TRUE; + } + + mutex_exit(&(recv_sys->mutex)); + + if (buf_page_peek(space, page_no)) { + buf_block_t* block; + + mtr_start(&mtr); + + block = buf_page_get( + space, zip_size, page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level( + block, SYNC_NO_ORDER_CHECK); + + recv_recover_page(FALSE, FALSE, block); + mtr_commit(&mtr); + } else { + recv_read_in_area(space, zip_size, + page_no); + } + + mutex_enter(&(recv_sys->mutex)); + } + + recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); + } + + if (has_printed + && (i * 100) / hash_get_n_cells(recv_sys->addr_hash) + != ((i + 1) * 100) + / hash_get_n_cells(recv_sys->addr_hash)) { + + fprintf(stderr, "%lu ", (ulong) + ((i * 100) + / hash_get_n_cells(recv_sys->addr_hash))); + } + } + + /* Wait until all the pages have been processed */ + + while (recv_sys->n_addrs != 0) { + + mutex_exit(&(recv_sys->mutex)); + + os_thread_sleep(500000); + + mutex_enter(&(recv_sys->mutex)); + } + + if (has_printed) { + + fprintf(stderr, "\n"); + } + + if (!allow_ibuf) { + /* Flush all the file pages to disk and invalidate them in + the buffer pool */ + + mutex_exit(&(recv_sys->mutex)); + mutex_exit(&(log_sys->mutex)); + + n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, + IB_ULONGLONG_MAX); + ut_a(n_pages != ULINT_UNDEFINED); + + buf_flush_wait_batch_end(BUF_FLUSH_LIST); + + buf_pool_invalidate(); + + mutex_enter(&(log_sys->mutex)); + mutex_enter(&(recv_sys->mutex)); + + recv_no_ibuf_operations = FALSE; + } + + recv_sys->apply_log_recs = FALSE; + recv_sys->apply_batch_on = FALSE; + + recv_sys_empty_hash(); + + if (has_printed) { + fprintf(stderr, "InnoDB: Apply batch completed\n"); + } + + mutex_exit(&(recv_sys->mutex)); +} + +#ifdef UNIV_HOTBACKUP +/*********************************************************************** +Applies log records in the hash table to a backup. */ +UNIV_INTERN +void +recv_apply_log_recs_for_backup(void) +/*================================*/ +{ + recv_addr_t* recv_addr; + ulint n_hash_cells; + buf_block_t* block; + ulint actual_size; + ibool success; + ulint error; + ulint i; + + recv_sys->apply_log_recs = TRUE; + recv_sys->apply_batch_on = TRUE; + + block = buf_LRU_get_free_block(UNIV_PAGE_SIZE); + + fputs("InnoDB: Starting an apply batch of log records" + " to the database...\n" + "InnoDB: Progress in percents: ", stderr); + + n_hash_cells = hash_get_n_cells(recv_sys->addr_hash); + + for (i = 0; i < n_hash_cells; i++) { + /* The address hash table is externally chained */ + recv_addr = hash_get_nth_cell(recv_sys->addr_hash, i)->node; + + while (recv_addr != NULL) { + + ulint zip_size + = fil_space_get_zip_size(recv_addr->space); + + if (zip_size == ULINT_UNDEFINED) { +#if 0 + fprintf(stderr, + "InnoDB: Warning: cannot apply" + " log record to" + " tablespace %lu page %lu,\n" + "InnoDB: because tablespace with" + " that id does not exist.\n", + recv_addr->space, recv_addr->page_no); +#endif + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + + goto skip_this_recv_addr; + } + + /* We simulate a page read made by the buffer pool, to + make sure the recovery apparatus works ok. We must init + the block. */ + + buf_page_init_for_backup_restore( + recv_addr->space, recv_addr->page_no, + zip_size, block); + + /* Extend the tablespace's last file if the page_no + does not fall inside its bounds; we assume the last + file is auto-extending, and ibbackup copied the file + when it still was smaller */ + + success = fil_extend_space_to_desired_size( + &actual_size, + recv_addr->space, recv_addr->page_no + 1); + if (!success) { + fprintf(stderr, + "InnoDB: Fatal error: cannot extend" + " tablespace %lu to hold %lu pages\n", + recv_addr->space, recv_addr->page_no); + + exit(1); + } + + /* Read the page from the tablespace file using the + fil0fil.c routines */ + + if (zip_size) { + error = fil_io(OS_FILE_READ, TRUE, + recv_addr->space, zip_size, + recv_addr->page_no, 0, zip_size, + block->page.zip.data, NULL); + } else { + error = fil_io(OS_FILE_READ, TRUE, + recv_addr->space, 0, + recv_addr->page_no, 0, + UNIV_PAGE_SIZE, + block->frame, NULL); + } + + if (error != DB_SUCCESS) { + fprintf(stderr, + "InnoDB: Fatal error: cannot read" + " from tablespace" + " %lu page number %lu\n", + (ulong) recv_addr->space, + (ulong) recv_addr->page_no); + + exit(1); + } + + /* Apply the log records to this page */ + recv_recover_page(TRUE, FALSE, block); + + /* Write the page back to the tablespace file using the + fil0fil.c routines */ + + buf_flush_init_for_writing( + block->frame, buf_block_get_page_zip(block), + mach_read_ull(block->frame + FIL_PAGE_LSN)); + + if (zip_size) { + error = fil_io(OS_FILE_WRITE, TRUE, + recv_addr->space, zip_size, + recv_addr->page_no, 0, + zip_size, + block->page.zip.data, NULL); + } else { + error = fil_io(OS_FILE_WRITE, TRUE, + recv_addr->space, 0, + recv_addr->page_no, 0, + UNIV_PAGE_SIZE, + block->frame, NULL); + } +skip_this_recv_addr: + recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); + } + + if ((100 * i) / n_hash_cells + != (100 * (i + 1)) / n_hash_cells) { + fprintf(stderr, "%lu ", + (ulong) ((100 * i) / n_hash_cells)); + fflush(stderr); + } + } + + buf_block_free(block); + recv_sys_empty_hash(); +} +#endif /* UNIV_HOTBACKUP */ + +/*********************************************************************** +Tries to parse a single log record and returns its length. */ +static +ulint +recv_parse_log_rec( +/*===============*/ + /* out: length of the record, or 0 if the record was + not complete */ + byte* ptr, /* in: pointer to a buffer */ + byte* end_ptr,/* in: pointer to the buffer end */ + byte* type, /* out: type */ + ulint* space, /* out: space id */ + ulint* page_no,/* out: page number */ + byte** body) /* out: log record body start */ +{ + byte* new_ptr; + + *body = NULL; + + if (ptr == end_ptr) { + + return(0); + } + + if (*ptr == MLOG_MULTI_REC_END) { + + *type = *ptr; + + return(1); + } + + if (*ptr == MLOG_DUMMY_RECORD) { + *type = *ptr; + + *space = ULINT_UNDEFINED - 1; /* For debugging */ + + return(1); + } + + new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space, + page_no); + *body = new_ptr; + + if (UNIV_UNLIKELY(!new_ptr)) { + + return(0); + } + + /* Check that page_no is sensible */ + + if (UNIV_UNLIKELY(*page_no > 0x8FFFFFFFUL)) { + + recv_sys->found_corrupt_log = TRUE; + + return(0); + } + + new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr, + NULL, NULL); + if (UNIV_UNLIKELY(new_ptr == NULL)) { + + return(0); + } + + if (*page_no > recv_max_parsed_page_no) { + recv_max_parsed_page_no = *page_no; + } + + return(new_ptr - ptr); +} + +/*********************************************************** +Calculates the new value for lsn when more data is added to the log. */ +static +ib_uint64_t +recv_calc_lsn_on_data_add( +/*======================*/ + ib_uint64_t lsn, /* in: old lsn */ + ib_uint64_t len) /* in: this many bytes of data is + added, log block headers not included */ +{ + ulint frag_len; + ulint lsn_len; + + frag_len = (((ulint) lsn) % OS_FILE_LOG_BLOCK_SIZE) + - LOG_BLOCK_HDR_SIZE; + ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE + - LOG_BLOCK_TRL_SIZE); + lsn_len = (ulint) len; + lsn_len += (lsn_len + frag_len) + / (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE + - LOG_BLOCK_TRL_SIZE) + * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE); + + return(lsn + lsn_len); +} + +#ifdef UNIV_LOG_DEBUG +/*********************************************************** +Checks that the parser recognizes incomplete initial segments of a log +record as incomplete. */ +static +void +recv_check_incomplete_log_recs( +/*===========================*/ + byte* ptr, /* in: pointer to a complete log record */ + ulint len) /* in: length of the log record */ +{ + ulint i; + byte type; + ulint space; + ulint page_no; + byte* body; + + for (i = 0; i < len; i++) { + ut_a(0 == recv_parse_log_rec(ptr, ptr + i, &type, &space, + &page_no, &body)); + } +} +#endif /* UNIV_LOG_DEBUG */ + +/*********************************************************** +Prints diagnostic info of corrupt log. */ +static +void +recv_report_corrupt_log( +/*====================*/ + byte* ptr, /* in: pointer to corrupt log record */ + byte type, /* in: type of the record */ + ulint space, /* in: space id, this may also be garbage */ + ulint page_no)/* in: page number, this may also be garbage */ +{ + fprintf(stderr, + "InnoDB: ############### CORRUPT LOG RECORD FOUND\n" + "InnoDB: Log record type %lu, space id %lu, page number %lu\n" + "InnoDB: Log parsing proceeded successfully up to %llu\n" + "InnoDB: Previous log record type %lu, is multi %lu\n" + "InnoDB: Recv offset %lu, prev %lu\n", + (ulong) type, (ulong) space, (ulong) page_no, + recv_sys->recovered_lsn, + (ulong) recv_previous_parsed_rec_type, + (ulong) recv_previous_parsed_rec_is_multi, + (ulong) (ptr - recv_sys->buf), + (ulong) recv_previous_parsed_rec_offset); + + if ((ulint)(ptr - recv_sys->buf + 100) + > recv_previous_parsed_rec_offset + && (ulint)(ptr - recv_sys->buf + 100 + - recv_previous_parsed_rec_offset) + < 200000) { + fputs("InnoDB: Hex dump of corrupt log starting" + " 100 bytes before the start\n" + "InnoDB: of the previous log rec,\n" + "InnoDB: and ending 100 bytes after the start" + " of the corrupt rec:\n", + stderr); + + ut_print_buf(stderr, + recv_sys->buf + + recv_previous_parsed_rec_offset - 100, + ptr - recv_sys->buf + 200 + - recv_previous_parsed_rec_offset); + putc('\n', stderr); + } + + fputs("InnoDB: WARNING: the log file may have been corrupt and it\n" + "InnoDB: is possible that the log scan did not proceed\n" + "InnoDB: far enough in recovery! Please run CHECK TABLE\n" + "InnoDB: on your InnoDB tables to check that they are ok!\n" + "InnoDB: If mysqld crashes after this recovery, look at\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "forcing-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); + + fflush(stderr); +} + +/*********************************************************** +Parses log records from a buffer and stores them to a hash table to wait +merging to file pages. */ +static +ibool +recv_parse_log_recs( +/*================*/ + /* out: currently always returns FALSE */ + ibool store_to_hash) /* in: TRUE if the records should be stored + to the hash table; this is set to FALSE if just + debug checking is needed */ +{ + byte* ptr; + byte* end_ptr; + ulint single_rec; + ulint len; + ulint total_len; + ib_uint64_t new_recovered_lsn; + ib_uint64_t old_lsn; + byte type; + ulint space; + ulint page_no; + byte* body; + ulint n_recs; + + ut_ad(mutex_own(&(log_sys->mutex))); + ut_ad(recv_sys->parse_start_lsn != 0); +loop: + ptr = recv_sys->buf + recv_sys->recovered_offset; + + end_ptr = recv_sys->buf + recv_sys->len; + + if (ptr == end_ptr) { + + return(FALSE); + } + + single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG; + + if (single_rec || *ptr == MLOG_DUMMY_RECORD) { + /* The mtr only modified a single page, or this is a file op */ + + old_lsn = recv_sys->recovered_lsn; + + /* Try to parse a log record, fetching its type, space id, + page no, and a pointer to the body of the log record */ + + len = recv_parse_log_rec(ptr, end_ptr, &type, &space, + &page_no, &body); + + if (len == 0 || recv_sys->found_corrupt_log) { + if (recv_sys->found_corrupt_log) { + + recv_report_corrupt_log(ptr, + type, space, page_no); + } + + return(FALSE); + } + + new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len); + + if (new_recovered_lsn > recv_sys->scanned_lsn) { + /* The log record filled a log block, and we require + that also the next log block should have been scanned + in */ + + return(FALSE); + } + + recv_previous_parsed_rec_type = (ulint)type; + recv_previous_parsed_rec_offset = recv_sys->recovered_offset; + recv_previous_parsed_rec_is_multi = 0; + + recv_sys->recovered_offset += len; + recv_sys->recovered_lsn = new_recovered_lsn; + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Parsed a single log rec" + " type %lu len %lu space %lu page no %lu\n", + (ulong) type, (ulong) len, (ulong) space, + (ulong) page_no); + } +#endif /* UNIV_DEBUG */ + + if (type == MLOG_DUMMY_RECORD) { + /* Do nothing */ + + } else if (!store_to_hash) { + /* In debug checking, update a replicate page + according to the log record, and check that it + becomes identical with the original page */ +#ifdef UNIV_LOG_DEBUG + recv_check_incomplete_log_recs(ptr, len); +#endif/* UNIV_LOG_DEBUG */ + + } else if (type == MLOG_FILE_CREATE + || type == MLOG_FILE_CREATE2 + || type == MLOG_FILE_RENAME + || type == MLOG_FILE_DELETE) { + ut_a(space); +#ifdef UNIV_HOTBACKUP + if (recv_replay_file_ops) { + + /* In ibbackup --apply-log, replay an .ibd file + operation, if possible; note that + fil_path_to_mysql_datadir is set in ibbackup to + point to the datadir we should use there */ + + if (NULL == fil_op_log_parse_or_replay( + body, end_ptr, type, space)) { + fprintf(stderr, + "InnoDB: Error: file op" + " log record of type %lu" + " space %lu not complete in\n" + "InnoDB: the replay phase." + " Path %s\n", + (ulint)type, space, + (char*)(body + 2)); + + ut_error; + } + } +#endif + /* In normal mysqld crash recovery we do not try to + replay file operations */ + } else { + recv_add_to_hash_table(type, space, page_no, body, + ptr + len, old_lsn, + recv_sys->recovered_lsn); + } + } else { + /* Check that all the records associated with the single mtr + are included within the buffer */ + + total_len = 0; + n_recs = 0; + + for (;;) { + len = recv_parse_log_rec(ptr, end_ptr, &type, &space, + &page_no, &body); + if (len == 0 || recv_sys->found_corrupt_log) { + + if (recv_sys->found_corrupt_log) { + + recv_report_corrupt_log( + ptr, type, space, page_no); + } + + return(FALSE); + } + + recv_previous_parsed_rec_type = (ulint)type; + recv_previous_parsed_rec_offset + = recv_sys->recovered_offset + total_len; + recv_previous_parsed_rec_is_multi = 1; + + if ((!store_to_hash) && (type != MLOG_MULTI_REC_END)) { +#ifdef UNIV_LOG_DEBUG + recv_check_incomplete_log_recs(ptr, len); +#endif /* UNIV_LOG_DEBUG */ + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Parsed a multi log rec" + " type %lu len %lu" + " space %lu page no %lu\n", + (ulong) type, (ulong) len, + (ulong) space, (ulong) page_no); + } +#endif /* UNIV_DEBUG */ + + total_len += len; + n_recs++; + + ptr += len; + + if (type == MLOG_MULTI_REC_END) { + + /* Found the end mark for the records */ + + break; + } + } + + new_recovered_lsn = recv_calc_lsn_on_data_add( + recv_sys->recovered_lsn, total_len); + + if (new_recovered_lsn > recv_sys->scanned_lsn) { + /* The log record filled a log block, and we require + that also the next log block should have been scanned + in */ + + return(FALSE); + } + + /* Add all the records to the hash table */ + + ptr = recv_sys->buf + recv_sys->recovered_offset; + + for (;;) { + old_lsn = recv_sys->recovered_lsn; + len = recv_parse_log_rec(ptr, end_ptr, &type, &space, + &page_no, &body); + if (recv_sys->found_corrupt_log) { + + recv_report_corrupt_log(ptr, + type, space, page_no); + } + + ut_a(len != 0); + ut_a(0 == ((ulint)*ptr & MLOG_SINGLE_REC_FLAG)); + + recv_sys->recovered_offset += len; + recv_sys->recovered_lsn + = recv_calc_lsn_on_data_add(old_lsn, len); + if (type == MLOG_MULTI_REC_END) { + + /* Found the end mark for the records */ + + break; + } + + if (store_to_hash) { + recv_add_to_hash_table(type, space, page_no, + body, ptr + len, + old_lsn, + new_recovered_lsn); + } + + ptr += len; + } + } + + goto loop; +} + +/*********************************************************** +Adds data from a new log block to the parsing buffer of recv_sys if +recv_sys->parse_start_lsn is non-zero. */ +static +ibool +recv_sys_add_to_parsing_buf( +/*========================*/ + /* out: TRUE if more data added */ + byte* log_block, /* in: log block */ + ib_uint64_t scanned_lsn) /* in: lsn of how far we were able + to find data in this log block */ +{ + ulint more_len; + ulint data_len; + ulint start_offset; + ulint end_offset; + + ut_ad(scanned_lsn >= recv_sys->scanned_lsn); + + if (!recv_sys->parse_start_lsn) { + /* Cannot start parsing yet because no start point for + it found */ + + return(FALSE); + } + + data_len = log_block_get_data_len(log_block); + + if (recv_sys->parse_start_lsn >= scanned_lsn) { + + return(FALSE); + + } else if (recv_sys->scanned_lsn >= scanned_lsn) { + + return(FALSE); + + } else if (recv_sys->parse_start_lsn > recv_sys->scanned_lsn) { + more_len = (ulint) (scanned_lsn - recv_sys->parse_start_lsn); + } else { + more_len = (ulint) (scanned_lsn - recv_sys->scanned_lsn); + } + + if (more_len == 0) { + + return(FALSE); + } + + ut_ad(data_len >= more_len); + + start_offset = data_len - more_len; + + if (start_offset < LOG_BLOCK_HDR_SIZE) { + start_offset = LOG_BLOCK_HDR_SIZE; + } + + end_offset = data_len; + + if (end_offset > OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) { + end_offset = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; + } + + ut_ad(start_offset <= end_offset); + + if (start_offset < end_offset) { + ut_memcpy(recv_sys->buf + recv_sys->len, + log_block + start_offset, end_offset - start_offset); + + recv_sys->len += end_offset - start_offset; + + ut_a(recv_sys->len <= RECV_PARSING_BUF_SIZE); + } + + return(TRUE); +} + +/*********************************************************** +Moves the parsing buffer data left to the buffer start. */ +static +void +recv_sys_justify_left_parsing_buf(void) +/*===================================*/ +{ + ut_memmove(recv_sys->buf, recv_sys->buf + recv_sys->recovered_offset, + recv_sys->len - recv_sys->recovered_offset); + + recv_sys->len -= recv_sys->recovered_offset; + + recv_sys->recovered_offset = 0; +} + +/*********************************************************** +Scans log from a buffer and stores new log data to the parsing buffer. Parses +and hashes the log records if new data found. */ +UNIV_INTERN +ibool +recv_scan_log_recs( +/*===============*/ + /* out: TRUE if limit_lsn has been + reached, or not able to scan any more + in this log group */ + ibool apply_automatically,/* in: TRUE if we want this + function to apply log records + automatically when the hash table + becomes full; in the hot backup tool + the tool does the applying, not this + function */ + ulint available_memory,/* in: we let the hash table of recs + to grow to this size, at the maximum */ + ibool store_to_hash, /* in: TRUE if the records should be + stored to the hash table; this is set + to FALSE if just debug checking is + needed */ + byte* buf, /* in: buffer containing a log segment + or garbage */ + ulint len, /* in: buffer length */ + ib_uint64_t start_lsn, /* in: buffer start lsn */ + ib_uint64_t* contiguous_lsn, /* in/out: it is known that all log + groups contain contiguous log data up + to this lsn */ + ib_uint64_t* group_scanned_lsn)/* out: scanning succeeded up to + this lsn */ +{ + byte* log_block; + ulint no; + ib_uint64_t scanned_lsn; + ibool finished; + ulint data_len; + ibool more_data; + + ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(len > 0); + ut_a(apply_automatically <= TRUE); + ut_a(store_to_hash <= TRUE); + + finished = FALSE; + + log_block = buf; + scanned_lsn = start_lsn; + more_data = FALSE; + + while (log_block < buf + len && !finished) { + + no = log_block_get_hdr_no(log_block); + /* + fprintf(stderr, "Log block header no %lu\n", no); + + fprintf(stderr, "Scanned lsn no %lu\n", + log_block_convert_lsn_to_no(scanned_lsn)); + */ + if (no != log_block_convert_lsn_to_no(scanned_lsn) + || !log_block_checksum_is_ok_or_old_format(log_block)) { + + if (no == log_block_convert_lsn_to_no(scanned_lsn) + && !log_block_checksum_is_ok_or_old_format( + log_block)) { + fprintf(stderr, + "InnoDB: Log block no %lu at" + " lsn %llu has\n" + "InnoDB: ok header, but checksum field" + " contains %lu, should be %lu\n", + (ulong) no, + scanned_lsn, + (ulong) log_block_get_checksum( + log_block), + (ulong) log_block_calc_checksum( + log_block)); + } + + /* Garbage or an incompletely written log block */ + + finished = TRUE; + + break; + } + + if (log_block_get_flush_bit(log_block)) { + /* This block was a start of a log flush operation: + we know that the previous flush operation must have + been completed for all log groups before this block + can have been flushed to any of the groups. Therefore, + we know that log data is contiguous up to scanned_lsn + in all non-corrupt log groups. */ + + if (scanned_lsn > *contiguous_lsn) { + *contiguous_lsn = scanned_lsn; + } + } + + data_len = log_block_get_data_len(log_block); + + if ((store_to_hash || (data_len == OS_FILE_LOG_BLOCK_SIZE)) + && scanned_lsn + data_len > recv_sys->scanned_lsn + && (recv_sys->scanned_checkpoint_no > 0) + && (log_block_get_checkpoint_no(log_block) + < recv_sys->scanned_checkpoint_no) + && (recv_sys->scanned_checkpoint_no + - log_block_get_checkpoint_no(log_block) + > 0x80000000UL)) { + + /* Garbage from a log buffer flush which was made + before the most recent database recovery */ + + finished = TRUE; +#ifdef UNIV_LOG_DEBUG + /* This is not really an error, but currently + we stop here in the debug version: */ + + ut_error; +#endif + break; + } + + if (!recv_sys->parse_start_lsn + && (log_block_get_first_rec_group(log_block) > 0)) { + + /* We found a point from which to start the parsing + of log records */ + + recv_sys->parse_start_lsn = scanned_lsn + + log_block_get_first_rec_group(log_block); + recv_sys->scanned_lsn = recv_sys->parse_start_lsn; + recv_sys->recovered_lsn = recv_sys->parse_start_lsn; + } + + scanned_lsn += data_len; + + if (scanned_lsn > recv_sys->scanned_lsn) { + + /* We have found more entries. If this scan is + of startup type, we must initiate crash recovery + environment before parsing these log records. */ + + if (recv_log_scan_is_startup_type + && !recv_needed_recovery) { + + fprintf(stderr, + "InnoDB: Log scan progressed" + " past the checkpoint lsn %llu\n", + recv_sys->scanned_lsn); + recv_init_crash_recovery(); + } + + /* We were able to find more log data: add it to the + parsing buffer if parse_start_lsn is already + non-zero */ + + if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE + >= RECV_PARSING_BUF_SIZE) { + fprintf(stderr, + "InnoDB: Error: log parsing" + " buffer overflow." + " Recovery may have failed!\n"); + + recv_sys->found_corrupt_log = TRUE; + + } else if (!recv_sys->found_corrupt_log) { + more_data = recv_sys_add_to_parsing_buf( + log_block, scanned_lsn); + } + + recv_sys->scanned_lsn = scanned_lsn; + recv_sys->scanned_checkpoint_no + = log_block_get_checkpoint_no(log_block); + } + + if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + /* Log data for this group ends here */ + + finished = TRUE; + } else { + log_block += OS_FILE_LOG_BLOCK_SIZE; + } + } + + *group_scanned_lsn = scanned_lsn; + + if (recv_needed_recovery + || (recv_is_from_backup && !recv_is_making_a_backup)) { + recv_scan_print_counter++; + + if (finished || (recv_scan_print_counter % 80 == 0)) { + + fprintf(stderr, + "InnoDB: Doing recovery: scanned up to" + " log sequence number %llu\n", + *group_scanned_lsn); + } + } + + if (more_data && !recv_sys->found_corrupt_log) { + /* Try to parse more log records */ + + recv_parse_log_recs(store_to_hash); + + if (store_to_hash && mem_heap_get_size(recv_sys->heap) + > available_memory + && apply_automatically) { + + /* Hash table of log records has grown too big: + empty it; FALSE means no ibuf operations + allowed, as we cannot add new records to the + log yet: they would be produced by ibuf + operations */ + + recv_apply_hashed_log_recs(FALSE); + } + + if (recv_sys->recovered_offset > RECV_PARSING_BUF_SIZE / 4) { + /* Move parsing buffer data to the buffer start */ + + recv_sys_justify_left_parsing_buf(); + } + } + + return(finished); +} + +/*********************************************************** +Scans log from a buffer and stores new log data to the parsing buffer. Parses +and hashes the log records if new data found. */ +static +void +recv_group_scan_log_recs( +/*=====================*/ + log_group_t* group, /* in: log group */ + ib_uint64_t* contiguous_lsn, /* in/out: it is known that all log + groups contain contiguous log data up + to this lsn */ + ib_uint64_t* group_scanned_lsn)/* out: scanning succeeded up to + this lsn */ +{ + ibool finished; + ib_uint64_t start_lsn; + ib_uint64_t end_lsn; + + finished = FALSE; + + start_lsn = *contiguous_lsn; + + while (!finished) { + end_lsn = start_lsn + RECV_SCAN_SIZE; + + log_group_read_log_seg(LOG_RECOVER, log_sys->buf, + group, start_lsn, end_lsn); + + finished = recv_scan_log_recs( + TRUE, (buf_pool->curr_size - recv_n_pool_free_frames) + * UNIV_PAGE_SIZE, TRUE, log_sys->buf, RECV_SCAN_SIZE, + start_lsn, contiguous_lsn, group_scanned_lsn); + start_lsn = end_lsn; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Scanned group %lu up to" + " log sequence number %llu\n", + (ulong) group->id, + *group_scanned_lsn); + } +#endif /* UNIV_DEBUG */ +} + +/*********************************************************** +Initialize crash recovery environment. Can be called iff +recv_needed_recovery == FALSE. */ +static +void +recv_init_crash_recovery(void) +/*==========================*/ +{ + ut_a(!recv_needed_recovery); + + recv_needed_recovery = TRUE; + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Database was not" + " shut down normally!\n" + "InnoDB: Starting crash recovery.\n"); + + fprintf(stderr, + "InnoDB: Reading tablespace information" + " from the .ibd files...\n"); + + fil_load_single_table_tablespaces(); + + /* If we are using the doublewrite method, we will + check if there are half-written pages in data files, + and restore them from the doublewrite buffer if + possible */ + + if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { + + fprintf(stderr, + "InnoDB: Restoring possible" + " half-written data pages from" + " the doublewrite\n" + "InnoDB: buffer...\n"); + trx_sys_doublewrite_init_or_restore_pages(TRUE); + } +} + +/************************************************************ +Recovers from a checkpoint. When this function returns, the database is able +to start processing of new user transactions, but the function +recv_recovery_from_checkpoint_finish should be called later to complete +the recovery and free the resources used in it. */ +UNIV_INTERN +ulint +recv_recovery_from_checkpoint_start_func( +/*=====================================*/ + /* out: error code or DB_SUCCESS */ +#ifdef UNIV_LOG_ARCHIVE + ulint type, /* in: LOG_CHECKPOINT or LOG_ARCHIVE */ + ib_uint64_t limit_lsn, /* in: recover up to this lsn + if possible */ +#endif /* UNIV_LOG_ARCHIVE */ + ib_uint64_t min_flushed_lsn,/* in: min flushed lsn from + data files */ + ib_uint64_t max_flushed_lsn)/* in: max flushed lsn from + data files */ +{ + log_group_t* group; + log_group_t* max_cp_group; + log_group_t* up_to_date_group; + ulint max_cp_field; + ib_uint64_t checkpoint_lsn; + ib_uint64_t checkpoint_no; + ib_uint64_t old_scanned_lsn; + ib_uint64_t group_scanned_lsn; + ib_uint64_t contiguous_lsn; + ib_uint64_t archived_lsn; + byte* buf; + byte log_hdr_buf[LOG_FILE_HDR_SIZE]; + ulint err; + +#ifdef UNIV_LOG_ARCHIVE + ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX); +# define TYPE_CHECKPOINT (type == LOG_CHECKPOINT) +# define LIMIT_LSN limit_lsn +#else /* UNIV_LOG_ARCHIVE */ +# define TYPE_CHECKPOINT 1 +# define LIMIT_LSN IB_ULONGLONG_MAX +#endif /* UNIV_LOG_ARCHIVE */ + + if (TYPE_CHECKPOINT) { + recv_sys_create(); + recv_sys_init(FALSE, buf_pool_get_curr_size()); + } + + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { + fprintf(stderr, + "InnoDB: The user has set SRV_FORCE_NO_LOG_REDO on\n"); + fprintf(stderr, + "InnoDB: Skipping log redo\n"); + + return(DB_SUCCESS); + } + + recv_recovery_on = TRUE; + + recv_sys->limit_lsn = LIMIT_LSN; + + mutex_enter(&(log_sys->mutex)); + + /* Look for the latest checkpoint from any of the log groups */ + + err = recv_find_max_checkpoint(&max_cp_group, &max_cp_field); + + if (err != DB_SUCCESS) { + + mutex_exit(&(log_sys->mutex)); + + return(err); + } + + log_group_read_checkpoint_info(max_cp_group, max_cp_field); + + buf = log_sys->checkpoint_buf; + + checkpoint_lsn = mach_read_ull(buf + LOG_CHECKPOINT_LSN); + checkpoint_no = mach_read_ull(buf + LOG_CHECKPOINT_NO); + archived_lsn = mach_read_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN); + + /* Read the first log file header to print a note if this is + a recovery from a restored InnoDB Hot Backup */ + + fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, max_cp_group->space_id, 0, + 0, 0, LOG_FILE_HDR_SIZE, + log_hdr_buf, max_cp_group); + + if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { + /* This log file was created by ibbackup --restore: print + a note to the user about it */ + + fprintf(stderr, + "InnoDB: The log file was created by" + " ibbackup --apply-log at\n" + "InnoDB: %s\n", + log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP); + fprintf(stderr, + "InnoDB: NOTE: the following crash recovery" + " is part of a normal restore.\n"); + + /* Wipe over the label now */ + + memset(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, + ' ', 4); + /* Write to the log file to wipe over the label */ + fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, + max_cp_group->space_id, 0, + 0, 0, OS_FILE_LOG_BLOCK_SIZE, + log_hdr_buf, max_cp_group); + } + +#ifdef UNIV_LOG_ARCHIVE + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + log_checkpoint_get_nth_group_info(buf, group->id, + &(group->archived_file_no), + &(group->archived_offset)); + + group = UT_LIST_GET_NEXT(log_groups, group); + } +#endif /* UNIV_LOG_ARCHIVE */ + + if (TYPE_CHECKPOINT) { + /* Start reading the log groups from the checkpoint lsn up. The + variable contiguous_lsn contains an lsn up to which the log is + known to be contiguously written to all log groups. */ + + recv_sys->parse_start_lsn = checkpoint_lsn; + recv_sys->scanned_lsn = checkpoint_lsn; + recv_sys->scanned_checkpoint_no = 0; + recv_sys->recovered_lsn = checkpoint_lsn; + + srv_start_lsn = checkpoint_lsn; + } + + contiguous_lsn = ut_uint64_align_down(recv_sys->scanned_lsn, + OS_FILE_LOG_BLOCK_SIZE); + if (TYPE_CHECKPOINT) { + up_to_date_group = max_cp_group; +#ifdef UNIV_LOG_ARCHIVE + } else { + ulint capacity; + + /* Try to recover the remaining part from logs: first from + the logs of the archived group */ + + group = recv_sys->archive_group; + capacity = log_group_get_capacity(group); + + if (recv_sys->scanned_lsn > checkpoint_lsn + capacity + || checkpoint_lsn > recv_sys->scanned_lsn + capacity) { + + mutex_exit(&(log_sys->mutex)); + + /* The group does not contain enough log: probably + an archived log file was missing or corrupt */ + + return(DB_ERROR); + } + + recv_group_scan_log_recs(group, &contiguous_lsn, + &group_scanned_lsn); + if (recv_sys->scanned_lsn < checkpoint_lsn) { + + mutex_exit(&(log_sys->mutex)); + + /* The group did not contain enough log: an archived + log file was missing or invalid, or the log group + was corrupt */ + + return(DB_ERROR); + } + + group->scanned_lsn = group_scanned_lsn; + up_to_date_group = group; +#endif /* UNIV_LOG_ARCHIVE */ + } + + ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + +#ifdef UNIV_LOG_ARCHIVE + if ((type == LOG_ARCHIVE) && (group == recv_sys->archive_group)) { + group = UT_LIST_GET_NEXT(log_groups, group); + } +#endif /* UNIV_LOG_ARCHIVE */ + + /* Set the flag to publish that we are doing startup scan. */ + recv_log_scan_is_startup_type = TYPE_CHECKPOINT; + while (group) { + old_scanned_lsn = recv_sys->scanned_lsn; + + recv_group_scan_log_recs(group, &contiguous_lsn, + &group_scanned_lsn); + group->scanned_lsn = group_scanned_lsn; + + if (old_scanned_lsn < group_scanned_lsn) { + /* We found a more up-to-date group */ + + up_to_date_group = group; + } + +#ifdef UNIV_LOG_ARCHIVE + if ((type == LOG_ARCHIVE) + && (group == recv_sys->archive_group)) { + group = UT_LIST_GET_NEXT(log_groups, group); + } +#endif /* UNIV_LOG_ARCHIVE */ + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + /* Done with startup scan. Clear the flag. */ + recv_log_scan_is_startup_type = FALSE; + if (TYPE_CHECKPOINT) { + /* NOTE: we always do a 'recovery' at startup, but only if + there is something wrong we will print a message to the + user about recovery: */ + + if (checkpoint_lsn != max_flushed_lsn + || checkpoint_lsn != min_flushed_lsn) { + + if (checkpoint_lsn < max_flushed_lsn) { + fprintf(stderr, + "InnoDB: #########################" + "#################################\n" + "InnoDB: " + "WARNING!\n" + "InnoDB: The log sequence number" + " in ibdata files is higher\n" + "InnoDB: than the log sequence number" + " in the ib_logfiles! Are you sure\n" + "InnoDB: you are using the right" + " ib_logfiles to start up" + " the database?\n" + "InnoDB: Log sequence number in" + " ib_logfiles is %llu, log\n" + "InnoDB: sequence numbers stamped" + " to ibdata file headers are between\n" + "InnoDB: %llu and %llu.\n" + "InnoDB: #########################" + "#################################\n", + checkpoint_lsn, + min_flushed_lsn, + max_flushed_lsn); + } + + if (!recv_needed_recovery) { + fprintf(stderr, + "InnoDB: The log sequence number" + " in ibdata files does not match\n" + "InnoDB: the log sequence number" + " in the ib_logfiles!\n"); + recv_init_crash_recovery(); + } + } + + if (!recv_needed_recovery) { + /* Init the doublewrite buffer memory structure */ + trx_sys_doublewrite_init_or_restore_pages(FALSE); + } + } + + /* We currently have only one log group */ + if (group_scanned_lsn < checkpoint_lsn) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: We were only able to scan the log" + " up to\n" + "InnoDB: %llu, but a checkpoint was at %llu.\n" + "InnoDB: It is possible that" + " the database is now corrupt!\n", + group_scanned_lsn, + checkpoint_lsn); + } + + if (group_scanned_lsn < recv_max_page_lsn) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ERROR: We were only able to scan the log" + " up to %llu\n" + "InnoDB: but a database page a had an lsn %llu." + " It is possible that the\n" + "InnoDB: database is now corrupt!\n", + group_scanned_lsn, + recv_max_page_lsn); + } + + if (recv_sys->recovered_lsn < checkpoint_lsn) { + + mutex_exit(&(log_sys->mutex)); + + if (recv_sys->recovered_lsn >= LIMIT_LSN) { + + return(DB_SUCCESS); + } + + ut_error; + + return(DB_ERROR); + } + + /* Synchronize the uncorrupted log groups to the most up-to-date log + group; we also copy checkpoint info to groups */ + + log_sys->next_checkpoint_lsn = checkpoint_lsn; + log_sys->next_checkpoint_no = checkpoint_no + 1; + +#ifdef UNIV_LOG_ARCHIVE + log_sys->archived_lsn = archived_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + + recv_synchronize_groups(up_to_date_group); + + if (!recv_needed_recovery) { + ut_a(checkpoint_lsn == recv_sys->recovered_lsn); + } else { + srv_start_lsn = recv_sys->recovered_lsn; + } + + log_sys->lsn = recv_sys->recovered_lsn; + + ut_memcpy(log_sys->buf, recv_sys->last_block, OS_FILE_LOG_BLOCK_SIZE); + + log_sys->buf_free = (ulint) log_sys->lsn % OS_FILE_LOG_BLOCK_SIZE; + log_sys->buf_next_to_write = log_sys->buf_free; + log_sys->written_to_some_lsn = log_sys->lsn; + log_sys->written_to_all_lsn = log_sys->lsn; + + log_sys->last_checkpoint_lsn = checkpoint_lsn; + + log_sys->next_checkpoint_no = checkpoint_no + 1; + +#ifdef UNIV_LOG_ARCHIVE + if (archived_lsn == IB_ULONGLONG_MAX) { + + log_sys->archiving_state = LOG_ARCH_OFF; + } +#endif /* UNIV_LOG_ARCHIVE */ + + mutex_enter(&(recv_sys->mutex)); + + recv_sys->apply_log_recs = TRUE; + + mutex_exit(&(recv_sys->mutex)); + + mutex_exit(&(log_sys->mutex)); + + recv_lsn_checks_on = TRUE; + + /* The database is now ready to start almost normal processing of user + transactions: transaction rollbacks and the application of the log + records in the hash table can be run in background. */ + + return(DB_SUCCESS); + +#undef TYPE_CHECKPOINT +#undef LIMIT_LSN +} + +/************************************************************ +Completes recovery from a checkpoint. */ +UNIV_INTERN +void +recv_recovery_from_checkpoint_finish(void) +/*======================================*/ +{ + int i; + + /* Apply the hashed log records to the respective file pages */ + + if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { + + recv_apply_hashed_log_recs(TRUE); + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Log records applied to the database\n"); + } +#endif /* UNIV_DEBUG */ + + if (recv_needed_recovery) { + trx_sys_print_mysql_master_log_pos(); + trx_sys_print_mysql_binlog_offset(); + } + + if (recv_sys->found_corrupt_log) { + + fprintf(stderr, + "InnoDB: WARNING: the log file may have been" + " corrupt and it\n" + "InnoDB: is possible that the log scan or parsing" + " did not proceed\n" + "InnoDB: far enough in recovery. Please run" + " CHECK TABLE\n" + "InnoDB: on your InnoDB tables to check that" + " they are ok!\n" + "InnoDB: It may be safest to recover your" + " InnoDB database from\n" + "InnoDB: a backup!\n"); + } + + /* Free the resources of the recovery system */ + + recv_recovery_on = FALSE; + +#ifndef UNIV_LOG_DEBUG + recv_sys_free(); +#endif + + /* Drop partially created indexes. */ + row_merge_drop_temp_indexes(); + +#ifdef UNIV_SYNC_DEBUG + /* Wait for a while so that created threads have time to suspend + themselves before we switch the latching order checks on */ + os_thread_sleep(1000000); + + /* Switch latching order checks on in sync0sync.c */ + sync_order_checks_on = TRUE; +#endif + if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) { + /* Rollback the uncommitted transactions which have no user + session */ + + os_thread_create(trx_rollback_or_clean_all_recovered, + (void *)&i, NULL); + } +} + +/********************************************************** +Resets the logs. The contents of log files will be lost! */ +UNIV_INTERN +void +recv_reset_logs( +/*============*/ + ib_uint64_t lsn, /* in: reset to this lsn + rounded up to be divisible by + OS_FILE_LOG_BLOCK_SIZE, after + which we add + LOG_BLOCK_HDR_SIZE */ +#ifdef UNIV_LOG_ARCHIVE + ulint arch_log_no, /* in: next archived log file number */ +#endif /* UNIV_LOG_ARCHIVE */ + ibool new_logs_created)/* in: TRUE if resetting logs + is done at the log creation; + FALSE if it is done after + archive recovery */ +{ + log_group_t* group; + + ut_ad(mutex_own(&(log_sys->mutex))); + + log_sys->lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE); + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + group->lsn = log_sys->lsn; + group->lsn_offset = LOG_FILE_HDR_SIZE; +#ifdef UNIV_LOG_ARCHIVE + group->archived_file_no = arch_log_no; + group->archived_offset = 0; +#endif /* UNIV_LOG_ARCHIVE */ + + if (!new_logs_created) { + recv_truncate_group(group, group->lsn, group->lsn, + group->lsn, group->lsn); + } + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + log_sys->buf_next_to_write = 0; + log_sys->written_to_some_lsn = log_sys->lsn; + log_sys->written_to_all_lsn = log_sys->lsn; + + log_sys->next_checkpoint_no = 0; + log_sys->last_checkpoint_lsn = 0; + +#ifdef UNIV_LOG_ARCHIVE + log_sys->archived_lsn = log_sys->lsn; +#endif /* UNIV_LOG_ARCHIVE */ + + log_block_init(log_sys->buf, log_sys->lsn); + log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE); + + log_sys->buf_free = LOG_BLOCK_HDR_SIZE; + log_sys->lsn += LOG_BLOCK_HDR_SIZE; + + mutex_exit(&(log_sys->mutex)); + + /* Reset the checkpoint fields in logs */ + + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + mutex_enter(&(log_sys->mutex)); +} + +#ifdef UNIV_HOTBACKUP +/********************************************************** +Creates new log files after a backup has been restored. */ +UNIV_INTERN +void +recv_reset_log_files_for_backup( +/*============================*/ + const char* log_dir, /* in: log file directory path */ + ulint n_log_files, /* in: number of log files */ + ulint log_file_size, /* in: log file size */ + ib_uint64_t lsn) /* in: new start lsn, must be + divisible by OS_FILE_LOG_BLOCK_SIZE */ +{ + os_file_t log_file; + ibool success; + byte* buf; + ulint i; + ulint log_dir_len; + char name[5000]; + static const char ib_logfile_basename[] = "ib_logfile"; + + log_dir_len = strlen(log_dir); + /* full path name of ib_logfile consists of log dir path + basename + + number. This must fit in the name buffer. + */ + ut_a(log_dir_len + strlen(ib_logfile_basename) + 11 < sizeof(name)); + + buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + memset(buf, '\0', LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + + for (i = 0; i < n_log_files; i++) { + + sprintf(name, "%s%s%lu", log_dir, + ib_logfile_basename, (ulong)i); + + log_file = os_file_create_simple(name, OS_FILE_CREATE, + OS_FILE_READ_WRITE, &success); + if (!success) { + fprintf(stderr, + "InnoDB: Cannot create %s. Check that" + " the file does not exist yet.\n", name); + + exit(1); + } + + fprintf(stderr, + "Setting log file size to %lu %lu\n", + (ulong) ut_get_high32(log_file_size), + (ulong) log_file_size & 0xFFFFFFFFUL); + + success = os_file_set_size(name, log_file, + log_file_size & 0xFFFFFFFFUL, + ut_get_high32(log_file_size)); + + if (!success) { + fprintf(stderr, + "InnoDB: Cannot set %s size to %lu %lu\n", + name, (ulong) ut_get_high32(log_file_size), + (ulong) (log_file_size & 0xFFFFFFFFUL)); + exit(1); + } + + os_file_flush(log_file); + os_file_close(log_file); + } + + /* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */ + + log_reset_first_header_and_checkpoint(buf, lsn); + + log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn); + log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE, + LOG_BLOCK_HDR_SIZE); + sprintf(name, "%s%s%lu", log_dir, ib_logfile_basename, (ulong)0); + + log_file = os_file_create_simple(name, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); + if (!success) { + fprintf(stderr, "InnoDB: Cannot open %s.\n", name); + + exit(1); + } + + os_file_write(name, log_file, buf, 0, 0, + LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + os_file_flush(log_file); + os_file_close(log_file); + + ut_free(buf); +} +#endif /* UNIV_HOTBACKUP */ + +#ifdef UNIV_LOG_ARCHIVE +/********************************************************** +Reads from the archive of a log group and performs recovery. */ +static +ibool +log_group_recover_from_archive_file( +/*================================*/ + /* out: TRUE if no more complete + consistent archive files */ + log_group_t* group) /* in: log group */ +{ + os_file_t file_handle; + ib_uint64_t start_lsn; + ib_uint64_t file_end_lsn; + ib_uint64_t dummy_lsn; + ib_uint64_t scanned_lsn; + ulint len; + ibool ret; + byte* buf; + ulint read_offset; + ulint file_size; + ulint file_size_high; + int input_char; + char name[10000]; + + ut_a(0); + +try_open_again: + buf = log_sys->buf; + + /* Add the file to the archive file space; open the file */ + + log_archived_file_name_gen(name, group->id, group->archived_file_no); + + file_handle = os_file_create(name, OS_FILE_OPEN, + OS_FILE_LOG, OS_FILE_AIO, &ret); + + if (ret == FALSE) { +ask_again: + fprintf(stderr, + "InnoDB: Do you want to copy additional" + " archived log files\n" + "InnoDB: to the directory\n"); + fprintf(stderr, + "InnoDB: or were these all the files needed" + " in recovery?\n"); + fprintf(stderr, + "InnoDB: (Y == copy more files; N == this is all)?"); + + input_char = getchar(); + + if (input_char == (int) 'N') { + + return(TRUE); + } else if (input_char == (int) 'Y') { + + goto try_open_again; + } else { + goto ask_again; + } + } + + ret = os_file_get_size(file_handle, &file_size, &file_size_high); + ut_a(ret); + + ut_a(file_size_high == 0); + + fprintf(stderr, "InnoDB: Opened archived log file %s\n", name); + + ret = os_file_close(file_handle); + + if (file_size < LOG_FILE_HDR_SIZE) { + fprintf(stderr, + "InnoDB: Archive file header incomplete %s\n", name); + + return(TRUE); + } + + ut_a(ret); + + /* Add the archive file as a node to the space */ + + fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE, + group->archive_space_id, FALSE); +#if RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE +# error "RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE" +#endif + + /* Read the archive file header */ + fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->archive_space_id, 0, 0, + LOG_FILE_HDR_SIZE, buf, NULL); + + /* Check if the archive file header is consistent */ + + if (mach_read_from_4(buf + LOG_GROUP_ID) != group->id + || mach_read_from_4(buf + LOG_FILE_NO) + != group->archived_file_no) { + fprintf(stderr, + "InnoDB: Archive file header inconsistent %s\n", name); + + return(TRUE); + } + + if (!mach_read_from_4(buf + LOG_FILE_ARCH_COMPLETED)) { + fprintf(stderr, + "InnoDB: Archive file not completely written %s\n", + name); + + return(TRUE); + } + + start_lsn = mach_read_ull(buf + LOG_FILE_START_LSN); + file_end_lsn = mach_read_ull(buf + LOG_FILE_END_LSN); + + if (!recv_sys->scanned_lsn) { + + if (recv_sys->parse_start_lsn < start_lsn) { + fprintf(stderr, + "InnoDB: Archive log file %s" + " starts from too big a lsn\n", + name); + return(TRUE); + } + + recv_sys->scanned_lsn = start_lsn; + } + + if (recv_sys->scanned_lsn != start_lsn) { + + fprintf(stderr, + "InnoDB: Archive log file %s starts from" + " a wrong lsn\n", + name); + return(TRUE); + } + + read_offset = LOG_FILE_HDR_SIZE; + + for (;;) { + len = RECV_SCAN_SIZE; + + if (read_offset + len > file_size) { + len = ut_calc_align_down(file_size - read_offset, + OS_FILE_LOG_BLOCK_SIZE); + } + + if (len == 0) { + + break; + } + +#ifdef UNIV_DEBUG + if (log_debug_writes) { + fprintf(stderr, + "InnoDB: Archive read starting at" + " lsn %llu, len %lu from file %s\n", + start_lsn, + (ulong) len, name); + } +#endif /* UNIV_DEBUG */ + + fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, + group->archive_space_id, read_offset / UNIV_PAGE_SIZE, + read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + + ret = recv_scan_log_recs( + TRUE, (buf_pool->n_frames - recv_n_pool_free_frames) + * UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn, + &dummy_lsn, &scanned_lsn); + + if (scanned_lsn == file_end_lsn) { + + return(FALSE); + } + + if (ret) { + fprintf(stderr, + "InnoDB: Archive log file %s" + " does not scan right\n", + name); + return(TRUE); + } + + read_offset += len; + start_lsn += len; + + ut_ad(start_lsn == scanned_lsn); + } + + return(FALSE); +} + +/************************************************************ +Recovers from archived log files, and also from log files, if they exist. */ +UNIV_INTERN +ulint +recv_recovery_from_archive_start( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + ib_uint64_t min_flushed_lsn,/* in: min flushed lsn field from the + data files */ + ib_uint64_t limit_lsn, /* in: recover up to this lsn if + possible */ + ulint first_log_no) /* in: number of the first archived + log file to use in the recovery; the + file will be searched from + INNOBASE_LOG_ARCH_DIR specified in + server config file */ +{ + log_group_t* group; + ulint group_id; + ulint trunc_len; + ibool ret; + ulint err; + + ut_a(0); + + recv_sys_create(); + recv_sys_init(FALSE, buf_pool_get_curr_size()); + + recv_recovery_on = TRUE; + recv_recovery_from_backup_on = TRUE; + + recv_sys->limit_lsn = limit_lsn; + + group_id = 0; + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + + while (group) { + if (group->id == group_id) { + + break; + } + + group = UT_LIST_GET_NEXT(log_groups, group); + } + + if (!group) { + fprintf(stderr, + "InnoDB: There is no log group defined with id %lu!\n", + (ulong) group_id); + return(DB_ERROR); + } + + group->archived_file_no = first_log_no; + + recv_sys->parse_start_lsn = min_flushed_lsn; + + recv_sys->scanned_lsn = 0; + recv_sys->scanned_checkpoint_no = 0; + recv_sys->recovered_lsn = recv_sys->parse_start_lsn; + + recv_sys->archive_group = group; + + ret = FALSE; + + mutex_enter(&(log_sys->mutex)); + + while (!ret) { + ret = log_group_recover_from_archive_file(group); + + /* Close and truncate a possible processed archive file + from the file space */ + + trunc_len = UNIV_PAGE_SIZE + * fil_space_get_size(group->archive_space_id); + if (trunc_len > 0) { + fil_space_truncate_start(group->archive_space_id, + trunc_len); + } + + group->archived_file_no++; + } + + if (recv_sys->recovered_lsn < limit_lsn) { + + if (!recv_sys->scanned_lsn) { + + recv_sys->scanned_lsn = recv_sys->parse_start_lsn; + } + + mutex_exit(&(log_sys->mutex)); + + err = recv_recovery_from_checkpoint_start(LOG_ARCHIVE, + limit_lsn, + IB_ULONGLONG_MAX, + IB_ULONGLONG_MAX); + if (err != DB_SUCCESS) { + + return(err); + } + + mutex_enter(&(log_sys->mutex)); + } + + if (limit_lsn != IB_ULONGLONG_MAX) { + + recv_apply_hashed_log_recs(FALSE); + + recv_reset_logs(recv_sys->recovered_lsn, 0, FALSE); + } + + mutex_exit(&(log_sys->mutex)); + + return(DB_SUCCESS); +} + +/************************************************************ +Completes recovery from archive. */ +UNIV_INTERN +void +recv_recovery_from_archive_finish(void) +/*===================================*/ +{ + recv_recovery_from_checkpoint_finish(); + + recv_recovery_from_backup_on = FALSE; +} +#endif /* UNIV_LOG_ARCHIVE */ diff --git a/storage/xtradb/mach/mach0data.c b/storage/xtradb/mach/mach0data.c new file mode 100644 index 00000000000..5deb475318d --- /dev/null +++ b/storage/xtradb/mach/mach0data.c @@ -0,0 +1,135 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************** +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "mach0data.h" + +#ifdef UNIV_NONINL +#include "mach0data.ic" +#endif + +/************************************************************* +Reads a ulint in a compressed form if the log record fully contains it. */ +UNIV_INTERN +byte* +mach_parse_compressed( +/*==================*/ + /* out: pointer to end of the stored field, NULL if + not complete */ + byte* ptr, /* in: pointer to buffer from where to read */ + byte* end_ptr,/* in: pointer to end of the buffer */ + ulint* val) /* out: read value (< 2^32) */ +{ + ulint flag; + + ut_ad(ptr && end_ptr && val); + + if (ptr >= end_ptr) { + + return(NULL); + } + + flag = mach_read_from_1(ptr); + + if (flag < 0x80UL) { + *val = flag; + return(ptr + 1); + + } else if (flag < 0xC0UL) { + if (end_ptr < ptr + 2) { + return(NULL); + } + + *val = mach_read_from_2(ptr) & 0x7FFFUL; + + return(ptr + 2); + + } else if (flag < 0xE0UL) { + if (end_ptr < ptr + 3) { + return(NULL); + } + + *val = mach_read_from_3(ptr) & 0x3FFFFFUL; + + return(ptr + 3); + } else if (flag < 0xF0UL) { + if (end_ptr < ptr + 4) { + return(NULL); + } + + *val = mach_read_from_4(ptr) & 0x1FFFFFFFUL; + + return(ptr + 4); + } else { + ut_ad(flag == 0xF0UL); + + if (end_ptr < ptr + 5) { + return(NULL); + } + + *val = mach_read_from_4(ptr + 1); + return(ptr + 5); + } +} + +/************************************************************* +Reads a dulint in a compressed form if the log record fully contains it. */ +UNIV_INTERN +byte* +mach_dulint_parse_compressed( +/*=========================*/ + /* out: pointer to end of the stored field, NULL if + not complete */ + byte* ptr, /* in: pointer to buffer from where to read */ + byte* end_ptr,/* in: pointer to end of the buffer */ + dulint* val) /* out: read value */ +{ + ulint high; + ulint low; + ulint size; + + ut_ad(ptr && end_ptr && val); + + if (end_ptr < ptr + 5) { + + return(NULL); + } + + high = mach_read_compressed(ptr); + + size = mach_get_compressed_size(high); + + ptr += size; + + if (end_ptr < ptr + 4) { + + return(NULL); + } + + low = mach_read_from_4(ptr); + + *val = ut_dulint_create(high, low); + + return(ptr + 4); +} diff --git a/storage/xtradb/mem/mem0dbg.c b/storage/xtradb/mem/mem0dbg.c new file mode 100644 index 00000000000..a1647462922 --- /dev/null +++ b/storage/xtradb/mem/mem0dbg.c @@ -0,0 +1,1014 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The memory management: the debug code. This is not a compilation module, +but is included in mem0mem.* ! + +Created 6/9/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_MEM_DEBUG +/* The mutex which protects in the debug version the hash table +containing the list of live memory heaps, and also the global +variables below. */ +UNIV_INTERN mutex_t mem_hash_mutex; + +/* The following variables contain information about the +extent of memory allocations. Only used in the debug version. +Protected by mem_hash_mutex above. */ + +static ulint mem_n_created_heaps = 0; +static ulint mem_n_allocations = 0; +static ulint mem_total_allocated_memory = 0; +UNIV_INTERN ulint mem_current_allocated_memory = 0; +static ulint mem_max_allocated_memory = 0; +static ulint mem_last_print_info = 0; + +/* Size of the hash table for memory management tracking */ +#define MEM_HASH_SIZE 997 + +/* The node of the list containing currently allocated memory heaps */ + +typedef struct mem_hash_node_struct mem_hash_node_t; +struct mem_hash_node_struct { + UT_LIST_NODE_T(mem_hash_node_t) + list; /* hash list node */ + mem_heap_t* heap; /* memory heap */ + const char* file_name;/* file where heap was created*/ + ulint line; /* file line of creation */ + ulint nth_heap;/* this is the nth heap created */ + UT_LIST_NODE_T(mem_hash_node_t) + all_list;/* list of all created heaps */ +}; + +typedef UT_LIST_BASE_NODE_T(mem_hash_node_t) mem_hash_cell_t; + +/* The hash table of allocated heaps */ +static mem_hash_cell_t mem_hash_table[MEM_HASH_SIZE]; + +/* The base node of the list of all allocated heaps */ +static mem_hash_cell_t mem_all_list_base; + +static ibool mem_hash_initialized = FALSE; + + +UNIV_INLINE +mem_hash_cell_t* +mem_hash_get_nth_cell(ulint i); + +/* Accessor function for the hash table. Returns a pointer to the +table cell. */ +UNIV_INLINE +mem_hash_cell_t* +mem_hash_get_nth_cell(ulint i) +{ + ut_a(i < MEM_HASH_SIZE); + + return(&(mem_hash_table[i])); +} + +/* Accessor functions for a memory field in the debug version */ +UNIV_INTERN +void +mem_field_header_set_len(byte* field, ulint len) +{ + mach_write_to_4(field - 2 * sizeof(ulint), len); +} + +UNIV_INTERN +ulint +mem_field_header_get_len(byte* field) +{ + return(mach_read_from_4(field - 2 * sizeof(ulint))); +} + +UNIV_INTERN +void +mem_field_header_set_check(byte* field, ulint check) +{ + mach_write_to_4(field - sizeof(ulint), check); +} + +UNIV_INTERN +ulint +mem_field_header_get_check(byte* field) +{ + return(mach_read_from_4(field - sizeof(ulint))); +} + +UNIV_INTERN +void +mem_field_trailer_set_check(byte* field, ulint check) +{ + mach_write_to_4(field + mem_field_header_get_len(field), check); +} + +UNIV_INTERN +ulint +mem_field_trailer_get_check(byte* field) +{ + return(mach_read_from_4(field + + mem_field_header_get_len(field))); +} +#endif /* UNIV_MEM_DEBUG */ + +/********************************************************************** +Initializes the memory system. */ +UNIV_INTERN +void +mem_init( +/*=====*/ + ulint size) /* in: common pool size in bytes */ +{ +#ifdef UNIV_MEM_DEBUG + + ulint i; + + /* Initialize the hash table */ + ut_a(FALSE == mem_hash_initialized); + + mutex_create(&mem_hash_mutex, SYNC_MEM_HASH); + + for (i = 0; i < MEM_HASH_SIZE; i++) { + UT_LIST_INIT(*mem_hash_get_nth_cell(i)); + } + + UT_LIST_INIT(mem_all_list_base); + + mem_hash_initialized = TRUE; +#endif + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + /* When innodb_use_sys_malloc is set, the + mem_comm_pool won't be used for any allocations. We + create a dummy mem_comm_pool, because some statistics + and debugging code relies on it being initialized. */ + size = 1; + } + + mem_comm_pool = mem_pool_create(size); +} + +#ifdef UNIV_MEM_DEBUG +/********************************************************************** +Initializes an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_init( +/*===========*/ + byte* buf, /* in: memory field */ + ulint n) /* in: how many bytes the user requested */ +{ + ulint rnd; + byte* usr_buf; + + usr_buf = buf + MEM_FIELD_HEADER_SIZE; + + /* In the debug version write the length field and the + check fields to the start and the end of the allocated storage. + The field header consists of a length field and + a random number field, in this order. The field trailer contains + the same random number as a check field. */ + + mem_field_header_set_len(usr_buf, n); + + rnd = ut_rnd_gen_ulint(); + + mem_field_header_set_check(usr_buf, rnd); + mem_field_trailer_set_check(usr_buf, rnd); + + /* Update the memory allocation information */ + + mutex_enter(&mem_hash_mutex); + + mem_total_allocated_memory += n; + mem_current_allocated_memory += n; + mem_n_allocations++; + + if (mem_current_allocated_memory > mem_max_allocated_memory) { + mem_max_allocated_memory = mem_current_allocated_memory; + } + + mutex_exit(&mem_hash_mutex); + + /* In the debug version set the buffer to a random + combination of 0xBA and 0xBE */ + + mem_init_buf(usr_buf, n); +} + +/********************************************************************** +Erases an allocated memory field in the debug version. */ +UNIV_INTERN +void +mem_field_erase( +/*============*/ + byte* buf, /* in: memory field */ + ulint n __attribute__((unused))) + /* in: how many bytes the user requested */ +{ + byte* usr_buf; + + usr_buf = buf + MEM_FIELD_HEADER_SIZE; + + mutex_enter(&mem_hash_mutex); + mem_current_allocated_memory -= n; + mutex_exit(&mem_hash_mutex); + + /* Check that the field lengths agree */ + ut_ad(n == (ulint)mem_field_header_get_len(usr_buf)); + + /* In the debug version, set the freed space to a random + combination of 0xDE and 0xAD */ + + mem_erase_buf(buf, MEM_SPACE_NEEDED(n)); +} + +/******************************************************************* +Initializes a buffer to a random combination of hex BA and BE. +Used to initialize allocated memory. */ +UNIV_INTERN +void +mem_init_buf( +/*=========*/ + byte* buf, /* in: pointer to buffer */ + ulint n) /* in: length of buffer */ +{ + byte* ptr; + + UNIV_MEM_ASSERT_W(buf, n); + + for (ptr = buf; ptr < buf + n; ptr++) { + + if (ut_rnd_gen_ibool()) { + *ptr = 0xBA; + } else { + *ptr = 0xBE; + } + } + + UNIV_MEM_INVALID(buf, n); +} + +/******************************************************************* +Initializes a buffer to a random combination of hex DE and AD. +Used to erase freed memory.*/ +UNIV_INTERN +void +mem_erase_buf( +/*==========*/ + byte* buf, /* in: pointer to buffer */ + ulint n) /* in: length of buffer */ +{ + byte* ptr; + + UNIV_MEM_ASSERT_W(buf, n); + + for (ptr = buf; ptr < buf + n; ptr++) { + if (ut_rnd_gen_ibool()) { + *ptr = 0xDE; + } else { + *ptr = 0xAD; + } + } + + UNIV_MEM_FREE(buf, n); +} + +/******************************************************************* +Inserts a created memory heap to the hash table of current allocated +memory heaps. */ +UNIV_INTERN +void +mem_hash_insert( +/*============*/ + mem_heap_t* heap, /* in: the created heap */ + const char* file_name, /* in: file name of creation */ + ulint line) /* in: line where created */ +{ + mem_hash_node_t* new_node; + ulint cell_no ; + + ut_ad(mem_heap_check(heap)); + + mutex_enter(&mem_hash_mutex); + + cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE); + + /* Allocate a new node to the list */ + new_node = ut_malloc(sizeof(mem_hash_node_t)); + + new_node->heap = heap; + new_node->file_name = file_name; + new_node->line = line; + new_node->nth_heap = mem_n_created_heaps; + + /* Insert into lists */ + UT_LIST_ADD_FIRST(list, *mem_hash_get_nth_cell(cell_no), new_node); + + UT_LIST_ADD_LAST(all_list, mem_all_list_base, new_node); + + mem_n_created_heaps++; + + mutex_exit(&mem_hash_mutex); +} + +/******************************************************************* +Removes a memory heap (which is going to be freed by the caller) +from the list of live memory heaps. Returns the size of the heap +in terms of how much memory in bytes was allocated for the user of +the heap (not the total space occupied by the heap). +Also validates the heap. +NOTE: This function does not free the storage occupied by the +heap itself, only the node in the list of heaps. */ +UNIV_INTERN +void +mem_hash_remove( +/*============*/ + mem_heap_t* heap, /* in: the heap to be freed */ + const char* file_name, /* in: file name of freeing */ + ulint line) /* in: line where freed */ +{ + mem_hash_node_t* node; + ulint cell_no; + ibool error; + ulint size; + + ut_ad(mem_heap_check(heap)); + + mutex_enter(&mem_hash_mutex); + + cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE); + + /* Look for the heap in the hash table list */ + node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(cell_no)); + + while (node != NULL) { + if (node->heap == heap) { + + break; + } + + node = UT_LIST_GET_NEXT(list, node); + } + + if (node == NULL) { + fprintf(stderr, + "Memory heap or buffer freed in %s line %lu" + " did not exist.\n", + file_name, (ulong) line); + ut_error; + } + + /* Remove from lists */ + UT_LIST_REMOVE(list, *mem_hash_get_nth_cell(cell_no), node); + + UT_LIST_REMOVE(all_list, mem_all_list_base, node); + + /* Validate the heap which will be freed */ + mem_heap_validate_or_print(node->heap, NULL, FALSE, &error, &size, + NULL, NULL); + if (error) { + fprintf(stderr, + "Inconsistency in memory heap or" + " buffer n:o %lu created\n" + "in %s line %lu and tried to free in %s line %lu.\n" + "Hex dump of 400 bytes around memory heap" + " first block start:\n", + node->nth_heap, node->file_name, (ulong) node->line, + file_name, (ulong) line); + ut_print_buf(stderr, (byte*)node->heap - 200, 400); + fputs("\nDump of the mem heap:\n", stderr); + mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, + &size, NULL, NULL); + ut_error; + } + + /* Free the memory occupied by the node struct */ + ut_free(node); + + mem_current_allocated_memory -= size; + + mutex_exit(&mem_hash_mutex); +} +#endif /* UNIV_MEM_DEBUG */ + +#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG +/******************************************************************* +Checks a memory heap for consistency and prints the contents if requested. +Outputs the sum of sizes of buffers given to the user (only in +the debug version), the physical size of the heap and the number of +blocks in the heap. In case of error returns 0 as sizes and number +of blocks. */ +UNIV_INTERN +void +mem_heap_validate_or_print( +/*=======================*/ + mem_heap_t* heap, /* in: memory heap */ + byte* top __attribute__((unused)), + /* in: calculate and validate only until + this top pointer in the heap is reached, + if this pointer is NULL, ignored */ + ibool print, /* in: if TRUE, prints the contents + of the heap; works only in + the debug version */ + ibool* error, /* out: TRUE if error */ + ulint* us_size,/* out: allocated memory + (for the user) in the heap, + if a NULL pointer is passed as this + argument, it is ignored; in the + non-debug version this is always -1 */ + ulint* ph_size,/* out: physical size of the heap, + if a NULL pointer is passed as this + argument, it is ignored */ + ulint* n_blocks) /* out: number of blocks in the heap, + if a NULL pointer is passed as this + argument, it is ignored */ +{ + mem_block_t* block; + ulint total_len = 0; + ulint block_count = 0; + ulint phys_len = 0; +#ifdef UNIV_MEM_DEBUG + ulint len; + byte* field; + byte* user_field; + ulint check_field; +#endif + + /* Pessimistically, we set the parameters to error values */ + if (us_size != NULL) { + *us_size = 0; + } + if (ph_size != NULL) { + *ph_size = 0; + } + if (n_blocks != NULL) { + *n_blocks = 0; + } + *error = TRUE; + + block = heap; + + if (block->magic_n != MEM_BLOCK_MAGIC_N) { + return; + } + + if (print) { + fputs("Memory heap:", stderr); + } + + while (block != NULL) { + phys_len += mem_block_get_len(block); + + if ((block->type == MEM_HEAP_BUFFER) + && (mem_block_get_len(block) > UNIV_PAGE_SIZE)) { + + fprintf(stderr, + "InnoDB: Error: mem block %p" + " length %lu > UNIV_PAGE_SIZE\n", + (void*) block, + (ulong) mem_block_get_len(block)); + /* error */ + + return; + } + +#ifdef UNIV_MEM_DEBUG + /* We can trace the fields of the block only in the debug + version */ + if (print) { + fprintf(stderr, " Block %ld:", block_count); + } + + field = (byte*)block + mem_block_get_start(block); + + if (top && (field == top)) { + + goto completed; + } + + while (field < (byte*)block + mem_block_get_free(block)) { + + /* Calculate the pointer to the storage + which was given to the user */ + + user_field = field + MEM_FIELD_HEADER_SIZE; + + len = mem_field_header_get_len(user_field); + + if (print) { + ut_print_buf(stderr, user_field, len); + putc('\n', stderr); + } + + total_len += len; + check_field = mem_field_header_get_check(user_field); + + if (check_field + != mem_field_trailer_get_check(user_field)) { + /* error */ + + fprintf(stderr, + "InnoDB: Error: block %lx mem" + " field %lx len %lu\n" + "InnoDB: header check field is" + " %lx but trailer %lx\n", + (ulint)block, + (ulint)field, len, check_field, + mem_field_trailer_get_check( + user_field)); + + return; + } + + /* Move to next field */ + field = field + MEM_SPACE_NEEDED(len); + + if (top && (field == top)) { + + goto completed; + } + + } + + /* At the end check that we have arrived to the first free + position */ + + if (field != (byte*)block + mem_block_get_free(block)) { + /* error */ + + fprintf(stderr, + "InnoDB: Error: block %lx end of" + " mem fields %lx\n" + "InnoDB: but block free at %lx\n", + (ulint)block, (ulint)field, + (ulint)((byte*)block + + mem_block_get_free(block))); + + return; + } + +#endif + + block = UT_LIST_GET_NEXT(list, block); + block_count++; + } +#ifdef UNIV_MEM_DEBUG +completed: +#endif + if (us_size != NULL) { + *us_size = total_len; + } + if (ph_size != NULL) { + *ph_size = phys_len; + } + if (n_blocks != NULL) { + *n_blocks = block_count; + } + *error = FALSE; +} + +/****************************************************************** +Prints the contents of a memory heap. */ +static +void +mem_heap_print( +/*===========*/ + mem_heap_t* heap) /* in: memory heap */ +{ + ibool error; + ulint us_size; + ulint phys_size; + ulint n_blocks; + + ut_ad(mem_heap_check(heap)); + + mem_heap_validate_or_print(heap, NULL, TRUE, &error, + &us_size, &phys_size, &n_blocks); + fprintf(stderr, + "\nheap type: %lu; size: user size %lu;" + " physical size %lu; blocks %lu.\n", + (ulong) heap->type, (ulong) us_size, + (ulong) phys_size, (ulong) n_blocks); + ut_a(!error); +} + +/****************************************************************** +Validates the contents of a memory heap. */ +UNIV_INTERN +ibool +mem_heap_validate( +/*==============*/ + /* out: TRUE if ok */ + mem_heap_t* heap) /* in: memory heap */ +{ + ibool error; + ulint us_size; + ulint phys_size; + ulint n_blocks; + + ut_ad(mem_heap_check(heap)); + + mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size, + &phys_size, &n_blocks); + if (error) { + mem_heap_print(heap); + } + + ut_a(!error); + + return(TRUE); +} +#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/****************************************************************** +Checks that an object is a memory heap (or a block of it). */ +UNIV_INTERN +ibool +mem_heap_check( +/*===========*/ + /* out: TRUE if ok */ + mem_heap_t* heap) /* in: memory heap */ +{ + ut_a(heap->magic_n == MEM_BLOCK_MAGIC_N); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_MEM_DEBUG +/********************************************************************* +TRUE if no memory is currently allocated. */ +UNIV_INTERN +ibool +mem_all_freed(void) +/*===============*/ + /* out: TRUE if no heaps exist */ +{ + mem_hash_node_t* node; + ulint heap_count = 0; + ulint i; + + mem_validate(); + + mutex_enter(&mem_hash_mutex); + + for (i = 0; i < MEM_HASH_SIZE; i++) { + + node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i)); + while (node != NULL) { + heap_count++; + node = UT_LIST_GET_NEXT(list, node); + } + } + + mutex_exit(&mem_hash_mutex); + + if (heap_count == 0) { + + ut_a(mem_pool_get_reserved(mem_comm_pool) == 0); + + return(TRUE); + } else { + return(FALSE); + } +} + +/********************************************************************* +Validates the dynamic memory allocation system. */ +UNIV_INTERN +ibool +mem_validate_no_assert(void) +/*========================*/ + /* out: TRUE if error */ +{ + mem_hash_node_t* node; + ulint n_heaps = 0; + ulint allocated_mem; + ulint ph_size; + ulint total_allocated_mem = 0; + ibool error = FALSE; + ulint n_blocks; + ulint i; + + mem_pool_validate(mem_comm_pool); + + mutex_enter(&mem_hash_mutex); + + for (i = 0; i < MEM_HASH_SIZE; i++) { + + node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i)); + + while (node != NULL) { + n_heaps++; + + mem_heap_validate_or_print(node->heap, NULL, + FALSE, &error, + &allocated_mem, + &ph_size, &n_blocks); + + if (error) { + fprintf(stderr, + "\nERROR!!!!!!!!!!!!!!!!!!!" + "!!!!!!!!!!!!!!!!!!!!!!!\n\n" + "Inconsistency in memory heap" + " or buffer created\n" + "in %s line %lu.\n", + node->file_name, node->line); + + mutex_exit(&mem_hash_mutex); + + return(TRUE); + } + + total_allocated_mem += allocated_mem; + node = UT_LIST_GET_NEXT(list, node); + } + } + + if ((n_heaps == 0) && (mem_current_allocated_memory != 0)) { + error = TRUE; + } + + if (mem_total_allocated_memory < mem_current_allocated_memory) { + error = TRUE; + } + + if (mem_max_allocated_memory > mem_total_allocated_memory) { + error = TRUE; + } + + if (mem_n_created_heaps < n_heaps) { + error = TRUE; + } + + mutex_exit(&mem_hash_mutex); + + return(error); +} + +/**************************************************************** +Validates the dynamic memory */ +UNIV_INTERN +ibool +mem_validate(void) +/*==============*/ + /* out: TRUE if ok */ +{ + ut_a(!mem_validate_no_assert()); + + return(TRUE); +} +#endif /* UNIV_MEM_DEBUG */ + +/**************************************************************** +Tries to find neigboring memory allocation blocks and dumps to stderr +the neighborhood of a given pointer. */ +UNIV_INTERN +void +mem_analyze_corruption( +/*===================*/ + void* ptr) /* in: pointer to place of possible corruption */ +{ + byte* p; + ulint i; + ulint dist; + + fputs("InnoDB: Apparent memory corruption: mem dump ", stderr); + ut_print_buf(stderr, (byte*)ptr - 250, 500); + + fputs("\nInnoDB: Scanning backward trying to find" + " previous allocated mem blocks\n", stderr); + + p = (byte*)ptr; + dist = 0; + + for (i = 0; i < 10; i++) { + for (;;) { + if (((ulint)p) % 4 == 0) { + + if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) { + fprintf(stderr, + "Mem block at - %lu," + " file %s, line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + + if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) { + fprintf(stderr, + "Freed mem block at - %lu," + " file %s, line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + } + + p--; + dist++; + } + + p--; + dist++; + } + + fprintf(stderr, + "InnoDB: Scanning forward trying to find next" + " allocated mem blocks\n"); + + p = (byte*)ptr; + dist = 0; + + for (i = 0; i < 10; i++) { + for (;;) { + if (((ulint)p) % 4 == 0) { + + if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) { + fprintf(stderr, + "Mem block at + %lu, file %s," + " line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + + if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) { + fprintf(stderr, + "Freed mem block at + %lu," + " file %s, line %lu\n", + (ulong) dist, + (p + sizeof(ulint)), + (ulong) + (*(ulint*)(p + 8 + + sizeof(ulint)))); + + break; + } + } + + p++; + dist++; + } + + p++; + dist++; + } +} + +/********************************************************************* +Prints information of dynamic memory usage and currently allocated +memory heaps or buffers. Can only be used in the debug version. */ +static +void +mem_print_info_low( +/*===============*/ + ibool print_all) /* in: if TRUE, all heaps are printed, + else only the heaps allocated after the + previous call of this function */ +{ +#ifdef UNIV_MEM_DEBUG + mem_hash_node_t* node; + ulint n_heaps = 0; + ulint allocated_mem; + ulint ph_size; + ulint total_allocated_mem = 0; + ibool error; + ulint n_blocks; +#endif + FILE* outfile; + + /* outfile = fopen("ibdebug", "a"); */ + + outfile = stdout; + + fprintf(outfile, "\n"); + fprintf(outfile, + "________________________________________________________\n"); + fprintf(outfile, "MEMORY ALLOCATION INFORMATION\n\n"); + +#ifndef UNIV_MEM_DEBUG + + UT_NOT_USED(print_all); + + mem_pool_print_info(outfile, mem_comm_pool); + + fprintf(outfile, + "Sorry, non-debug version cannot give more memory info\n"); + + /* fclose(outfile); */ + + return; +#else + mutex_enter(&mem_hash_mutex); + + fprintf(outfile, "LIST OF CREATED HEAPS AND ALLOCATED BUFFERS: \n\n"); + + if (!print_all) { + fprintf(outfile, "AFTER THE LAST PRINT INFO\n"); + } + + node = UT_LIST_GET_FIRST(mem_all_list_base); + + while (node != NULL) { + n_heaps++; + + if (!print_all && node->nth_heap < mem_last_print_info) { + + goto next_heap; + } + + mem_heap_validate_or_print(node->heap, NULL, + FALSE, &error, &allocated_mem, + &ph_size, &n_blocks); + total_allocated_mem += allocated_mem; + + fprintf(outfile, + "%lu: file %s line %lu of size %lu phys.size %lu" + " with %lu blocks, type %lu\n", + node->nth_heap, node->file_name, node->line, + allocated_mem, ph_size, n_blocks, + (node->heap)->type); +next_heap: + node = UT_LIST_GET_NEXT(all_list, node); + } + + fprintf(outfile, "\n"); + + fprintf(outfile, "Current allocated memory : %lu\n", + mem_current_allocated_memory); + fprintf(outfile, "Current allocated heaps and buffers : %lu\n", + n_heaps); + fprintf(outfile, "Cumulative allocated memory : %lu\n", + mem_total_allocated_memory); + fprintf(outfile, "Maximum allocated memory : %lu\n", + mem_max_allocated_memory); + fprintf(outfile, "Cumulative created heaps and buffers : %lu\n", + mem_n_created_heaps); + fprintf(outfile, "Cumulative number of allocations : %lu\n", + mem_n_allocations); + + mem_last_print_info = mem_n_created_heaps; + + mutex_exit(&mem_hash_mutex); + + mem_pool_print_info(outfile, mem_comm_pool); + + /* mem_validate(); */ + + /* fclose(outfile); */ +#endif +} + +/********************************************************************* +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers. Can only be used in the debug version. */ +UNIV_INTERN +void +mem_print_info(void) +/*================*/ +{ + mem_print_info_low(TRUE); +} + +/********************************************************************* +Prints information of dynamic memory usage and currently allocated memory +heaps or buffers since the last ..._print_info or..._print_new_info. */ +UNIV_INTERN +void +mem_print_new_info(void) +/*====================*/ +{ + mem_print_info_low(FALSE); +} diff --git a/storage/xtradb/mem/mem0mem.c b/storage/xtradb/mem/mem0mem.c new file mode 100644 index 00000000000..b7345f5846b --- /dev/null +++ b/storage/xtradb/mem/mem0mem.c @@ -0,0 +1,553 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The memory management + +Created 6/9/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0mem.h" +#ifdef UNIV_NONINL +#include "mem0mem.ic" +#endif + +#include "buf0buf.h" +#include "srv0srv.h" +#include "mem0dbg.c" +#include + +/* + THE MEMORY MANAGEMENT + ===================== + +The basic element of the memory management is called a memory +heap. A memory heap is conceptually a +stack from which memory can be allocated. The stack may grow infinitely. +The top element of the stack may be freed, or +the whole stack can be freed at one time. The advantage of the +memory heap concept is that we can avoid using the malloc and free +functions of C which are quite expensive, for example, on the Solaris + GCC +system (50 MHz Sparc, 1993) the pair takes 3 microseconds, +on Win NT + 100MHz Pentium, 2.5 microseconds. +When we use a memory heap, +we can allocate larger blocks of memory at a time and thus +reduce overhead. Slightly more efficient the method is when we +allocate the memory from the index page buffer pool, as we can +claim a new page fast. This is called buffer allocation. +When we allocate the memory from the dynamic memory of the +C environment, that is called dynamic allocation. + +The default way of operation of the memory heap is the following. +First, when the heap is created, an initial block of memory is +allocated. In dynamic allocation this may be about 50 bytes. +If more space is needed, additional blocks are allocated +and they are put into a linked list. +After the initial block, each allocated block is twice the size of the +previous, until a threshold is attained, after which the sizes +of the blocks stay the same. An exception is, of course, the case +where the caller requests a memory buffer whose size is +bigger than the threshold. In that case a block big enough must +be allocated. + +The heap is physically arranged so that if the current block +becomes full, a new block is allocated and always inserted in the +chain of blocks as the last block. + +In the debug version of the memory management, all the allocated +heaps are kept in a list (which is implemented as a hash table). +Thus we can notice if the caller tries to free an already freed +heap. In addition, each buffer given to the caller contains +start field at the start and a trailer field at the end of the buffer. + +The start field has the following content: +A. sizeof(ulint) bytes of field length (in the standard byte order) +B. sizeof(ulint) bytes of check field (a random number) + +The trailer field contains: +A. sizeof(ulint) bytes of check field (the same random number as at the start) + +Thus we can notice if something has been copied over the +borders of the buffer, which is illegal. +The memory in the buffers is initialized to a random byte sequence. +After freeing, all the blocks in the heap are set to random bytes +to help us discover errors which result from the use of +buffers in an already freed heap. */ + +#ifdef MEM_PERIODIC_CHECK + +ibool mem_block_list_inited; +/* List of all mem blocks allocated; protected by the mem_comm_pool mutex */ +UT_LIST_BASE_NODE_T(mem_block_t) mem_block_list; + +#endif + +/************************************************************************** +Duplicates a NUL-terminated string, allocated from a memory heap. */ +UNIV_INTERN +char* +mem_heap_strdup( +/*============*/ + /* out, own: a copy of the string */ + mem_heap_t* heap, /* in: memory heap where string is allocated */ + const char* str) /* in: string to be copied */ +{ + return(mem_heap_dup(heap, str, strlen(str) + 1)); +} + +/************************************************************************** +Duplicate a block of data, allocated from a memory heap. */ +UNIV_INTERN +void* +mem_heap_dup( +/*=========*/ + /* out, own: a copy of the data */ + mem_heap_t* heap, /* in: memory heap where copy is allocated */ + const void* data, /* in: data to be copied */ + ulint len) /* in: length of data, in bytes */ +{ + return(memcpy(mem_heap_alloc(heap, len), data, len)); +} + +/************************************************************************** +Concatenate two memory blocks and return the result, using a memory heap. */ +UNIV_INTERN +void* +mem_heap_cat( +/*=========*/ + /* out, own: the result */ + mem_heap_t* heap, /* in: memory heap where result is allocated */ + const void* b1, /* in: block 1 */ + ulint len1, /* in: length of b1, in bytes */ + const void* b2, /* in: block 2 */ + ulint len2) /* in: length of b2, in bytes */ +{ + void* res = mem_heap_alloc(heap, len1 + len2); + + memcpy(res, b1, len1); + memcpy((char*)res + len1, b2, len2); + + return(res); +} + +/************************************************************************** +Concatenate two strings and return the result, using a memory heap. */ +UNIV_INTERN +char* +mem_heap_strcat( +/*============*/ + /* out, own: the result */ + mem_heap_t* heap, /* in: memory heap where string is allocated */ + const char* s1, /* in: string 1 */ + const char* s2) /* in: string 2 */ +{ + char* s; + ulint s1_len = strlen(s1); + ulint s2_len = strlen(s2); + + s = mem_heap_alloc(heap, s1_len + s2_len + 1); + + memcpy(s, s1, s1_len); + memcpy(s + s1_len, s2, s2_len); + + s[s1_len + s2_len] = '\0'; + + return(s); +} + + +/******************************************************************** +Helper function for mem_heap_printf. */ +static +ulint +mem_heap_printf_low( +/*================*/ + /* out: length of formatted string, + including terminating NUL */ + char* buf, /* in/out: buffer to store formatted string + in, or NULL to just calculate length */ + const char* format, /* in: format string */ + va_list ap) /* in: arguments */ +{ + ulint len = 0; + + while (*format) { + + /* Does this format specifier have the 'l' length modifier. */ + ibool is_long = FALSE; + + /* Length of one parameter. */ + size_t plen; + + if (*format++ != '%') { + /* Non-format character. */ + + len++; + + if (buf) { + *buf++ = *(format - 1); + } + + continue; + } + + if (*format == 'l') { + is_long = TRUE; + format++; + } + + switch (*format++) { + case 's': + /* string */ + { + char* s = va_arg(ap, char*); + + /* "%ls" is a non-sensical format specifier. */ + ut_a(!is_long); + + plen = strlen(s); + len += plen; + + if (buf) { + memcpy(buf, s, plen); + buf += plen; + } + } + + break; + + case 'u': + /* unsigned int */ + { + char tmp[32]; + unsigned long val; + + /* We only support 'long' values for now. */ + ut_a(is_long); + + val = va_arg(ap, unsigned long); + + plen = sprintf(tmp, "%lu", val); + len += plen; + + if (buf) { + memcpy(buf, tmp, plen); + buf += plen; + } + } + + break; + + case '%': + + /* "%l%" is a non-sensical format specifier. */ + ut_a(!is_long); + + len++; + + if (buf) { + *buf++ = '%'; + } + + break; + + default: + ut_error; + } + } + + /* For the NUL character. */ + len++; + + if (buf) { + *buf = '\0'; + } + + return(len); +} + +/******************************************************************** +A simple (s)printf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). */ +UNIV_INTERN +char* +mem_heap_printf( +/*============*/ + /* out: heap-allocated formatted string */ + mem_heap_t* heap, /* in: memory heap */ + const char* format, /* in: format string */ + ...) +{ + va_list ap; + char* str; + ulint len; + + /* Calculate length of string */ + len = 0; + va_start(ap, format); + len = mem_heap_printf_low(NULL, format, ap); + va_end(ap); + + /* Now create it for real. */ + str = mem_heap_alloc(heap, len); + va_start(ap, format); + mem_heap_printf_low(str, format, ap); + va_end(ap); + + return(str); +} + +/******************************************************************* +Creates a memory heap block where data can be allocated. */ +UNIV_INTERN +mem_block_t* +mem_heap_create_block( +/*==================*/ + /* out, own: memory heap block, NULL if + did not succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps) */ + mem_heap_t* heap, /* in: memory heap or NULL if first block + should be created */ + ulint n, /* in: number of bytes needed for user data */ + ulint type, /* in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ + const char* file_name,/* in: file name where created */ + ulint line) /* in: line where created */ +{ + buf_block_t* buf_block = NULL; + mem_block_t* block; + ulint len; + + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + if (heap && heap->magic_n != MEM_BLOCK_MAGIC_N) { + mem_analyze_corruption(heap); + } + + /* In dynamic allocation, calculate the size: block header + data. */ + len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n); + + if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) { + + ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF); + + block = mem_area_alloc(&len, mem_comm_pool); + } else { + len = UNIV_PAGE_SIZE; + + if ((type & MEM_HEAP_BTR_SEARCH) && heap) { + /* We cannot allocate the block from the + buffer pool, but must get the free block from + the heap header free block field */ + + buf_block = heap->free_block; + heap->free_block = NULL; + + if (UNIV_UNLIKELY(!buf_block)) { + + return(NULL); + } + } else { + buf_block = buf_block_alloc(0); + } + + block = (mem_block_t*) buf_block->frame; + } + + ut_ad(block); + block->buf_block = buf_block; + block->magic_n = MEM_BLOCK_MAGIC_N; + ut_strlcpy_rev(block->file_name, file_name, sizeof(block->file_name)); + block->line = line; + +#ifdef MEM_PERIODIC_CHECK + mem_pool_mutex_enter(); + + if (!mem_block_list_inited) { + mem_block_list_inited = TRUE; + UT_LIST_INIT(mem_block_list); + } + + UT_LIST_ADD_LAST(mem_block_list, mem_block_list, block); + + mem_pool_mutex_exit(); +#endif + mem_block_set_len(block, len); + mem_block_set_type(block, type); + mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE); + mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE); + + block->free_block = NULL; + + ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len); + + return(block); +} + +/******************************************************************* +Adds a new block to a memory heap. */ +UNIV_INTERN +mem_block_t* +mem_heap_add_block( +/*===============*/ + /* out: created block, NULL if did not + succeed (only possible for + MEM_HEAP_BTR_SEARCH type heaps)*/ + mem_heap_t* heap, /* in: memory heap */ + ulint n) /* in: number of bytes user needs */ +{ + mem_block_t* block; + mem_block_t* new_block; + ulint new_size; + + ut_ad(mem_heap_check(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + /* We have to allocate a new block. The size is always at least + doubled until the standard size is reached. After that the size + stays the same, except in cases where the caller needs more space. */ + + new_size = 2 * mem_block_get_len(block); + + if (heap->type != MEM_HEAP_DYNAMIC) { + /* From the buffer pool we allocate buffer frames */ + ut_a(n <= MEM_MAX_ALLOC_IN_BUF); + + if (new_size > MEM_MAX_ALLOC_IN_BUF) { + new_size = MEM_MAX_ALLOC_IN_BUF; + } + } else if (new_size > MEM_BLOCK_STANDARD_SIZE) { + + new_size = MEM_BLOCK_STANDARD_SIZE; + } + + if (new_size < n) { + new_size = n; + } + + new_block = mem_heap_create_block(heap, new_size, heap->type, + heap->file_name, heap->line); + if (new_block == NULL) { + + return(NULL); + } + + /* Add the new block as the last block */ + + UT_LIST_INSERT_AFTER(list, heap->base, block, new_block); + + return(new_block); +} + +/********************************************************************** +Frees a block from a memory heap. */ +UNIV_INTERN +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /* in: heap */ + mem_block_t* block) /* in: block to free */ +{ + ulint type; + ulint len; + buf_block_t* buf_block; + + if (block->magic_n != MEM_BLOCK_MAGIC_N) { + mem_analyze_corruption(block); + } + + UT_LIST_REMOVE(list, heap->base, block); + +#ifdef MEM_PERIODIC_CHECK + mem_pool_mutex_enter(); + + UT_LIST_REMOVE(mem_block_list, mem_block_list, block); + + mem_pool_mutex_exit(); +#endif + type = heap->type; + len = block->len; + buf_block = block->buf_block; + block->magic_n = MEM_FREED_BLOCK_MAGIC_N; + +#ifdef UNIV_MEM_DEBUG + /* In the debug version we set the memory to a random combination + of hex 0xDE and 0xAD. */ + + mem_erase_buf((byte*)block, len); +#else /* UNIV_MEM_DEBUG */ + UNIV_MEM_ASSERT_AND_FREE(block, len); +#endif /* UNIV_MEM_DEBUG */ + + if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) { + + ut_ad(!buf_block); + mem_area_free(block, mem_comm_pool); + } else { + ut_ad(type & MEM_HEAP_BUFFER); + + buf_block_free(buf_block); + } +} + +/********************************************************************** +Frees the free_block field from a memory heap. */ +UNIV_INTERN +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap) /* in: heap */ +{ + if (UNIV_LIKELY_NULL(heap->free_block)) { + + buf_block_free(heap->free_block); + + heap->free_block = NULL; + } +} + +#ifdef MEM_PERIODIC_CHECK +/********************************************************************** +Goes through the list of all allocated mem blocks, checks their magic +numbers, and reports possible corruption. */ +UNIV_INTERN +void +mem_validate_all_blocks(void) +/*=========================*/ +{ + mem_block_t* block; + + mem_pool_mutex_enter(); + + block = UT_LIST_GET_FIRST(mem_block_list); + + while (block) { + if (block->magic_n != MEM_BLOCK_MAGIC_N) { + mem_analyze_corruption(block); + } + + block = UT_LIST_GET_NEXT(mem_block_list, block); + } + + mem_pool_mutex_exit(); +} +#endif diff --git a/storage/xtradb/mem/mem0pool.c b/storage/xtradb/mem/mem0pool.c new file mode 100644 index 00000000000..34de6b2a706 --- /dev/null +++ b/storage/xtradb/mem/mem0pool.c @@ -0,0 +1,704 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The lowest-level memory management + +Created 5/12/1997 Heikki Tuuri +*************************************************************************/ + +#include "mem0pool.h" +#ifdef UNIV_NONINL +#include "mem0pool.ic" +#endif + +#include "srv0srv.h" +#include "sync0sync.h" +#include "ut0mem.h" +#include "ut0lst.h" +#include "ut0byte.h" +#include "mem0mem.h" + +/* We would like to use also the buffer frames to allocate memory. This +would be desirable, because then the memory consumption of the database +would be fixed, and we might even lock the buffer pool to the main memory. +The problem here is that the buffer management routines can themselves call +memory allocation, while the buffer pool mutex is reserved. + +The main components of the memory consumption are: + +1. buffer pool, +2. parsed and optimized SQL statements, +3. data dictionary cache, +4. log buffer, +5. locks for each transaction, +6. hash table for the adaptive index, +7. state and buffers for each SQL query currently being executed, +8. session for each user, and +9. stack for each OS thread. + +Items 1 and 2 are managed by an LRU algorithm. Items 5 and 6 can potentially +consume very much memory. Items 7 and 8 should consume quite little memory, +and the OS should take care of item 9, which too should consume little memory. + +A solution to the memory management: + +1. the buffer pool size is set separately; +2. log buffer size is set separately; +3. the common pool size for all the other entries, except 8, is set separately. + +Problems: we may waste memory if the common pool is set too big. Another +problem is the locks, which may take very much space in big transactions. +Then the shared pool size should be set very big. We can allow locks to take +space from the buffer pool, but the SQL optimizer is then unaware of the +usable size of the buffer pool. We could also combine the objects in the +common pool and the buffers in the buffer pool into a single LRU list and +manage it uniformly, but this approach does not take into account the parsing +and other costs unique to SQL statements. + +The locks for a transaction can be seen as a part of the state of the +transaction. Hence, they should be stored in the common pool. We still +have the problem of a very big update transaction, for example, which +will set very many x-locks on rows, and the locks will consume a lot +of memory, say, half of the buffer pool size. + +Another problem is what to do if we are not able to malloc a requested +block of memory from the common pool. Then we can request memory from +the operating system. If it does not help, a system error results. + +Because 5 and 6 may potentially consume very much memory, we let them grow +into the buffer pool. We may let the locks of a transaction take frames +from the buffer pool, when the corresponding memory heap block has grown to +the size of a buffer frame. Similarly for the hash node cells of the locks, +and for the adaptive index. Thus, for each individual transaction, its locks +can occupy at most about the size of the buffer frame of memory in the common +pool, and after that its locks will grow into the buffer pool. */ + +/* Mask used to extract the free bit from area->size */ +#define MEM_AREA_FREE 1 + +/* The smallest memory area total size */ +#define MEM_AREA_MIN_SIZE (2 * MEM_AREA_EXTRA_SIZE) + + +/* Data structure for a memory pool. The space is allocated using the buddy +algorithm, where free list i contains areas of size 2 to power i. */ +struct mem_pool_struct{ + byte* buf; /* memory pool */ + ulint size; /* memory common pool size */ + ulint reserved; /* amount of currently allocated + memory */ + mutex_t mutex; /* mutex protecting this struct */ + UT_LIST_BASE_NODE_T(mem_area_t) + free_list[64]; /* lists of free memory areas: an + area is put to the list whose number + is the 2-logarithm of the area size */ +}; + +/* The common memory pool */ +UNIV_INTERN mem_pool_t* mem_comm_pool = NULL; + +/* We use this counter to check that the mem pool mutex does not leak; +this is to track a strange assertion failure reported at +mysql@lists.mysql.com */ + +UNIV_INTERN ulint mem_n_threads_inside = 0; + +/************************************************************************ +Reserves the mem pool mutex. */ +UNIV_INTERN +void +mem_pool_mutex_enter(void) +/*======================*/ +{ + mutex_enter(&(mem_comm_pool->mutex)); +} + +/************************************************************************ +Releases the mem pool mutex. */ +UNIV_INTERN +void +mem_pool_mutex_exit(void) +/*=====================*/ +{ + mutex_exit(&(mem_comm_pool->mutex)); +} + +/************************************************************************ +Returns memory area size. */ +UNIV_INLINE +ulint +mem_area_get_size( +/*==============*/ + /* out: size */ + mem_area_t* area) /* in: area */ +{ + return(area->size_and_free & ~MEM_AREA_FREE); +} + +/************************************************************************ +Sets memory area size. */ +UNIV_INLINE +void +mem_area_set_size( +/*==============*/ + mem_area_t* area, /* in: area */ + ulint size) /* in: size */ +{ + area->size_and_free = (area->size_and_free & MEM_AREA_FREE) + | size; +} + +/************************************************************************ +Returns memory area free bit. */ +UNIV_INLINE +ibool +mem_area_get_free( +/*==============*/ + /* out: TRUE if free */ + mem_area_t* area) /* in: area */ +{ +#if TRUE != MEM_AREA_FREE +# error "TRUE != MEM_AREA_FREE" +#endif + return(area->size_and_free & MEM_AREA_FREE); +} + +/************************************************************************ +Sets memory area free bit. */ +UNIV_INLINE +void +mem_area_set_free( +/*==============*/ + mem_area_t* area, /* in: area */ + ibool free) /* in: free bit value */ +{ +#if TRUE != MEM_AREA_FREE +# error "TRUE != MEM_AREA_FREE" +#endif + area->size_and_free = (area->size_and_free & ~MEM_AREA_FREE) + | free; +} + +/************************************************************************ +Creates a memory pool. */ +UNIV_INTERN +mem_pool_t* +mem_pool_create( +/*============*/ + /* out: memory pool */ + ulint size) /* in: pool size in bytes */ +{ + mem_pool_t* pool; + mem_area_t* area; + ulint i; + ulint used; + + pool = ut_malloc(sizeof(mem_pool_t)); + + /* We do not set the memory to zero (FALSE) in the pool, + but only when allocated at a higher level in mem0mem.c. + This is to avoid masking useful Purify warnings. */ + + pool->buf = ut_malloc_low(size, FALSE, TRUE); + pool->size = size; + + mutex_create(&pool->mutex, SYNC_MEM_POOL); + + /* Initialize the free lists */ + + for (i = 0; i < 64; i++) { + + UT_LIST_INIT(pool->free_list[i]); + } + + used = 0; + + while (size - used >= MEM_AREA_MIN_SIZE) { + + i = ut_2_log(size - used); + + if (ut_2_exp(i) > size - used) { + + /* ut_2_log rounds upward */ + + i--; + } + + area = (mem_area_t*)(pool->buf + used); + + mem_area_set_size(area, ut_2_exp(i)); + mem_area_set_free(area, TRUE); + UNIV_MEM_FREE(MEM_AREA_EXTRA_SIZE + (byte*) area, + ut_2_exp(i) - MEM_AREA_EXTRA_SIZE); + + UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area); + + used = used + ut_2_exp(i); + } + + ut_ad(size >= used); + + pool->reserved = 0; + + return(pool); +} + +/************************************************************************ +Fills the specified free list. */ +static +ibool +mem_pool_fill_free_list( +/*====================*/ + /* out: TRUE if we were able to insert a + block to the free list */ + ulint i, /* in: free list index */ + mem_pool_t* pool) /* in: memory pool */ +{ + mem_area_t* area; + mem_area_t* area2; + ibool ret; + + ut_ad(mutex_own(&(pool->mutex))); + + if (UNIV_UNLIKELY(i >= 63)) { + /* We come here when we have run out of space in the + memory pool: */ + + return(FALSE); + } + + area = UT_LIST_GET_FIRST(pool->free_list[i + 1]); + + if (area == NULL) { + if (UT_LIST_GET_LEN(pool->free_list[i + 1]) > 0) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: mem pool free list %lu" + " length is %lu\n" + "InnoDB: though the list is empty!\n", + (ulong) i + 1, + (ulong) + UT_LIST_GET_LEN(pool->free_list[i + 1])); + } + + ret = mem_pool_fill_free_list(i + 1, pool); + + if (ret == FALSE) { + + return(FALSE); + } + + area = UT_LIST_GET_FIRST(pool->free_list[i + 1]); + } + + if (UNIV_UNLIKELY(UT_LIST_GET_LEN(pool->free_list[i + 1]) == 0)) { + mem_analyze_corruption(area); + + ut_error; + } + + UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area); + + area2 = (mem_area_t*)(((byte*)area) + ut_2_exp(i)); + UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE); + + mem_area_set_size(area2, ut_2_exp(i)); + mem_area_set_free(area2, TRUE); + + UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area2); + + mem_area_set_size(area, ut_2_exp(i)); + + UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area); + + return(TRUE); +} + +/************************************************************************ +Allocates memory from a pool. NOTE: This low-level function should only be +used in mem0mem.*! */ +UNIV_INTERN +void* +mem_area_alloc( +/*===========*/ + /* out, own: allocated memory buffer */ + ulint* psize, /* in: requested size in bytes; for optimum + space usage, the size should be a power of 2 + minus MEM_AREA_EXTRA_SIZE; + out: allocated size in bytes (greater than + or equal to the requested size) */ + mem_pool_t* pool) /* in: memory pool */ +{ + mem_area_t* area; + ulint size; + ulint n; + ibool ret; + + /* If we are using os allocator just make a simple call + to malloc */ + if (UNIV_LIKELY(srv_use_sys_malloc)) { + return(malloc(*psize)); + } + + size = *psize; + n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE)); + + mutex_enter(&(pool->mutex)); + mem_n_threads_inside++; + + ut_a(mem_n_threads_inside == 1); + + area = UT_LIST_GET_FIRST(pool->free_list[n]); + + if (area == NULL) { + ret = mem_pool_fill_free_list(n, pool); + + if (ret == FALSE) { + /* Out of memory in memory pool: we try to allocate + from the operating system with the regular malloc: */ + + mem_n_threads_inside--; + mutex_exit(&(pool->mutex)); + + return(ut_malloc(size)); + } + + area = UT_LIST_GET_FIRST(pool->free_list[n]); + } + + if (!mem_area_get_free(area)) { + fprintf(stderr, + "InnoDB: Error: Removing element from mem pool" + " free list %lu though the\n" + "InnoDB: element is not marked free!\n", + (ulong) n); + + mem_analyze_corruption(area); + + /* Try to analyze a strange assertion failure reported at + mysql@lists.mysql.com where the free bit IS 1 in the + hex dump above */ + + if (mem_area_get_free(area)) { + fprintf(stderr, + "InnoDB: Probably a race condition" + " because now the area is marked free!\n"); + } + + ut_error; + } + + if (UT_LIST_GET_LEN(pool->free_list[n]) == 0) { + fprintf(stderr, + "InnoDB: Error: Removing element from mem pool" + " free list %lu\n" + "InnoDB: though the list length is 0!\n", + (ulong) n); + mem_analyze_corruption(area); + + ut_error; + } + + ut_ad(mem_area_get_size(area) == ut_2_exp(n)); + + mem_area_set_free(area, FALSE); + + UT_LIST_REMOVE(free_list, pool->free_list[n], area); + + pool->reserved += mem_area_get_size(area); + + mem_n_threads_inside--; + mutex_exit(&(pool->mutex)); + + ut_ad(mem_pool_validate(pool)); + + *psize = ut_2_exp(n) - MEM_AREA_EXTRA_SIZE; + UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*)area, *psize); + + return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*)area))); +} + +/************************************************************************ +Gets the buddy of an area, if it exists in pool. */ +UNIV_INLINE +mem_area_t* +mem_area_get_buddy( +/*===============*/ + /* out: the buddy, NULL if no buddy in pool */ + mem_area_t* area, /* in: memory area */ + ulint size, /* in: memory area size */ + mem_pool_t* pool) /* in: memory pool */ +{ + mem_area_t* buddy; + + ut_ad(size != 0); + + if (((((byte*)area) - pool->buf) % (2 * size)) == 0) { + + /* The buddy is in a higher address */ + + buddy = (mem_area_t*)(((byte*)area) + size); + + if ((((byte*)buddy) - pool->buf) + size > pool->size) { + + /* The buddy is not wholly contained in the pool: + there is no buddy */ + + buddy = NULL; + } + } else { + /* The buddy is in a lower address; NOTE that area cannot + be at the pool lower end, because then we would end up to + the upper branch in this if-clause: the remainder would be + 0 */ + + buddy = (mem_area_t*)(((byte*)area) - size); + } + + return(buddy); +} + +/************************************************************************ +Frees memory to a pool. */ +UNIV_INTERN +void +mem_area_free( +/*==========*/ + void* ptr, /* in, own: pointer to allocated memory + buffer */ + mem_pool_t* pool) /* in: memory pool */ +{ + mem_area_t* area; + mem_area_t* buddy; + void* new_ptr; + ulint size; + ulint n; + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + free(ptr); + + return; + } + + /* It may be that the area was really allocated from the OS with + regular malloc: check if ptr points within our memory pool */ + + if ((byte*)ptr < pool->buf || (byte*)ptr >= pool->buf + pool->size) { + ut_free(ptr); + + return; + } + + area = (mem_area_t*) (((byte*)ptr) - MEM_AREA_EXTRA_SIZE); + + if (mem_area_get_free(area)) { + fprintf(stderr, + "InnoDB: Error: Freeing element to mem pool" + " free list though the\n" + "InnoDB: element is marked free!\n"); + + mem_analyze_corruption(area); + ut_error; + } + + size = mem_area_get_size(area); + UNIV_MEM_FREE(ptr, size - MEM_AREA_EXTRA_SIZE); + + if (size == 0) { + fprintf(stderr, + "InnoDB: Error: Mem area size is 0. Possibly a" + " memory overrun of the\n" + "InnoDB: previous allocated area!\n"); + + mem_analyze_corruption(area); + ut_error; + } + +#ifdef UNIV_LIGHT_MEM_DEBUG + if (((byte*)area) + size < pool->buf + pool->size) { + + ulint next_size; + + next_size = mem_area_get_size( + (mem_area_t*)(((byte*)area) + size)); + if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) { + fprintf(stderr, + "InnoDB: Error: Memory area size %lu," + " next area size %lu not a power of 2!\n" + "InnoDB: Possibly a memory overrun of" + " the buffer being freed here.\n", + (ulong) size, (ulong) next_size); + mem_analyze_corruption(area); + + ut_error; + } + } +#endif + buddy = mem_area_get_buddy(area, size, pool); + + n = ut_2_log(size); + + mutex_enter(&(pool->mutex)); + mem_n_threads_inside++; + + ut_a(mem_n_threads_inside == 1); + + if (buddy && mem_area_get_free(buddy) + && (size == mem_area_get_size(buddy))) { + + /* The buddy is in a free list */ + + if ((byte*)buddy < (byte*)area) { + new_ptr = ((byte*)buddy) + MEM_AREA_EXTRA_SIZE; + + mem_area_set_size(buddy, 2 * size); + mem_area_set_free(buddy, FALSE); + } else { + new_ptr = ptr; + + mem_area_set_size(area, 2 * size); + } + + /* Remove the buddy from its free list and merge it to area */ + + UT_LIST_REMOVE(free_list, pool->free_list[n], buddy); + + pool->reserved += ut_2_exp(n); + + mem_n_threads_inside--; + mutex_exit(&(pool->mutex)); + + mem_area_free(new_ptr, pool); + + return; + } else { + UT_LIST_ADD_FIRST(free_list, pool->free_list[n], area); + + mem_area_set_free(area, TRUE); + + ut_ad(pool->reserved >= size); + + pool->reserved -= size; + } + + mem_n_threads_inside--; + mutex_exit(&(pool->mutex)); + + ut_ad(mem_pool_validate(pool)); +} + +/************************************************************************ +Validates a memory pool. */ +UNIV_INTERN +ibool +mem_pool_validate( +/*==============*/ + /* out: TRUE if ok */ + mem_pool_t* pool) /* in: memory pool */ +{ + mem_area_t* area; + mem_area_t* buddy; + ulint free; + ulint i; + + mutex_enter(&(pool->mutex)); + + free = 0; + + for (i = 0; i < 64; i++) { + + UT_LIST_VALIDATE(free_list, mem_area_t, pool->free_list[i]); + + area = UT_LIST_GET_FIRST(pool->free_list[i]); + + while (area != NULL) { + ut_a(mem_area_get_free(area)); + ut_a(mem_area_get_size(area) == ut_2_exp(i)); + + buddy = mem_area_get_buddy(area, ut_2_exp(i), pool); + + ut_a(!buddy || !mem_area_get_free(buddy) + || (ut_2_exp(i) != mem_area_get_size(buddy))); + + area = UT_LIST_GET_NEXT(free_list, area); + + free += ut_2_exp(i); + } + } + + ut_a(free + pool->reserved == pool->size); + + mutex_exit(&(pool->mutex)); + + return(TRUE); +} + +/************************************************************************ +Prints info of a memory pool. */ +UNIV_INTERN +void +mem_pool_print_info( +/*================*/ + FILE* outfile,/* in: output file to write to */ + mem_pool_t* pool) /* in: memory pool */ +{ + ulint i; + + mem_pool_validate(pool); + + fprintf(outfile, "INFO OF A MEMORY POOL\n"); + + mutex_enter(&(pool->mutex)); + + for (i = 0; i < 64; i++) { + if (UT_LIST_GET_LEN(pool->free_list[i]) > 0) { + + fprintf(outfile, + "Free list length %lu for" + " blocks of size %lu\n", + (ulong) UT_LIST_GET_LEN(pool->free_list[i]), + (ulong) ut_2_exp(i)); + } + } + + fprintf(outfile, "Pool size %lu, reserved %lu.\n", (ulong) pool->size, + (ulong) pool->reserved); + mutex_exit(&(pool->mutex)); +} + +/************************************************************************ +Returns the amount of reserved memory. */ +UNIV_INTERN +ulint +mem_pool_get_reserved( +/*==================*/ + /* out: reserved memory in bytes */ + mem_pool_t* pool) /* in: memory pool */ +{ + ulint reserved; + + mutex_enter(&(pool->mutex)); + + reserved = pool->reserved; + + mutex_exit(&(pool->mutex)); + + return(reserved); +} diff --git a/storage/xtradb/mtr/mtr0log.c b/storage/xtradb/mtr/mtr0log.c new file mode 100644 index 00000000000..0fe66d08c05 --- /dev/null +++ b/storage/xtradb/mtr/mtr0log.c @@ -0,0 +1,609 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mini-transaction log routines + +Created 12/7/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" + +#ifdef UNIV_NONINL +#include "mtr0log.ic" +#endif + +#include "buf0buf.h" +#include "dict0boot.h" +#include "log0recv.h" +#include "page0page.h" + +/************************************************************ +Catenates n bytes to the mtr log. */ +UNIV_INTERN +void +mlog_catenate_string( +/*=================*/ + mtr_t* mtr, /* in: mtr */ + const byte* str, /* in: string to write */ + ulint len) /* in: string length */ +{ + dyn_array_t* mlog; + + if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) { + + return; + } + + mlog = &(mtr->log); + + dyn_push_string(mlog, str, len); +} + +/************************************************************ +Writes the initial part of a log record consisting of one-byte item +type and four-byte space and page numbers. Also pushes info +to the mtr memo that a buffer page has been modified. */ +UNIV_INTERN +void +mlog_write_initial_log_record( +/*==========================*/ + const byte* ptr, /* in: pointer to (inside) a buffer + frame holding the file page where + modification is made */ + byte type, /* in: log item type: MLOG_1BYTE, ... */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(type <= MLOG_BIGGEST_TYPE); + ut_ad(type > MLOG_8BYTES); + + log_ptr = mlog_open(mtr, 11); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr); + + mlog_close(mtr, log_ptr); +} + +/************************************************************ +Parses an initial log record written by mlog_write_initial_log_record. */ +UNIV_INTERN +byte* +mlog_parse_initial_log_record( +/*==========================*/ + /* out: parsed record end, NULL if not a complete + record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + byte* type, /* out: log record type: MLOG_1BYTE, ... */ + ulint* space, /* out: space id */ + ulint* page_no)/* out: page number */ +{ + if (end_ptr < ptr + 1) { + + return(NULL); + } + + *type = (byte)((ulint)*ptr & ~MLOG_SINGLE_REC_FLAG); + ut_ad(*type <= MLOG_BIGGEST_TYPE); + + ptr++; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + ptr = mach_parse_compressed(ptr, end_ptr, space); + + if (ptr == NULL) { + + return(NULL); + } + + ptr = mach_parse_compressed(ptr, end_ptr, page_no); + + return(ptr); +} + +/************************************************************ +Parses a log record written by mlog_write_ulint or mlog_write_dulint. */ +UNIV_INTERN +byte* +mlog_parse_nbytes( +/*==============*/ + /* out: parsed record end, NULL if not a complete + record or a corrupt record */ + ulint type, /* in: log record type: MLOG_1BYTE, ... */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + byte* page, /* in: page where to apply the log record, or NULL */ + void* page_zip)/* in/out: compressed page, or NULL */ +{ + ulint offset; + ulint val; + dulint dval; + + ut_a(type <= MLOG_8BYTES); + ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX); + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + if (offset >= UNIV_PAGE_SIZE) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (type == MLOG_8BYTES) { + ptr = mach_dulint_parse_compressed(ptr, end_ptr, &dval); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_8 + (((page_zip_des_t*) page_zip)->data + + offset, dval); + } + mach_write_to_8(page + offset, dval); + } + + return(ptr); + } + + ptr = mach_parse_compressed(ptr, end_ptr, &val); + + if (ptr == NULL) { + + return(NULL); + } + + switch (type) { + case MLOG_1BYTE: + if (UNIV_UNLIKELY(val > 0xFFUL)) { + goto corrupt; + } + if (page) { + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_1 + (((page_zip_des_t*) page_zip)->data + + offset, val); + } + mach_write_to_1(page + offset, val); + } + break; + case MLOG_2BYTES: + if (UNIV_UNLIKELY(val > 0xFFFFUL)) { + goto corrupt; + } + if (page) { + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_2 + (((page_zip_des_t*) page_zip)->data + + offset, val); + } + mach_write_to_2(page + offset, val); + } + break; + case MLOG_4BYTES: + if (page) { + if (UNIV_LIKELY_NULL(page_zip)) { + mach_write_to_4 + (((page_zip_des_t*) page_zip)->data + + offset, val); + } + mach_write_to_4(page + offset, val); + } + break; + default: + corrupt: + recv_sys->found_corrupt_log = TRUE; + ptr = NULL; + } + + return(ptr); +} + +/************************************************************ +Writes 1 - 4 bytes to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_ulint( +/*=============*/ + byte* ptr, /* in: pointer where to write */ + ulint val, /* in: value to write */ + byte type, /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + byte* log_ptr; + + switch (type) { + case MLOG_1BYTE: + mach_write_to_1(ptr, val); + break; + case MLOG_2BYTES: + mach_write_to_2(ptr, val); + break; + case MLOG_4BYTES: + mach_write_to_4(ptr, val); + break; + default: + ut_error; + } + + log_ptr = mlog_open(mtr, 11 + 2 + 5); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr); + + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; + + log_ptr += mach_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/************************************************************ +Writes 8 bytes to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_dulint( +/*==============*/ + byte* ptr, /* in: pointer where to write */ + dulint val, /* in: value to write */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(ptr && mtr); + + mach_write_to_8(ptr, val); + + log_ptr = mlog_open(mtr, 11 + 2 + 9); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_8BYTES, + log_ptr, mtr); + + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; + + log_ptr += mach_dulint_write_compressed(log_ptr, val); + + mlog_close(mtr, log_ptr); +} + +/************************************************************ +Writes a string to a file page buffered in the buffer pool. Writes the +corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_write_string( +/*==============*/ + byte* ptr, /* in: pointer where to write */ + const byte* str, /* in: string to write */ + ulint len, /* in: string length */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ut_ad(ptr && mtr); + ut_a(len < UNIV_PAGE_SIZE); + + memcpy(ptr, str, len); + + mlog_log_string(ptr, len, mtr); +} + +/************************************************************ +Logs a write of a string to a file page buffered in the buffer pool. +Writes the corresponding log record to the mini-transaction log. */ +UNIV_INTERN +void +mlog_log_string( +/*============*/ + byte* ptr, /* in: pointer written to */ + ulint len, /* in: string length */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(ptr && mtr); + ut_ad(len <= UNIV_PAGE_SIZE); + + log_ptr = mlog_open(mtr, 30); + + /* If no logging is requested, we may return now */ + if (log_ptr == NULL) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_WRITE_STRING, + log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(ptr)); + log_ptr += 2; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, ptr, len); +} + +/************************************************************ +Parses a log record written by mlog_write_string. */ +UNIV_INTERN +byte* +mlog_parse_string( +/*==============*/ + /* out: parsed record end, NULL if not a complete + record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + byte* page, /* in: page where to apply the log record, or NULL */ + void* page_zip)/* in/out: compressed page, or NULL */ +{ + ulint offset; + ulint len; + + ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX); + + if (end_ptr < ptr + 4) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + len = mach_read_from_2(ptr); + ptr += 2; + + if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(len + offset) > UNIV_PAGE_SIZE) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (end_ptr < ptr + len) { + + return(NULL); + } + + if (page) { + if (UNIV_LIKELY_NULL(page_zip)) { + memcpy(((page_zip_des_t*) page_zip)->data + + offset, ptr, len); + } + memcpy(page + offset, ptr, len); + } + + return(ptr + len); +} + +/************************************************************ +Opens a buffer for mlog, writes the initial log record and, +if needed, the field lengths of an index. */ +UNIV_INTERN +byte* +mlog_open_and_write_index( +/*======================*/ + /* out: buffer, NULL if log mode + MTR_LOG_NONE */ + mtr_t* mtr, /* in: mtr */ + byte* rec, /* in: index record or page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: log item type */ + ulint size) /* in: requested buffer size in bytes + (if 0, calls mlog_close() and returns NULL) */ +{ + byte* log_ptr; + const byte* log_start; + const byte* log_end; + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + if (!page_rec_is_comp(rec)) { + log_start = log_ptr = mlog_open(mtr, 11 + size); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + log_end = log_ptr + 11 + size; + } else { + ulint i; + ulint n = dict_index_get_n_fields(index); + /* total size needed */ + ulint total = 11 + size + (n + 2) * 2; + ulint alloc = total; + /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + log_ptr = mlog_write_initial_log_record_fast(rec, type, + log_ptr, mtr); + mach_write_to_2(log_ptr, n); + log_ptr += 2; + mach_write_to_2(log_ptr, + dict_index_get_n_unique_in_tree(index)); + log_ptr += 2; + for (i = 0; i < n; i++) { + dict_field_t* field; + const dict_col_t* col; + ulint len; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + len = field->fixed_len; + ut_ad(len < 0x7fff); + if (len == 0 + && (col->len > 255 || col->mtype == DATA_BLOB)) { + /* variable-length field + with maximum length > 255 */ + len = 0x7fff; + } + if (col->prtype & DATA_NOT_NULL) { + len |= 0x8000; + } + if (log_ptr + 2 > log_end) { + mlog_close(mtr, log_ptr); + ut_a(total > (ulint) (log_ptr - log_start)); + total -= log_ptr - log_start; + alloc = total; + if (alloc > DYN_ARRAY_DATA_SIZE) { + alloc = DYN_ARRAY_DATA_SIZE; + } + log_start = log_ptr = mlog_open(mtr, alloc); + if (!log_ptr) { + return(NULL); /* logging is disabled */ + } + log_end = log_ptr + alloc; + } + mach_write_to_2(log_ptr, len); + log_ptr += 2; + } + } + if (size == 0) { + mlog_close(mtr, log_ptr); + log_ptr = NULL; + } else if (log_ptr + size > log_end) { + mlog_close(mtr, log_ptr); + log_ptr = mlog_open(mtr, size); + } + return(log_ptr); +} + +/************************************************************ +Parses a log record written by mlog_open_and_write_index. */ +UNIV_INTERN +byte* +mlog_parse_index( +/*=============*/ + /* out: parsed record end, + NULL if not a complete record */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + /* out: new value of log_ptr */ + ibool comp, /* in: TRUE=compact record format */ + dict_index_t** index) /* out, own: dummy index */ +{ + ulint i, n, n_uniq; + dict_table_t* table; + dict_index_t* ind; + + ut_ad(comp == FALSE || comp == TRUE); + + if (comp) { + if (end_ptr < ptr + 4) { + return(NULL); + } + n = mach_read_from_2(ptr); + ptr += 2; + n_uniq = mach_read_from_2(ptr); + ptr += 2; + ut_ad(n_uniq <= n); + if (end_ptr < ptr + n * 2) { + return(NULL); + } + } else { + n = n_uniq = 1; + } + table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n, + comp ? DICT_TF_COMPACT : 0); + ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY", + DICT_HDR_SPACE, 0, n); + ind->table = table; + ind->n_uniq = (unsigned int) n_uniq; + if (n_uniq != n) { + ut_a(n_uniq + DATA_ROLL_PTR <= n); + ind->type = DICT_CLUSTERED; + } + if (comp) { + for (i = 0; i < n; i++) { + ulint len = mach_read_from_2(ptr); + ptr += 2; + /* The high-order bit of len is the NOT NULL flag; + the rest is 0 or 0x7fff for variable-length fields, + and 1..0x7ffe for fixed-length fields. */ + dict_mem_table_add_col( + table, NULL, NULL, + ((len + 1) & 0x7fff) <= 1 + ? DATA_BINARY : DATA_FIXBINARY, + len & 0x8000 ? DATA_NOT_NULL : 0, + len & 0x7fff); + + dict_index_add_col(ind, table, + dict_table_get_nth_col(table, i), + 0); + } + dict_table_add_system_columns(table, table->heap); + if (n_uniq != n) { + /* Identify DB_TRX_ID and DB_ROLL_PTR in the index. */ + ut_a(DATA_TRX_ID_LEN + == dict_index_get_nth_col(ind, DATA_TRX_ID - 1 + + n_uniq)->len); + ut_a(DATA_ROLL_PTR_LEN + == dict_index_get_nth_col(ind, DATA_ROLL_PTR - 1 + + n_uniq)->len); + ind->fields[DATA_TRX_ID - 1 + n_uniq].col + = &table->cols[n + DATA_TRX_ID]; + ind->fields[DATA_ROLL_PTR - 1 + n_uniq].col + = &table->cols[n + DATA_ROLL_PTR]; + } + } + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + ind->cached = TRUE; + *index = ind; + return(ptr); +} diff --git a/storage/xtradb/mtr/mtr0mtr.c b/storage/xtradb/mtr/mtr0mtr.c new file mode 100644 index 00000000000..881751b560e --- /dev/null +++ b/storage/xtradb/mtr/mtr0mtr.c @@ -0,0 +1,345 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0mtr.h" + +#ifdef UNIV_NONINL +#include "mtr0mtr.ic" +#endif + +#include "buf0buf.h" +#include "page0types.h" +#include "mtr0log.h" +#include "log0log.h" + +/********************************************************************* +Releases the item in the slot given. */ +UNIV_INLINE +void +mtr_memo_slot_release( +/*==================*/ + mtr_t* mtr, /* in: mtr */ + mtr_memo_slot_t* slot) /* in: memo slot */ +{ + void* object; + ulint type; + + ut_ad(mtr && slot); + + object = slot->object; + type = slot->type; + + if (UNIV_LIKELY(object != NULL)) { + if (type <= MTR_MEMO_BUF_FIX) { + buf_page_release((buf_block_t*)object, type, mtr); + } else if (type == MTR_MEMO_S_LOCK) { + rw_lock_s_unlock((rw_lock_t*)object); +#ifdef UNIV_DEBUG + } else if (type != MTR_MEMO_X_LOCK) { + ut_ad(type == MTR_MEMO_MODIFY); + ut_ad(mtr_memo_contains(mtr, object, + MTR_MEMO_PAGE_X_FIX)); +#endif /* UNIV_DEBUG */ + } else { + rw_lock_x_unlock((rw_lock_t*)object); + } + } + + slot->object = NULL; +} + +/************************************************************** +Releases the mlocks and other objects stored in an mtr memo. They are released +in the order opposite to which they were pushed to the memo. NOTE! It is +essential that the x-rw-lock on a modified buffer page is not released before +buf_page_note_modification is called for that page! Otherwise, some thread +might race to modify it, and the flush list sort order on lsn would be +destroyed. */ +UNIV_INLINE +void +mtr_memo_pop_all( +/*=============*/ + mtr_t* mtr) /* in: mtr */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in + commit */ + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + + while (offset > 0) { + offset -= sizeof(mtr_memo_slot_t); + slot = dyn_array_get_element(memo, offset); + + mtr_memo_slot_release(mtr, slot); + } +} + +/**************************************************************** +Writes the contents of a mini-transaction log, if any, to the database log. */ +static +void +mtr_log_reserve_and_write( +/*======================*/ + mtr_t* mtr) /* in: mtr */ +{ + dyn_array_t* mlog; + dyn_block_t* block; + ulint data_size; + ibool success; + byte* first_data; + + ut_ad(mtr); + + mlog = &(mtr->log); + + first_data = dyn_block_get_data(mlog); + + if (mtr->n_log_recs > 1) { + mlog_catenate_ulint(mtr, MLOG_MULTI_REC_END, MLOG_1BYTE); + } else { + *first_data = (byte)((ulint)*first_data + | MLOG_SINGLE_REC_FLAG); + } + + if (mlog->heap == NULL) { + mtr->end_lsn = log_reserve_and_write_fast( + first_data, dyn_block_get_used(mlog), + &(mtr->start_lsn), &success); + if (success) { + + return; + } + } + + data_size = dyn_array_get_data_size(mlog); + + /* Open the database log for log_write_low */ + mtr->start_lsn = log_reserve_and_open(data_size); + + if (mtr->log_mode == MTR_LOG_ALL) { + + block = mlog; + + while (block != NULL) { + log_write_low(dyn_block_get_data(block), + dyn_block_get_used(block)); + block = dyn_array_get_next_block(mlog, block); + } + } else { + ut_ad(mtr->log_mode == MTR_LOG_NONE); + /* Do nothing */ + } + + mtr->end_lsn = log_close(); +} + +/******************************************************************* +Commits a mini-transaction. */ +UNIV_INTERN +void +mtr_commit( +/*=======*/ + mtr_t* mtr) /* in: mini-transaction */ +{ + ibool write_log; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + ut_d(mtr->state = MTR_COMMITTING); + + write_log = mtr->modifications && mtr->n_log_recs; + + if (write_log) { + mtr_log_reserve_and_write(mtr); + } + + /* We first update the modification info to buffer pages, and only + after that release the log mutex: this guarantees that when the log + mutex is free, all buffer pages contain an up-to-date info of their + modifications. This fact is used in making a checkpoint when we look + at the oldest modification of any page in the buffer pool. It is also + required when we insert modified buffer pages in to the flush list + which must be sorted on oldest_modification. */ + + mtr_memo_pop_all(mtr); + + if (write_log) { + log_release(); + } + + ut_d(mtr->state = MTR_COMMITTED); + dyn_array_free(&(mtr->memo)); + dyn_array_free(&(mtr->log)); +} + +/************************************************************** +Releases the latches stored in an mtr memo down to a savepoint. +NOTE! The mtr must not have made changes to buffer pages after the +savepoint, as these can be handled only by mtr_commit. */ +UNIV_INTERN +void +mtr_rollback_to_savepoint( +/*======================*/ + mtr_t* mtr, /* in: mtr */ + ulint savepoint) /* in: savepoint */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + ut_ad(offset >= savepoint); + + while (offset > savepoint) { + offset -= sizeof(mtr_memo_slot_t); + + slot = dyn_array_get_element(memo, offset); + + ut_ad(slot->type != MTR_MEMO_MODIFY); + mtr_memo_slot_release(mtr, slot); + } +} + +/******************************************************* +Releases an object in the memo stack. */ +UNIV_INTERN +void +mtr_memo_release( +/*=============*/ + mtr_t* mtr, /* in: mtr */ + void* object, /* in: object */ + ulint type) /* in: object type: MTR_MEMO_S_LOCK, ... */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_ACTIVE); + + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + + while (offset > 0) { + offset -= sizeof(mtr_memo_slot_t); + + slot = dyn_array_get_element(memo, offset); + + if ((object == slot->object) && (type == slot->type)) { + + mtr_memo_slot_release(mtr, slot); + + break; + } + } +} + +/************************************************************ +Reads 1 - 4 bytes from a file page buffered in the buffer pool. */ +UNIV_INTERN +ulint +mtr_read_ulint( +/*===========*/ + /* out: value read */ + const byte* ptr, /* in: pointer from where to read */ + ulint type, /* in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */ + mtr_t* mtr __attribute__((unused))) + /* in: mini-transaction handle */ +{ + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX)); + if (type == MLOG_1BYTE) { + return(mach_read_from_1(ptr)); + } else if (type == MLOG_2BYTES) { + return(mach_read_from_2(ptr)); + } else { + ut_ad(type == MLOG_4BYTES); + return(mach_read_from_4(ptr)); + } +} + +/************************************************************ +Reads 8 bytes from a file page buffered in the buffer pool. */ +UNIV_INTERN +dulint +mtr_read_dulint( +/*============*/ + /* out: value read */ + const byte* ptr, /* in: pointer from where to read */ + mtr_t* mtr __attribute__((unused))) + /* in: mini-transaction handle */ +{ + ut_ad(mtr->state == MTR_ACTIVE); + ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX)); + return(mach_read_from_8(ptr)); +} + +#ifdef UNIV_DEBUG +/************************************************************** +Checks if memo contains the given page. */ +UNIV_INTERN +ibool +mtr_memo_contains_page( +/*===================*/ + /* out: TRUE if contains */ + mtr_t* mtr, /* in: mtr */ + const byte* ptr, /* in: pointer to buffer frame */ + ulint type) /* in: type of object */ +{ + return(mtr_memo_contains(mtr, buf_block_align(ptr), type)); +} + +/************************************************************* +Prints info of an mtr handle. */ +UNIV_INTERN +void +mtr_print( +/*======*/ + mtr_t* mtr) /* in: mtr */ +{ + fprintf(stderr, + "Mini-transaction handle: memo size %lu bytes" + " log size %lu bytes\n", + (ulong) dyn_array_get_data_size(&(mtr->memo)), + (ulong) dyn_array_get_data_size(&(mtr->log))); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c new file mode 100644 index 00000000000..2d130899622 --- /dev/null +++ b/storage/xtradb/os/os0file.c @@ -0,0 +1,4248 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" +#include "os0sync.h" +#include "os0thread.h" +#include "ut0mem.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "buf0buf.h" + +#if defined(UNIV_HOTBACKUP) && defined(__WIN__) +/* Add includes for the _stat() call to compile on Windows */ +#include +#include +#include +#endif /* UNIV_HOTBACKUP */ + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef __WIN__ +UNIV_INTERN ulint os_innodb_umask + = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +#else +UNIV_INTERN ulint os_innodb_umask = 0; +#endif + +#ifdef UNIV_DO_FLUSH +/* If the following is set to TRUE, we do not call os_file_flush in every +os_file_write. We can set this TRUE when the doublewrite buffer is used. */ +UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE; +#else +/* We do not call os_file_flush in every os_file_write. */ +#endif /* UNIV_DO_FLUSH */ + +/* We use these mutexes to protect lseek + file i/o operation, if the +OS does not provide an atomic pread or pwrite, or similar */ +#define OS_FILE_N_SEEK_MUTEXES 16 +UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; + +/* In simulated aio, merge at most this many consecutive i/os */ +#define OS_AIO_MERGE_N_CONSECUTIVE 64 + +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads */ + +UNIV_INTERN ibool os_aio_use_native_aio = FALSE; + +UNIV_INTERN ibool os_aio_print_debug = FALSE; + +/* The aio array slot structure */ +typedef struct os_aio_slot_struct os_aio_slot_t; + +struct os_aio_slot_struct{ + ibool is_read; /* TRUE if a read operation */ + ulint pos; /* index of the slot in the aio + array */ + ibool reserved; /* TRUE if this slot is reserved */ + time_t reservation_time;/* time when reserved */ + ulint len; /* length of the block to read or + write */ + byte* buf; /* buffer used in i/o */ + ulint type; /* OS_FILE_READ or OS_FILE_WRITE */ + ulint offset; /* 32 low bits of file offset in + bytes */ + ulint offset_high; /* 32 high bits of file offset */ + os_file_t file; /* file where to read or write */ + const char* name; /* file name or path */ + ibool io_already_done;/* used only in simulated aio: + TRUE if the physical i/o already + made and only the slot message + needs to be passed to the caller + of os_aio_simulated_handle */ + fil_node_t* message1; /* message which is given by the */ + void* message2; /* the requester of an aio operation + and which can be used to identify + which pending aio operation was + completed */ +#ifdef WIN_ASYNC_IO + os_event_t event; /* event object we need in the + OVERLAPPED struct */ + OVERLAPPED control; /* Windows control block for the + aio request */ +#endif +}; + +/* The aio array structure */ +typedef struct os_aio_array_struct os_aio_array_t; + +struct os_aio_array_struct{ + os_mutex_t mutex; /* the mutex protecting the aio array */ + os_event_t not_full; /* The event which is set to the signaled + state when there is space in the aio + outside the ibuf segment */ + os_event_t is_empty; /* The event which is set to the signaled + state when there are no pending i/os + in this array */ + ulint n_slots; /* Total number of slots in the aio array. + This must be divisible by n_threads. */ + ulint n_segments;/* Number of segments in the aio array of + pending aio requests. A thread can wait + separately for any one of the segments. */ + ulint n_reserved;/* Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /* Pointer to the slots in the array */ +#ifdef __WIN__ + os_native_event_t* native_events; + /* Pointer to an array of OS native event + handles where we copied the handles from + slots, in the same order. This can be used + in WaitForMultipleObjects; used only in + Windows */ +#endif +}; + +/* Array of events used in simulated aio */ +static os_event_t* os_aio_segment_wait_events = NULL; + +/* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These +are NULL when the module has not yet been initialized. */ +static os_aio_array_t* os_aio_read_array = NULL; +static os_aio_array_t* os_aio_write_array = NULL; +static os_aio_array_t* os_aio_ibuf_array = NULL; +static os_aio_array_t* os_aio_log_array = NULL; +static os_aio_array_t* os_aio_sync_array = NULL; + +static ulint os_aio_n_segments = ULINT_UNDEFINED; + +/* If the following is TRUE, read i/o handler threads try to +wait until a batch of new read requests have been posted */ +static ibool os_aio_recommend_sleep_for_read_threads = FALSE; + +UNIV_INTERN ulint os_n_file_reads = 0; +UNIV_INTERN ulint os_bytes_read_since_printout = 0; +UNIV_INTERN ulint os_n_file_writes = 0; +UNIV_INTERN ulint os_n_fsyncs = 0; +UNIV_INTERN ulint os_n_file_reads_old = 0; +UNIV_INTERN ulint os_n_file_writes_old = 0; +UNIV_INTERN ulint os_n_fsyncs_old = 0; +UNIV_INTERN time_t os_last_printout; + +UNIV_INTERN ibool os_has_said_disk_full = FALSE; + +/* The mutex protecting the following counts of pending I/O operations */ +static os_mutex_t os_file_count_mutex; +UNIV_INTERN ulint os_file_n_pending_preads = 0; +UNIV_INTERN ulint os_file_n_pending_pwrites = 0; +UNIV_INTERN ulint os_n_pending_writes = 0; +UNIV_INTERN ulint os_n_pending_reads = 0; + +/*************************************************************************** +Gets the operating system version. Currently works only on Windows. */ +UNIV_INTERN +ulint +os_get_os_version(void) +/*===================*/ + /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */ +{ +#ifdef __WIN__ + OSVERSIONINFO os_info; + + os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); + + ut_a(GetVersionEx(&os_info)); + + if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) { + return(OS_WIN31); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { + return(OS_WIN95); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { + if (os_info.dwMajorVersion <= 4) { + return(OS_WINNT); + } else { + return(OS_WIN2000); + } + } else { + ut_error; + return(0); + } +#else + ut_error; + + return(0); +#endif +} + +/*************************************************************************** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + /* out: error number, or OS error + number + 100 */ + ibool report_all_errors) /* in: TRUE if we want an error message + printed of all errors */ +{ + ulint err; + +#ifdef __WIN__ + + err = (ulint) GetLastError(); + + if (report_all_errors + || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ERROR_PATH_NOT_FOUND) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == ERROR_ACCESS_DENIED) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory. It may also be" + " you have created a subdirectory\n" + "InnoDB: of the same name as a data file.\n"); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + fprintf(stderr, + "InnoDB: The error means that another program" + " is using InnoDB's files.\n" + "InnoDB: This might be a backup or antivirus" + " software or another instance\n" + "InnoDB: of MySQL." + " Please close it to get rid of this error.\n"); + } else { + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + "http://dev.mysql.com/doc/refman/5.1/en/" + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else { + return(100 + err); + } +#else + err = (ulint) errno; + + if (report_all_errors + || (err != ENOSPC && err != EEXIST)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ENOENT) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == EACCES) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory.\n"); + } else { + if (strerror((int)err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu" + " means '%s'.\n", + err, strerror((int)err)); + } + + fprintf(stderr, + "InnoDB: Some operating system" + " error numbers are described at\n" + "InnoDB: " + "http://dev.mysql.com/doc/refman/5.1/en/" + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ENOSPC) { + return(OS_FILE_DISK_FULL); + } else if (err == ENOENT) { + return(OS_FILE_NOT_FOUND); + } else if (err == EEXIST) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == EXDEV || err == ENOTDIR || err == EISDIR) { + return(OS_FILE_PATH_ERROR); + } else { + return(100 + err); + } +#endif +} + +/******************************************************************** +Does error handling when a file operation fails. +Conditionally exits (calling exit(3)) based on should_exit value and the +error type */ +static +ibool +os_file_handle_error_cond_exit( +/*===========================*/ + /* out: TRUE if we should retry the + operation */ + const char* name, /* in: name of a file or NULL */ + const char* operation, /* in: operation */ + ibool should_exit) /* in: call exit(3) if unknown error + and this parameter is TRUE */ +{ + ulint err; + + err = os_file_get_last_error(FALSE); + + if (err == OS_FILE_DISK_FULL) { + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(FALSE); + } + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with" + " file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk" + " to free space.\n"); + + os_has_said_disk_full = TRUE; + + fflush(stderr); + + return(FALSE); + } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) { + + return(TRUE); + } else if (err == OS_FILE_ALREADY_EXISTS + || err == OS_FILE_PATH_ERROR) { + + return(FALSE); + } else if (err == OS_FILE_SHARING_VIOLATION) { + + os_thread_sleep(10000000); /* 10 sec */ + return(TRUE); + } else { + if (name) { + fprintf(stderr, "InnoDB: File name %s\n", name); + } + + fprintf(stderr, "InnoDB: File operation call: '%s'.\n", + operation); + + if (should_exit) { + fprintf(stderr, "InnoDB: Cannot continue operation.\n"); + + fflush(stderr); + + exit(1); + } + } + + return(FALSE); +} + +/******************************************************************** +Does error handling when a file operation fails. */ +static +ibool +os_file_handle_error( +/*=================*/ + /* out: TRUE if we should retry the + operation */ + const char* name, /* in: name of a file or NULL */ + const char* operation)/* in: operation */ +{ + /* exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, TRUE)); +} + +/******************************************************************** +Does error handling when a file operation fails. */ +static +ibool +os_file_handle_error_no_exit( +/*=========================*/ + /* out: TRUE if we should retry the + operation */ + const char* name, /* in: name of a file or NULL */ + const char* operation)/* in: operation */ +{ + /* don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, FALSE)); +} + +#undef USE_FILE_LOCK +#define USE_FILE_LOCK +#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__) +/* InnoDB Hot Backup does not lock the data files. + * On Windows, mandatory locking is used. + */ +# undef USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/******************************************************************** +Obtain an exclusive lock on a file. */ +static +int +os_file_lock( +/*=========*/ + /* out: 0 on success */ + int fd, /* in: file descriptor */ + const char* name) /* in: file name */ +{ + struct flock lk; + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + if (fcntl(fd, F_SETLK, &lk) == -1) { + fprintf(stderr, + "InnoDB: Unable to lock %s, error: %d\n", name, errno); + + if (errno == EAGAIN || errno == EACCES) { + fprintf(stderr, + "InnoDB: Check that you do not already have" + " another mysqld process\n" + "InnoDB: using the same InnoDB data" + " or log files.\n"); + } + + return(-1); + } + + return(0); +} +#endif /* USE_FILE_LOCK */ + +/******************************************************************** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void) +/*===================*/ +{ + ulint i; + + os_file_count_mutex = os_mutex_create(NULL); + + for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_file_seek_mutexes[i] = os_mutex_create(NULL); + } +} + +/*************************************************************************** +Creates a temporary file. This function is like tmpfile(3), but +the temporary file is created in the MySQL temporary directory. +On Netware, this function is like tmpfile(3), because the C run-time +library of Netware does not expose the delete-on-close flag. */ +UNIV_INTERN +FILE* +os_file_create_tmpfile(void) +/*========================*/ + /* out: temporary file handle, or NULL on error */ +{ +#ifdef UNIV_HOTBACKUP + ut_error; + + return(NULL); +#else +# ifdef __NETWARE__ + FILE* file = tmpfile(); +# else /* __NETWARE__ */ + FILE* file = NULL; + int fd = innobase_mysql_tmpfile(); + + if (fd >= 0) { + file = fdopen(fd, "w+b"); + } +# endif /* __NETWARE__ */ + + if (!file) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unable to create temporary file;" + " errno: %d\n", errno); +# ifndef __NETWARE__ + if (fd >= 0) { + close(fd); + } +# endif /* !__NETWARE__ */ + } + + return(file); +#endif /* UNIV_HOTBACKUP */ +} + +/*************************************************************************** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + /* out: directory stream, NULL if + error */ + const char* dirname, /* in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal) /* in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); + + dir = FindFirstFile((LPCTSTR) path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(dir); +#endif +} + +/*************************************************************************** +Closes a directory stream. */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + /* out: 0 if success, -1 if failure */ + os_file_dir_t dir) /* in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error_no_exit(NULL, "closedir"); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error_no_exit(NULL, "closedir"); + } + + return(ret); +#endif +} + +/*************************************************************************** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + /* out: 0 if ok, -1 if error, 1 if at the end + of the directory */ + const char* dirname,/* in: directory name or path */ + os_file_dir_t dir, /* in: directory stream */ + os_file_stat_t* info) /* in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen((char *) lpFindFileData->cFileName) + < OS_FILE_MAX_PATH); + + if (strcmp((char *) lpFindFileData->cFileName, ".") == 0 + || strcmp((char *) lpFindFileData->cFileName, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, (char *) lpFindFileData->cFileName); + + info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow) + + (((ib_int64_t)(lpFindFileData->nFileSizeHigh)) + << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { + /* TODO: test Windows symlinks */ + /* TODO: MySQL has apparently its own symlink + implementation in Windows, dbname.sym can + redirect a database directory: + http://dev.mysql.com/doc/refman/5.1/en/ + windows-symbolic-links.html */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else { + /* It is probably safest to assume that all other + file types are normal. Better to check them rather + than blindly skip them. */ + + info->type = OS_FILE_TYPE_FILE; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error_no_exit(dirname, + "readdir_next_file"); + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; +#ifdef HAVE_READDIR_R + char dirent_buf[sizeof(struct dirent) + + _POSIX_PATH_MAX + 100]; + /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as + the max file name len; but in most standards, the + length is NAME_MAX; we add 100 to be even safer */ +#endif + +next_file: + +#ifdef HAVE_READDIR_R + ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent); + + if (ret != 0) { + fprintf(stderr, + "InnoDB: cannot read directory %s, error %lu\n", + dirname, (ulong)ret); + + return(-1); + } + + if (ent == NULL) { + /* End of directory */ + + return(1); + } + + ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1); +#else + ent = readdir(dir); + + if (ent == NULL) { + + return(1); + } +#endif + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + os_file_handle_error_no_exit(full_path, "stat"); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_int64_t)statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + +/********************************************************************* +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + /* out: TRUE if call succeeds, + FALSE on error */ + const char* pathname, /* in: directory name as + null-terminated string */ + ibool fail_if_exists) /* in: if TRUE, pre-existing directory + is treated as an error. */ +{ +#ifdef __WIN__ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + /* failure */ + os_file_handle_error(pathname, "CreateDirectory"); + + return(FALSE); + } + + return (TRUE); +#else + int rcode; + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error(pathname, "mkdir"); + + return(FALSE); + } + + return (TRUE); +#endif +} + +/******************************************************************** +A simple function to open or create a file. */ +UNIV_INTERN +os_file_t +os_file_create_simple( +/*==================*/ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is + opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), or + OS_FILE_CREATE_PATH if new file + (if exists, error) and subdirectories along + its path are created (if needed)*/ + ulint access_type,/* in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD create_flag; + DWORD access; + DWORD attributes = 0; + ibool retry; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_CREATE_PATH) { + /* create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + if (!*success) { + ut_error; + } + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + } else { + create_flag = 0; + ut_error; + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + access = 0; + ut_error; + } + + file = CreateFile((LPCTSTR) name, + access, + FILE_SHARE_READ | FILE_SHARE_WRITE, + /* file can be read and written also + by other processes */ + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /* no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + + retry = os_file_handle_error(name, + create_mode == OS_FILE_OPEN ? + "open" : "create"); + if (retry) { + goto try_again; + } + } else { + *success = TRUE; + } + + return(file); +#else /* __WIN__ */ + os_file_t file; + int create_flag; + ibool retry; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else if (create_mode == OS_FILE_CREATE_PATH) { + /* create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + if (!*success) { + return (-1); + } + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + create_flag = 0; + ut_error; + } + + if (create_mode == OS_FILE_CREATE) { + file = open(name, create_flag, S_IRUSR | S_IWUSR + | S_IRGRP | S_IWGRP); + } else { + file = open(name, create_flag); + } + + if (file == -1) { + *success = FALSE; + + retry = os_file_handle_error(name, + create_mode == OS_FILE_OPEN ? + "open" : "create"); + if (retry) { + goto try_again; + } +#ifdef USE_FILE_LOCK + } else if (access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; + close(file); + file = -1; +#endif + } else { + *success = TRUE; + } + + return(file); +#endif /* __WIN__ */ +} + +/******************************************************************** +A simple function to open or create a file. */ +UNIV_INTERN +os_file_t +os_file_create_simple_no_error_handling( +/*====================================*/ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD create_flag; + DWORD access; + DWORD attributes = 0; + DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + create_flag = 0; + ut_error; + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + access = GENERIC_READ; + share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ + | FILE_SHARE_WRITE; /* A backup program has to give + mysqld the maximum freedom to + do what it likes with the + file */ + } else { + access = 0; + ut_error; + } + + file = CreateFile((LPCTSTR) name, + access, + share_mode, + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /* no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + } else { + *success = TRUE; + } + + return(file); +#else /* __WIN__ */ + os_file_t file; + int create_flag; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else { + create_flag = 0; + ut_error; + } + + if (create_mode == OS_FILE_CREATE) { + file = open(name, create_flag, S_IRUSR | S_IWUSR + | S_IRGRP | S_IWGRP); + } else { + file = open(name, create_flag); + } + + if (file == -1) { + *success = FALSE; +#ifdef USE_FILE_LOCK + } else if (access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; + close(file); + file = -1; +#endif + } else { + *success = TRUE; + } + + return(file); +#endif /* __WIN__ */ +} + +/******************************************************************** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd, /* in: file descriptor to alter */ + const char* file_name, /* in: file name, used in the + diagnostic message */ + const char* operation_name) /* in: "open" or "create"; used in the + diagnostic message */ +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save; + errno_save = (int)errno; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Failed to set DIRECTIO_ON " + "on file %s: %s: %s, continuing anyway\n", + file_name, operation_name, strerror(errno_save)); + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save; + errno_save = (int)errno; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Failed to set O_DIRECT " + "on file %s: %s: %s, continuing anyway\n", + file_name, operation_name, strerror(errno_save)); + if (errno_save == EINVAL) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: O_DIRECT is known to result in " + "'Invalid argument' on Linux on tmpfs, " + "see MySQL Bug#26662\n"); + } + } +#endif +} + +/******************************************************************** +Opens an existing file or creates a new. */ +UNIV_INTERN +os_file_t +os_file_create( +/*===========*/ + /* out, own: handle to the file, not defined + if error, error number can be retrieved with + os_file_get_last_error */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), + OS_FILE_OVERWRITE if a new file is created + or an old overwritten; + OS_FILE_OPEN_RAW, if a raw device or disk + partition should be opened */ + ulint purpose,/* in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD share_mode = FILE_SHARE_READ; + DWORD create_flag; + DWORD attributes; + ibool retry; +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN_RAW) { + create_flag = OPEN_EXISTING; + share_mode = FILE_SHARE_WRITE; + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_OVERWRITE) { + create_flag = CREATE_ALWAYS; + } else { + create_flag = 0; + ut_error; + } + + if (purpose == OS_FILE_AIO) { + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ + attributes = 0; +#ifdef WIN_ASYNC_IO + if (os_aio_use_native_aio) { + attributes = attributes | FILE_FLAG_OVERLAPPED; + } +#endif +#ifdef UNIV_NON_BUFFERED_IO + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + /* Do not use unbuffered i/o to log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + } else if (srv_win_file_flush_method + == SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; + } +#endif + } else if (purpose == OS_FILE_NORMAL) { + attributes = 0; +#ifdef UNIV_NON_BUFFERED_IO + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + /* Do not use unbuffered i/o to log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + } else if (srv_win_file_flush_method + == SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; + } +#endif + } else { + attributes = 0; + ut_error; + } + + file = CreateFile((LPCTSTR) name, + GENERIC_READ | GENERIC_WRITE, /* read and write + access */ + share_mode, /* File can be read also by other + processes; we must give the read + permission because of ibbackup. We do + not give the write permission to + others because if one would succeed to + start 2 instances of mysqld on the + SAME files, that could cause severe + database corruption! When opening + raw disk partitions, Microsoft manuals + say that we must give also the write + permission. */ + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /* no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + + /* When srv_file_per_table is on, file creation failure may not + be critical to the whole instance. Do not crash the server in + case of unknown errors. */ + if (srv_file_per_table) { + retry = os_file_handle_error_no_exit(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } else { + retry = os_file_handle_error(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } + + if (retry) { + goto try_again; + } + } else { + *success = TRUE; + } + + return(file); +#else /* __WIN__ */ + os_file_t file; + int create_flag; + ibool retry; + const char* mode_str = NULL; + const char* type_str = NULL; + const char* purpose_str = NULL; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + mode_str = "OPEN"; + create_flag = O_RDWR; + } else if (create_mode == OS_FILE_CREATE) { + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else if (create_mode == OS_FILE_OVERWRITE) { + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + } else { + create_flag = 0; + ut_error; + } + + if (type == OS_LOG_FILE) { + type_str = "LOG"; + } else if (type == OS_DATA_FILE) { + type_str = "DATA"; + } else { + ut_error; + } + + if (purpose == OS_FILE_AIO) { + purpose_str = "AIO"; + } else if (purpose == OS_FILE_NORMAL) { + purpose_str = "NORMAL"; + } else { + ut_error; + } + +#if 0 + fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n", + name, mode_str, type_str, purpose_str); +#endif +#ifdef O_SYNC + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + if (type == OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + +# if 0 + fprintf(stderr, "Using O_SYNC for file %s\n", name); +# endif + + create_flag = create_flag | O_SYNC; + } +#endif /* O_SYNC */ + + file = open(name, create_flag, os_innodb_umask); + + if (file == -1) { + *success = FALSE; + + /* When srv_file_per_table is on, file creation failure may not + be critical to the whole instance. Do not crash the server in + case of unknown errors. */ + if (srv_file_per_table) { + retry = os_file_handle_error_no_exit(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } else { + retry = os_file_handle_error(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } + + if (retry) { + goto try_again; + } else { + return(file /* -1 */); + } + } + /* else */ + + *success = TRUE; + + /* We disable OS caching (O_DIRECT) only on data files */ + if (type != OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { + + os_file_set_nocache(file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + int i; + ut_print_timestamp(stderr); + fputs(" InnoDB: Retrying to lock" + " the first data file\n", + stderr); + for (i = 0; i < 100; i++) { + os_thread_sleep(1000000); + if (!os_file_lock(file, name)) { + *success = TRUE; + return(file); + } + } + ut_print_timestamp(stderr); + fputs(" InnoDB: Unable to open the first data file\n", + stderr); + } + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + + return(file); +#endif /* __WIN__ */ +} + +/*************************************************************************** +Deletes a file if it exists. The file has to be closed before calling this. */ +UNIV_INTERN +ibool +os_file_delete_if_exists( +/*=====================*/ + /* out: TRUE if success */ + const char* name) /* in: file path as a null-terminated string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR)name); + + if (ret) { + return(TRUE); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* the file does not exist, this not an error */ + + return(TRUE); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running ibbackup" + " to back up the file?\n", name); + + os_file_get_last_error(TRUE); /* print error information */ + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(FALSE); + } + + goto loop; +#else + int ret; + + ret = unlink(name); + + if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Deletes a file. The file has to be closed before calling this. */ +UNIV_INTERN +ibool +os_file_delete( +/*===========*/ + /* out: TRUE if success */ + const char* name) /* in: file path as a null-terminated string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR)name); + + if (ret) { + return(TRUE); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as a 'mild' + error and return */ + + return(FALSE); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running ibbackup" + " to back up the file?\n", name); + + os_file_get_last_error(TRUE); /* print error information */ + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(FALSE); + } + + goto loop; +#else + int ret; + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. */ +UNIV_INTERN +ibool +os_file_rename( +/*===========*/ + /* out: TRUE if success */ + const char* oldpath,/* in: old file path as a null-terminated + string */ + const char* newpath)/* in: new file path */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath); + + if (ret) { + return(TRUE); + } + + os_file_handle_error_no_exit(oldpath, "rename"); + + return(FALSE); +#else + int ret; + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_error_no_exit(oldpath, "rename"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. */ +UNIV_INTERN +ibool +os_file_close( +/*==========*/ + /* out: TRUE if success */ + os_file_t file) /* in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(NULL, "close"); + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + os_file_handle_error(NULL, "close"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Closes a file handle. */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + /* out: TRUE if success */ + os_file_t file) /* in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Gets a file size. */ +UNIV_INTERN +ibool +os_file_get_size( +/*=============*/ + /* out: TRUE if success */ + os_file_t file, /* in: handle to a file */ + ulint* size, /* out: least significant 32 bits of file + size */ + ulint* size_high)/* out: most significant 32 bits of size */ +{ +#ifdef __WIN__ + DWORD high; + DWORD low; + + low = GetFileSize(file, &high); + + if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) { + return(FALSE); + } + + *size = low; + *size_high = high; + + return(TRUE); +#else + off_t offs; + + offs = lseek(file, 0, SEEK_END); + + if (offs == ((off_t)-1)) { + + return(FALSE); + } + + if (sizeof(off_t) > 4) { + *size = (ulint)(offs & 0xFFFFFFFFUL); + *size_high = (ulint)(offs >> 32); + } else { + *size = (ulint) offs; + *size_high = 0; + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Gets file size as a 64-bit integer ib_int64_t. */ +UNIV_INTERN +ib_int64_t +os_file_get_size_as_iblonglong( +/*===========================*/ + /* out: size in bytes, -1 if error */ + os_file_t file) /* in: handle to a file */ +{ + ulint size; + ulint size_high; + ibool success; + + success = os_file_get_size(file, &size, &size_high); + + if (!success) { + + return(-1); + } + + return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size); +} + +/*************************************************************************** +Write the specified number of zeros to a newly created file. */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + /* out: TRUE if success */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + os_file_t file, /* in: handle to a file */ + ulint size, /* in: least significant 32 bits of file + size */ + ulint size_high)/* in: most significant 32 bits of size */ +{ + ib_int64_t current_size; + ib_int64_t desired_size; + ibool ret; + byte* buf; + byte* buf2; + ulint buf_size; + + ut_a(size == (size & 0xFFFFFFFF)); + + current_size = 0; + desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32); + + /* Write up to 1 megabyte at a time. */ + buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE)) + * UNIV_PAGE_SIZE; + buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE); + + /* Align the buffer for possible raw i/o */ + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + /* Write buffer full of zeros */ + memset(buf, 0, buf_size); + + if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) { + + fprintf(stderr, "InnoDB: Progress in MB:"); + } + + while (current_size < desired_size) { + ulint n_bytes; + + if (desired_size - current_size < (ib_int64_t) buf_size) { + n_bytes = (ulint) (desired_size - current_size); + } else { + n_bytes = buf_size; + } + + ret = os_file_write(name, file, buf, + (ulint)(current_size & 0xFFFFFFFF), + (ulint)(current_size >> 32), + n_bytes); + if (!ret) { + ut_free(buf2); + goto error_handling; + } + + /* Print about progress for each 100 MB written */ + if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024) + != current_size / (ib_int64_t)(100 * 1024 * 1024)) { + + fprintf(stderr, " %lu00", + (ulong) ((current_size + n_bytes) + / (ib_int64_t)(100 * 1024 * 1024))); + } + + current_size += n_bytes; + } + + if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) { + + fprintf(stderr, "\n"); + } + + ut_free(buf2); + + ret = os_file_flush(file); + + if (ret) { + return(TRUE); + } + +error_handling: + return(FALSE); +} + +/*************************************************************************** +Truncates a file at its current position. */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + /* out: TRUE if success */ + FILE* file) /* in: file to be truncated */ +{ +#ifdef __WIN__ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + return(SetEndOfFile(h)); +#else /* __WIN__ */ + return(!ftruncate(fileno(file), ftell(file))); +#endif /* __WIN__ */ +} + +#ifndef __WIN__ +/*************************************************************************** +Wrapper to fsync(2) that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. */ + +static +int +os_file_fsync( +/*==========*/ + /* out: 0 if success, -1 otherwise */ + os_file_t file) /* in: handle to a file */ +{ + int ret; + int failures; + ibool retry; + + failures = 0; + + do { + ret = fsync(file); + + os_n_fsyncs++; + + if (ret == -1 && errno == ENOLCK) { + + if (failures % 100 == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: fsync(): " + "No locks available; retrying\n"); + } + + os_thread_sleep(200000 /* 0.2 sec */); + + failures++; + + retry = TRUE; + } else { + + retry = FALSE; + } + } while (retry); + + return(ret); +} +#endif /* !__WIN__ */ + +/*************************************************************************** +Flushes the write buffers of a given file to the disk. */ +UNIV_INTERN +ibool +os_file_flush( +/*==========*/ + /* out: TRUE if success */ + os_file_t file) /* in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + os_n_fsyncs++; + + ret = FlushFileBuffers(file); + + if (ret) { + return(TRUE); + } + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#else + int ret; + +#if defined(HAVE_DARWIN_THREADS) +# ifndef F_FULLFSYNC + /* The following definition is from the Mac OS X 10.3 */ +# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */ +# elif F_FULLFSYNC != 51 +# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3" +# endif + /* Apple has disabled fsync() for internal disk drives in OS X. That + caused corruption for a user when he tested a power outage. Let us in + OS X use a nonstandard flush method recommended by an Apple + engineer. */ + + if (!srv_have_fullfsync) { + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + + ret = os_file_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ + ret = os_file_fsync(file); + } + } +#else + ret = os_file_fsync(file); +#endif + + if (ret == 0) { + return(TRUE); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(TRUE); + } + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: the OS said file flush did not succeed\n"); + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#endif +} + +#ifndef __WIN__ +/*********************************************************************** +Does a synchronous read operation in Posix. */ +static +ssize_t +os_file_pread( +/*==========*/ + /* out: number of bytes read, -1 if error */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read */ + ulint n, /* in: number of bytes to read */ + ulint offset, /* in: least significant 32 bits of file + offset from where to read */ + ulint offset_high) /* in: most significant 32 bits of + offset */ +{ + off_t offs; + ssize_t n_bytes; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + + if (sizeof(off_t) > 4) { + offs = (off_t)offset + (((off_t)offset_high) << 32); + + } else { + offs = (off_t)offset; + + if (offset_high > 0) { + fprintf(stderr, + "InnoDB: Error: file read at offset > 4 GB\n"); + } + } + + os_n_file_reads++; + +#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads++; + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + n_bytes = pread(file, buf, (ssize_t)n, offs); + + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads--; + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + return(n_bytes); +#else + { + off_t ret_offset; + ssize_t ret; + ulint i; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + } else { + ret = read(file, buf, (ssize_t)n); + } + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + return(ret); + } +#endif +} + +/*********************************************************************** +Does a synchronous write operation in Posix. */ +static +ssize_t +os_file_pwrite( +/*===========*/ + /* out: number of bytes written, -1 if error */ + os_file_t file, /* in: handle to a file */ + const void* buf, /* in: buffer from where to write */ + ulint n, /* in: number of bytes to write */ + ulint offset, /* in: least significant 32 bits of file + offset where to write */ + ulint offset_high) /* in: most significant 32 bits of + offset */ +{ + ssize_t ret; + off_t offs; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + + if (sizeof(off_t) > 4) { + offs = (off_t)offset + (((off_t)offset_high) << 32); + } else { + offs = (off_t)offset; + + if (offset_high > 0) { + fprintf(stderr, + "InnoDB: Error: file write" + " at offset > 4 GB\n"); + } + } + + os_n_file_writes++; + +#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD) + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites++; + os_n_pending_writes++; + os_mutex_exit(os_file_count_mutex); + + ret = pwrite(file, buf, (ssize_t)n, offs); + + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites--; + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + +# ifdef UNIV_DO_FLUSH + if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && !os_do_not_call_flush_at_each_write) { + + /* Always do fsync to reduce the probability that when + the OS crashes, a database page is only partially + physically written to disk. */ + + ut_a(TRUE == os_file_flush(file)); + } +# endif /* UNIV_DO_FLUSH */ + + return(ret); +#else + { + off_t ret_offset; + ulint i; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + + goto func_exit; + } + + ret = write(file, buf, (ssize_t)n); + +# ifdef UNIV_DO_FLUSH + if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && !os_do_not_call_flush_at_each_write) { + + /* Always do fsync to reduce the probability that when + the OS crashes, a database page is only partially + physically written to disk. */ + + ut_a(TRUE == os_file_flush(file)); + } +# endif /* UNIV_DO_FLUSH */ + +func_exit: + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + + return(ret); + } +#endif +} +#endif + +/*********************************************************************** +Requests a synchronous positioned read operation. */ +UNIV_INTERN +ibool +os_file_read( +/*=========*/ + /* out: TRUE if request was + successful, FALSE if fail */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read */ + ulint offset, /* in: least significant 32 bits of file + offset where to read */ + ulint offset_high, /* in: most significant 32 bits of + offset */ + ulint n) /* in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; + ulint i; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset; + high = (DWORD) offset_high; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + return(TRUE); + } +#else + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, offset_high); + + if ((ulint)ret == n) { + + return(TRUE); + } + + fprintf(stderr, + "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n" + "InnoDB: Was only able to read %ld.\n", + (ulong)n, (ulong)offset_high, + (ulong)offset, (long)ret); +#endif +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error(NULL, "read"); + + if (retry) { + goto try_again; + } + + fprintf(stderr, + "InnoDB: Fatal error: cannot read from file." + " OS error number %lu.\n", +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif + ); + fflush(stderr); + + ut_error; + + return(FALSE); +} + +/*********************************************************************** +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. */ +UNIV_INTERN +ibool +os_file_read_no_error_handling( +/*===========================*/ + /* out: TRUE if request was + successful, FALSE if fail */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read */ + ulint offset, /* in: least significant 32 bits of file + offset where to read */ + ulint offset_high, /* in: most significant 32 bits of + offset */ + ulint n) /* in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; + ulint i; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset; + high = (DWORD) offset_high; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + return(TRUE); + } +#else + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, offset_high); + + if ((ulint)ret == n) { + + return(TRUE); + } +#endif +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error_no_exit(NULL, "read"); + + if (retry) { + goto try_again; + } + + return(FALSE); +} + +/*********************************************************************** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /* in: file to read from */ + char* str, /* in: buffer where to read */ + ulint size) /* in: size of buffer */ +{ + size_t flen; + + if (size == 0) { + return; + } + + rewind(file); + flen = fread(str, 1, size - 1, file); + str[flen] = '\0'; +} + +/*********************************************************************** +Requests a synchronous write operation. */ +UNIV_INTERN +ibool +os_file_write( +/*==========*/ + /* out: TRUE if request was + successful, FALSE if fail */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + os_file_t file, /* in: handle to a file */ + const void* buf, /* in: buffer from which to write */ + ulint offset, /* in: least significant 32 bits of file + offset where to write */ + ulint offset_high, /* in: most significant 32 bits of + offset */ + ulint n) /* in: number of bytes to write */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ulint i; + ulint n_retries = 0; + ulint err; + + ut_a((offset & 0xFFFFFFFF) == offset); + + os_n_file_writes++; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); +retry: + low = (DWORD) offset; + high = (DWORD) offset_high; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: File pointer positioning to" + " file %s failed at\n" + "InnoDB: offset %lu %lu. Operating system" + " error number %lu.\n" + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + "http://dev.mysql.com/doc/refman/5.1/en/" + "operating-system-error-codes.html\n", + name, (ulong) offset_high, (ulong) offset, + (ulong) GetLastError()); + + return(FALSE); + } + + ret = WriteFile(file, buf, (DWORD) n, &len, NULL); + + /* Always do fsync to reduce the probability that when the OS crashes, + a database page is only partially physically written to disk. */ + +# ifdef UNIV_DO_FLUSH + if (!os_do_not_call_flush_at_each_write) { + ut_a(TRUE == os_file_flush(file)); + } +# endif /* UNIV_DO_FLUSH */ + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + + return(TRUE); + } + + /* If some background file system backup tool is running, then, at + least in Windows 2000, we may get here a specific error. Let us + retry the operation 100 times, with 1 second waits. */ + + if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) { + + os_thread_sleep(1000000); + + n_retries++; + + goto retry; + } + + if (!os_has_said_disk_full) { + + err = (ulint)GetLastError(); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %lu %lu.\n" + "InnoDB: %lu bytes should have been written," + " only %lu were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, (ulong) offset_high, (ulong) offset, + (ulong) n, (ulong) len, (ulong) err); + + if (strerror((int)err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulong) err, strerror((int)err)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + "http://dev.mysql.com/doc/refman/5.1/en/" + "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#else + ssize_t ret; + + ret = os_file_pwrite(file, buf, n, offset, offset_high); + + if ((ulint)ret == n) { + + return(TRUE); + } + + if (!os_has_said_disk_full) { + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %lu %lu.\n" + "InnoDB: %lu bytes should have been written," + " only %ld were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset_high, offset, n, (long int)ret, + (ulint)errno); + if (strerror(errno) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulint)errno, strerror(errno)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + "http://dev.mysql.com/doc/refman/5.1/en/" + "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#endif +} + +/*********************************************************************** +Check the existence and type of the given file. */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + /* out: TRUE if call succeeded */ + const char* path, /* in: pathname of the file */ + ibool* exists, /* out: TRUE if file exists */ + os_file_type_t* type) /* out: type of the file (if it exists) */ +{ +#ifdef __WIN__ + int ret; + struct _stat statinfo; + + ret = _stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#endif +} + +/*********************************************************************** +This function returns information about the specified file */ +UNIV_INTERN +ibool +os_file_get_status( +/*===============*/ + /* out: TRUE if stat + information found */ + const char* path, /* in: pathname of the file */ + os_file_stat_t* stat_info) /* information of a file in a + directory */ +{ +#ifdef __WIN__ + int ret; + struct _stat statinfo; + + ret = _stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(FALSE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + if (_S_IFDIR & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_FILE; + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(FALSE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + stat_info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + stat_info->type = OS_FILE_TYPE_FILE; + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(TRUE); +#endif +} + +/* path name separator character */ +#ifdef __WIN__ +# define OS_FILE_PATH_SEPARATOR '\\' +#else +# define OS_FILE_PATH_SEPARATOR '/' +#endif + +/******************************************************************** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' charac­ +ters are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." +*/ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + /* out, own: directory component of the + pathname */ + const char* path) /* in: pathname */ +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR); + if (!last_slash) { + /* No slash in the path, return "." */ + + return(mem_strdup(".")); + } + + /* Ok, there is a slash */ + + if (last_slash == path) { + /* last slash is the first char of the path */ + + return(mem_strdup("/")); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, last_slash - path)); +} + +/******************************************************************** +Creates all missing subdirectories along the given path. */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + /* out: TRUE if call succeeded + FALSE otherwise */ + const char* path) /* in: path name */ +{ + char* subdir; + ibool success, subdir_exists; + os_file_type_t type; + + subdir = os_file_dirname(path); + if (strlen(subdir) == 1 + && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { + /* subdir is root or cwd, nothing to do */ + mem_free(subdir); + + return(TRUE); + } + + /* Test if subdir exists */ + success = os_file_status(subdir, &subdir_exists, &type); + if (success && !subdir_exists) { + /* subdir does not exist, create it */ + success = os_file_create_subdirs_if_needed(subdir); + if (!success) { + mem_free(subdir); + + return(FALSE); + } + success = os_file_create_directory(subdir, FALSE); + } + + mem_free(subdir); + + return(success); +} + +/******************************************************************** +Returns a pointer to the nth slot in the aio array. */ +static +os_aio_slot_t* +os_aio_array_get_nth_slot( +/*======================*/ + /* out: pointer to slot */ + os_aio_array_t* array, /* in: aio array */ + ulint index) /* in: index of the slot */ +{ + ut_a(index < array->n_slots); + + return((array->slots) + index); +} + +/**************************************************************************** +Creates an aio wait array. */ +static +os_aio_array_t* +os_aio_array_create( +/*================*/ + /* out, own: aio array */ + ulint n, /* in: maximum number of pending aio operations + allowed; n must be divisible by n_segments */ + ulint n_segments) /* in: number of segments in the aio array */ +{ + os_aio_array_t* array; + ulint i; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + OVERLAPPED* over; +#endif + ut_a(n > 0); + ut_a(n_segments > 0); + + array = ut_malloc(sizeof(os_aio_array_t)); + + array->mutex = os_mutex_create(NULL); + array->not_full = os_event_create(NULL); + array->is_empty = os_event_create(NULL); + + os_event_set(array->is_empty); + + array->n_slots = n; + array->n_segments = n_segments; + array->n_reserved = 0; + array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); +#ifdef __WIN__ + array->native_events = ut_malloc(n * sizeof(os_native_event_t)); +#endif + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + slot->pos = i; + slot->reserved = FALSE; +#ifdef WIN_ASYNC_IO + slot->event = os_event_create(NULL); + + over = &(slot->control); + + over->hEvent = slot->event->handle; + + *((array->native_events) + i) = over->hEvent; +#endif + } + + return(array); +} + +/**************************************************************************** +Initializes the asynchronous io system. Calls also os_io_init_simple. +Creates a separate aio array for +non-ibuf read and write, a third aio array for the ibuf i/o, with just one +segment, two aio arrays for log reads and writes with one segment, and a +synchronous aio array of the specified size. The combined number of segments +in the three first aio arrays is the parameter n_segments given to the +function. The caller must create an i/o handler thread for each segment in +the four first arrays, but not for the sync aio array. */ +UNIV_INTERN +void +os_aio_init( +/*========*/ + ulint n, /* in: maximum number of pending aio operations + allowed; n must be divisible by n_segments */ +// ulint n_segments, /* in: combined number of segments in the four +// first aio arrays; must be >= 4 */ + ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads*/ + ulint n_write_threads, /**/ + ulint n_slots_sync) /* in: number of slots in the sync aio array */ +{ + ulint n_read_segs; + ulint n_write_segs; + ulint n_per_seg; + ulint i; + + ulint n_segments = 2 + n_read_threads + n_write_threads; + + ut_ad(n % n_segments == 0); + ut_ad(n_segments >= 4); + + os_io_init_simple(); + + for (i = 0; i < n_segments; i++) { + srv_set_io_thread_op_info(i, "not started yet"); + } + + n_per_seg = n / n_segments; + n_write_segs = n_write_threads; + n_read_segs = n_read_threads; + + /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ + + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[0] = "insert buffer thread"; + + os_aio_log_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[1] = "log thread"; + + os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, + n_read_segs); + for (i = 2; i < 2 + n_read_segs; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "read thread"; + } + + os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, + n_write_segs); + for (i = 2 + n_read_segs; i < n_segments; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "write thread"; + } + + os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + + os_aio_n_segments = n_segments; + + os_aio_validate(); + + os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*)); + + for (i = 0; i < n_segments; i++) { + os_aio_segment_wait_events[i] = os_event_create(NULL); + } + + os_last_printout = time(NULL); + +} + +#ifdef WIN_ASYNC_IO +/**************************************************************************** +Wakes up all async i/o threads in the array in Windows async i/o at +shutdown. */ +static +void +os_aio_array_wake_win_aio_at_shutdown( +/*==================================*/ + os_aio_array_t* array) /* in: aio array */ +{ + ulint i; + + for (i = 0; i < array->n_slots; i++) { + + os_event_set((array->slots + i)->event); + } +} +#endif + +/**************************************************************************** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void) +/*=====================================*/ +{ + ulint i; + +#ifdef WIN_ASYNC_IO + /* This code wakes up all ai/o threads in Windows native aio */ + os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array); + os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); + os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); + os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); +#endif + /* This loop wakes up all simulated ai/o threads */ + + for (i = 0; i < os_aio_n_segments; i++) { + + os_event_set(os_aio_segment_wait_events[i]); + } +} + +/**************************************************************************** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void) +/*=====================================*/ +{ + os_event_wait(os_aio_write_array->is_empty); +} + +/************************************************************************** +Calculates segment number for a slot. */ +static +ulint +os_aio_get_segment_no_from_slot( +/*============================*/ + /* out: segment number (which is the number + used by, for example, i/o-handler threads) */ + os_aio_array_t* array, /* in: aio wait array */ + os_aio_slot_t* slot) /* in: slot in this array */ +{ + ulint segment; + ulint seg_len; + + if (array == os_aio_ibuf_array) { + segment = 0; + + } else if (array == os_aio_log_array) { + segment = 1; + + } else if (array == os_aio_read_array) { + seg_len = os_aio_read_array->n_slots + / os_aio_read_array->n_segments; + + segment = 2 + slot->pos / seg_len; + } else { + ut_a(array == os_aio_write_array); + seg_len = os_aio_write_array->n_slots + / os_aio_write_array->n_segments; + + segment = os_aio_read_array->n_segments + 2 + + slot->pos / seg_len; + } + + return(segment); +} + +/************************************************************************** +Calculates local segment number and aio array from global segment number. */ +static +ulint +os_aio_get_array_and_local_segment( +/*===============================*/ + /* out: local segment number within + the aio array */ + os_aio_array_t** array, /* out: aio wait array */ + ulint global_segment)/* in: global segment number */ +{ + ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (global_segment == 0) { + *array = os_aio_ibuf_array; + segment = 0; + + } else if (global_segment == 1) { + *array = os_aio_log_array; + segment = 0; + + } else if (global_segment < os_aio_read_array->n_segments + 2) { + *array = os_aio_read_array; + + segment = global_segment - 2; + } else { + *array = os_aio_write_array; + + segment = global_segment - (os_aio_read_array->n_segments + 2); + } + + return(segment); +} + +/*********************************************************************** +Requests for a slot in the aio array. If no slot is available, waits until +not_full-event becomes signaled. */ +static +os_aio_slot_t* +os_aio_array_reserve_slot( +/*======================*/ + /* out: pointer to slot */ + ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */ + os_aio_array_t* array, /* in: aio array */ + fil_node_t* message1,/* in: message to be passed along with + the aio operation */ + void* message2,/* in: message to be passed along with + the aio operation */ + os_file_t file, /* in: file handle */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + void* buf, /* in: buffer where to read or from which + to write */ + ulint offset, /* in: least significant 32 bits of file + offset */ + ulint offset_high, /* in: most significant 32 bits of + offset */ + ulint len) /* in: length of the block to read or write */ +{ + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + OVERLAPPED* control; +#endif + ulint i; + ulint prim_segment; + ulint n; + + n = array->n_slots / array->n_segments; + /* 64 blocks' striping ( aligning max(BUF_READ_AHEAD_AREA) ) */ + prim_segment = ( offset >> (UNIV_PAGE_SIZE_SHIFT + 6) ) % (array->n_segments); + +loop: + os_mutex_enter(array->mutex); + + if (array->n_reserved == array->n_slots) { + os_mutex_exit(array->mutex); + + if (!os_aio_use_native_aio) { + /* If the handler threads are suspended, wake them + so that we get more slots */ + + os_aio_simulated_wake_handler_threads(); + } + + os_event_wait(array->not_full); + + goto loop; + } + + for (i = prim_segment * n; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + break; + } + } + + if (slot->reserved == TRUE){ + /* Not found after the intended segment. So we should search before. */ + for (i = 0;; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + break; + } + } + } + + array->n_reserved++; + + if (array->n_reserved == 1) { + os_event_reset(array->is_empty); + } + + if (array->n_reserved == array->n_slots) { + os_event_reset(array->not_full); + } + + slot->reserved = TRUE; + slot->reservation_time = time(NULL); + slot->message1 = message1; + slot->message2 = message2; + slot->file = file; + slot->name = name; + slot->len = len; + slot->type = type; + slot->buf = buf; + slot->offset = offset; + slot->offset_high = offset_high; + slot->io_already_done = FALSE; + +#ifdef WIN_ASYNC_IO + control = &(slot->control); + control->Offset = (DWORD)offset; + control->OffsetHigh = (DWORD)offset_high; + os_event_reset(slot->event); +#endif + + os_mutex_exit(array->mutex); + + return(slot); +} + +/*********************************************************************** +Frees a slot in the aio array. */ +static +void +os_aio_array_free_slot( +/*===================*/ + os_aio_array_t* array, /* in: aio array */ + os_aio_slot_t* slot) /* in: pointer to slot */ +{ + ut_ad(array); + ut_ad(slot); + + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); + + slot->reserved = FALSE; + + array->n_reserved--; + + if (array->n_reserved == array->n_slots - 1) { + os_event_set(array->not_full); + } + + if (array->n_reserved == 0) { + os_event_set(array->is_empty); + } + +#ifdef WIN_ASYNC_IO + os_event_reset(slot->event); +#endif + os_mutex_exit(array->mutex); +} + +/************************************************************************** +Wakes up a simulated aio i/o-handler thread if it has something to do. */ +static +void +os_aio_simulated_wake_handler_thread( +/*=================================*/ + ulint global_segment) /* in: the number of the segment in the aio + arrays */ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint segment; + ulint n; + ulint i; + + ut_ad(!os_aio_use_native_aio); + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + os_mutex_enter(array->mutex); + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + /* Found an i/o request */ + + break; + } + } + + os_mutex_exit(array->mutex); + + if (i < n) { + os_event_set(os_aio_segment_wait_events[global_segment]); + } +} + +/************************************************************************** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void) +/*=======================================*/ +{ + ulint i; + + if (os_aio_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = FALSE; + + for (i = 0; i < os_aio_n_segments; i++) { + os_aio_simulated_wake_handler_thread(i); + } +} + +/************************************************************************** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep(void) +/*============================================*/ +{ + os_aio_array_t* array; + ulint g; + + os_aio_recommend_sleep_for_read_threads = TRUE; + + for (g = 0; g < os_aio_n_segments; g++) { + os_aio_get_array_and_local_segment(&array, g); + + if (array == os_aio_read_array) { + + os_event_reset(os_aio_segment_wait_events[g]); + } + } +} + +/*********************************************************************** +Requests an asynchronous i/o operation. */ +UNIV_INTERN +ibool +os_aio( +/*===*/ + /* out: TRUE if request was queued + successfully, FALSE if fail */ + ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /* in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /* in: name of the file or path as a + null-terminated string */ + os_file_t file, /* in: handle to a file */ + void* buf, /* in: buffer where to read or from which + to write */ + ulint offset, /* in: least significant 32 bits of file + offset where to read or write */ + ulint offset_high, /* in: most significant 32 bits of + offset */ + ulint n, /* in: number of bytes to read or write */ + fil_node_t* message1,/* in: messages for the aio handler (these + can be used to identify a completed aio + operation); if mode is OS_AIO_SYNC, these + are ignored */ + void* message2) +{ + os_aio_array_t* array; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + ibool retval; + BOOL ret = TRUE; + DWORD len = (DWORD) n; + struct fil_node_struct * dummy_mess1; + void* dummy_mess2; + ulint dummy_type; +#endif + ulint err = 0; + ibool retry; + ulint wake_later; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(os_aio_validate()); + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); + + if (mode == OS_AIO_SYNC +#ifdef WIN_ASYNC_IO + && !os_aio_use_native_aio +#endif + ) { + /* This is actually an ordinary synchronous read or write: + no need to use an i/o-handler thread. NOTE that if we use + Windows async i/o, Windows does not allow us to use + ordinary synchronous os_file_read etc. on the same file, + therefore we have built a special mechanism for synchronous + wait in the Windows case. */ + + if (type == OS_FILE_READ) { + return(os_file_read(file, buf, offset, + offset_high, n)); + } + + ut_a(type == OS_FILE_WRITE); + + return(os_file_write(name, file, buf, offset, offset_high, n)); + } + +try_again: + if (mode == OS_AIO_NORMAL) { + if (type == OS_FILE_READ) { + array = os_aio_read_array; + } else { + array = os_aio_write_array; + } + } else if (mode == OS_AIO_IBUF) { + ut_ad(type == OS_FILE_READ); + /* Reduce probability of deadlock bugs in connection with ibuf: + do not let the ibuf i/o handler sleep */ + + wake_later = FALSE; + + array = os_aio_ibuf_array; + } else if (mode == OS_AIO_LOG) { + + array = os_aio_log_array; + } else if (mode == OS_AIO_SYNC) { + array = os_aio_sync_array; + } else { + array = NULL; /* Eliminate compiler warning */ + ut_error; + } + + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, + name, buf, offset, offset_high, n); + if (type == OS_FILE_READ) { + if (os_aio_use_native_aio) { +#ifdef WIN_ASYNC_IO + os_n_file_reads++; + os_bytes_read_since_printout += len; + + ret = ReadFile(file, buf, (DWORD)n, &len, + &(slot->control)); +#endif + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else if (type == OS_FILE_WRITE) { + if (os_aio_use_native_aio) { +#ifdef WIN_ASYNC_IO + os_n_file_writes++; + ret = WriteFile(file, buf, (DWORD)n, &len, + &(slot->control)); +#endif + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else { + ut_error; + } + +#ifdef WIN_ASYNC_IO + if (os_aio_use_native_aio) { + if ((ret && len == n) + || (!ret && GetLastError() == ERROR_IO_PENDING)) { + /* aio was queued successfully! */ + + if (mode == OS_AIO_SYNC) { + /* We want a synchronous i/o operation on a + file where we also use async i/o: in Windows + we must use the same wait mechanism as for + async i/o */ + + retval = os_aio_windows_handle(ULINT_UNDEFINED, + slot->pos, + &dummy_mess1, + &dummy_mess2, + &dummy_type); + + return(retval); + } + + return(TRUE); + } + + err = 1; /* Fall through the next if */ + } +#endif + if (err == 0) { + /* aio was queued successfully! */ + + return(TRUE); + } + + os_aio_array_free_slot(array, slot); + + retry = os_file_handle_error(name, + type == OS_FILE_READ + ? "aio read" : "aio write"); + if (retry) { + + goto try_again; + } + + return(FALSE); +} + +#ifdef WIN_ASYNC_IO +/************************************************************************** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + /* out: TRUE if the aio operation succeeded */ + ulint segment, /* in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /* this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /* out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ +{ + ulint orig_seg = segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret_val; + BOOL ret; + DWORD len; + + if (segment == ULINT_UNDEFINED) { + array = os_aio_sync_array; + segment = 0; + } else { + segment = os_aio_get_array_and_local_segment(&array, segment); + } + + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + ut_ad(os_aio_validate()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + if (array == os_aio_sync_array) { + os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); + i = pos; + } else { + srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); + i = os_event_wait_multiple(n, + (array->native_events) + + segment * n); + } + + os_mutex_enter(array->mutex); + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + ut_a(slot->reserved); + + if (orig_seg != ULINT_UNDEFINED) { + srv_set_io_thread_op_info(orig_seg, + "get windows aio return value"); + } + + ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + + if (ret && len == slot->len) { + ret_val = TRUE; + +#ifdef UNIV_DO_FLUSH + if (slot->type == OS_FILE_WRITE + && !os_do_not_call_flush_at_each_write) { + ut_a(TRUE == os_file_flush(slot->file)); + } +#endif /* UNIV_DO_FLUSH */ + } else { + os_file_handle_error(slot->name, "Windows aio"); + + ret_val = FALSE; + } + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret_val); +} +#endif + +/************************************************************************** +Does simulated aio. This function should be called by an i/o-handler +thread. */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + /* out: TRUE if the aio operation succeeded */ + ulint global_segment, /* in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /* out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ +{ + os_aio_array_t* array; + ulint segment; + os_aio_slot_t* slot; + os_aio_slot_t* slot2; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + ulint n_consecutive; + ulint total_len; + ulint offs; + ulint lowest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; + byte* combined_buf2; + ibool ret; + ulint n; + ulint i; + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { + + /* Give other threads chance to add several i/os to the array + at once. */ + + goto recommended_sleep; + } + + os_mutex_enter(array->mutex); + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (b)"); + + /* Check if there is a slot for which the i/o has already been + done */ + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved && slot->io_already_done) { + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o for slot %lu" + " already done, returning\n", + (ulong) i); + } + + ret = TRUE; + + goto slot_io_done; + } + } + + n_consecutive = 0; + + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + + biggest_age = 0; + lowest_offset = ULINT_MAX; + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + age = (ulint)difftime(time(NULL), + slot->reservation_time); + + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = ULINT_MAX; + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, + i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + + slot = consecutive_ios[0]; + + /* Check if there are several consecutive blocks to read or write */ + +consecutive_loop: + for (i = 0; i < n; i++) { + slot2 = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot2->reserved && slot2 != slot + && slot2->offset == slot->offset + slot->len + /* check that sum does not wrap over */ + && slot->offset + slot->len > slot->offset + && slot2->offset_high == slot->offset_high + && slot2->type == slot->type + && slot2->file == slot->file) { + + /* Found a consecutive i/o request */ + + consecutive_ios[n_consecutive] = slot2; + n_consecutive++; + + slot = slot2; + + if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { + + goto consecutive_loop; + } else { + break; + } + } + } + + srv_set_io_thread_op_info(global_segment, "consecutive i/o requests"); + + /* We have now collected n_consecutive i/o requests in the array; + allocate a single buffer which can hold all data, and perform the + i/o */ + + total_len = 0; + slot = consecutive_ios[0]; + + for (i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; + } + + if (n_consecutive == 1) { + /* We can use the buffer of the i/o request */ + combined_buf = slot->buf; + combined_buf2 = NULL; + } else { + combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); + + ut_a(combined_buf2); + + combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE); + } + + /* We release the array mutex for the time of the i/o: NOTE that + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + + os_mutex_exit(array->mutex); + + if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { + /* Copy the buffers to the combined buffer */ + offs = 0; + + for (i = 0; i < n_consecutive; i++) { + + ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + srv_set_io_thread_op_info(global_segment, "doing file i/o"); + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: doing i/o of type %lu at offset %lu %lu," + " length %lu\n", + (ulong) slot->type, (ulong) slot->offset_high, + (ulong) slot->offset, (ulong) total_len); + } + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (slot->type == OS_FILE_WRITE) { + ret = os_file_write(slot->name, slot->file, combined_buf, + slot->offset, slot->offset_high, + total_len); + } else { + ret = os_file_read(slot->file, combined_buf, + slot->offset, slot->offset_high, total_len); + } + + ut_a(ret); + srv_set_io_thread_op_info(global_segment, "file i/o done"); + +#if 0 + fprintf(stderr, + "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n", + n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE); +#endif + + if (slot->type == OS_FILE_READ && n_consecutive > 1) { + /* Copy the combined buffer to individual buffers */ + offs = 0; + + for (i = 0; i < n_consecutive; i++) { + + ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + if (combined_buf2) { + ut_free(combined_buf2); + } + + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (i = 0; i < n_consecutive; i++) { + consecutive_ios[i]->io_already_done = TRUE; + } + + /* We return the messages for the first slot now, and if there were + several slots, the messages will be returned with subsequent calls + of this function */ + +slot_io_done: + + ut_a(slot->reserved); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret); + +wait_for_io: + srv_set_io_thread_op_info(global_segment, "resetting wait event"); + + /* We wait here until there again can be i/os in the segment + of this thread */ + + os_event_reset(os_aio_segment_wait_events[global_segment]); + + os_mutex_exit(array->mutex); + +recommended_sleep: + srv_set_io_thread_op_info(global_segment, "waiting for i/o request"); + + os_event_wait(os_aio_segment_wait_events[global_segment]); + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o handler thread for i/o" + " segment %lu wakes up\n", + (ulong) global_segment); + } + + goto restart; +} + +/************************************************************************** +Validates the consistency of an aio array. */ +static +ibool +os_aio_array_validate( +/*==================*/ + /* out: TRUE if ok */ + os_aio_array_t* array) /* in: aio wait array */ +{ + os_aio_slot_t* slot; + ulint n_reserved = 0; + ulint i; + + ut_a(array); + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + os_mutex_exit(array->mutex); + + return(TRUE); +} + +/************************************************************************** +Validates the consistency the aio system. */ +UNIV_INTERN +ibool +os_aio_validate(void) +/*=================*/ + /* out: TRUE if ok */ +{ + os_aio_array_validate(os_aio_read_array); + os_aio_array_validate(os_aio_write_array); + os_aio_array_validate(os_aio_ibuf_array); + os_aio_array_validate(os_aio_log_array); + os_aio_array_validate(os_aio_sync_array); + + return(TRUE); +} + +/************************************************************************** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file) /* in: file where to print */ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n_reserved; + time_t current_time; + double time_elapsed; + double avg_bytes_read; + ulint i; + + for (i = 0; i < srv_n_file_io_threads; i++) { + fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); + +#ifndef __WIN__ + if (os_aio_segment_wait_events[i]->is_set) { + fprintf(file, " ev set"); + } +#endif + + fprintf(file, "\n"); + } + + fputs("Pending normal aio reads:", file); + + array = os_aio_read_array; +loop: + ut_a(array); + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + n_reserved = 0; + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; +#if 0 + fprintf(stderr, "Reserved slot, messages %p %p\n", + (void*) slot->message1, + (void*) slot->message2); +#endif + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + fprintf(file, " %lu", (ulong) n_reserved); + + os_mutex_exit(array->mutex); + + if (array == os_aio_read_array) { + fputs(", aio writes:", file); + + array = os_aio_write_array; + + goto loop; + } + + if (array == os_aio_write_array) { + fputs(",\n ibuf aio reads:", file); + array = os_aio_ibuf_array; + + goto loop; + } + + if (array == os_aio_ibuf_array) { + fputs(", log i/o's:", file); + array = os_aio_log_array; + + goto loop; + } + + if (array == os_aio_log_array) { + fputs(", sync i/o's:", file); + array = os_aio_sync_array; + + goto loop; + } + + putc('\n', file); + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" + "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); + + if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { + fprintf(file, + "%lu pending preads, %lu pending pwrites\n", + (ulong) os_file_n_pending_preads, + (ulong) os_file_n_pending_pwrites); + } + + if (os_n_file_reads == os_n_file_reads_old) { + avg_bytes_read = 0.0; + } else { + avg_bytes_read = (double) os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + } + + fprintf(file, + "%.2f reads/s, %lu avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + (os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + (ulong)avg_bytes_read, + (os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/************************************************************************** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void) +/*======================*/ +{ + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + +#ifdef UNIV_DEBUG +/************************************************************************** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void) +/*=======================*/ + /* out: TRUE if all free */ +{ + os_aio_array_t* array; + ulint n_res = 0; + + array = os_aio_read_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_write_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_ibuf_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_log_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_sync_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (n_res == 0) { + + return(TRUE); + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/os/os0proc.c b/storage/xtradb/os/os0proc.c new file mode 100644 index 00000000000..8d4a71f8c4e --- /dev/null +++ b/storage/xtradb/os/os0proc.c @@ -0,0 +1,262 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system +process control primitives + +Created 9/30/1995 Heikki Tuuri +*******************************************************/ + +#include "os0proc.h" +#ifdef UNIV_NONINL +#include "os0proc.ic" +#endif + +#include "ut0mem.h" +#include "ut0byte.h" + +/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and +MAP_ANON but MAP_ANON is marked as deprecated */ +#if defined(MAP_ANONYMOUS) +#define OS_MAP_ANON MAP_ANONYMOUS +#elif defined(MAP_ANON) +#define OS_MAP_ANON MAP_ANON +#endif + +UNIV_INTERN ibool os_use_large_pages; +/* Large page size. This may be a boot-time option on some platforms */ +UNIV_INTERN ulint os_large_page_size; + +/******************************************************************** +Converts the current process id to a number. It is not guaranteed that the +number is unique. In Linux returns the 'process number' of the current +thread. That number is the same as one sees in 'top', for example. In Linux +the thread id is not the same as one sees in 'top'. */ +UNIV_INTERN +ulint +os_proc_get_number(void) +/*====================*/ +{ +#ifdef __WIN__ + return((ulint)GetCurrentProcessId()); +#else + return((ulint)getpid()); +#endif +} + +/******************************************************************** +Allocates large pages memory. */ +UNIV_INTERN +void* +os_mem_alloc_large( +/*===============*/ + /* out: allocated memory */ + ulint* n) /* in/out: number of bytes */ +{ + void* ptr; + ulint size; +#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX + int shmid; + struct shmid_ds buf; + + if (!os_use_large_pages || !os_large_page_size) { + goto skip; + } + + /* Align block size to os_large_page_size */ + ut_ad(ut_is_2pow(os_large_page_size)); + size = ut_2pow_round(*n + (os_large_page_size - 1), + os_large_page_size); + + shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W); + if (shmid < 0) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate" + " %lu bytes. errno %d\n", size, errno); + ptr = NULL; + } else { + ptr = shmat(shmid, NULL, 0); + if (ptr == (void *)-1) { + fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to" + " attach shared memory segment, errno %d\n", + errno); + } + + /* Remove the shared memory segment so that it will be + automatically freed after memory is detached or + process exits */ + shmctl(shmid, IPC_RMID, &buf); + } + + if (ptr) { + *n = size; + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); +# ifdef UNIV_SET_MEM_TO_ZERO + memset(ptr, '\0', size); +# endif + UNIV_MEM_ALLOC(ptr, size); + return(ptr); + } + + fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional" + " memory pool\n"); +skip: +#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */ + +#ifdef __WIN__ + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + + /* Align block size to system page size */ + ut_ad(ut_is_2pow(system_info.dwPageSize)); + /* system_info.dwPageSize is only 32-bit. Casting to ulint is required + on 64-bit Windows. */ + size = *n = ut_2pow_round(*n + (system_info.dwPageSize - 1), + (ulint) system_info.dwPageSize); + ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, + PAGE_READWRITE); + if (!ptr) { + fprintf(stderr, "InnoDB: VirtualAlloc(%lu bytes) failed;" + " Windows error %lu\n", + (ulong) size, (ulong) GetLastError()); + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_ALLOC(ptr, size); + } +#elif defined __NETWARE__ || !defined OS_MAP_ANON + size = *n; + ptr = ut_malloc_low(size, TRUE, FALSE); +#else +# ifdef HAVE_GETPAGESIZE + size = getpagesize(); +# else + size = UNIV_PAGE_SIZE; +# endif + /* Align block size to system page size */ + ut_ad(ut_is_2pow(size)); + size = *n = ut_2pow_round(*n + (size - 1), size); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | OS_MAP_ANON, -1, 0); + if (UNIV_UNLIKELY(ptr == (void*) -1)) { + fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;" + " errno %lu\n", + (ulong) size, (ulong) errno); + ptr = NULL; + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_total_allocated_memory += size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_ALLOC(ptr, size); + } +#endif + return(ptr); +} + +/******************************************************************** +Frees large pages memory. */ +UNIV_INTERN +void +os_mem_free_large( +/*==============*/ + void *ptr, /* in: pointer returned by + os_mem_alloc_large() */ + ulint size) /* in: size returned by + os_mem_alloc_large() */ +{ + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + os_fast_mutex_unlock(&ut_list_mutex); + +#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX + if (os_use_large_pages && os_large_page_size && !shmdt(ptr)) { + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + ut_total_allocated_memory -= size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_FREE(ptr, size); + return; + } +#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */ +#ifdef __WIN__ + /* When RELEASE memory, the size parameter must be 0. + Do not use MEM_RELEASE with MEM_DECOMMIT. */ + if (!VirtualFree(ptr, 0, MEM_RELEASE)) { + fprintf(stderr, "InnoDB: VirtualFree(%p, %lu) failed;" + " Windows error %lu\n", + ptr, (ulong) size, (ulong) GetLastError()); + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + ut_total_allocated_memory -= size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_FREE(ptr, size); + } +#elif defined __NETWARE__ || !defined OS_MAP_ANON + ut_free(ptr); +#else + if (munmap(ptr, size)) { + fprintf(stderr, "InnoDB: munmap(%p, %lu) failed;" + " errno %lu\n", + ptr, (ulong) size, (ulong) errno); + } else { + os_fast_mutex_lock(&ut_list_mutex); + ut_a(ut_total_allocated_memory >= size); + ut_total_allocated_memory -= size; + os_fast_mutex_unlock(&ut_list_mutex); + UNIV_MEM_FREE(ptr, size); + } +#endif +} + +/******************************************************************** +Sets the priority boost for threads released from waiting within the current +process. */ +UNIV_INTERN +void +os_process_set_priority_boost( +/*==========================*/ + ibool do_boost) /* in: TRUE if priority boost should be done, + FALSE if not */ +{ +#ifdef __WIN__ + ibool no_boost; + + if (do_boost) { + no_boost = FALSE; + } else { + no_boost = TRUE; + } + +#if TRUE != 1 +# error "TRUE != 1" +#endif + + /* Does not do anything currently! + SetProcessPriorityBoost(GetCurrentProcess(), no_boost); + */ + fputs("Warning: process priority boost setting" + " currently not functional!\n", + stderr); +#else + UT_NOT_USED(do_boost); +#endif +} diff --git a/storage/xtradb/os/os0sync.c b/storage/xtradb/os/os0sync.c new file mode 100644 index 00000000000..78ff74059f8 --- /dev/null +++ b/storage/xtradb/os/os0sync.c @@ -0,0 +1,769 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system +synchronization primitives. + +Created 9/6/1995 Heikki Tuuri +*******************************************************/ + +#include "os0sync.h" +#ifdef UNIV_NONINL +#include "os0sync.ic" +#endif + +#ifdef __WIN__ +#include +#endif + +#include "ut0mem.h" +#include "srv0start.h" + +/* Type definition for an operating system mutex struct */ +struct os_mutex_struct{ + os_event_t event; /* Used by sync0arr.c for queing threads */ + void* handle; /* OS handle to mutex */ + ulint count; /* we use this counter to check + that the same thread does not + recursively lock the mutex: we + do not assume that the OS mutex + supports recursive locking, though + NT seems to do that */ + UT_LIST_NODE_T(os_mutex_str_t) os_mutex_list; + /* list of all 'slow' OS mutexes created */ +}; + +/* Mutex protecting counts and the lists of OS mutexes and events */ +UNIV_INTERN os_mutex_t os_sync_mutex; +static ibool os_sync_mutex_inited = FALSE; +static ibool os_sync_free_called = FALSE; + +/* This is incremented by 1 in os_thread_create and decremented by 1 in +os_thread_exit */ +UNIV_INTERN ulint os_thread_count = 0; + +/* The list of all events created */ +static UT_LIST_BASE_NODE_T(os_event_struct_t) os_event_list; + +/* The list of all OS 'slow' mutexes */ +static UT_LIST_BASE_NODE_T(os_mutex_str_t) os_mutex_list; + +UNIV_INTERN ulint os_event_count = 0; +UNIV_INTERN ulint os_mutex_count = 0; +UNIV_INTERN ulint os_fast_mutex_count = 0; + +/* Because a mutex is embedded inside an event and there is an +event embedded inside a mutex, on free, this generates a recursive call. +This version of the free event function doesn't acquire the global lock */ +static void os_event_free_internal(os_event_t event); + +/************************************************************* +Initializes global event and OS 'slow' mutex lists. */ +UNIV_INTERN +void +os_sync_init(void) +/*==============*/ +{ + UT_LIST_INIT(os_event_list); + UT_LIST_INIT(os_mutex_list); + + os_sync_mutex = os_mutex_create(NULL); + + os_sync_mutex_inited = TRUE; +} + +/************************************************************* +Frees created events and OS 'slow' mutexes. */ +UNIV_INTERN +void +os_sync_free(void) +/*==============*/ +{ + os_event_t event; + os_mutex_t mutex; + + os_sync_free_called = TRUE; + event = UT_LIST_GET_FIRST(os_event_list); + + while (event) { + + os_event_free(event); + + event = UT_LIST_GET_FIRST(os_event_list); + } + + mutex = UT_LIST_GET_FIRST(os_mutex_list); + + while (mutex) { + if (mutex == os_sync_mutex) { + /* Set the flag to FALSE so that we do not try to + reserve os_sync_mutex any more in remaining freeing + operations in shutdown */ + os_sync_mutex_inited = FALSE; + } + + os_mutex_free(mutex); + + mutex = UT_LIST_GET_FIRST(os_mutex_list); + } + os_sync_free_called = FALSE; +} + +/************************************************************* +Creates an event semaphore, i.e., a semaphore which may just have two +states: signaled and nonsignaled. The created event is manual reset: it +must be reset explicitly by calling sync_os_reset_event. */ +UNIV_INTERN +os_event_t +os_event_create( +/*============*/ + /* out: the event handle */ + const char* name) /* in: the name of the event, if NULL + the event is created without a name */ +{ +#ifdef __WIN__ + os_event_t event; + + event = ut_malloc(sizeof(struct os_event_struct)); + + event->handle = CreateEvent(NULL, /* No security attributes */ + TRUE, /* Manual reset */ + FALSE, /* Initial state nonsignaled */ + (LPCTSTR) name); + if (!event->handle) { + fprintf(stderr, + "InnoDB: Could not create a Windows event semaphore;" + " Windows error %lu\n", + (ulong) GetLastError()); + } +#else /* Unix */ + os_event_t event; + + UT_NOT_USED(name); + + event = ut_malloc(sizeof(struct os_event_struct)); + + os_fast_mutex_init(&(event->os_mutex)); + +#if defined(UNIV_HOTBACKUP) && defined(UNIV_HPUX10) + ut_a(0 == pthread_cond_init(&(event->cond_var), + pthread_condattr_default)); +#else + ut_a(0 == pthread_cond_init(&(event->cond_var), NULL)); +#endif + event->is_set = FALSE; + + /* We return this value in os_event_reset(), which can then be + be used to pass to the os_event_wait_low(). The value of zero + is reserved in os_event_wait_low() for the case when the + caller does not want to pass any signal_count value. To + distinguish between the two cases we initialize signal_count + to 1 here. */ + event->signal_count = 1; +#endif /* __WIN__ */ + + /* The os_sync_mutex can be NULL because during startup an event + can be created [ because it's embedded in the mutex/rwlock ] before + this module has been initialized */ + if (os_sync_mutex != NULL) { + os_mutex_enter(os_sync_mutex); + } + + /* Put to the list of events */ + UT_LIST_ADD_FIRST(os_event_list, os_event_list, event); + + os_event_count++; + + if (os_sync_mutex != NULL) { + os_mutex_exit(os_sync_mutex); + } + + return(event); +} + +#ifdef __WIN__ +/************************************************************* +Creates an auto-reset event semaphore, i.e., an event which is automatically +reset when a single thread is released. Works only in Windows. */ +UNIV_INTERN +os_event_t +os_event_create_auto( +/*=================*/ + /* out: the event handle */ + const char* name) /* in: the name of the event, if NULL + the event is created without a name */ +{ + os_event_t event; + + event = ut_malloc(sizeof(struct os_event_struct)); + + event->handle = CreateEvent(NULL, /* No security attributes */ + FALSE, /* Auto-reset */ + FALSE, /* Initial state nonsignaled */ + (LPCTSTR) name); + + if (!event->handle) { + fprintf(stderr, + "InnoDB: Could not create a Windows auto" + " event semaphore; Windows error %lu\n", + (ulong) GetLastError()); + } + + /* Put to the list of events */ + os_mutex_enter(os_sync_mutex); + + UT_LIST_ADD_FIRST(os_event_list, os_event_list, event); + + os_event_count++; + + os_mutex_exit(os_sync_mutex); + + return(event); +} +#endif + +/************************************************************** +Sets an event semaphore to the signaled state: lets waiting threads +proceed. */ +UNIV_INTERN +void +os_event_set( +/*=========*/ + os_event_t event) /* in: event to set */ +{ +#ifdef __WIN__ + ut_a(event); + ut_a(SetEvent(event->handle)); +#else + ut_a(event); + + os_fast_mutex_lock(&(event->os_mutex)); + + if (event->is_set) { + /* Do nothing */ + } else { + event->is_set = TRUE; + event->signal_count += 1; + ut_a(0 == pthread_cond_broadcast(&(event->cond_var))); + } + + os_fast_mutex_unlock(&(event->os_mutex)); +#endif +} + +/************************************************************** +Resets an event semaphore to the nonsignaled state. Waiting threads will +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). */ +UNIV_INTERN +ib_int64_t +os_event_reset( +/*===========*/ + /* out: current signal_count. */ + os_event_t event) /* in: event to reset */ +{ + ib_int64_t ret = 0; + +#ifdef __WIN__ + ut_a(event); + + ut_a(ResetEvent(event->handle)); +#else + ut_a(event); + + os_fast_mutex_lock(&(event->os_mutex)); + + if (!event->is_set) { + /* Do nothing */ + } else { + event->is_set = FALSE; + } + ret = event->signal_count; + + os_fast_mutex_unlock(&(event->os_mutex)); +#endif + return(ret); +} + +/************************************************************** +Frees an event object, without acquiring the global lock. */ +static +void +os_event_free_internal( +/*===================*/ + os_event_t event) /* in: event to free */ +{ +#ifdef __WIN__ + ut_a(event); + + ut_a(CloseHandle(event->handle)); +#else + ut_a(event); + + /* This is to avoid freeing the mutex twice */ + os_fast_mutex_free(&(event->os_mutex)); + + ut_a(0 == pthread_cond_destroy(&(event->cond_var))); +#endif + /* Remove from the list of events */ + + UT_LIST_REMOVE(os_event_list, os_event_list, event); + + os_event_count--; + + ut_free(event); +} + +/************************************************************** +Frees an event object. */ +UNIV_INTERN +void +os_event_free( +/*==========*/ + os_event_t event) /* in: event to free */ + +{ +#ifdef __WIN__ + ut_a(event); + + ut_a(CloseHandle(event->handle)); +#else + ut_a(event); + + os_fast_mutex_free(&(event->os_mutex)); + ut_a(0 == pthread_cond_destroy(&(event->cond_var))); +#endif + /* Remove from the list of events */ + + os_mutex_enter(os_sync_mutex); + + UT_LIST_REMOVE(os_event_list, os_event_list, event); + + os_event_count--; + + os_mutex_exit(os_sync_mutex); + + ut_free(event); +} + +/************************************************************** +Waits for an event object until it is in the signaled state. If +srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the +waiting thread when the event becomes signaled (or immediately if the +event is already in the signaled state). + +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ +UNIV_INTERN +void +os_event_wait_low( +/*==============*/ + os_event_t event, /* in: event to wait */ + ib_int64_t reset_sig_count)/* in: zero or the value + returned by previous call of + os_event_reset(). */ +{ +#ifdef __WIN__ + DWORD err; + + ut_a(event); + + UT_NOT_USED(reset_sig_count); + + /* Specify an infinite time limit for waiting */ + err = WaitForSingleObject(event->handle, INFINITE); + + ut_a(err == WAIT_OBJECT_0); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); + } +#else + ib_int64_t old_signal_count; + + os_fast_mutex_lock(&(event->os_mutex)); + + if (reset_sig_count) { + old_signal_count = reset_sig_count; + } else { + old_signal_count = event->signal_count; + } + + for (;;) { + if (event->is_set == TRUE + || event->signal_count != old_signal_count) { + + os_fast_mutex_unlock(&(event->os_mutex)); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + + os_thread_exit(NULL); + } + /* Ok, we may return */ + + return; + } + + pthread_cond_wait(&(event->cond_var), &(event->os_mutex)); + + /* Solaris manual said that spurious wakeups may occur: we + have to check if the event really has been signaled after + we came here to wait */ + } +#endif +} + +/************************************************************** +Waits for an event object until it is in the signaled state or +a timeout is exceeded. In Unix the timeout is always infinite. */ +UNIV_INTERN +ulint +os_event_wait_time( +/*===============*/ + /* out: 0 if success, OS_SYNC_TIME_EXCEEDED if + timeout was exceeded */ + os_event_t event, /* in: event to wait */ + ulint time) /* in: timeout in microseconds, or + OS_SYNC_INFINITE_TIME */ +{ +#ifdef __WIN__ + DWORD err; + + ut_a(event); + + if (time != OS_SYNC_INFINITE_TIME) { + err = WaitForSingleObject(event->handle, (DWORD) time / 1000); + } else { + err = WaitForSingleObject(event->handle, INFINITE); + } + + if (err == WAIT_OBJECT_0) { + + return(0); + } else if (err == WAIT_TIMEOUT) { + + return(OS_SYNC_TIME_EXCEEDED); + } else { + ut_error; + return(1000000); /* dummy value to eliminate compiler warn. */ + } +#else + UT_NOT_USED(time); + + /* In Posix this is just an ordinary, infinite wait */ + + os_event_wait(event); + + return(0); +#endif +} + +#ifdef __WIN__ +/************************************************************** +Waits for any event in an OS native event array. Returns if even a single +one is signaled or becomes signaled. */ +UNIV_INTERN +ulint +os_event_wait_multiple( +/*===================*/ + /* out: index of the event + which was signaled */ + ulint n, /* in: number of events in the + array */ + os_native_event_t* native_event_array) + /* in: pointer to an array of event + handles */ +{ + DWORD index; + + ut_a(native_event_array); + ut_a(n > 0); + + index = WaitForMultipleObjects((DWORD) n, native_event_array, + FALSE, /* Wait for any 1 event */ + INFINITE); /* Infinite wait time + limit */ + ut_a(index >= WAIT_OBJECT_0); /* NOTE: Pointless comparision */ + ut_a(index < WAIT_OBJECT_0 + n); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); + } + + return(index - WAIT_OBJECT_0); +} +#endif + +/************************************************************* +Creates an operating system mutex semaphore. Because these are slow, the +mutex semaphore of InnoDB itself (mutex_t) should be used where possible. */ +UNIV_INTERN +os_mutex_t +os_mutex_create( +/*============*/ + /* out: the mutex handle */ + const char* name) /* in: the name of the mutex, if NULL + the mutex is created without a name */ +{ +#ifdef __WIN__ + HANDLE mutex; + os_mutex_t mutex_str; + + mutex = CreateMutex(NULL, /* No security attributes */ + FALSE, /* Initial state: no owner */ + (LPCTSTR) name); + ut_a(mutex); +#else + os_fast_mutex_t* mutex; + os_mutex_t mutex_str; + + UT_NOT_USED(name); + + mutex = ut_malloc(sizeof(os_fast_mutex_t)); + + os_fast_mutex_init(mutex); +#endif + mutex_str = ut_malloc(sizeof(os_mutex_str_t)); + + mutex_str->handle = mutex; + mutex_str->count = 0; + mutex_str->event = os_event_create(NULL); + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + /* When creating os_sync_mutex itself we cannot reserve it */ + os_mutex_enter(os_sync_mutex); + } + + UT_LIST_ADD_FIRST(os_mutex_list, os_mutex_list, mutex_str); + + os_mutex_count++; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } + + return(mutex_str); +} + +/************************************************************** +Acquires ownership of a mutex semaphore. */ +UNIV_INTERN +void +os_mutex_enter( +/*===========*/ + os_mutex_t mutex) /* in: mutex to acquire */ +{ +#ifdef __WIN__ + DWORD err; + + ut_a(mutex); + + /* Specify infinite time limit for waiting */ + err = WaitForSingleObject(mutex->handle, INFINITE); + + ut_a(err == WAIT_OBJECT_0); + + (mutex->count)++; + ut_a(mutex->count == 1); +#else + os_fast_mutex_lock(mutex->handle); + + (mutex->count)++; + + ut_a(mutex->count == 1); +#endif +} + +/************************************************************** +Releases ownership of a mutex. */ +UNIV_INTERN +void +os_mutex_exit( +/*==========*/ + os_mutex_t mutex) /* in: mutex to release */ +{ + ut_a(mutex); + + ut_a(mutex->count == 1); + + (mutex->count)--; +#ifdef __WIN__ + ut_a(ReleaseMutex(mutex->handle)); +#else + os_fast_mutex_unlock(mutex->handle); +#endif +} + +/************************************************************** +Frees a mutex object. */ +UNIV_INTERN +void +os_mutex_free( +/*==========*/ + os_mutex_t mutex) /* in: mutex to free */ +{ + ut_a(mutex); + + if (UNIV_LIKELY(!os_sync_free_called)) { + os_event_free_internal(mutex->event); + } + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_enter(os_sync_mutex); + } + + UT_LIST_REMOVE(os_mutex_list, os_mutex_list, mutex); + + os_mutex_count--; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } + +#ifdef __WIN__ + ut_a(CloseHandle(mutex->handle)); + + ut_free(mutex); +#else + os_fast_mutex_free(mutex->handle); + ut_free(mutex->handle); + ut_free(mutex); +#endif +} + +/************************************************************* +Initializes an operating system fast mutex semaphore. */ +UNIV_INTERN +void +os_fast_mutex_init( +/*===============*/ + os_fast_mutex_t* fast_mutex) /* in: fast mutex */ +{ +#ifdef __WIN__ + ut_a(fast_mutex); + + InitializeCriticalSection((LPCRITICAL_SECTION) fast_mutex); +#else +#if defined(UNIV_HOTBACKUP) && defined(UNIV_HPUX10) + ut_a(0 == pthread_mutex_init(fast_mutex, pthread_mutexattr_default)); +#else + ut_a(0 == pthread_mutex_init(fast_mutex, MY_MUTEX_INIT_FAST)); +#endif +#endif + if (UNIV_LIKELY(os_sync_mutex_inited)) { + /* When creating os_sync_mutex itself (in Unix) we cannot + reserve it */ + + os_mutex_enter(os_sync_mutex); + } + + os_fast_mutex_count++; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } +} + +/************************************************************** +Acquires ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_lock( +/*===============*/ + os_fast_mutex_t* fast_mutex) /* in: mutex to acquire */ +{ +#ifdef __WIN__ + EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex); +#else + pthread_mutex_lock(fast_mutex); +#endif +} + +/************************************************************** +Releases ownership of a fast mutex. */ +UNIV_INTERN +void +os_fast_mutex_unlock( +/*=================*/ + os_fast_mutex_t* fast_mutex) /* in: mutex to release */ +{ +#ifdef __WIN__ + LeaveCriticalSection(fast_mutex); +#else + pthread_mutex_unlock(fast_mutex); +#endif +} + +/************************************************************** +Frees a mutex object. */ +UNIV_INTERN +void +os_fast_mutex_free( +/*===============*/ + os_fast_mutex_t* fast_mutex) /* in: mutex to free */ +{ +#ifdef __WIN__ + ut_a(fast_mutex); + + DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex); +#else + int ret; + + ret = pthread_mutex_destroy(fast_mutex); + + if (UNIV_UNLIKELY(ret != 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: error: return value %lu when calling\n" + "InnoDB: pthread_mutex_destroy().\n", (ulint)ret); + fprintf(stderr, + "InnoDB: Byte contents of the pthread mutex at %p:\n", + (void*) fast_mutex); + ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t)); + putc('\n', stderr); + } +#endif + if (UNIV_LIKELY(os_sync_mutex_inited)) { + /* When freeing the last mutexes, we have + already freed os_sync_mutex */ + + os_mutex_enter(os_sync_mutex); + } + + os_fast_mutex_count--; + + if (UNIV_LIKELY(os_sync_mutex_inited)) { + os_mutex_exit(os_sync_mutex); + } +} diff --git a/storage/xtradb/os/os0thread.c b/storage/xtradb/os/os0thread.c new file mode 100644 index 00000000000..7d0a57ae17c --- /dev/null +++ b/storage/xtradb/os/os0thread.c @@ -0,0 +1,365 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The interface to the operating system thread control primitives + +Created 9/8/1995 Heikki Tuuri +*******************************************************/ + +#include "os0thread.h" +#ifdef UNIV_NONINL +#include "os0thread.ic" +#endif + +#ifdef __WIN__ +#include +#endif + +#include "srv0srv.h" +#include "os0sync.h" + +/******************************************************************* +Compares two thread ids for equality. */ +UNIV_INTERN +ibool +os_thread_eq( +/*=========*/ + /* out: TRUE if equal */ + os_thread_id_t a, /* in: OS thread or thread id */ + os_thread_id_t b) /* in: OS thread or thread id */ +{ +#ifdef __WIN__ + if (a == b) { + return(TRUE); + } + + return(FALSE); +#else + if (pthread_equal(a, b)) { + return(TRUE); + } + + return(FALSE); +#endif +} + +/******************************************************************** +Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is +unique for the thread though! */ +UNIV_INTERN +ulint +os_thread_pf( +/*=========*/ + os_thread_id_t a) +{ +#ifdef UNIV_HPUX10 + /* In HP-UX-10.20 a pthread_t is a struct of 3 fields: field1, field2, + field3. We do not know if field1 determines the thread uniquely. */ + + return((ulint)(a.field1)); +#else + return((ulint)a); +#endif +} + +/********************************************************************* +Returns the thread identifier of current thread. Currently the thread +identifier in Unix is the thread handle itself. Note that in HP-UX +pthread_t is a struct of 3 fields. */ +UNIV_INTERN +os_thread_id_t +os_thread_get_curr_id(void) +/*=======================*/ +{ +#ifdef __WIN__ + return(GetCurrentThreadId()); +#else + return(pthread_self()); +#endif +} + +/******************************************************************** +Creates a new thread of execution. The execution starts from +the function given. The start function takes a void* parameter +and returns an ulint. */ +UNIV_INTERN +os_thread_t +os_thread_create( +/*=============*/ + /* out: handle to the thread */ +#ifndef __WIN__ + os_posix_f_t start_f, +#else + ulint (*start_f)(void*), /* in: pointer to function + from which to start */ +#endif + void* arg, /* in: argument to start + function */ + os_thread_id_t* thread_id) /* out: id of the created + thread, or NULL */ +{ +#ifdef __WIN__ + os_thread_t thread; + DWORD win_thread_id; + + os_mutex_enter(os_sync_mutex); + os_thread_count++; + os_mutex_exit(os_sync_mutex); + + thread = CreateThread(NULL, /* no security attributes */ + 0, /* default size stack */ + (LPTHREAD_START_ROUTINE)start_f, + arg, + 0, /* thread runs immediately */ + &win_thread_id); + + if (srv_set_thread_priorities) { + + /* Set created thread priority the same as a normal query + in MYSQL: we try to prevent starvation of threads by + assigning same priority QUERY_PRIOR to all */ + + ut_a(SetThreadPriority(thread, srv_query_thread_priority)); + } + + if (thread_id) { + *thread_id = win_thread_id; + } + + return(thread); +#else + int ret; + os_thread_t pthread; + pthread_attr_t attr; + +#if !(defined(UNIV_HOTBACKUP) && defined(UNIV_HPUX10)) + pthread_attr_init(&attr); +#endif + +#ifdef UNIV_AIX + /* We must make sure a thread stack is at least 32 kB, otherwise + InnoDB might crash; we do not know if the default stack size on + AIX is always big enough. An empirical test on AIX-4.3 suggested + the size was 96 kB, though. */ + + ret = pthread_attr_setstacksize(&attr, + (size_t)(PTHREAD_STACK_MIN + + 32 * 1024)); + if (ret) { + fprintf(stderr, + "InnoDB: Error: pthread_attr_setstacksize" + " returned %d\n", ret); + exit(1); + } +#endif +#ifdef __NETWARE__ + ret = pthread_attr_setstacksize(&attr, + (size_t) NW_THD_STACKSIZE); + if (ret) { + fprintf(stderr, + "InnoDB: Error: pthread_attr_setstacksize" + " returned %d\n", ret); + exit(1); + } +#endif + os_mutex_enter(os_sync_mutex); + os_thread_count++; + os_mutex_exit(os_sync_mutex); + +#if defined(UNIV_HOTBACKUP) && defined(UNIV_HPUX10) + ret = pthread_create(&pthread, pthread_attr_default, start_f, arg); +#else + ret = pthread_create(&pthread, &attr, start_f, arg); +#endif + if (ret) { + fprintf(stderr, + "InnoDB: Error: pthread_create returned %d\n", ret); + exit(1); + } + +#if !(defined(UNIV_HOTBACKUP) && defined(UNIV_HPUX10)) + pthread_attr_destroy(&attr); +#endif + if (srv_set_thread_priorities) { + + my_pthread_setprio(pthread, srv_query_thread_priority); + } + + if (thread_id) { + *thread_id = pthread; + } + + return(pthread); +#endif +} + +/********************************************************************* +Exits the current thread. */ +UNIV_INTERN +void +os_thread_exit( +/*===========*/ + void* exit_value) /* in: exit value; in Windows this void* + is cast as a DWORD */ +{ +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Thread exits, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + os_mutex_enter(os_sync_mutex); + os_thread_count--; + os_mutex_exit(os_sync_mutex); + +#ifdef __WIN__ + ExitThread((DWORD)exit_value); +#else + pthread_exit(exit_value); +#endif +} + +/********************************************************************* +Returns handle to the current thread. */ +UNIV_INTERN +os_thread_t +os_thread_get_curr(void) +/*====================*/ +{ +#ifdef __WIN__ + return(GetCurrentThread()); +#else + return(pthread_self()); +#endif +} + +/********************************************************************* +Advises the os to give up remainder of the thread's time slice. */ +UNIV_INTERN +void +os_thread_yield(void) +/*=================*/ +{ +#if defined(__WIN__) + Sleep(0); +#elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H)) + sched_yield(); +#elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG) + pthread_yield(); +#elif defined(HAVE_PTHREAD_YIELD_ONE_ARG) + pthread_yield(0); +#else + os_thread_sleep(0); +#endif +} + +/********************************************************************* +The thread sleeps at least the time given in microseconds. */ +UNIV_INTERN +void +os_thread_sleep( +/*============*/ + ulint tm) /* in: time in microseconds */ +{ +#ifdef __WIN__ + Sleep((DWORD) tm / 1000); +#elif defined(__NETWARE__) + delay(tm / 1000); +#else + struct timeval t; + + t.tv_sec = tm / 1000000; + t.tv_usec = tm % 1000000; + + select(0, NULL, NULL, NULL, &t); +#endif +} + +/********************************************************************** +Sets a thread priority. */ +UNIV_INTERN +void +os_thread_set_priority( +/*===================*/ + os_thread_t handle, /* in: OS handle to the thread */ + ulint pri) /* in: priority */ +{ +#ifdef __WIN__ + int os_pri; + + if (pri == OS_THREAD_PRIORITY_BACKGROUND) { + os_pri = THREAD_PRIORITY_BELOW_NORMAL; + } else if (pri == OS_THREAD_PRIORITY_NORMAL) { + os_pri = THREAD_PRIORITY_NORMAL; + } else if (pri == OS_THREAD_PRIORITY_ABOVE_NORMAL) { + os_pri = THREAD_PRIORITY_HIGHEST; + } else { + ut_error; + } + + ut_a(SetThreadPriority(handle, os_pri)); +#else + UT_NOT_USED(handle); + UT_NOT_USED(pri); +#endif +} + +/********************************************************************** +Gets a thread priority. */ +UNIV_INTERN +ulint +os_thread_get_priority( +/*===================*/ + /* out: priority */ + os_thread_t handle __attribute__((unused))) + /* in: OS handle to the thread */ +{ +#ifdef __WIN__ + int os_pri; + ulint pri; + + os_pri = GetThreadPriority(handle); + + if (os_pri == THREAD_PRIORITY_BELOW_NORMAL) { + pri = OS_THREAD_PRIORITY_BACKGROUND; + } else if (os_pri == THREAD_PRIORITY_NORMAL) { + pri = OS_THREAD_PRIORITY_NORMAL; + } else if (os_pri == THREAD_PRIORITY_HIGHEST) { + pri = OS_THREAD_PRIORITY_ABOVE_NORMAL; + } else { + ut_error; + } + + return(pri); +#else + return(0); +#endif +} + +/********************************************************************** +Gets the last operating system error code for the calling thread. */ +UNIV_INTERN +ulint +os_thread_get_last_error(void) +/*==========================*/ +{ +#ifdef __WIN__ + return(GetLastError()); +#else + return(0); +#endif +} diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.c new file mode 100644 index 00000000000..e810756c1e4 --- /dev/null +++ b/storage/xtradb/page/page0cur.c @@ -0,0 +1,1922 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#include "page0cur.h" +#ifdef UNIV_NONINL +#include "page0cur.ic" +#endif + +#include "page0zip.h" +#include "mtr0log.h" +#include "log0recv.h" +#include "rem0cmp.h" + +static ulint page_rnd = 976722341; + +#ifdef PAGE_CUR_ADAPT +# ifdef UNIV_SEARCH_PERF_STAT +static ulint page_cur_short_succ = 0; +# endif /* UNIV_SEARCH_PERF_STAT */ + +/******************************************************************** +Tries a search shortcut based on the last insert. */ +UNIV_INLINE +ibool +page_cur_try_search_shortcut( +/*=========================*/ + /* out: TRUE on success */ + const buf_block_t* block, /* in: index page */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* tuple, /* in: data tuple */ + ulint* iup_matched_fields, + /* in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /* in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /* out: page cursor */ +{ + const rec_t* rec; + const rec_t* next_rec; + ulint low_match; + ulint low_bytes; + ulint up_match; + ulint up_bytes; +#ifdef UNIV_SEARCH_DEBUG + page_cur_t cursor2; +#endif + ibool success = FALSE; + const page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(dtuple_check_typed(tuple)); + + rec = page_header_get_ptr(page, PAGE_LAST_INSERT); + offsets = rec_get_offsets(rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + ut_ad(rec); + ut_ad(page_rec_is_user_rec(rec)); + + ut_pair_min(&low_match, &low_bytes, + *ilow_matched_fields, *ilow_matched_bytes, + *iup_matched_fields, *iup_matched_bytes); + + up_match = low_match; + up_bytes = low_bytes; + + if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets, + &low_match, &low_bytes) < 0) { + goto exit_func; + } + + next_rec = page_rec_get_next_const(rec); + offsets = rec_get_offsets(next_rec, index, offsets, + dtuple_get_n_fields(tuple), &heap); + + if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets, + &up_match, &up_bytes) >= 0) { + goto exit_func; + } + + page_cur_position(rec, block, cursor); + +#ifdef UNIV_SEARCH_DEBUG + page_cur_search_with_match(block, index, tuple, PAGE_CUR_DBG, + iup_matched_fields, + iup_matched_bytes, + ilow_matched_fields, + ilow_matched_bytes, + &cursor2); + ut_a(cursor2.rec == cursor->rec); + + if (!page_rec_is_supremum(next_rec)) { + + ut_a(*iup_matched_fields == up_match); + ut_a(*iup_matched_bytes == up_bytes); + } + + ut_a(*ilow_matched_fields == low_match); + ut_a(*ilow_matched_bytes == low_bytes); +#endif + if (!page_rec_is_supremum(next_rec)) { + + *iup_matched_fields = up_match; + *iup_matched_bytes = up_bytes; + } + + *ilow_matched_fields = low_match; + *ilow_matched_bytes = low_bytes; + +#ifdef UNIV_SEARCH_PERF_STAT + page_cur_short_succ++; +#endif + success = TRUE; +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +#endif + +#ifdef PAGE_CUR_LE_OR_EXTENDS +/******************************************************************** +Checks if the nth field in a record is a character type field which extends +the nth field in tuple, i.e., the field is longer or equal in length and has +common first characters. */ +static +ibool +page_cur_rec_field_extends( +/*=======================*/ + /* out: TRUE if rec field + extends tuple field */ + const dtuple_t* tuple, /* in: data tuple */ + const rec_t* rec, /* in: record */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint n) /* in: compare nth field */ +{ + const dtype_t* type; + const dfield_t* dfield; + const byte* rec_f; + ulint rec_f_len; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + dfield = dtuple_get_nth_field(tuple, n); + + type = dfield_get_type(dfield); + + rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len); + + if (type->mtype == DATA_VARCHAR + || type->mtype == DATA_CHAR + || type->mtype == DATA_FIXBINARY + || type->mtype == DATA_BINARY + || type->mtype == DATA_BLOB + || type->mtype == DATA_VARMYSQL + || type->mtype == DATA_MYSQL) { + + if (dfield_get_len(dfield) != UNIV_SQL_NULL + && rec_f_len != UNIV_SQL_NULL + && rec_f_len >= dfield_get_len(dfield) + && !cmp_data_data_slow(type->mtype, type->prtype, + dfield_get_data(dfield), + dfield_get_len(dfield), + rec_f, dfield_get_len(dfield))) { + + return(TRUE); + } + } + + return(FALSE); +} +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + +/******************************************************************** +Searches the right position for a page cursor. */ +UNIV_INTERN +void +page_cur_search_with_match( +/*=======================*/ + const buf_block_t* block, /* in: buffer block */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* tuple, /* in: data tuple */ + ulint mode, /* in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /* in/out: already matched + fields in upper limit record */ + ulint* iup_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + ulint* ilow_matched_fields, + /* in/out: already matched + fields in lower limit record */ + ulint* ilow_matched_bytes, + /* in/out: already matched + bytes in a field not yet + completely matched */ + page_cur_t* cursor) /* out: page cursor */ +{ + ulint up; + ulint low; + ulint mid; + const page_t* page; + const page_dir_slot_t* slot; + const rec_t* up_rec; + const rec_t* low_rec; + const rec_t* mid_rec; + ulint up_matched_fields; + ulint up_matched_bytes; + ulint low_matched_fields; + ulint low_matched_bytes; + ulint cur_matched_fields; + ulint cur_matched_bytes; + int cmp; +#ifdef UNIV_SEARCH_DEBUG + int dbg_cmp; + ulint dbg_matched_fields; + ulint dbg_matched_bytes; +#endif +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t* page_zip = buf_block_get_page_zip(block); +#endif /* UNIV_ZIP_DEBUG */ + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(block && tuple && iup_matched_fields && iup_matched_bytes + && ilow_matched_fields && ilow_matched_bytes && cursor); + ut_ad(dtuple_validate(tuple)); +#ifdef UNIV_DEBUG +# ifdef PAGE_CUR_DBG + if (mode != PAGE_CUR_DBG) +# endif /* PAGE_CUR_DBG */ +# ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode != PAGE_CUR_LE_OR_EXTENDS) +# endif /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || mode == PAGE_CUR_G || mode == PAGE_CUR_GE); +#endif /* UNIV_DEBUG */ + page = buf_block_get_frame(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + page_check_dir(page); + +#ifdef PAGE_CUR_ADAPT + if (page_is_leaf(page) + && (mode == PAGE_CUR_LE) + && (page_header_get_field(page, PAGE_N_DIRECTION) > 3) + && (page_header_get_ptr(page, PAGE_LAST_INSERT)) + && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) { + + if (page_cur_try_search_shortcut( + block, index, tuple, + iup_matched_fields, iup_matched_bytes, + ilow_matched_fields, ilow_matched_bytes, + cursor)) { + return; + } + } +# ifdef PAGE_CUR_DBG + if (mode == PAGE_CUR_DBG) { + mode = PAGE_CUR_LE; + } +# endif +#endif + + /* The following flag does not work for non-latin1 char sets because + cmp_full_field does not tell how many bytes matched */ +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_a(mode != PAGE_CUR_LE_OR_EXTENDS); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + + /* If mode PAGE_CUR_G is specified, we are trying to position the + cursor to answer a query of the form "tuple < X", where tuple is + the input parameter, and X denotes an arbitrary physical record on + the page. We want to position the cursor on the first X which + satisfies the condition. */ + + up_matched_fields = *iup_matched_fields; + up_matched_bytes = *iup_matched_bytes; + low_matched_fields = *ilow_matched_fields; + low_matched_bytes = *ilow_matched_bytes; + + /* Perform binary search. First the search is done through the page + directory, after that as a linear search in the list of records + owned by the upper limit directory slot. */ + + low = 0; + up = page_dir_get_n_slots(page) - 1; + + /* Perform binary search until the lower and upper limit directory + slots come to the distance 1 of each other */ + + while (up - low > 1) { + mid = (low + up) / 2; + slot = page_dir_get_nth_slot(page, mid); + mid_rec = page_dir_slot_get_rec(slot); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_slot_match: + low = mid; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_slot_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_slot_match: + up = mid; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_slot_match; + } else { + + goto up_slot_match; + } + } + + slot = page_dir_get_nth_slot(page, low); + low_rec = page_dir_slot_get_rec(slot); + slot = page_dir_get_nth_slot(page, up); + up_rec = page_dir_slot_get_rec(slot); + + /* Perform linear search until the upper and lower records come to + distance 1 of each other. */ + + while (page_rec_get_next_const(low_rec) != up_rec) { + + mid_rec = page_rec_get_next_const(low_rec); + + ut_pair_min(&cur_matched_fields, &cur_matched_bytes, + low_matched_fields, low_matched_bytes, + up_matched_fields, up_matched_bytes); + + offsets = rec_get_offsets(mid_rec, index, offsets, + dtuple_get_n_fields_cmp(tuple), + &heap); + + cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets, + &cur_matched_fields, + &cur_matched_bytes); + if (UNIV_LIKELY(cmp > 0)) { +low_rec_match: + low_rec = mid_rec; + low_matched_fields = cur_matched_fields; + low_matched_bytes = cur_matched_bytes; + + } else if (UNIV_EXPECT(cmp, -1)) { +#ifdef PAGE_CUR_LE_OR_EXTENDS + if (mode == PAGE_CUR_LE_OR_EXTENDS + && page_cur_rec_field_extends( + tuple, mid_rec, offsets, + cur_matched_fields)) { + + goto low_rec_match; + } +#endif /* PAGE_CUR_LE_OR_EXTENDS */ +up_rec_match: + up_rec = mid_rec; + up_matched_fields = cur_matched_fields; + up_matched_bytes = cur_matched_bytes; + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE +#ifdef PAGE_CUR_LE_OR_EXTENDS + || mode == PAGE_CUR_LE_OR_EXTENDS +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + ) { + + goto low_rec_match; + } else { + + goto up_rec_match; + } + } + +#ifdef UNIV_SEARCH_DEBUG + + /* Check that the lower and upper limit records have the + right alphabetical order compared to tuple. */ + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(low_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp >= 0); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp == 1); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp >= 0); + } + + if (!page_rec_is_infimum(low_rec)) { + + ut_a(low_matched_fields == dbg_matched_fields); + ut_a(low_matched_bytes == dbg_matched_bytes); + } + + dbg_matched_fields = 0; + dbg_matched_bytes = 0; + + offsets = rec_get_offsets(up_rec, index, offsets, + ULINT_UNDEFINED, &heap); + dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets, + &dbg_matched_fields, + &dbg_matched_bytes); + if (mode == PAGE_CUR_G) { + ut_a(dbg_cmp == -1); + } else if (mode == PAGE_CUR_GE) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_L) { + ut_a(dbg_cmp <= 0); + } else if (mode == PAGE_CUR_LE) { + ut_a(dbg_cmp == -1); + } + + if (!page_rec_is_supremum(up_rec)) { + + ut_a(up_matched_fields == dbg_matched_fields); + ut_a(up_matched_bytes == dbg_matched_bytes); + } +#endif + if (mode <= PAGE_CUR_GE) { + page_cur_position(up_rec, block, cursor); + } else { + page_cur_position(low_rec, block, cursor); + } + + *iup_matched_fields = up_matched_fields; + *iup_matched_bytes = up_matched_bytes; + *ilow_matched_fields = low_matched_fields; + *ilow_matched_bytes = low_matched_bytes; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*************************************************************** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +UNIV_INTERN +void +page_cur_open_on_rnd_user_rec( +/*==========================*/ + buf_block_t* block, /* in: page */ + page_cur_t* cursor) /* out: page cursor */ +{ + ulint rnd; + ulint n_recs = page_get_n_recs(buf_block_get_frame(block)); + + page_cur_set_before_first(block, cursor); + + if (UNIV_UNLIKELY(n_recs == 0)) { + + return; + } + + page_rnd += 87584577; + + rnd = page_rnd % n_recs; + + do { + page_cur_move_to_next(cursor); + } while (rnd--); +} + +/*************************************************************** +Writes the log record of a record insert on a page. */ +static +void +page_cur_insert_rec_write_log( +/*==========================*/ + rec_t* insert_rec, /* in: inserted physical record */ + ulint rec_size, /* in: insert_rec size */ + rec_t* cursor_rec, /* in: record the + cursor is pointing to */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + ulint cur_rec_size; + ulint extra_size; + ulint cur_extra_size; + const byte* ins_ptr; + byte* log_ptr; + const byte* log_end; + ulint i; + + ut_a(rec_size < UNIV_PAGE_SIZE); + ut_ad(page_align(insert_rec) == page_align(cursor_rec)); + ut_ad(!page_rec_is_comp(insert_rec) + == !dict_table_is_comp(index->table)); + + { + mem_heap_t* heap = NULL; + ulint cur_offs_[REC_OFFS_NORMAL_SIZE]; + ulint ins_offs_[REC_OFFS_NORMAL_SIZE]; + + ulint* cur_offs; + ulint* ins_offs; + + rec_offs_init(cur_offs_); + rec_offs_init(ins_offs_); + + cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_, + ULINT_UNDEFINED, &heap); + ins_offs = rec_get_offsets(insert_rec, index, ins_offs_, + ULINT_UNDEFINED, &heap); + + extra_size = rec_offs_extra_size(ins_offs); + cur_extra_size = rec_offs_extra_size(cur_offs); + ut_ad(rec_size == rec_offs_size(ins_offs)); + cur_rec_size = rec_offs_size(cur_offs); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ins_ptr = insert_rec - extra_size; + + i = 0; + + if (cur_extra_size == extra_size) { + ulint min_rec_size = ut_min(cur_rec_size, rec_size); + + const byte* cur_ptr = cursor_rec - cur_extra_size; + + /* Find out the first byte in insert_rec which differs from + cursor_rec; skip the bytes in the record info */ + + do { + if (*ins_ptr == *cur_ptr) { + i++; + ins_ptr++; + cur_ptr++; + } else if ((i < extra_size) + && (i >= extra_size + - page_rec_get_base_extra_size + (insert_rec))) { + i = extra_size; + ins_ptr = insert_rec; + cur_ptr = cursor_rec; + } else { + break; + } + } while (i < min_rec_size); + } + + if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { + + if (page_rec_is_comp(insert_rec)) { + log_ptr = mlog_open_and_write_index( + mtr, insert_rec, index, MLOG_COMP_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + } else { + log_ptr = mlog_open(mtr, 11 + + 2 + 5 + 1 + 5 + 5 + + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + insert_rec, MLOG_REC_INSERT, log_ptr, mtr); + } + + log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(cursor_rec)); + log_ptr += 2; + } else { + log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (!log_ptr) { + /* Logging in mtr is switched off during crash + recovery: in that case mlog_open returns NULL */ + return; + } + log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; + } + + if (page_rec_is_comp(insert_rec)) { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, TRUE) + != rec_get_info_and_status_bits(cursor_rec, TRUE))) { + + goto need_extra_info; + } + } else { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, FALSE) + != rec_get_info_and_status_bits(cursor_rec, FALSE))) { + + goto need_extra_info; + } + } + + if (extra_size != cur_extra_size || rec_size != cur_rec_size) { +need_extra_info: + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, + 2 * (rec_size - i) + 1); + + /* Write the info bits */ + mach_write_to_1(log_ptr, + rec_get_info_and_status_bits( + insert_rec, + page_rec_is_comp(insert_rec))); + log_ptr++; + + /* Write the record origin offset */ + log_ptr += mach_write_compressed(log_ptr, extra_size); + + /* Write the mismatch index */ + log_ptr += mach_write_compressed(log_ptr, i); + + ut_a(i < UNIV_PAGE_SIZE); + ut_a(extra_size < UNIV_PAGE_SIZE); + } else { + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i)); + } + + /* Write to the log the inserted index record end segment which + differs from the cursor record */ + + rec_size -= i; + + if (log_ptr + rec_size <= log_end) { + memcpy(log_ptr, ins_ptr, rec_size); + mlog_close(mtr, log_ptr + rec_size); + } else { + mlog_close(mtr, log_ptr); + ut_a(rec_size < UNIV_PAGE_SIZE); + mlog_catenate_string(mtr, ins_ptr, rec_size); + } +} + +/*************************************************************** +Parses a log record of a record insert on a page. */ +UNIV_INTERN +byte* +page_cur_parse_insert_rec( +/*======================*/ + /* out: end of log record or NULL */ + ibool is_short,/* in: TRUE if short inserts */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ulint origin_offset; + ulint end_seg_len; + ulint mismatch_index; + page_t* page; + rec_t* cursor_rec; + byte buf1[1024]; + byte* buf; + byte* ptr2 = ptr; + ulint info_and_status_bits = 0; /* remove warning */ + page_cur_t cursor; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page = block ? buf_block_get_frame(block) : NULL; + + if (is_short) { + cursor_rec = page_rec_get_prev(page_get_supremum_rec(page)); + } else { + ulint offset; + + /* Read the cursor rec offset as a 2-byte ulint */ + + if (UNIV_UNLIKELY(end_ptr < ptr + 2)) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + cursor_rec = page + offset; + + if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)) { + + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + } + + ptr = mach_parse_compressed(ptr, end_ptr, &end_seg_len); + + if (ptr == NULL) { + + return(NULL); + } + + if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) { + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (end_seg_len & 0x1UL) { + /* Read the info bits */ + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_and_status_bits = mach_read_from_1(ptr); + ptr++; + + ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(origin_offset < UNIV_PAGE_SIZE); + + ptr = mach_parse_compressed(ptr, end_ptr, &mismatch_index); + + if (ptr == NULL) { + + return(NULL); + } + + ut_a(mismatch_index < UNIV_PAGE_SIZE); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + (end_seg_len >> 1))) { + + return(NULL); + } + + if (!block) { + + return(ptr + (end_seg_len >> 1)); + } + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + /* Read from the log the inserted index record end segment which + differs from the cursor record */ + + offsets = rec_get_offsets(cursor_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!(end_seg_len & 0x1UL)) { + info_and_status_bits = rec_get_info_and_status_bits( + cursor_rec, page_is_comp(page)); + origin_offset = rec_offs_extra_size(offsets); + mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1); + } + + end_seg_len >>= 1; + + if (mismatch_index + end_seg_len < sizeof buf1) { + buf = buf1; + } else { + buf = mem_alloc(mismatch_index + end_seg_len); + } + + /* Build the inserted record to buf */ + + if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "Is short %lu, info_and_status_bits %lu, offset %lu, " + "o_offset %lu\n" + "mismatch index %lu, end_seg_len %lu\n" + "parsed len %lu\n", + (ulong) is_short, (ulong) info_and_status_bits, + (ulong) page_offset(cursor_rec), + (ulong) origin_offset, + (ulong) mismatch_index, (ulong) end_seg_len, + (ulong) (ptr - ptr2)); + + fputs("Dump of 300 bytes of log:\n", stderr); + ut_print_buf(stderr, ptr2, 300); + putc('\n', stderr); + + buf_page_print(page, 0); + + ut_error; + } + + ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); + ut_memcpy(buf + mismatch_index, ptr, end_seg_len); + + if (page_is_comp(page)) { + rec_set_info_and_status_bits(buf + origin_offset, + info_and_status_bits); + } else { + rec_set_info_bits_old(buf + origin_offset, + info_and_status_bits); + } + + page_cur_position(cursor_rec, block, &cursor); + + offsets = rec_get_offsets(buf + origin_offset, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor, + buf + origin_offset, + index, offsets, mtr))) { + /* The redo log record should only have been written + after the write was successful. */ + ut_error; + } + + if (buf != buf1) { + + mem_free(buf); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(ptr + end_seg_len); +} + +/*************************************************************** +Inserts a record next to page cursor on an uncompressed page. +Returns pointer to inserted record if succeed, i.e., enough +space available, NULL otherwise. The cursor stays at the same position. */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_low( +/*====================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + rec_t* current_rec,/* in: pointer to current record after + which the new record is inserted */ + dict_index_t* index, /* in: record descriptor */ + const rec_t* rec, /* in: pointer to a physical record */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /* the relevant page */ + rec_t* last_insert; /* cursor position at previous + insert */ + rec_t* free_rec; /* a free record that was reused, + or NULL */ + rec_t* insert_rec; /* inserted record */ + ulint heap_no; /* heap number of the inserted + record */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_align(current_rec); + ut_ad(dict_table_is_comp(index->table) + == (ibool) !!page_is_comp(page)); + + ut_ad(!page_rec_is_supremum(current_rec)); + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + /* 2. Try to find suitable space from page memory management */ + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets(free_rec, index, foffsets, + ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + } else { + heap_no = rec_get_heap_no_old(free_rec); + page_mem_alloc_free(page, NULL, + rec_get_next_ptr(free_rec, FALSE), + rec_size); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, NULL, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(current_rec != insert_rec); + + { + /* next record after current before the insertion */ + rec_t* next_rec = page_rec_get_next(current_rec); +#ifdef UNIV_DEBUG + if (page_is_comp(page)) { + ut_ad(rec_get_status(current_rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + } +#endif + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(current_rec, insert_rec); + } + + page_header_set_field(page, NULL, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + if (page_is_comp(page)) { + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + } else { + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, heap_no); + } + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert || !page_is_comp(page) + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == current_rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, NULL, n_owned + 1); + } else { + n_owned = rec_get_n_owned_old(owner_rec); + rec_set_n_owned_old(owner_rec, n_owned + 1); + } + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, NULL, + page_dir_find_owner_slot(owner_rec)); + } + } + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + current_rec, index, mtr); + } + + return(insert_rec); +} + +/*************************************************************** +Compresses or reorganizes a page after an optimistic insert. */ +static +rec_t* +page_cur_insert_rec_zip_reorg( +/*==========================*/ + /* out: rec if succeed, NULL otherwise */ + rec_t** current_rec,/* in/out: pointer to current record after + which the new record is inserted */ + buf_block_t* block, /* in: buffer block */ + dict_index_t* index, /* in: record descriptor */ + rec_t* rec, /* in: inserted record */ + page_t* page, /* in: uncompressed page */ + page_zip_des_t* page_zip,/* in: compressed page */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + ulint pos; + + /* Recompress or reorganize and recompress the page. */ + if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) { + return(rec); + } + + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + pos = page_rec_get_n_recs_before(rec); + + if (page_zip_reorganize(block, index, mtr)) { + /* The page was reorganized: Find rec by seeking to pos, + and update *current_rec. */ + rec = page + PAGE_NEW_INFIMUM; + + while (--pos) { + rec = page + rec_get_next_offs(rec, TRUE); + } + + *current_rec = rec; + rec = page + rec_get_next_offs(rec, TRUE); + + return(rec); + } + + /* Out of space: restore the page */ + if (!page_zip_decompress(page_zip, page)) { + ut_error; /* Memory corrupted? */ + } + ut_ad(page_validate(page, index)); + return(NULL); +} + +/*************************************************************** +Inserts a record next to page cursor on a compressed and uncompressed +page. Returns pointer to inserted record if succeed, i.e., +enough space available, NULL otherwise. +The cursor stays at the same position. */ +UNIV_INTERN +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + /* out: pointer to record if succeed, NULL + otherwise */ + rec_t** current_rec,/* in/out: pointer to current record after + which the new record is inserted */ + buf_block_t* block, /* in: buffer block of *current_rec */ + dict_index_t* index, /* in: record descriptor */ + const rec_t* rec, /* in: pointer to a physical record */ + ulint* offsets,/* in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /* in: mini-transaction handle, or NULL */ +{ + byte* insert_buf; + ulint rec_size; + page_t* page; /* the relevant page */ + rec_t* last_insert; /* cursor position at previous + insert */ + rec_t* free_rec; /* a free record that was reused, + or NULL */ + rec_t* insert_rec; /* inserted record */ + ulint heap_no; /* heap number of the inserted + record */ + page_zip_des_t* page_zip; + + page_zip = buf_block_get_page_zip(block); + ut_ad(page_zip); + + ut_ad(rec_offs_validate(rec, index, offsets)); + + page = page_align(*current_rec); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(page_is_comp(page)); + + ut_ad(!page_rec_is_supremum(*current_rec)); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + /* 1. Get the size of the physical record in the page */ + rec_size = rec_offs_size(offsets); + +#ifdef UNIV_DEBUG_VALGRIND + { + const void* rec_start + = rec - rec_offs_extra_size(offsets); + ulint extra_size + = rec_offs_extra_size(offsets) + - (rec_offs_comp(offsets) + ? REC_N_NEW_EXTRA_BYTES + : REC_N_OLD_EXTRA_BYTES); + + /* All data bytes of the record must be valid. */ + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + /* The variable-length header must be valid. */ + UNIV_MEM_ASSERT_RW(rec_start, extra_size); + } +#endif /* UNIV_DEBUG_VALGRIND */ + + /* 2. Try to find suitable space from page memory management */ + if (!page_zip_available(page_zip, dict_index_is_clust(index), + rec_size, 1)) { + + /* Try compressing the whole page afterwards. */ + insert_rec = page_cur_insert_rec_low(*current_rec, + index, rec, offsets, + NULL); + + if (UNIV_LIKELY(insert_rec != NULL)) { + insert_rec = page_cur_insert_rec_zip_reorg( + current_rec, block, index, insert_rec, + page, page_zip, mtr); + } + + return(insert_rec); + } + + free_rec = page_header_get_ptr(page, PAGE_FREE); + if (UNIV_LIKELY_NULL(free_rec)) { + /* Try to allocate from the head of the free list. */ + lint extra_size_diff; + ulint foffsets_[REC_OFFS_NORMAL_SIZE]; + ulint* foffsets = foffsets_; + mem_heap_t* heap = NULL; + + rec_offs_init(foffsets_); + + foffsets = rec_get_offsets(free_rec, index, foffsets, + ULINT_UNDEFINED, &heap); + if (rec_offs_size(foffsets) < rec_size) { +too_small: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + goto use_heap; + } + + insert_buf = free_rec - rec_offs_extra_size(foffsets); + + /* On compressed pages, do not relocate records from + the free list. If extra_size would grow, use the heap. */ + extra_size_diff + = rec_offs_extra_size(offsets) + - rec_offs_extra_size(foffsets); + + if (UNIV_UNLIKELY(extra_size_diff < 0)) { + /* Add an offset to the extra_size. */ + if (rec_offs_size(foffsets) + < rec_size - extra_size_diff) { + + goto too_small; + } + + insert_buf -= extra_size_diff; + } else if (UNIV_UNLIKELY(extra_size_diff)) { + /* Do not allow extra_size to grow */ + + goto too_small; + } + + heap_no = rec_get_heap_no_new(free_rec); + page_mem_alloc_free(page, page_zip, + rec_get_next_ptr(free_rec, TRUE), + rec_size); + + if (!page_is_leaf(page)) { + /* Zero out the node pointer of free_rec, + in case it will not be overwritten by + insert_rec. */ + + ut_ad(rec_size > REC_NODE_PTR_SIZE); + + if (rec_offs_extra_size(foffsets) + + rec_offs_data_size(foffsets) > rec_size) { + + memset(rec_get_end(free_rec, foffsets) + - REC_NODE_PTR_SIZE, 0, + REC_NODE_PTR_SIZE); + } + } else if (dict_index_is_clust(index)) { + /* Zero out the DB_TRX_ID and DB_ROLL_PTR + columns of free_rec, in case it will not be + overwritten by insert_rec. */ + + ulint trx_id_col; + ulint trx_id_offs; + ulint len; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + trx_id_offs = rec_get_nth_field_offs(foffsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + + if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs + + rec_offs_extra_size(foffsets) > rec_size) { + /* We will have to zero out the + DB_TRX_ID and DB_ROLL_PTR, because + they will not be fully overwritten by + insert_rec. */ + + memset(free_rec + trx_id_offs, 0, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + + ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN + == rec_get_nth_field(free_rec, foffsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { +use_heap: + free_rec = NULL; + insert_buf = page_mem_alloc_heap(page, page_zip, + rec_size, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { + return(NULL); + } + + page_zip_dir_add_slot(page_zip, dict_index_is_clust(index)); + } + + /* 3. Create the record */ + insert_rec = rec_copy(insert_buf, rec, offsets); + rec_offs_make_valid(insert_rec, index, offsets); + + /* 4. Insert the record in the linked list of records */ + ut_ad(*current_rec != insert_rec); + + { + /* next record after current before the insertion */ + rec_t* next_rec = page_rec_get_next(*current_rec); + ut_ad(rec_get_status(*current_rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + + page_rec_set_next(insert_rec, next_rec); + page_rec_set_next(*current_rec, insert_rec); + } + + page_header_set_field(page, page_zip, PAGE_N_RECS, + 1 + page_get_n_recs(page)); + + /* 5. Set the n_owned field in the inserted record to zero, + and set the heap_no field */ + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, heap_no); + + UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets), + rec_offs_size(offsets)); + + page_zip_dir_insert(page_zip, *current_rec, free_rec, insert_rec); + + /* 6. Update the last insertion info in page header */ + + last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); + ut_ad(!last_insert + || rec_get_node_ptr_flag(last_insert) + == rec_get_node_ptr_flag(insert_rec)); + + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + } else if ((last_insert == *current_rec) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_LEFT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + + } else if ((page_rec_get_next(insert_rec) == last_insert) + && (page_header_get_field(page, PAGE_DIRECTION) + != PAGE_RIGHT)) { + + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, + page_header_get_field( + page, PAGE_N_DIRECTION) + 1); + } else { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + } + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec); + + /* 7. It remains to update the owner record. */ + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1); + + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ + + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot( + page, page_zip, + page_dir_find_owner_slot(owner_rec)); + } + } + + page_zip_write_rec(page_zip, insert_rec, index, offsets, 1); + + /* 9. Write log record of the insert */ + if (UNIV_LIKELY(mtr != NULL)) { + page_cur_insert_rec_write_log(insert_rec, rec_size, + *current_rec, index, mtr); + } + + return(insert_rec); +} + +/************************************************************** +Writes a log record of copying a record list end to a new created page. */ +UNIV_INLINE +byte* +page_copy_rec_list_to_created_page_write_log( +/*=========================================*/ + /* out: 4-byte field where to + write the log data length, + or NULL if logging is disabled */ + page_t* page, /* in: index page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, page, index, + page_is_comp(page) + ? MLOG_COMP_LIST_END_COPY_CREATED + : MLOG_LIST_END_COPY_CREATED, 4); + if (UNIV_LIKELY(log_ptr != NULL)) { + mlog_close(mtr, log_ptr + 4); + } + + return(log_ptr); +} + +/************************************************************** +Parses a log record of copying a record list end to a new created page. */ +UNIV_INTERN +byte* +page_parse_copy_rec_list_to_created_page( +/*=====================================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + byte* rec_end; + ulint log_data_len; + page_t* page; + page_zip_des_t* page_zip; + + if (ptr + 4 > end_ptr) { + + return(NULL); + } + + log_data_len = mach_read_from_4(ptr); + ptr += 4; + + rec_end = ptr + log_data_len; + + if (rec_end > end_ptr) { + + return(NULL); + } + + if (!block) { + + return(rec_end); + } + + while (ptr < rec_end) { + ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, + block, index, mtr); + } + + ut_a(ptr == rec_end); + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); + + return(rec_end); +} + +/***************************************************************** +Copies records from page to a newly created page, from a given record onward, +including that record. Infimum and supremum records are not copied. */ +UNIV_INTERN +void +page_copy_rec_list_end_to_created_page( +/*===================================*/ + page_t* new_page, /* in/out: index page to copy to */ + rec_t* rec, /* in: first record to copy */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_dir_slot_t* slot = 0; /* remove warning */ + byte* heap_top; + rec_t* insert_rec = 0; /* remove warning */ + rec_t* prev_rec; + ulint count; + ulint n_recs; + ulint slot_index; + ulint rec_size; + ulint log_mode; + byte* log_ptr; + ulint log_data_len; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW); + ut_ad(page_align(rec) != new_page); + ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page)); + + if (page_rec_is_infimum(rec)) { + + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return; + } + +#ifdef UNIV_DEBUG + /* To pass the debug tests we have to set these dummy values + in the debug version */ + page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, + new_page + UNIV_PAGE_SIZE - 1); +#endif + + log_ptr = page_copy_rec_list_to_created_page_write_log(new_page, + index, mtr); + + log_data_len = dyn_array_get_data_size(&(mtr->log)); + + /* Individual inserts are logged in a shorter form */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); + + prev_rec = page_get_infimum_rec(new_page); + if (page_is_comp(new_page)) { + heap_top = new_page + PAGE_NEW_SUPREMUM_END; + } else { + heap_top = new_page + PAGE_OLD_SUPREMUM_END; + } + count = 0; + slot_index = 0; + n_recs = 0; + + do { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + insert_rec = rec_copy(heap_top, rec, offsets); + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } else { + rec_set_next_offs_old(prev_rec, + page_offset(insert_rec)); + + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, + PAGE_HEAP_NO_USER_LOW + n_recs); + } + + count++; + n_recs++; + + if (UNIV_UNLIKELY + (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) { + + slot_index++; + + slot = page_dir_get_nth_slot(new_page, slot_index); + + page_dir_slot_set_rec(slot, insert_rec); + page_dir_slot_set_n_owned(slot, NULL, count); + + count = 0; + } + + rec_size = rec_offs_size(offsets); + + ut_ad(heap_top < new_page + UNIV_PAGE_SIZE); + + heap_top += rec_size; + + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, + index, mtr); + prev_rec = insert_rec; + rec = page_rec_get_next(rec); + } while (!page_rec_is_supremum(rec)); + + if ((slot_index > 0) && (count + 1 + + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 + <= PAGE_DIR_SLOT_MAX_N_OWNED)) { + /* We can merge the two last dir slots. This operation is + here to make this function imitate exactly the equivalent + task made using page_cur_insert_rec, which we use in database + recovery to reproduce the task performed by this function. + To be able to check the correctness of recovery, it is good + that it imitates exactly. */ + + count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + page_dir_slot_set_n_owned(slot, NULL, 0); + + slot_index--; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len; + + ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); + + if (UNIV_LIKELY(log_ptr != NULL)) { + mach_write_to_4(log_ptr, log_data_len); + } + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM); + } + + slot = page_dir_get_nth_slot(new_page, 1 + slot_index); + + page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); + page_dir_slot_set_n_owned(slot, NULL, count + 1); + + page_dir_set_n_slots(new_page, NULL, 2 + slot_index); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top); + page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs); + page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs); + + page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(new_page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0); + + /* Restore the log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +/*************************************************************** +Writes log record of a record delete on a page. */ +UNIV_INLINE +void +page_cur_delete_rec_write_log( +/*==========================*/ + rec_t* rec, /* in: record to be deleted */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + byte* log_ptr; + + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, + page_rec_is_comp(rec) + ? MLOG_COMP_REC_DELETE + : MLOG_REC_DELETE, 2); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + /* Write the cursor rec offset as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + + mlog_close(mtr, log_ptr + 2); +} + +/*************************************************************** +Parses log record of a record delete on a page. */ +UNIV_INTERN +byte* +page_cur_parse_delete_rec( +/*======================*/ + /* out: pointer to record end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in: page or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ulint offset; + page_cur_t cursor; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + /* Read the cursor rec offset as a 2-byte ulint */ + offset = mach_read_from_2(ptr); + ptr += 2; + + ut_a(offset <= UNIV_PAGE_SIZE); + + if (block) { + page_t* page = buf_block_get_frame(block); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec = page + offset; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cursor); + ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page)); + + page_cur_delete_rec(&cursor, index, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + return(ptr); +} + +/*************************************************************** +Deletes a record at the page cursor. The cursor is moved to the next +record after the deleted one. */ +UNIV_INTERN +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /* in/out: a page cursor */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + page_dir_slot_t* cur_dir_slot; + page_dir_slot_t* prev_slot; + page_t* page; + page_zip_des_t* page_zip; + rec_t* current_rec; + rec_t* prev_rec = NULL; + rec_t* next_rec; + ulint cur_slot_no; + ulint cur_n_owned; + rec_t* rec; + + ut_ad(cursor && mtr); + + page = page_cur_get_page(cursor); + page_zip = page_cur_get_page_zip(cursor); + + /* page_zip_validate() will fail here when + btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark(). + Then, both "page_zip" and "page" would have the min-rec-mark + set on the smallest user record, but "page" would additionally + have it set on the smallest-but-one record. Because sloppy + page_zip_validate_low() only ignores min-rec-flag differences + in the smallest user record, it cannot be used here either. */ + + current_rec = cursor->rec; + ut_ad(rec_offs_validate(current_rec, index, offsets)); + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + /* The record must not be the supremum or infimum record. */ + ut_ad(page_rec_is_user_rec(current_rec)); + + /* Save to local variables some data associated with current_rec */ + cur_slot_no = page_dir_find_owner_slot(current_rec); + cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no); + cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot); + + /* 0. Write the log record */ + page_cur_delete_rec_write_log(current_rec, index, mtr); + + /* 1. Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock */ + + buf_block_modify_clock_inc(page_cur_get_block(cursor)); + + /* 2. Find the next and the previous record. Note that the cursor is + left at the next record. */ + + ut_ad(cur_slot_no > 0); + prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1); + + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + /* rec now points to the record of the previous directory slot. Look + for the immediate predecessor of current_rec in a loop. */ + + while(current_rec != rec) { + prev_rec = rec; + rec = page_rec_get_next(rec); + } + + page_cur_move_to_next(cursor); + next_rec = cursor->rec; + + /* 3. Remove the record from the linked list of records */ + + page_rec_set_next(prev_rec, next_rec); + + /* 4. If the deleted record is pointed to by a dir slot, update the + record pointer in slot. In the following if-clause we assume that + prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED + >= 2. */ + +#if PAGE_DIR_SLOT_MIN_N_OWNED < 2 +# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2" +#endif + ut_ad(cur_n_owned > 1); + + if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) { + page_dir_slot_set_rec(cur_dir_slot, prev_rec); + } + + /* 5. Update the number of owned records of the slot */ + + page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); + + /* 6. Free the memory occupied by the record */ + page_mem_free(page, page_zip, current_rec, index, offsets); + + /* 7. Now we have decremented the number of owned records of the slot. + If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the + slots. */ + + if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) { + page_dir_balance_slot(page, page_zip, cur_slot_no); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ +} diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.c new file mode 100644 index 00000000000..0ce532068ce --- /dev/null +++ b/storage/xtradb/page/page0page.c @@ -0,0 +1,2568 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#define THIS_MODULE +#include "page0page.h" +#ifdef UNIV_NONINL +#include "page0page.ic" +#endif +#undef THIS_MODULE + +#include "page0cur.h" +#include "page0zip.h" +#include "lock0lock.h" +#include "fut0lst.h" +#include "btr0sea.h" +#include "buf0buf.h" +#include "srv0srv.h" +#include "btr0btr.h" + +/* THE INDEX PAGE + ============== + +The index page consists of a page header which contains the page's +id and other information. On top of it are the the index records +in a heap linked into a one way linear list according to alphabetic order. + +Just below page end is an array of pointers which we call page directory, +to about every sixth record in the list. The pointers are placed in +the directory in the alphabetical order of the records pointed to, +enabling us to make binary search using the array. Each slot n:o I +in the directory points to a record, where a 4-bit field contains a count +of those records which are in the linear list between pointer I and +the pointer I - 1 in the directory, including the record +pointed to by pointer I and not including the record pointed to by I - 1. +We say that the record pointed to by slot I, or that slot I, owns +these records. The count is always kept in the range 4 to 8, with +the exception that it is 1 for the first slot, and 1--8 for the second slot. + +An essentially binary search can be performed in the list of index +records, like we could do if we had pointer to every record in the +page directory. The data structure is, however, more efficient when +we are doing inserts, because most inserts are just pushed on a heap. +Only every 8th insert requires block move in the directory pointer +table, which itself is quite small. A record is deleted from the page +by just taking it off the linear list and updating the number of owned +records-field of the record which owns it, and updating the page directory, +if necessary. A special case is the one when the record owns itself. +Because the overhead of inserts is so small, we may also increase the +page size from the projected default of 8 kB to 64 kB without too +much loss of efficiency in inserts. Bigger page becomes actual +when the disk transfer rate compared to seek and latency time rises. +On the present system, the page size is set so that the page transfer +time (3 ms) is 20 % of the disk random access time (15 ms). + +When the page is split, merged, or becomes full but contains deleted +records, we have to reorganize the page. + +Assuming a page size of 8 kB, a typical index page of a secondary +index contains 300 index entries, and the size of the page directory +is 50 x 4 bytes = 200 bytes. */ + +/******************************************************************* +Looks for the directory slot which owns the given record. */ +UNIV_INTERN +ulint +page_dir_find_owner_slot( +/*=====================*/ + /* out: the directory slot number */ + const rec_t* rec) /* in: the physical record */ +{ + const page_t* page; + register uint16 rec_offs_bytes; + register const page_dir_slot_t* slot; + register const page_dir_slot_t* first_slot; + register const rec_t* r = rec; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + first_slot = page_dir_get_nth_slot(page, 0); + slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1); + + if (page_is_comp(page)) { + while (rec_get_n_owned_new(r) == 0) { + r = rec_get_next_ptr_const(r, TRUE); + ut_ad(r >= page + PAGE_NEW_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } else { + while (rec_get_n_owned_old(r) == 0) { + r = rec_get_next_ptr_const(r, FALSE); + ut_ad(r >= page + PAGE_OLD_SUPREMUM); + ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); + } + } + + rec_offs_bytes = mach_encode_2(r - page); + + while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) { + + if (UNIV_UNLIKELY(slot == first_slot)) { + fprintf(stderr, + "InnoDB: Probable data corruption on" + " page %lu\n" + "InnoDB: Original record ", + (ulong) page_get_page_no(page)); + + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, rec); + } + + fputs("\n" + "InnoDB: on that page.\n" + "InnoDB: Cannot find the dir slot for record ", + stderr); + if (page_is_comp(page)) { + fputs("(compact record)", stderr); + } else { + rec_print_old(stderr, page + + mach_decode_2(rec_offs_bytes)); + } + fputs("\n" + "InnoDB: on that page!\n", stderr); + + buf_page_print(page, 0); + + ut_error; + } + + slot += PAGE_DIR_SLOT_SIZE; + } + + return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE); +} + +/****************************************************************** +Used to check the consistency of a directory slot. */ +static +ibool +page_dir_slot_check( +/*================*/ + /* out: TRUE if succeed */ + page_dir_slot_t* slot) /* in: slot */ +{ + page_t* page; + ulint n_slots; + ulint n_owned; + + ut_a(slot); + + page = page_align(slot); + + n_slots = page_dir_get_n_slots(page); + + ut_a(slot <= page_dir_get_nth_slot(page, 0)); + ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); + + ut_a(page_rec_check(page_dir_slot_get_rec(slot))); + + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot)); + } else { + n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot)); + } + + if (slot == page_dir_get_nth_slot(page, 0)) { + ut_a(n_owned == 1); + } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { + ut_a(n_owned >= 1); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } else { + ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); + ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); + } + + return(TRUE); +} + +/***************************************************************** +Sets the max trx id field value. */ +UNIV_INTERN +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dulint trx_id) /* in: transaction id */ +{ + const ibool is_hashed = block->is_hashed; + page_t* page = buf_block_get_frame(block); + + if (is_hashed) { + rw_lock_x_lock(&btr_search_latch); + } + + /* It is not necessary to write this change to the redo log, as + during a database recovery we assume that the max trx id of every + page is the maximum trx id assigned before the crash. */ + + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_header(page_zip, + page + (PAGE_HEADER + PAGE_MAX_TRX_ID), + 8, NULL); + } + + if (is_hashed) { + rw_lock_x_unlock(&btr_search_latch); + } +} + +/**************************************************************** +Allocates a block of memory from the heap of an index page. */ +UNIV_INTERN +byte* +page_mem_alloc_heap( +/*================*/ + /* out: pointer to start of allocated + buffer, or NULL if allocation fails */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with enough + space available for inserting the record, + or NULL */ + ulint need, /* in: total number of bytes needed */ + ulint* heap_no)/* out: this contains the heap number + of the allocated record + if allocation succeeds */ +{ + byte* block; + ulint avl_space; + + ut_ad(page && heap_no); + + avl_space = page_get_max_insert_size(page, 1); + + if (avl_space >= need) { + block = page_header_get_ptr(page, PAGE_HEAP_TOP); + + page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP, + block + need); + *heap_no = page_dir_get_n_heap(page); + + page_dir_set_n_heap(page, page_zip, 1 + *heap_no); + + return(block); + } + + return(NULL); +} + +/************************************************************** +Writes a log record of page creation. */ +UNIV_INLINE +void +page_create_write_log( +/*==================*/ + buf_frame_t* frame, /* in: a buffer frame where the page is + created */ + mtr_t* mtr, /* in: mini-transaction handle */ + ibool comp) /* in: TRUE=compact page format */ +{ + mlog_write_initial_log_record(frame, comp + ? MLOG_COMP_PAGE_CREATE + : MLOG_PAGE_CREATE, mtr); +} + +/*************************************************************** +Parses a redo log record of creating a page. */ +UNIV_INTERN +byte* +page_parse_create( +/*==============*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + ulint comp, /* in: nonzero=compact page format */ + buf_block_t* block, /* in: block or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + /* The record is empty, except for the record initial part */ + + if (block) { + page_create(block, mtr, comp); + } + + return(ptr); +} + +/************************************************************** +The index page creation function. */ +static +page_t* +page_create_low( +/*============*/ + /* out: pointer to the page */ + buf_block_t* block, /* in: a buffer block where the + page is created */ + ulint comp) /* in: nonzero=compact page format */ +{ + page_dir_slot_t* slot; + mem_heap_t* heap; + dtuple_t* tuple; + dfield_t* field; + byte* heap_top; + rec_t* infimum_rec; + rec_t* supremum_rec; + page_t* page; + dict_index_t* index; + ulint* offsets; + + ut_ad(block); +#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA" +#endif +#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA +# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA" +#endif + + /* The infimum and supremum records use a dummy index. */ + if (UNIV_LIKELY(comp)) { + index = srv_sys->dummy_ind2; + } else { + index = srv_sys->dummy_ind1; + } + + /* 1. INCREMENT MODIFY CLOCK */ + buf_block_modify_clock_inc(block); + + page = buf_block_get_frame(block); + + fil_page_set_type(page, FIL_PAGE_INDEX); + + heap = mem_heap_create(200); + + /* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */ + + /* Create first a data tuple for infimum record */ + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "infimum", 8); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8); + /* Set the corresponding physical record to its place in the page + record heap */ + + heap_top = page + PAGE_DATA; + + infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(infimum_rec == page + PAGE_NEW_INFIMUM); + + rec_set_n_owned_new(infimum_rec, NULL, 1); + rec_set_heap_no_new(infimum_rec, 0); + } else { + ut_a(infimum_rec == page + PAGE_OLD_INFIMUM); + + rec_set_n_owned_old(infimum_rec, 1); + rec_set_heap_no_old(infimum_rec, 0); + } + + offsets = rec_get_offsets(infimum_rec, index, NULL, + ULINT_UNDEFINED, &heap); + + heap_top = rec_get_end(infimum_rec, offsets); + + /* Create then a tuple for supremum */ + + tuple = dtuple_create(heap, 1); + dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); + field = dtuple_get_nth_field(tuple, 0); + + dfield_set_data(field, "supremum", comp ? 8 : 9); + dtype_set(dfield_get_type(field), + DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9); + + supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0); + + if (UNIV_LIKELY(comp)) { + ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM); + + rec_set_n_owned_new(supremum_rec, NULL, 1); + rec_set_heap_no_new(supremum_rec, 1); + } else { + ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM); + + rec_set_n_owned_old(supremum_rec, 1); + rec_set_heap_no_old(supremum_rec, 1); + } + + offsets = rec_get_offsets(supremum_rec, index, offsets, + ULINT_UNDEFINED, &heap); + heap_top = rec_get_end(supremum_rec, offsets); + + ut_ad(heap_top == page + + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); + + mem_heap_free(heap); + + /* 4. INITIALIZE THE PAGE */ + + page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2); + page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top); + page_header_set_field(page, NULL, PAGE_N_HEAP, comp + ? 0x8000 | PAGE_HEAP_NO_USER_LOW + : PAGE_HEAP_NO_USER_LOW); + page_header_set_ptr(page, NULL, PAGE_FREE, NULL); + page_header_set_field(page, NULL, PAGE_GARBAGE, 0); + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + page_header_set_field(page, NULL, PAGE_N_RECS, 0); + page_set_max_trx_id(block, NULL, ut_dulint_zero); + memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START + - page_offset(heap_top)); + + /* 5. SET POINTERS IN RECORDS AND DIR SLOTS */ + + /* Set the slots to point to infimum and supremum. */ + + slot = page_dir_get_nth_slot(page, 0); + page_dir_slot_set_rec(slot, infimum_rec); + + slot = page_dir_get_nth_slot(page, 1); + page_dir_slot_set_rec(slot, supremum_rec); + + /* Set the next pointers in infimum and supremum */ + + if (UNIV_LIKELY(comp)) { + rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM); + rec_set_next_offs_new(supremum_rec, 0); + } else { + rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM); + rec_set_next_offs_old(supremum_rec, 0); + } + + return(page); +} + +/************************************************************** +Create an uncompressed B-tree index page. */ +UNIV_INTERN +page_t* +page_create( +/*========*/ + /* out: pointer to the page */ + buf_block_t* block, /* in: a buffer block where the + page is created */ + mtr_t* mtr, /* in: mini-transaction handle */ + ulint comp) /* in: nonzero=compact page format */ +{ + page_create_write_log(buf_block_get_frame(block), mtr, comp); + return(page_create_low(block, comp)); +} + +/************************************************************** +Create a compressed B-tree index page. */ +UNIV_INTERN +page_t* +page_create_zip( +/*============*/ + /* out: pointer to the page */ + buf_block_t* block, /* in/out: a buffer frame where the + page is created */ + dict_index_t* index, /* in: the index of the page */ + ulint level, /* in: the B-tree level of the page */ + mtr_t* mtr) /* in: mini-transaction handle */ +{ + page_t* page; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + ut_ad(block); + ut_ad(page_zip); + ut_ad(index); + ut_ad(dict_table_is_comp(index->table)); + + page = page_create_low(block, TRUE); + mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level); + + if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) { + /* The compression of a newly created page + should always succeed. */ + ut_error; + } + + return(page); +} + +/***************************************************************** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. */ +UNIV_INTERN +void +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /* in: index page to copy to */ + buf_block_t* block, /* in: index page of rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_cur_t cur1; + rec_t* cur2; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1)) { + + page_cur_move_to_next(&cur1); + } + + ut_a((ibool)!!page_is_comp(new_page) + == dict_table_is_comp(index->table)); + ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); + ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) + (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + cur2 = page_get_infimum_rec(buf_block_get_frame(new_block)); + + /* Copy records from the original page to the new page */ + + while (!page_cur_is_after_last(&cur1)) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + rec_t* ins_rec; + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ins_rec = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + if (UNIV_UNLIKELY(!ins_rec)) { + /* Track an assertion failure reported on the mailing + list on June 18th, 2003 */ + + buf_page_print(new_page, 0); + buf_page_print(page_align(rec), 0); + ut_print_timestamp(stderr); + + fprintf(stderr, + "InnoDB: rec offset %lu, cur1 offset %lu," + " cur2 offset %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(page_cur_get_rec(&cur1)), + (ulong) page_offset(cur2)); + ut_error; + } + + page_cur_move_to_next(&cur1); + cur2 = ins_rec; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************** +Copies records from page to new_page, from a given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. */ +UNIV_INTERN +rec_t* +page_copy_rec_list_end( +/*===================*/ + /* out: pointer to the original + successor of the infimum record + on new_page, or NULL on zip overflow + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page to copy to */ + buf_block_t* block, /* in: index page containing rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_t* page = page_align(rec); + rec_t* ret = page_rec_get_next( + page_get_infimum_rec(new_page)); + ulint log_mode = 0; /* remove warning */ + +#ifdef UNIV_ZIP_DEBUG + if (new_page_zip) { + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + ut_a(page_zip); + + /* Strict page_zip_validate() may fail here. + Furthermore, btr_compress() may set FIL_PAGE_PREV to + FIL_NULL on new_page while leaving it intact on + new_page_zip. So, we cannot validate new_page_zip. */ + ut_a(page_zip_validate_low(page_zip, page, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(buf_block_get_frame(block) == page); + ut_ad(page_is_leaf(page) == page_is_leaf(new_page)); + ut_ad(page_is_comp(page) == page_is_comp(new_page)); + /* Here, "ret" may be pointing to a user record or the + predefined supremum record. */ + + if (UNIV_LIKELY_NULL(new_page_zip)) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) { + page_copy_rec_list_end_to_created_page(new_page, rec, + index, mtr); + } else { + page_copy_rec_list_end_no_locks(new_block, block, rec, + index, mtr); + } + + if (UNIV_LIKELY_NULL(new_page_zip)) { + mtr_set_log_mode(mtr, log_mode); + + if (UNIV_UNLIKELY + (!page_zip_compress(new_page_zip, new_page, index, mtr))) { + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos + = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the successor of + the predefined infimum record. It must still + have at least one predecessor (the predefined + infimum record, or a freshly copied record + that is smaller than "ret"). */ + ut_a(ret_pos > 0); + + if (UNIV_UNLIKELY + (!page_zip_reorganize(new_block, index, mtr))) { + + if (UNIV_UNLIKELY + (!page_zip_decompress(new_page_zip, + new_page))) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + return(NULL); + } else { + /* The page was reorganized: + Seek to ret_pos. */ + ret = new_page + PAGE_NEW_INFIMUM; + + do { + ret = rec_get_next_ptr(ret, TRUE); + } while (--ret_pos); + } + } + } + + /* Update the lock table, MAX_TRX_ID, and possible hash index */ + + lock_move_rec_list_end(new_block, block, rec); + + page_update_max_trx_id(new_block, new_page_zip, + page_get_max_trx_id(page)); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/***************************************************************** +Copies records from page to new_page, up to the given record, +NOT including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. */ +UNIV_INTERN +rec_t* +page_copy_rec_list_start( +/*=====================*/ + /* out: pointer to the original + predecessor of the supremum record + on new_page, or NULL on zip overflow + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page to copy to */ + buf_block_t* block, /* in: index page containing rec */ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block); + page_cur_t cur1; + rec_t* cur2; + ulint log_mode = 0 /* remove warning */; + mem_heap_t* heap = NULL; + rec_t* ret + = page_rec_get_prev(page_get_supremum_rec(new_page)); + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + /* Here, "ret" may be pointing to a user record or the + predefined infimum record. */ + + if (page_rec_is_infimum(rec)) { + + return(ret); + } + + if (UNIV_LIKELY_NULL(new_page_zip)) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + cur2 = ret; + + /* Copy records from the original page to the new page */ + + while (page_cur_get_rec(&cur1) != rec) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + offsets = rec_get_offsets(cur1_rec, index, offsets, + ULINT_UNDEFINED, &heap); + cur2 = page_cur_insert_rec_low(cur2, index, + cur1_rec, offsets, mtr); + ut_a(cur2); + + page_cur_move_to_next(&cur1); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (UNIV_LIKELY_NULL(new_page_zip)) { + mtr_set_log_mode(mtr, log_mode); + + if (UNIV_UNLIKELY + (!page_zip_compress(new_page_zip, new_page, index, mtr))) { + /* Before trying to reorganize the page, + store the number of preceding records on the page. */ + ulint ret_pos + = page_rec_get_n_recs_before(ret); + /* Before copying, "ret" was the predecessor + of the predefined supremum record. If it was + the predefined infimum record, then it would + still be the infimum. Thus, the assertion + ut_a(ret_pos > 0) would fail here. */ + + if (UNIV_UNLIKELY + (!page_zip_reorganize(new_block, index, mtr))) { + + if (UNIV_UNLIKELY + (!page_zip_decompress(new_page_zip, + new_page))) { + ut_error; + } + ut_ad(page_validate(new_page, index)); + return(NULL); + } else { + /* The page was reorganized: + Seek to ret_pos. */ + ret = new_page + PAGE_NEW_INFIMUM; + + do { + ret = rec_get_next_ptr(ret, TRUE); + } while (--ret_pos); + } + } + } + + /* Update MAX_TRX_ID, the lock table, and possible hash index */ + + page_update_max_trx_id(new_block, new_page_zip, + page_get_max_trx_id(page_align(rec))); + + lock_move_rec_list_start(new_block, block, rec, ret); + + btr_search_move_or_delete_hash_entries(new_block, block, index); + + return(ret); +} + +/************************************************************** +Writes a log record of a record list end or start deletion. */ +UNIV_INLINE +void +page_delete_rec_list_write_log( +/*===========================*/ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + byte type, /* in: operation type: + MLOG_LIST_END_DELETE, ... */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2); + if (log_ptr) { + /* Write the parameter as a 2-byte ulint */ + mach_write_to_2(log_ptr, page_offset(rec)); + mlog_close(mtr, log_ptr + 2); + } +} + +/************************************************************** +Parses a log record of a record list end or start deletion. */ +UNIV_INTERN +byte* +page_parse_delete_rec_list( +/*=======================*/ + /* out: end of log record or NULL */ + byte type, /* in: MLOG_LIST_END_DELETE, + MLOG_LIST_START_DELETE, + MLOG_COMP_LIST_END_DELETE or + MLOG_COMP_LIST_START_DELETE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + buf_block_t* block, /* in/out: buffer block or NULL */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + page_t* page; + ulint offset; + + ut_ad(type == MLOG_LIST_END_DELETE + || type == MLOG_LIST_START_DELETE + || type == MLOG_COMP_LIST_END_DELETE + || type == MLOG_COMP_LIST_START_DELETE); + + /* Read the record offset as a 2-byte ulint */ + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + ptr += 2; + + if (!block) { + + return(ptr); + } + + page = buf_block_get_frame(block); + + ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table)); + + if (type == MLOG_LIST_END_DELETE + || type == MLOG_COMP_LIST_END_DELETE) { + page_delete_rec_list_end(page + offset, block, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, + mtr); + } else { + page_delete_rec_list_start(page + offset, block, index, mtr); + } + + return(ptr); +} + +/***************************************************************** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /* in: pointer to record on page */ + buf_block_t* block, /* in: buffer block of the page */ + dict_index_t* index, /* in: record descriptor */ + ulint n_recs, /* in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /* in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /* in: mtr */ +{ + page_dir_slot_t*slot; + ulint slot_index; + rec_t* last_rec; + rec_t* prev_rec; + ulint n_owned; + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = page_align(rec); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); + ut_ad(!page_zip || page_rec_is_comp(rec)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_infimum(rec)) { + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return; + } + + /* Reset the last insert info in the page header and increment + the modify clock for the frame */ + + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + + /* The page gets invalid for optimistic searches: increment the + frame modify clock */ + + buf_block_modify_clock_inc(block); + + page_delete_rec_list_write_log(rec, index, page_is_comp(page) + ? MLOG_COMP_LIST_END_DELETE + : MLOG_LIST_END_DELETE, mtr); + + if (UNIV_LIKELY_NULL(page_zip)) { + ulint log_mode; + + ut_a(page_is_comp(page)); + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + do { + page_cur_t cur; + page_cur_position(rec, block, &cur); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + rec = rec_get_next_ptr(rec, TRUE); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&cur, index, offsets, mtr); + } while (page_offset(rec) != PAGE_NEW_SUPREMUM); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); + return; + } + + prev_rec = page_rec_get_prev(rec); + + last_rec = page_rec_get_prev(page_get_supremum_rec(page)); + + if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { + rec_t* rec2 = rec; + /* Calculate the sum of sizes and the number of records */ + size = 0; + n_recs = 0; + + do { + ulint s; + offsets = rec_get_offsets(rec2, index, offsets, + ULINT_UNDEFINED, &heap); + s = rec_offs_size(offsets); + ut_ad(rec2 - page + s - rec_offs_extra_size(offsets) + < UNIV_PAGE_SIZE); + ut_ad(size + s < UNIV_PAGE_SIZE); + size += s; + n_recs++; + + rec2 = page_rec_get_next(rec2); + } while (!page_rec_is_supremum(rec2)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + ut_ad(size < UNIV_PAGE_SIZE); + + /* Update the page directory; there is no need to balance the number + of the records owned by the supremum record, as it is allowed to be + less than PAGE_DIR_SLOT_MIN_N_OWNED */ + + if (page_is_comp(page)) { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_new(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, TRUE); + } + + ut_ad(rec_get_n_owned_new(rec2) > count); + + n_owned = rec_get_n_owned_new(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + slot = page_dir_get_nth_slot(page, slot_index); + } else { + rec_t* rec2 = rec; + ulint count = 0; + + while (rec_get_n_owned_old(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, FALSE); + } + + ut_ad(rec_get_n_owned_old(rec2) > count); + + n_owned = rec_get_n_owned_old(rec2) - count; + slot_index = page_dir_find_owner_slot(rec2); + slot = page_dir_get_nth_slot(page, slot_index); + } + + page_dir_slot_set_rec(slot, page_get_supremum_rec(page)); + page_dir_slot_set_n_owned(slot, NULL, n_owned); + + page_dir_set_n_slots(page, NULL, slot_index + 1); + + /* Remove the record chain segment from the record chain */ + page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + + /* Catenate the deleted chain segment to the page free list */ + + page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE)); + page_header_set_ptr(page, NULL, PAGE_FREE, rec); + + page_header_set_field(page, NULL, PAGE_GARBAGE, size + + page_header_get_field(page, PAGE_GARBAGE)); + + page_header_set_field(page, NULL, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - n_recs)); +} + +/***************************************************************** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +UNIV_INTERN +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /* in: record on page */ + buf_block_t* block, /* in: buffer block of the page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_cur_t cur1; + ulint log_mode; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + mem_heap_t* heap = NULL; + byte type; + + rec_offs_init(offsets_); + + ut_ad((ibool) !!page_rec_is_comp(rec) + == dict_table_is_comp(index->table)); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + + /* page_zip_validate() would detect a min_rec_mark mismatch + in btr_page_split_and_insert() + between btr_attach_half_pages() and insert_page = ... + when btr_page_get_split_rec_to_left() holds + (direction == FSP_DOWN). */ + ut_a(!page_zip || page_zip_validate_low(page_zip, page, TRUE)); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (page_rec_is_infimum(rec)) { + + return; + } + + if (page_rec_is_comp(rec)) { + type = MLOG_COMP_LIST_START_DELETE; + } else { + type = MLOG_LIST_START_DELETE; + } + + page_delete_rec_list_write_log(rec, index, type, mtr); + + page_cur_set_before_first(block, &cur1); + page_cur_move_to_next(&cur1); + + /* Individual deletes are not logged */ + + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + while (page_cur_get_rec(&cur1) != rec) { + offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, + offsets, ULINT_UNDEFINED, &heap); + page_cur_delete_rec(&cur1, index, offsets, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Restore log mode */ + + mtr_set_log_mode(mtr, log_mode); +} + +/***************************************************************** +Moves record list end to another page. Moved records include +split_rec. */ +UNIV_INTERN +ibool +page_move_rec_list_end( +/*===================*/ + /* out: TRUE on success; FALSE on + compression failure + (new_block will be decompressed) */ + buf_block_t* new_block, /* in/out: index page where to move */ + buf_block_t* block, /* in: index page from where to move */ + rec_t* split_rec, /* in: first record to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + ulint old_data_size; + ulint new_data_size; + ulint old_n_recs; + ulint new_n_recs; + + old_data_size = page_get_data_size(new_page); + old_n_recs = page_get_n_recs(new_page); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* new_page_zip + = buf_block_get_page_zip(new_block); + page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!new_page_zip == !page_zip); + ut_a(!new_page_zip + || page_zip_validate(new_page_zip, new_page)); + ut_a(!page_zip + || page_zip_validate(page_zip, page_align(split_rec))); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + new_data_size = page_get_data_size(new_page); + new_n_recs = page_get_n_recs(new_page); + + ut_ad(new_data_size >= old_data_size); + + page_delete_rec_list_end(split_rec, block, index, + new_n_recs - old_n_recs, + new_data_size - old_data_size, mtr); + + return(TRUE); +} + +/***************************************************************** +Moves record list start to another page. Moved records do not include +split_rec. */ +UNIV_INTERN +ibool +page_move_rec_list_start( +/*=====================*/ + /* out: TRUE on success; FALSE on + compression failure */ + buf_block_t* new_block, /* in/out: index page where to move */ + buf_block_t* block, /* in/out: page containing split_rec */ + rec_t* split_rec, /* in: first record not to move */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr) /* in: mtr */ +{ + if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block, + split_rec, index, mtr))) { + return(FALSE); + } + + page_delete_rec_list_start(split_rec, block, index, mtr); + + return(TRUE); +} + +/*************************************************************************** +This is a low-level operation which is used in a database index creation +to update the page number of a created B-tree to a data dictionary record. */ +UNIV_INTERN +void +page_rec_write_index_page_no( +/*=========================*/ + rec_t* rec, /* in: record to update */ + ulint i, /* in: index of the field to update */ + ulint page_no,/* in: value to write */ + mtr_t* mtr) /* in: mtr */ +{ + byte* data; + ulint len; + + data = rec_get_nth_field_old(rec, i, &len); + + ut_ad(len == 4); + + mlog_write_ulint(data, page_no, MLOG_4BYTES, mtr); +} + +/****************************************************************** +Used to delete n slots from the directory. This function updates +also n_owned fields in the records, so that the first slot after +the deleted ones inherits the records of the deleted slots. */ +UNIV_INLINE +void +page_dir_delete_slot( +/*=================*/ + page_t* page, /* in/out: the index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint slot_no)/* in: slot to be deleted */ +{ + page_dir_slot_t* slot; + ulint n_owned; + ulint i; + ulint n_slots; + + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + ut_ad(slot_no + 1 < page_dir_get_n_slots(page)); + + n_slots = page_dir_get_n_slots(page); + + /* 1. Reset the n_owned fields of the slots to be + deleted */ + slot = page_dir_get_nth_slot(page, slot_no); + n_owned = page_dir_slot_get_n_owned(slot); + page_dir_slot_set_n_owned(slot, page_zip, 0); + + /* 2. Update the n_owned value of the first non-deleted slot */ + + slot = page_dir_get_nth_slot(page, slot_no + 1); + page_dir_slot_set_n_owned(slot, page_zip, + n_owned + page_dir_slot_get_n_owned(slot)); + + /* 3. Destroy the slot by copying slots */ + for (i = slot_no + 1; i < n_slots; i++) { + rec_t* rec = (rec_t*) + page_dir_slot_get_rec(page_dir_get_nth_slot(page, i)); + page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec); + } + + /* 4. Zero out the last slot, which will be removed */ + mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0); + + /* 5. Update the page header */ + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1); +} + +/****************************************************************** +Used to add n slots to the directory. Does not set the record pointers +in the added slots or update n_owned values: this is the responsibility +of the caller. */ +UNIV_INLINE +void +page_dir_add_slot( +/*==============*/ + page_t* page, /* in/out: the index page */ + page_zip_des_t* page_zip,/* in/out: comprssed page, or NULL */ + ulint start) /* in: the slot above which the new slots + are added */ +{ + page_dir_slot_t* slot; + ulint n_slots; + + n_slots = page_dir_get_n_slots(page); + + ut_ad(start < n_slots - 1); + + /* Update the page header */ + page_dir_set_n_slots(page, page_zip, n_slots + 1); + + /* Move slots up */ + slot = page_dir_get_nth_slot(page, n_slots); + memmove(slot, slot + PAGE_DIR_SLOT_SIZE, + (n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE); +} + +/******************************************************************** +Splits a directory slot which owns too many records. */ +UNIV_INTERN +void +page_dir_split_slot( +/*================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page whose + uncompressed part will be written, or NULL */ + ulint slot_no)/* in: the directory slot */ +{ + rec_t* rec; + page_dir_slot_t* new_slot; + page_dir_slot_t* prev_slot; + page_dir_slot_t* slot; + ulint i; + ulint n_owned; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + n_owned = page_dir_slot_get_n_owned(slot); + ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1); + + /* 1. We loop to find a record approximately in the middle of the + records owned by the slot. */ + + prev_slot = page_dir_get_nth_slot(page, slot_no - 1); + rec = (rec_t*) page_dir_slot_get_rec(prev_slot); + + for (i = 0; i < n_owned / 2; i++) { + rec = page_rec_get_next(rec); + } + + ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED); + + /* 2. We add one directory slot immediately below the slot to be + split. */ + + page_dir_add_slot(page, page_zip, slot_no - 1); + + /* The added slot is now number slot_no, and the old slot is + now number slot_no + 1 */ + + new_slot = page_dir_get_nth_slot(page, slot_no); + slot = page_dir_get_nth_slot(page, slot_no + 1); + + /* 3. We store the appropriate values to the new slot. */ + + page_dir_slot_set_rec(new_slot, rec); + page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2); + + /* 4. Finally, we update the number of records field of the + original slot */ + + page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2)); +} + +/***************************************************************** +Tries to balance the given directory slot with too few records with the upper +neighbor, so that there are at least the minimum number of records owned by +the slot; this may result in the merging of two slots. */ +UNIV_INTERN +void +page_dir_balance_slot( +/*==================*/ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint slot_no)/* in: the directory slot */ +{ + page_dir_slot_t* slot; + page_dir_slot_t* up_slot; + ulint n_owned; + ulint up_n_owned; + rec_t* old_rec; + rec_t* new_rec; + + ut_ad(page); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + + slot = page_dir_get_nth_slot(page, slot_no); + + /* The last directory slot cannot be balanced with the upper + neighbor, as there is none. */ + + if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) { + + return; + } + + up_slot = page_dir_get_nth_slot(page, slot_no + 1); + + n_owned = page_dir_slot_get_n_owned(slot); + up_n_owned = page_dir_slot_get_n_owned(up_slot); + + ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1); + + /* If the upper slot has the minimum value of n_owned, we will merge + the two slots, therefore we assert: */ + ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED); + + if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) { + + /* In this case we can just transfer one record owned + by the upper slot to the property of the lower slot */ + old_rec = (rec_t*) page_dir_slot_get_rec(slot); + + if (page_is_comp(page)) { + new_rec = rec_get_next_ptr(old_rec, TRUE); + + rec_set_n_owned_new(old_rec, page_zip, 0); + rec_set_n_owned_new(new_rec, page_zip, n_owned + 1); + } else { + new_rec = rec_get_next_ptr(old_rec, FALSE); + + rec_set_n_owned_old(old_rec, 0); + rec_set_n_owned_old(new_rec, n_owned + 1); + } + + page_dir_slot_set_rec(slot, new_rec); + + page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1); + } else { + /* In this case we may merge the two slots */ + page_dir_delete_slot(page, page_zip, slot_no); + } +} + +/**************************************************************** +Returns the middle record of the record list. If there are an even number +of records in the list, returns the first record of the upper half-list. */ +UNIV_INTERN +rec_t* +page_get_middle_rec( +/*================*/ + /* out: middle record */ + page_t* page) /* in: page */ +{ + page_dir_slot_t* slot; + ulint middle; + ulint i; + ulint n_owned; + ulint count; + rec_t* rec; + + /* This many records we must leave behind */ + middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; + + count = 0; + + for (i = 0;; i++) { + + slot = page_dir_get_nth_slot(page, i); + n_owned = page_dir_slot_get_n_owned(slot); + + if (count + n_owned > middle) { + break; + } else { + count += n_owned; + } + } + + ut_ad(i > 0); + slot = page_dir_get_nth_slot(page, i - 1); + rec = (rec_t*) page_dir_slot_get_rec(slot); + rec = page_rec_get_next(rec); + + /* There are now count records behind rec */ + + for (i = 0; i < middle - count; i++) { + rec = page_rec_get_next(rec); + } + + return(rec); +} + +/******************************************************************* +Returns the number of records before the given record in chain. +The number includes infimum and supremum records. */ +UNIV_INTERN +ulint +page_rec_get_n_recs_before( +/*=======================*/ + /* out: number of records */ + const rec_t* rec) /* in: the physical record */ +{ + const page_dir_slot_t* slot; + const rec_t* slot_rec; + const page_t* page; + ulint i; + lint n = 0; + + ut_ad(page_rec_check(rec)); + + page = page_align(rec); + if (page_is_comp(page)) { + while (rec_get_n_owned_new(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, TRUE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_new(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + + rec = rec_get_next_ptr_const(rec, FALSE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_old(slot_rec); + + if (rec == slot_rec) { + + break; + } + } + } + + n--; + + ut_ad(n >= 0); + + return((ulint) n); +} + +/**************************************************************** +Prints record contents including the data relevant only in +the index page context. */ +UNIV_INTERN +void +page_rec_print( +/*===========*/ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: record descriptor */ +{ + ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + rec_print_new(stderr, rec, offsets); + if (page_rec_is_comp(rec)) { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) rec_get_heap_no_new(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } else { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) rec_get_heap_no_old(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } + + page_rec_check(rec); + rec_validate(rec, offsets); +} + +/******************************************************************* +This is used to print the contents of the directory for +debugging purposes. */ +UNIV_INTERN +void +page_dir_print( +/*===========*/ + page_t* page, /* in: index page */ + ulint pr_n) /* in: print n first and n last entries */ +{ + ulint n; + ulint i; + page_dir_slot_t* slot; + + n = page_dir_get_n_slots(page); + + fprintf(stderr, "--------------------------------\n" + "PAGE DIRECTORY\n" + "Page address %p\n" + "Directory stack top at offs: %lu; number of slots: %lu\n", + page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)), + (ulong) n); + for (i = 0; i < n; i++) { + slot = page_dir_get_nth_slot(page, i); + if ((i == pr_n) && (i < n - pr_n)) { + fputs(" ... \n", stderr); + } + if ((i < pr_n) || (i >= n - pr_n)) { + fprintf(stderr, + "Contents of slot: %lu: n_owned: %lu," + " rec offs: %lu\n", + (ulong) i, + (ulong) page_dir_slot_get_n_owned(slot), + (ulong) + page_offset(page_dir_slot_get_rec(slot))); + } + } + fprintf(stderr, "Total of %lu records\n" + "--------------------------------\n", + (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page))); +} + +/******************************************************************* +This is used to print the contents of the page record list for +debugging purposes. */ +UNIV_INTERN +void +page_print_list( +/*============*/ + buf_block_t* block, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint pr_n) /* in: print n first and n last entries */ +{ + page_t* page = block->frame; + page_cur_t cur; + ulint count; + ulint n_recs; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table)); + + fprintf(stderr, + "--------------------------------\n" + "PAGE RECORD LIST\n" + "Page address %p\n", page); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cur); + count = 0; + for (;;) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + + if (count == pr_n) { + break; + } + if (page_cur_is_after_last(&cur)) { + break; + } + page_cur_move_to_next(&cur); + count++; + } + + if (n_recs > 2 * pr_n) { + fputs(" ... \n", stderr); + } + + while (!page_cur_is_after_last(&cur)) { + page_cur_move_to_next(&cur); + + if (count + pr_n >= n_recs) { + offsets = rec_get_offsets(cur.rec, index, offsets, + ULINT_UNDEFINED, &heap); + page_rec_print(cur.rec, offsets); + } + count++; + } + + fprintf(stderr, + "Total of %lu records \n" + "--------------------------------\n", + (ulong) (count + 1)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/******************************************************************* +Prints the info in a page header. */ +UNIV_INTERN +void +page_header_print( +/*==============*/ + const page_t* page) +{ + fprintf(stderr, + "--------------------------------\n" + "PAGE HEADER INFO\n" + "Page address %p, n records %lu (%s)\n" + "n dir slots %lu, heap top %lu\n" + "Page n heap %lu, free %lu, garbage %lu\n" + "Page last insert %lu, direction %lu, n direction %lu\n", + page, (ulong) page_header_get_field(page, PAGE_N_RECS), + page_is_comp(page) ? "compact format" : "original format", + (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS), + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) page_dir_get_n_heap(page), + (ulong) page_header_get_field(page, PAGE_FREE), + (ulong) page_header_get_field(page, PAGE_GARBAGE), + (ulong) page_header_get_field(page, PAGE_LAST_INSERT), + (ulong) page_header_get_field(page, PAGE_DIRECTION), + (ulong) page_header_get_field(page, PAGE_N_DIRECTION)); +} + +/******************************************************************* +This is used to print the contents of the page for +debugging purposes. */ +UNIV_INTERN +void +page_print( +/*=======*/ + buf_block_t* block, /* in: index page */ + dict_index_t* index, /* in: dictionary index of the page */ + ulint dn, /* in: print dn first and last entries + in directory */ + ulint rn) /* in: print rn first and last records + in directory */ +{ + page_t* page = block->frame; + + page_header_print(page); + page_dir_print(page, dn); + page_print_list(block, index, rn); +} + +/******************************************************************* +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. */ +UNIV_INTERN +ibool +page_rec_validate( +/*==============*/ + /* out: TRUE if ok */ + rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n_owned; + ulint heap_no; + page_t* page; + + page = page_align(rec); + ut_a(!page_is_comp(page) == !rec_offs_comp(offsets)); + + page_rec_check(rec); + rec_validate(rec, offsets); + + if (page_rec_is_comp(rec)) { + n_owned = rec_get_n_owned_new(rec); + heap_no = rec_get_heap_no_new(rec); + } else { + n_owned = rec_get_n_owned_old(rec); + heap_no = rec_get_heap_no_old(rec); + } + + if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) { + fprintf(stderr, + "InnoDB: Dir slot of rec %lu, n owned too big %lu\n", + (ulong) page_offset(rec), (ulong) n_owned); + return(FALSE); + } + + if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) { + fprintf(stderr, + "InnoDB: Heap no of rec %lu too big %lu %lu\n", + (ulong) page_offset(rec), (ulong) heap_no, + (ulong) page_dir_get_n_heap(page)); + return(FALSE); + } + + return(TRUE); +} + +/******************************************************************* +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +UNIV_INTERN +void +page_check_dir( +/*===========*/ + const page_t* page) /* in: index page */ +{ + ulint n_slots; + ulint infimum_offs; + ulint supremum_offs; + + n_slots = page_dir_get_n_slots(page); + infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0)); + supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page, + n_slots - 1)); + + if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " infimum not pointed to\n"); + buf_page_print(page, 0); + } + + if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) { + + fprintf(stderr, + "InnoDB: Page directory corruption:" + " supremum not pointed to\n"); + buf_page_print(page, 0); + } +} + +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ +UNIV_INTERN +ibool +page_simple_validate_old( +/*=====================*/ + /* out: TRUE if ok */ + page_t* page) /* in: old-style index page */ +{ + page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + rec_t* rec; + byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(!page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu of page dir slots\n", + (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above" + " rec heap top %lu\n", + (ulong)(rec - page), + (ulong)(rec_heap_top - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) own_count, + (ulong)(rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong)(rec - page)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset" + " nonsensical %lu for rec %lu\n", + (ulong) rec_get_next_offs(rec, FALSE), + (ulong) (rec - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) (rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) (rec - page), + (ulong) (rec_heap_top - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ +UNIV_INTERN +ibool +page_simple_validate_new( +/*=====================*/ + /* out: TRUE if ok */ + page_t* page) /* in: new-style index page */ +{ + page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + rec_t* rec; + byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu" + " of page dir slots\n", (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page," + " heap top %lu, dir %lu\n", + (ulong) page_header_get_field(page, PAGE_HEAP_TOP), + (ulong) + page_offset(page_dir_get_nth_slot(page, n_slots - 1))); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above rec" + " heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu," + " rec %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) own_count, + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY + (page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point" + " to right rec %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + own_count = 0; + + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + + break; + } + + if (UNIV_UNLIKELY + (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset nonsensical %lu" + " for rec %lu\n", + (ulong) rec_get_next_offs(rec, TRUE), + (ulong) page_offset(rec)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero" + " in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has" + " a nonsensical offset %lu\n", + (ulong) page_offset(rec)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu" + " is above rec heap top %lu\n", + (ulong) page_offset(rec), + (ulong) page_offset(rec_heap_top)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears" + " to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/******************************************************************* +This function checks the consistency of an index page. */ +UNIV_INTERN +ibool +page_validate( +/*==========*/ + /* out: TRUE if ok */ + page_t* page, /* in: index page */ + dict_index_t* index) /* in: data dictionary index containing + the page record type definition */ +{ + page_dir_slot_t*slot; + mem_heap_t* heap; + byte* buf; + ulint count; + ulint own_count; + ulint rec_own_count; + ulint slot_no; + ulint data_size; + rec_t* rec; + rec_t* old_rec = NULL; + ulint offs; + ulint n_slots; + ibool ret = FALSE; + ulint i; + ulint* offsets = NULL; + ulint* old_offsets = NULL; + + if (UNIV_UNLIKELY((ibool) !!page_is_comp(page) + != dict_table_is_comp(index->table))) { + fputs("InnoDB: 'compact format' flag mismatch\n", stderr); + goto func_exit2; + } + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(!page_simple_validate_new(page))) { + goto func_exit2; + } + } else { + if (UNIV_UNLIKELY(!page_simple_validate_old(page))) { + goto func_exit2; + } + } + + heap = mem_heap_create(UNIV_PAGE_SIZE + 200); + + /* The following buffer is used to check that the + records in the page record heap do not overlap */ + + buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) + <= page_dir_get_nth_slot(page, n_slots - 1)))) { + + fputs("InnoDB: Record heap and dir overlap on a page ", + stderr); + dict_index_name_print(stderr, NULL, index); + fprintf(stderr, ", %p, %p\n", + page_header_get_ptr(page, PAGE_HEAP_TOP), + page_dir_get_nth_slot(page, n_slots - 1)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that + it is consistent with the directory. */ + count = 0; + data_size = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + rec = page_get_infimum_rec(page); + + for (;;) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (page_is_comp(page) && page_rec_is_user_rec(rec) + && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec) + == page_is_leaf(page))) { + fputs("InnoDB: node_ptr flag mismatch\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + goto func_exit; + } + + /* Check that the records are in the ascending order */ + if (UNIV_LIKELY(count >= PAGE_HEAP_NO_USER_LOW) + && !page_rec_is_supremum(rec)) { + if (UNIV_UNLIKELY + (1 != cmp_rec_rec(rec, old_rec, + offsets, old_offsets, index))) { + fprintf(stderr, + "InnoDB: Records in wrong order" + " on page %lu ", + (ulong) page_get_page_no(page)); + dict_index_name_print(stderr, NULL, index); + fputs("\nInnoDB: previous record ", stderr); + rec_print_new(stderr, old_rec, old_offsets); + fputs("\nInnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + + goto func_exit; + } + } + + if (page_rec_is_user_rec(rec)) { + + data_size += rec_offs_size(offsets); + } + + offs = page_offset(rec_get_start(rec, offsets)); + + for (i = rec_offs_size(offsets); i--; ) { + if (UNIV_UNLIKELY(buf[offs + i])) { + /* No other record may overlap this */ + + fputs("InnoDB: Record overlaps another\n", + stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + if (page_is_comp(page)) { + rec_own_count = rec_get_n_owned_new(rec); + } else { + rec_own_count = rec_get_n_owned_old(rec); + } + + if (UNIV_UNLIKELY(rec_own_count)) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_own_count != own_count)) { + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu\n", + (ulong) rec_own_count, + (ulong) own_count); + goto func_exit; + } + + if (page_dir_slot_get_rec(slot) != rec) { + fputs("InnoDB: Dir slot does not" + " point to right rec\n", + stderr); + goto func_exit; + } + + page_dir_slot_check(slot); + + own_count = 0; + if (!page_rec_is_supremum(rec)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_rec_is_supremum(rec)) { + break; + } + + count++; + own_count++; + old_rec = rec; + rec = page_rec_get_next(rec); + + /* set old_offsets to offsets; recycle offsets */ + { + ulint* offs = old_offsets; + old_offsets = offsets; + offsets = offs; + } + } + + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + + goto n_owned_zero; + } + } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { +n_owned_zero: + fputs("InnoDB: n owned is zero\n", stderr); + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + + PAGE_HEAP_NO_USER_LOW, + (ulong) (count + 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) { + fprintf(stderr, + "InnoDB: Summed data size %lu, returned by func %lu\n", + (ulong) data_size, (ulong) page_get_data_size(page)); + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { + + goto func_exit; + } + + count++; + offs = page_offset(rec_get_start(rec, offsets)); + + for (i = rec_offs_size(offsets); i--; ) { + + if (UNIV_UNLIKELY(buf[offs + i])) { + fputs("InnoDB: Record overlaps another" + " in free list\n", stderr); + goto func_exit; + } + + buf[offs + i] = 1; + } + + rec = page_rec_get_next(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) count + 1); + goto func_exit; + } + + ret = TRUE; + +func_exit: + mem_heap_free(heap); + + if (UNIV_UNLIKELY(ret == FALSE)) { +func_exit2: + fprintf(stderr, "InnoDB: Apparent corruption in page %lu in ", + (ulong) page_get_page_no(page)); + dict_index_name_print(stderr, NULL, index); + putc('\n', stderr); + buf_page_print(page, 0); + } + + return(ret); +} + +/******************************************************************* +Looks in the page record list for a record with the given heap number. */ +UNIV_INTERN +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + /* out: record, NULL if not found */ + const page_t* page, /* in: index page */ + ulint heap_no)/* in: heap number */ +{ + const rec_t* rec; + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + for(;;) { + ulint rec_heap_no = rec_get_heap_no_new(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, TRUE); + } + } else { + rec = page + PAGE_OLD_INFIMUM; + + for (;;) { + ulint rec_heap_no = rec_get_heap_no_old(rec); + + if (rec_heap_no == heap_no) { + + return(rec); + } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) { + + return(NULL); + } + + rec = page + rec_get_next_offs(rec, FALSE); + } + } +} diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.c new file mode 100644 index 00000000000..56189ce3bad --- /dev/null +++ b/storage/xtradb/page/page0zip.c @@ -0,0 +1,4575 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#define THIS_MODULE +#include "page0zip.h" +#ifdef UNIV_NONINL +# include "page0zip.ic" +#endif +#undef THIS_MODULE +#include "page0page.h" +#include "mtr0log.h" +#include "ut0sort.h" +#include "dict0boot.h" +#include "dict0dict.h" +#include "btr0sea.h" +#include "btr0cur.h" +#include "page0types.h" +#include "lock0lock.h" +#include "log0recv.h" +#include "zlib.h" +#include "buf0lru.h" + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE - 1]; + +/* Please refer to ../include/page0zip.ic for a description of the +compressed page format. */ + +/* The infimum and supremum records are omitted from the compressed page. +On compress, we compare that the records are there, and on uncompress we +restore the records. */ +static const byte infimum_extra[] = { + 0x01, /* info_bits=0, n_owned=1 */ + 0x00, 0x02 /* heap_no=0, status=2 */ + /* ?, ? */ /* next=(first user rec, or supremum) */ +}; +static const byte infimum_data[] = { + 0x69, 0x6e, 0x66, 0x69, + 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */ +}; +static const byte supremum_extra_data[] = { + /* 0x0?, */ /* info_bits=0, n_owned=1..8 */ + 0x00, 0x0b, /* heap_no=1, status=3 */ + 0x00, 0x00, /* next=0 */ + 0x73, 0x75, 0x70, 0x72, + 0x65, 0x6d, 0x75, 0x6d /* "supremum" */ +}; + +/** Assert that a block of memory is filled with zero bytes. +Compare at most sizeof(field_ref_zero) bytes. */ +#define ASSERT_ZERO(b, s) \ + ut_ad(!memcmp(b, field_ref_zero, ut_min(s, sizeof field_ref_zero))) +/** Assert that a BLOB pointer is filled with zero bytes. */ +#define ASSERT_ZERO_BLOB(b) \ + ut_ad(!memcmp(b, field_ref_zero, sizeof field_ref_zero)) + +/* Enable some extra debugging output. This code can be enabled +independently of any UNIV_ debugging conditions. */ +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# include +__attribute__((format (printf, 1, 2))) +/************************************************************************** +Report a failure to decompress or compress. */ +static +int +page_zip_fail_func( +/*===============*/ + /* out: number of characters printed */ + const char* fmt, /* in: printf(3) format string */ + ...) /* in: arguments corresponding to fmt */ +{ + int res; + va_list ap; + + ut_print_timestamp(stderr); + fputs(" InnoDB: ", stderr); + va_start(ap, fmt); + res = vfprintf(stderr, fmt, ap); + va_end(ap); + + return(res); +} +# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args +#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +# define page_zip_fail(fmt_args) /* empty */ +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ + +/************************************************************************** +Determine the guaranteed free space on an empty page. */ +UNIV_INTERN +ulint +page_zip_empty_size( +/*================*/ + /* out: minimum payload size on the page */ + ulint n_fields, /* in: number of columns in the index */ + ulint zip_size) /* in: compressed page size in bytes */ +{ + lint size = zip_size + /* subtract the page header and the longest + uncompressed data needed for one record */ + - (PAGE_DATA + + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + 1/* encoded heap_no==2 in page_zip_write_rec() */ + + 1/* end of modification log */ + - REC_N_NEW_EXTRA_BYTES/* omitted bytes */) + /* subtract the space for page_zip_fields_encode() */ + - compressBound(2 * (n_fields + 1)); + return(size > 0 ? (ulint) size : 0); +} + +/***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +including deleted records (the free list). */ +UNIV_INLINE +ulint +page_zip_dir_size( +/*==============*/ + /* out: length of dense page + directory, in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW); + return(size); +} + +/***************************************************************** +Gets the size of the compressed page trailer (the dense page directory), +only including user records (excluding the free list). */ +UNIV_INLINE +ulint +page_zip_dir_user_size( +/*===================*/ + /* out: length of dense page + directory comprising existing + records, in bytes */ + const page_zip_des_t* page_zip) /* in: compressed page */ +{ + ulint size = PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page_zip->data); + ut_ad(size <= page_zip_dir_size(page_zip)); + return(size); +} + +/***************************************************************** +Find the slot of the given record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find_low( +/*==================*/ + /* out: dense directory slot, + or NULL if record not found */ + byte* slot, /* in: start of records */ + byte* end, /* in: end of records */ + ulint offset) /* in: offset of user record */ +{ + ut_ad(slot <= end); + + for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) { + if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK) + == offset) { + return(slot); + } + } + + return(NULL); +} + +/***************************************************************** +Find the slot of the given non-free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find( +/*==============*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip), + end, + offset)); +} + +/***************************************************************** +Find the slot of the given free record in the dense page directory. */ +UNIV_INLINE +byte* +page_zip_dir_find_free( +/*===================*/ + /* out: dense directory slot, + or NULL if record not found */ + page_zip_des_t* page_zip, /* in: compressed page */ + ulint offset) /* in: offset of user record */ +{ + byte* end = page_zip->data + page_zip_get_size(page_zip); + + ut_ad(page_zip_simple_validate(page_zip)); + + return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip), + end - page_zip_dir_user_size(page_zip), + offset)); +} + +/***************************************************************** +Read a given slot in the dense page directory. */ +UNIV_INLINE +ulint +page_zip_dir_get( +/*=============*/ + /* out: record offset + on the uncompressed page, + possibly ORed with + PAGE_ZIP_DIR_SLOT_DEL or + PAGE_ZIP_DIR_SLOT_OWNED */ + const page_zip_des_t* page_zip, /* in: compressed page */ + ulint slot) /* in: slot + (0=first user record) */ +{ + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE); + return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1))); +} + +/************************************************************************** +Write a log record of compressing an index page. */ +static +void +page_zip_compress_write_log( +/*========================*/ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page, /* in: uncompressed page */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction */ +{ + byte* log_ptr; + ulint trailer_size; + + log_ptr = mlog_open(mtr, 11 + 2 + 2); + + if (!log_ptr) { + + return; + } + + /* Read the number of user records. */ + trailer_size = page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW; + /* Multiply by uncompressed of size stored per record */ + if (!page_is_leaf(page)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + } else if (dict_index_is_clust(index)) { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE; + } + /* Add the space occupied by BLOB pointers. */ + trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ut_a(page_zip->m_end > PAGE_DATA); +#if FIL_PAGE_DATA > PAGE_DATA +# error "FIL_PAGE_DATA > PAGE_DATA" +#endif + ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip)); + + log_ptr = mlog_write_initial_log_record_fast((page_t*) page, + MLOG_ZIP_PAGE_COMPRESS, + log_ptr, mtr); + mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE); + log_ptr += 2; + mach_write_to_2(log_ptr, trailer_size); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + /* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4); + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4); + /* Write most of the page header, the compressed stream and + the modification log. */ + mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE, + page_zip->m_end - FIL_PAGE_TYPE); + /* Write the uncompressed trailer of the compressed page. */ + mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip) + - trailer_size, trailer_size); +} + +/********************************************************** +Determine how many externally stored columns are contained +in existing records with smaller heap_no than rec. */ +static +ulint +page_zip_get_n_prev_extern( +/*=======================*/ + const page_zip_des_t* page_zip,/* in: dense page directory on + compressed page */ + const rec_t* rec, /* in: compact physical record + on a B-tree leaf page */ + dict_index_t* index) /* in: record descriptor */ +{ + const page_t* page = page_align(rec); + ulint n_ext = 0; + ulint i; + ulint left; + ulint heap_no; + ulint n_recs = page_get_n_recs(page_zip->data); + + ut_ad(page_is_leaf(page)); + ut_ad(page_is_comp(page)); + ut_ad(dict_table_is_comp(index->table)); + ut_ad(dict_index_is_clust(index)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + left = heap_no - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(!left)) { + return(0); + } + + for (i = 0; i < n_recs; i++) { + const rec_t* r = page + (page_zip_dir_get(page_zip, i) + & PAGE_ZIP_DIR_SLOT_MASK); + + if (rec_get_heap_no_new(r) < heap_no) { + n_ext += rec_get_n_extern_new(r, index, + ULINT_UNDEFINED); + if (!--left) { + break; + } + } + } + + return(n_ext); +} + +/************************************************************************** +Encode the length of a fixed-length column. */ +static +byte* +page_zip_fixed_field_encode( +/*========================*/ + /* out: buf + length of encoded val */ + byte* buf, /* in: pointer to buffer where to write */ + ulint val) /* in: value to write */ +{ + ut_ad(val >= 2); + + if (UNIV_LIKELY(val < 126)) { + /* + 0 = nullable variable field of at most 255 bytes length; + 1 = not null variable field of at most 255 bytes length; + 126 = nullable variable field with maximum length >255; + 127 = not null variable field with maximum length >255 + */ + *buf++ = (byte) val; + } else { + *buf++ = (byte) (0x80 | val >> 8); + *buf++ = (byte) val; + } + + return(buf); +} + +/************************************************************************** +Write the index information for the compressed page. */ +static +ulint +page_zip_fields_encode( +/*===================*/ + /* out: used size of buf */ + ulint n, /* in: number of fields to compress */ + dict_index_t* index, /* in: index comprising at least n fields */ + ulint trx_id_pos,/* in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf) /* out: buffer of (n + 1) * 2 bytes */ +{ + const byte* buf_start = buf; + ulint i; + ulint col; + ulint trx_id_col = 0; + /* sum of lengths of preceding non-nullable fixed fields, or 0 */ + ulint fixed_sum = 0; + + ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n); + + for (i = col = 0; i < n; i++) { + dict_field_t* field = dict_index_get_nth_field(index, i); + ulint val; + + if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) { + val = 1; /* set the "not nullable" flag */ + } else { + val = 0; /* nullable field */ + } + + if (!field->fixed_len) { + /* variable-length field */ + const dict_col_t* column + = dict_field_get_col(field); + + if (UNIV_UNLIKELY(column->len > 255) + || UNIV_UNLIKELY(column->mtype == DATA_BLOB)) { + val |= 0x7e; /* max > 255 bytes */ + } + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + *buf++ = (byte) val; + col++; + } else if (val) { + /* fixed-length non-nullable field */ + + if (fixed_sum && UNIV_UNLIKELY + (fixed_sum + field->fixed_len + > DICT_MAX_INDEX_COL_LEN)) { + /* Write out the length of the + preceding non-nullable fields, + to avoid exceeding the maximum + length of a fixed-length column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + if (i && UNIV_UNLIKELY(i == trx_id_pos)) { + if (fixed_sum) { + /* Write out the length of any + preceding non-nullable fields, + and start a new trx_id column. */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + col++; + } + + trx_id_col = col; + fixed_sum = field->fixed_len; + } else { + /* add to the sum */ + fixed_sum += field->fixed_len; + } + } else { + /* fixed-length nullable field */ + + if (fixed_sum) { + /* write out the length of any + preceding non-nullable fields */ + buf = page_zip_fixed_field_encode( + buf, fixed_sum << 1 | 1); + fixed_sum = 0; + col++; + } + + buf = page_zip_fixed_field_encode( + buf, field->fixed_len << 1); + col++; + } + } + + if (fixed_sum) { + /* Write out the lengths of last fixed-length columns. */ + buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1); + } + + if (trx_id_pos != ULINT_UNDEFINED) { + /* Write out the position of the trx_id column */ + i = trx_id_col; + } else { + /* Write out the number of nullable fields */ + i = index->n_nullable; + } + + if (i < 128) { + *buf++ = (byte) i; + } else { + *buf++ = (byte) (0x80 | i >> 8); + *buf++ = (byte) i; + } + + ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2); + return((ulint) (buf - buf_start)); +} + +/************************************************************************** +Populate the dense page directory from the sparse directory. */ +static +void +page_zip_dir_encode( +/*================*/ + const page_t* page, /* in: compact page */ + byte* buf, /* in: pointer to dense page directory[-1]; + out: dense directory on compressed page */ + const rec_t** recs) /* in: pointer to an array of 0, or NULL; + out: dense page directory sorted by ascending + address (and heap_no) */ +{ + const byte* rec; + ulint status; + ulint min_mark; + ulint heap_no; + ulint i; + ulint n_heap; + ulint offs; + + min_mark = 0; + + if (page_is_leaf(page)) { + status = REC_STATUS_ORDINARY; + } else { + status = REC_STATUS_NODE_PTR; + if (UNIV_UNLIKELY + (mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)) { + min_mark = REC_INFO_MIN_REC_FLAG; + } + } + + n_heap = page_dir_get_n_heap(page); + + /* Traverse the list of stored records in the collation order, + starting from the first user record. */ + + rec = page + PAGE_NEW_INFIMUM, TRUE; + + i = 0; + + for (;;) { + ulint info_bits; + offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) { + break; + } + rec = page + offs; + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR); + ut_a(offs >= PAGE_ZIP_START); +#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1) +# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2" +#endif +#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1 +# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1" +#endif + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + offs |= PAGE_ZIP_DIR_SLOT_OWNED; + } + + info_bits = rec_get_info_bits(rec, TRUE); + if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) { + info_bits &= ~REC_INFO_DELETED_FLAG; + offs |= PAGE_ZIP_DIR_SLOT_DEL; + } + ut_a(info_bits == min_mark); + /* Only the smallest user record can have + REC_INFO_MIN_REC_FLAG set. */ + min_mark = 0; + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + ut_a(rec_get_status(rec) == status); + } + + offs = page_header_get_field(page, PAGE_FREE); + + /* Traverse the free list (of deleted records). */ + while (offs) { + ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK)); + rec = page + offs; + + heap_no = rec_get_heap_no_new(rec); + ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW); + ut_a(heap_no < n_heap); + + ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */ + ut_a(rec_get_status(rec) == status); + + mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs); + + if (UNIV_LIKELY_NULL(recs)) { + /* Ensure that each heap_no occurs at most once. */ + ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]); + /* exclude infimum and supremum */ + recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec; + } + + offs = rec_get_next_offs(rec, TRUE); + } + + /* Ensure that each heap no occurs at least once. */ + ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap); +} + +/************************************************************************** +Allocate memory for zlib. */ +static +void* +page_zip_malloc( +/*============*/ + void* opaque, + uInt items, + uInt size) +{ + return(mem_heap_alloc(opaque, items * size)); +} + +/************************************************************************** +Deallocate memory for zlib. */ +static +void +page_zip_free( +/*==========*/ + void* opaque __attribute__((unused)), + void* address __attribute__((unused))) +{ +} + +/************************************************************************** +Configure the zlib allocator to use the given memory heap. */ +UNIV_INTERN +void +page_zip_set_alloc( +/*===============*/ + void* stream, /* in/out: zlib stream */ + mem_heap_t* heap) /* in: memory heap to use */ +{ + z_stream* strm = stream; + + strm->zalloc = page_zip_malloc; + strm->zfree = page_zip_free; + strm->opaque = heap; +} + +#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG +# define PAGE_ZIP_COMPRESS_DBG +#endif + +#ifdef PAGE_ZIP_COMPRESS_DBG +/* Set this variable in a debugger to enable +excessive logging in page_zip_compress(). */ +UNIV_INTERN ibool page_zip_compress_dbg; +/* Set this variable in a debugger to enable +binary logging of the data passed to deflate(). +When this variable is nonzero, it will act +as a log file name generator. */ +UNIV_INTERN unsigned page_zip_compress_log; + +/************************************************************************** +Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set. */ +static +ibool +page_zip_compress_deflate( +/*======================*/ + FILE* logfile,/* in: log file, or NULL */ + z_streamp strm, /* in/out: compressed stream for deflate() */ + int flush) /* in: deflate() flushing method */ +{ + int status; + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + ut_print_buf(stderr, strm->next_in, strm->avail_in); + } + if (UNIV_LIKELY_NULL(logfile)) { + fwrite(strm->next_in, 1, strm->avail_in, logfile); + } + status = deflate(strm, flush); + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, " -> %d\n", status); + } + return(status); +} + +/* Redefine deflate(). */ +# undef deflate +# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush) +# define FILE_LOGFILE FILE* logfile, +# define LOGFILE logfile, +#else /* PAGE_ZIP_COMPRESS_DBG */ +# define FILE_LOGFILE +# define LOGFILE +#endif /* PAGE_ZIP_COMPRESS_DBG */ + +/************************************************************************** +Compress the records of a node pointer page. */ +static +int +page_zip_compress_node_ptrs( +/*========================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + byte* storage, /* in: end of dense page directory */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Only leaf nodes may contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + ut_ad(!c_stream->avail_in); + + /* Compress the data bytes, except node_ptr. */ + c_stream->next_in = (byte*) rec; + c_stream->avail_in = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + ut_ad(c_stream->avail_in); + + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + + ut_ad(!c_stream->avail_in); + + memcpy(storage - REC_NODE_PTR_SIZE + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, REC_NODE_PTR_SIZE); + c_stream->next_in += REC_NODE_PTR_SIZE; + } while (--n_dense); + + return(err); +} + +/************************************************************************** +Compress the records of a leaf node of a secondary index. */ +static +int +page_zip_compress_sec( +/*==================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense) /* in: size of recs[] */ +{ + int err = Z_OK; + + ut_ad(n_dense > 0); + + do { + const rec_t* rec = *recs++; + + /* Compress everything up to this record. */ + c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in; + + if (UNIV_LIKELY(c_stream->avail_in)) { + UNIV_MEM_ASSERT_RW(c_stream->next_in, + c_stream->avail_in); + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + break; + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + c_stream->next_in = (byte*) rec; + } while (--n_dense); + + return(err); +} + +/************************************************************************** +Compress a record of a leaf node of a clustered index that contains +externally stored columns. */ +static +int +page_zip_compress_clust_ext( +/*========================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t* rec, /* in: record */ + const ulint* offsets, /* in: rec_get_offsets(rec) */ + ulint trx_id_col, /* in: position of of DB_TRX_ID */ + byte* deleted, /* in: dense directory entry pointing + to the head of the free list */ + byte* storage, /* in: end of dense page directory */ + byte** externs, /* in/out: pointer to the next + available BLOB pointer */ + ulint* n_blobs) /* in/out: number of + externally stored columns */ +{ + int err; + ulint i; + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, i)); + /* Store trx_id and roll_ptr + in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Compress any preceding bytes. */ + c_stream->avail_in + = src - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + i++; + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + c_stream->avail_in = src + - c_stream->next_in; + if (UNIV_LIKELY(c_stream->avail_in)) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + /* Reserve space for the data at + the end of the space reserved for + the compressed data and the page + modification log. */ + + if (UNIV_UNLIKELY + (c_stream->avail_out + <= BTR_EXTERN_FIELD_REF_SIZE)) { + /* out of space */ + return(Z_BUF_ERROR); + } + + ut_ad(*externs == c_stream->next_out + + c_stream->avail_out + + 1/* end of modif. log */); + + c_stream->next_in + += BTR_EXTERN_FIELD_REF_SIZE; + + /* Skip deleted records. */ + if (UNIV_LIKELY_NULL + (page_zip_dir_find_low( + storage, deleted, + page_offset(rec)))) { + continue; + } + + (*n_blobs)++; + c_stream->avail_out + -= BTR_EXTERN_FIELD_REF_SIZE; + *externs -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Copy the BLOB pointer */ + memcpy(*externs, c_stream->next_in + - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + return(Z_OK); +} + +/************************************************************************** +Compress the records of a leaf node of a clustered index. */ +static +int +page_zip_compress_clust( +/*====================*/ + /* out: Z_OK, or a zlib error code */ + FILE_LOGFILE + z_stream* c_stream, /* in/out: compressed page stream */ + const rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint* n_blobs, /* in: 0; out: number of + externally stored columns */ + ulint trx_id_col, /* index of the trx_id column */ + byte* deleted, /* in: dense directory entry pointing + to the head of the free list */ + byte* storage, /* in: end of dense page directory */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + int err = Z_OK; + ulint* offsets = NULL; + /* BTR_EXTERN_FIELD_REF storage */ + byte* externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ut_ad(*n_blobs == 0); + + do { + const rec_t* rec = *recs++; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + ut_ad(rec_offs_n_fields(offsets) + == dict_index_get_n_fields(index)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress the extra bytes. */ + c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES + - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES); + + /* Compress the data bytes. */ + + c_stream->next_in = (byte*) rec; + + /* Check if there are any externally stored columns. + For each externally stored column, store the + BTR_EXTERN_FIELD_REF separately. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + ut_ad(dict_index_is_clust(index)); + + err = page_zip_compress_clust_ext( + LOGFILE + c_stream, rec, offsets, trx_id_col, + deleted, storage, &externs, n_blobs); + + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } else { + ulint len; + const byte* src; + + /* Store trx_id and roll_ptr in uncompressed form. */ + src = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + /* Compress any preceding bytes. */ + c_stream->avail_in = src - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + return(err); + } + } + + ut_ad(!c_stream->avail_in); + ut_ad(c_stream->next_in == src); + + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (rec_get_heap_no_new(rec) - 1), + c_stream->next_in, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + c_stream->next_in + += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Skip also roll_ptr */ + ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets)); + } + + /* Compress the last bytes of the record. */ + c_stream->avail_in = rec + rec_offs_data_size(offsets) + - c_stream->next_in; + + if (c_stream->avail_in) { + err = deflate(c_stream, Z_NO_FLUSH); + if (UNIV_UNLIKELY(err != Z_OK)) { + + goto func_exit; + } + } + ut_ad(!c_stream->avail_in); + } while (--n_dense); + +func_exit: + return(err); +} + +/************************************************************************** +Compress a page. */ +UNIV_INTERN +ibool +page_zip_compress( +/*==============*/ + /* out: TRUE on success, FALSE on failure; + page_zip will be left intact on failure. */ + page_zip_des_t* page_zip,/* in: size; out: data, n_blobs, + m_start, m_end, m_nonempty */ + const page_t* page, /* in: uncompressed page */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + z_stream c_stream; + int err; + ulint n_fields;/* number of index fields needed */ + byte* fields; /* index field information */ + byte* buf; /* compressed payload of the page */ + byte* buf_end;/* end of buf */ + ulint n_dense; + ulint slot_size;/* amount of uncompressed bytes per record */ + const rec_t** recs; /* dense page directory, sorted by address */ + mem_heap_t* heap; + ulint trx_id_col; + ulint* offsets = NULL; + ulint n_blobs = 0; + byte* storage;/* storage of uncompressed columns */ + ullint usec = ut_time_us(NULL); +#ifdef PAGE_ZIP_COMPRESS_DBG + FILE* logfile = NULL; +#endif + + ut_a(page_is_comp(page)); + ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + /* Check the data that will be omitted. */ + ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra)); + ut_a(!memcmp(page + PAGE_NEW_INFIMUM, + infimum_data, sizeof infimum_data)); + ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] + /* info_bits == 0, n_owned <= max */ + <= PAGE_DIR_SLOT_MAX_N_OWNED); + ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data)); + + if (UNIV_UNLIKELY(!page_get_n_recs(page))) { + ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE) + == PAGE_NEW_SUPREMUM); + } + + if (page_is_leaf(page)) { + n_fields = dict_index_get_n_fields(index); + } else { + n_fields = dict_index_get_n_unique_in_tree(index); + } + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; +#ifdef PAGE_ZIP_COMPRESS_DBG + if (UNIV_UNLIKELY(page_zip_compress_dbg)) { + fprintf(stderr, "compress %p %p %lu %lu %lu\n", + (void*) page_zip, (void*) page, + page_is_leaf(page), + n_fields, n_dense); + } + if (UNIV_UNLIKELY(page_zip_compress_log)) { + /* Create a log file for every compression attempt. */ + char logfilename[9]; + ut_snprintf(logfilename, sizeof logfilename, + "%08x", page_zip_compress_log++); + logfile = fopen(logfilename, "wb"); + + if (logfile) { + /* Write the uncompressed page to the log. */ + fwrite(page, 1, UNIV_PAGE_SIZE, logfile); + /* Record the compressed size as zero. + This will be overwritten at successful exit. */ + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + putc(0, logfile); + } + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + page_zip_stat[page_zip->ssize - 1].compressed++; + + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + + goto err_exit; + } + + heap = mem_heap_create(page_zip_get_size(page_zip) + + n_fields * (2 + sizeof *offsets) + + n_dense * ((sizeof *recs) + - PAGE_ZIP_DIR_SLOT_SIZE) + + UNIV_PAGE_SIZE * 4 + + (512 << MAX_MEM_LEVEL)); + + recs = mem_heap_zalloc(heap, n_dense * sizeof *recs); + + fields = mem_heap_alloc(heap, (n_fields + 1) * 2); + + buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA); + buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA; + + /* Compress the data payload. */ + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT, + MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + + c_stream.next_out = buf; + /* Subtract the space reserved for uncompressed data. */ + /* Page header and the end marker of the modification log */ + c_stream.avail_out = buf_end - buf - 1; + /* Dense page directory and uncompressed columns, if any */ + if (page_is_leaf(page)) { + if (dict_index_is_clust(index)) { + trx_id_col = dict_index_get_sys_col_pos( + index, DATA_TRX_ID); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + /* Signal the absence of trx_id + in page_zip_fields_encode() */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + trx_id_col = 0; + slot_size = PAGE_ZIP_DIR_SLOT_SIZE; + } + } else { + slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE; + trx_id_col = ULINT_UNDEFINED; + } + + if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size + + 6/* sizeof(zlib header and footer) */)) { + goto zlib_error; + } + + c_stream.avail_out -= n_dense * slot_size; + c_stream.avail_in = page_zip_fields_encode(n_fields, index, + trx_id_col, fields); + c_stream.next_in = fields; + if (UNIV_LIKELY(!trx_id_col)) { + trx_id_col = ULINT_UNDEFINED; + } + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FULL_FLUSH); + if (err != Z_OK) { + goto zlib_error; + } + + ut_ad(!c_stream.avail_in); + + page_zip_dir_encode(page, buf_end, recs); + + c_stream.next_in = (byte*) page + PAGE_ZIP_START; + + storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + /* Compress the records in heap_no order. */ + if (UNIV_UNLIKELY(!n_dense)) { + } else if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + err = page_zip_compress_node_ptrs(LOGFILE + &c_stream, recs, n_dense, + index, storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + err = page_zip_compress_sec(LOGFILE + &c_stream, recs, n_dense); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } else { + /* This is a leaf page in a clustered index. */ + err = page_zip_compress_clust(LOGFILE + &c_stream, recs, n_dense, + index, &n_blobs, trx_id_col, + buf_end - PAGE_ZIP_DIR_SLOT_SIZE + * page_get_n_recs(page), + storage, heap); + if (UNIV_UNLIKELY(err != Z_OK)) { + goto zlib_error; + } + } + + /* Finish the compression. */ + ut_ad(!c_stream.avail_in); + /* Compress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list, + or the data of the last record from page_zip_compress_sec(). */ + c_stream.avail_in + = page_header_get_field(page, PAGE_HEAP_TOP) + - (c_stream.next_in - page); + ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR); + + UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in); + err = deflate(&c_stream, Z_FINISH); + + if (UNIV_UNLIKELY(err != Z_STREAM_END)) { +zlib_error: + deflateEnd(&c_stream); + mem_heap_free(heap); +err_exit: +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + page_zip_stat[page_zip->ssize - 1].compressed_usec + += ut_time_us(NULL) - usec; + return(FALSE); + } + + err = deflateEnd(&c_stream); + ut_a(err == Z_OK); + + ut_ad(buf + c_stream.total_out == c_stream.next_out); + ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out); + + /* Valgrind believes that zlib does not initialize some bits + in the last 7 or 8 bytes of the stream. Make Valgrind happy. */ + UNIV_MEM_VALID(buf, c_stream.total_out); + + /* Zero out the area reserved for the modification log. + Space for the end marker of the modification log is not + included in avail_out. */ + memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */); + +#ifdef UNIV_DEBUG + page_zip->m_start = +#endif /* UNIV_DEBUG */ + page_zip->m_end = PAGE_DATA + c_stream.total_out; + page_zip->m_nonempty = FALSE; + page_zip->n_blobs = n_blobs; + /* Copy those header fields that will not be written + in buf_flush_init_for_writing() */ + memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV); + memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2); + memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA); + /* Copy the rest of the compressed page */ + memcpy(page_zip->data + PAGE_DATA, buf, + page_zip_get_size(page_zip) - PAGE_DATA); + mem_heap_free(heap); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { + page_zip_compress_write_log(page_zip, page, index, mtr); + } + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + +#ifdef PAGE_ZIP_COMPRESS_DBG + if (logfile) { + /* Record the compressed size of the block. */ + byte sz[4]; + mach_write_to_4(sz, c_stream.total_out); + fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET); + fwrite(sz, 1, sizeof sz, logfile); + fclose(logfile); + } +#endif /* PAGE_ZIP_COMPRESS_DBG */ + { + page_zip_stat_t* zip_stat + = &page_zip_stat[page_zip->ssize - 1]; + zip_stat->compressed_ok++; + zip_stat->compressed_usec += ut_time_us(NULL) - usec; + } + + return(TRUE); +} + +/************************************************************************** +Compare two page directory entries. */ +UNIV_INLINE +ibool +page_zip_dir_cmp( +/*=============*/ + /* out: positive if rec1 > rec2 */ + const rec_t* rec1, /* in: rec1 */ + const rec_t* rec2) /* in: rec2 */ +{ + return(rec1 > rec2); +} + +/************************************************************************** +Sort the dense page directory by address (heap_no). */ +static +void +page_zip_dir_sort( +/*==============*/ + rec_t** arr, /* in/out: dense page directory */ + rec_t** aux_arr,/* in/out: work area */ + ulint low, /* in: lower bound of the sorting area, inclusive */ + ulint high) /* in: upper bound of the sorting area, exclusive */ +{ + UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high, + page_zip_dir_cmp); +} + +/************************************************************************** +Deallocate the index information initialized by page_zip_fields_decode(). */ +static +void +page_zip_fields_free( +/*=================*/ + dict_index_t* index) /* in: dummy index to be freed */ +{ + if (index) { + dict_table_t* table = index->table; + mem_heap_free(index->heap); + mutex_free(&(table->autoinc_mutex)); + mem_heap_free(table->heap); + } +} + +/************************************************************************** +Read the index information for the compressed page. */ +static +dict_index_t* +page_zip_fields_decode( +/*===================*/ + /* out,own: dummy index describing the page, + or NULL on error */ + const byte* buf, /* in: index information */ + const byte* end, /* in: end of buf */ + ulint* trx_id_col)/* in: NULL for non-leaf pages; + for leaf pages, pointer to where to store + the position of the trx_id column */ +{ + const byte* b; + ulint n; + ulint i; + ulint val; + dict_table_t* table; + dict_index_t* index; + + /* Determine the number of fields. */ + for (b = buf, n = 0; b < end; n++) { + if (*b++ & 0x80) { + b++; /* skip the second byte */ + } + } + + n--; /* n_nullable or trx_id */ + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) { + + page_zip_fail(("page_zip_fields_decode: n = %lu\n", + (ulong) n)); + return(NULL); + } + + if (UNIV_UNLIKELY(b > end)) { + + page_zip_fail(("page_zip_fields_decode: %p > %p\n", + (const void*) b, (const void*) end)); + return(NULL); + } + + table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, + DICT_TF_COMPACT); + index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", + DICT_HDR_SPACE, 0, n); + index->table = table; + index->n_uniq = n; + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + + /* Initialize the fields. */ + for (b = buf, i = 0; i < n; i++) { + ulint mtype; + ulint len; + + val = *b++; + + if (UNIV_UNLIKELY(val & 0x80)) { + /* fixed length > 62 bytes */ + val = (val & 0x7f) << 8 | *b++; + len = val >> 1; + mtype = DATA_FIXBINARY; + } else if (UNIV_UNLIKELY(val >= 126)) { + /* variable length with max > 255 bytes */ + len = 0x7fff; + mtype = DATA_BINARY; + } else if (val <= 1) { + /* variable length with max <= 255 bytes */ + len = 0; + mtype = DATA_BINARY; + } else { + /* fixed length < 62 bytes */ + len = val >> 1; + mtype = DATA_FIXBINARY; + } + + dict_mem_table_add_col(table, NULL, NULL, mtype, + val & 1 ? DATA_NOT_NULL : 0, len); + dict_index_add_col(index, table, + dict_table_get_nth_col(table, i), 0); + } + + val = *b++; + if (UNIV_UNLIKELY(val & 0x80)) { + val = (val & 0x7f) << 8 | *b++; + } + + /* Decode the position of the trx_id column. */ + if (trx_id_col) { + if (!val) { + val = ULINT_UNDEFINED; + } else if (UNIV_UNLIKELY(val >= n)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->type = DICT_CLUSTERED; + } + + *trx_id_col = val; + } else { + /* Decode the number of nullable fields. */ + if (UNIV_UNLIKELY(index->n_nullable > val)) { + page_zip_fields_free(index); + index = NULL; + } else { + index->n_nullable = val; + } + } + + ut_ad(b == end); + + return(index); +} + +/************************************************************************** +Populate the sparse page directory from the dense directory. */ +static +ibool +page_zip_dir_decode( +/*================*/ + /* out: TRUE on success, + FALSE on failure */ + const page_zip_des_t* page_zip,/* in: dense page directory on + compressed page */ + page_t* page, /* in: compact page with valid header; + out: trailer and sparse page directory + filled in */ + rec_t** recs, /* out: dense page directory sorted by + ascending address (and heap_no) */ + rec_t** recs_aux,/* in/out: scratch area */ + ulint n_dense)/* in: number of user records, and + size of recs[] and recs_aux[] */ +{ + ulint i; + ulint n_recs; + byte* slot; + + n_recs = page_get_n_recs(page); + + if (UNIV_UNLIKELY(n_recs > n_dense)) { + page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n", + (ulong) n_recs, (ulong) n_dense)); + return(FALSE); + } + + /* Traverse the list of stored records in the sorting order, + starting from the first user record. */ + + slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE); + UNIV_PREFETCH_RW(slot); + + /* Zero out the page trailer. */ + memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR); + + mach_write_to_2(slot, PAGE_NEW_INFIMUM); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + + /* Initialize the sparse directory and copy the dense directory. */ + for (i = 0; i < n_recs; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (offs & PAGE_ZIP_DIR_SLOT_OWNED) { + mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK); + slot -= PAGE_DIR_SLOT_SIZE; + UNIV_PREFETCH_RW(slot); + } + + if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK) + < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n", + (unsigned) i, (unsigned) n_recs, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK); + } + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + { + const page_dir_slot_t* last_slot = page_dir_get_nth_slot( + page, page_dir_get_n_slots(page) - 1); + + if (UNIV_UNLIKELY(slot != last_slot)) { + page_zip_fail(("page_zip_dir_decode 3: %p != %p\n", + (const void*) slot, + (const void*) last_slot)); + return(FALSE); + } + } + + /* Copy the rest of the dense directory. */ + for (; i < n_dense; i++) { + ulint offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n", + (unsigned) i, (unsigned) n_dense, + (ulong) offs)); + return(FALSE); + } + + recs[i] = page + offs; + } + + if (UNIV_LIKELY(n_dense > 1)) { + page_zip_dir_sort(recs, recs_aux, 0, n_dense); + } + return(TRUE); +} + +/************************************************************************** +Initialize the REC_N_NEW_EXTRA_BYTES of each record. */ +static +ibool +page_zip_set_extra_bytes( +/*=====================*/ + /* out: TRUE on success, + FALSE on failure */ + const page_zip_des_t* page_zip,/* in: compressed page */ + page_t* page, /* in/out: uncompressed page */ + ulint info_bits)/* in: REC_INFO_MIN_REC_FLAG or 0 */ +{ + ulint n; + ulint i; + ulint n_owned = 1; + ulint offs; + rec_t* rec; + + n = page_get_n_recs(page); + rec = page + PAGE_NEW_INFIMUM; + + for (i = 0; i < n; i++) { + offs = page_zip_dir_get(page_zip, i); + + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) { + info_bits |= REC_INFO_DELETED_FLAG; + } + if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) { + info_bits |= n_owned; + n_owned = 1; + } else { + n_owned++; + } + offs &= PAGE_ZIP_DIR_SLOT_MASK; + if (UNIV_UNLIKELY(offs < PAGE_ZIP_START + + REC_N_NEW_EXTRA_BYTES)) { + page_zip_fail(("page_zip_set_extra_bytes 1:" + " %u %u %lx\n", + (unsigned) i, (unsigned) n, + (ulong) offs)); + return(FALSE); + } + + rec_set_next_offs_new(rec, offs); + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits; + info_bits = 0; + } + + /* Set the next pointer of the last user record. */ + rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM); + + /* Set n_owned of the supremum record. */ + page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned; + + /* The dense directory excludes the infimum and supremum records. */ + n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW; + + if (i >= n) { + if (UNIV_LIKELY(i == n)) { + return(TRUE); + } + + page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n", + (unsigned) i, (unsigned) n)); + return(FALSE); + } + + offs = page_zip_dir_get(page_zip, i); + + /* Set the extra bytes of deleted records on the free list. */ + for (;;) { + if (UNIV_UNLIKELY(!offs) + || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) { + + page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n", + (ulong) offs)); + return(FALSE); + } + + rec = page + offs; + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + if (++i == n) { + break; + } + + offs = page_zip_dir_get(page_zip, i); + rec_set_next_offs_new(rec, offs); + } + + /* Terminate the free list. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + rec_set_next_offs_new(rec, 0); + + return(TRUE); +} + +/************************************************************************** +Apply the modification log to a record containing externally stored +columns. Do not copy the fields that are stored separately. */ +static +const byte* +page_zip_apply_log_ext( +/*===================*/ + /* out: pointer to modification log, + or NULL on failure */ + rec_t* rec, /* in/out: record */ + const ulint* offsets, /* in: rec_get_offsets(rec) */ + ulint trx_id_col, /* in: position of of DB_TRX_ID */ + const byte* data, /* in: modification log */ + const byte* end) /* in: end of modification log */ +{ + ulint i; + ulint len; + byte* next_out = rec; + + /* Check if there are any externally stored columns. + For each externally stored column, skip the + BTR_EXTERN_FIELD_REF. */ + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, + i, &len); + if (UNIV_UNLIKELY(dst - next_out >= end - data) + || UNIV_UNLIKELY + (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) + || rec_offs_nth_extern(offsets, i)) { + page_zip_fail(("page_zip_apply_log_ext:" + " trx_id len %lu," + " %p - %p >= %p - %p\n", + (ulong) len, + (const void*) dst, + (const void*) next_out, + (const void*) end, + (const void*) data)); + return(NULL); + } + + memcpy(next_out, data, dst - next_out); + data += dst - next_out; + next_out = dst + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + + len += dst - next_out + - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "ext %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(next_out, data, len); + data += len; + next_out += len + + BTR_EXTERN_FIELD_REF_SIZE; + } + } + + /* Copy the last bytes of the record. */ + len = rec_get_end(rec, offsets) - next_out; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log_ext: " + "last %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(next_out, data, len); + data += len; + + return(data); +} + +/************************************************************************** +Apply the modification log to an uncompressed page. +Do not copy the fields that are stored separately. */ +static +const byte* +page_zip_apply_log( +/*===============*/ + /* out: pointer to end of modification log, + or NULL on failure */ + const byte* data, /* in: modification log */ + ulint size, /* in: maximum length of the log, in bytes */ + rec_t** recs, /* in: dense page directory, + sorted by address (indexed by + heap_no - PAGE_HEAP_NO_USER_LOW) */ + ulint n_dense,/* in: size of recs[] */ + ulint trx_id_col,/* in: column number of trx_id in the index, + or ULINT_UNDEFINED if none */ + ulint heap_status, + /* in: heap_no and status bits for + the next record to uncompress */ + dict_index_t* index, /* in: index of the page */ + ulint* offsets)/* in/out: work area for + rec_get_offsets_reverse() */ +{ + const byte* const end = data + size; + + for (;;) { + ulint val; + rec_t* rec; + ulint len; + ulint hs; + + val = *data++; + if (UNIV_UNLIKELY(!val)) { + return(data - 1); + } + if (val & 0x80) { + val = (val & 0x7f) << 8 | *data++; + if (UNIV_UNLIKELY(!val)) { + page_zip_fail(("page_zip_apply_log:" + " invalid val %x%x\n", + data[-2], data[-1])); + return(NULL); + } + } + if (UNIV_UNLIKELY(data >= end)) { + page_zip_fail(("page_zip_apply_log: %p >= %p\n", + (const void*) data, + (const void*) end)); + return(NULL); + } + if (UNIV_UNLIKELY((val >> 1) > n_dense)) { + page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n", + (ulong) val, (ulong) n_dense)); + return(NULL); + } + + /* Determine the heap number and status bits of the record. */ + rec = recs[(val >> 1) - 1]; + + hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT; + hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1); + + /* This may either be an old record that is being + overwritten (updated in place, or allocated from + the free list), or a new record, with the next + available_heap_no. */ + if (UNIV_UNLIKELY(hs > heap_status)) { + page_zip_fail(("page_zip_apply_log: %lu > %lu\n", + (ulong) hs, (ulong) heap_status)); + return(NULL); + } else if (hs == heap_status) { + /* A new record was allocated from the heap. */ + if (UNIV_UNLIKELY(val & 1)) { + /* Only existing records may be cleared. */ + page_zip_fail(("page_zip_apply_log:" + " attempting to create" + " deleted rec %lu\n", + (ulong) hs)); + return(NULL); + } + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + mach_write_to_2(rec - REC_NEW_HEAP_NO, hs); + + if (val & 1) { + /* Clear the data bytes of the record. */ + mem_heap_t* heap = NULL; + ulint* offs; + offs = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + memset(rec, 0, rec_offs_data_size(offs)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + continue; + } + +#if REC_STATUS_NODE_PTR != TRUE +# error "REC_STATUS_NODE_PTR != TRUE" +#endif + rec_get_offsets_reverse(data, index, + hs & REC_STATUS_NODE_PTR, + offsets); + rec_offs_make_valid(rec, index, offsets); + + /* Copy the extra bytes (backwards). */ + { + byte* start = rec_get_start(rec, offsets); + byte* b = rec - REC_N_NEW_EXTRA_BYTES; + while (b != start) { + *--b = *data++; + } + } + + /* Copy the data bytes. */ + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + /* Non-leaf nodes should not contain any + externally stored columns. */ + if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + page_zip_fail(("page_zip_apply_log: " + "%lu&REC_STATUS_NODE_PTR\n", + (ulong) hs)); + return(NULL); + } + + data = page_zip_apply_log_ext( + rec, offsets, trx_id_col, data, end); + + if (UNIV_UNLIKELY(!data)) { + return(NULL); + } + } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) { + len = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + /* Copy the data bytes, except node_ptr. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "node_ptr %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(rec, data, len); + data += len; + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + len = rec_offs_data_size(offsets); + + /* Copy all data bytes of + a record in a secondary index. */ + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "sec %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + + memcpy(rec, data, len); + data += len; + } else { + /* Skip DB_TRX_ID and DB_ROLL_PTR. */ + ulint l = rec_get_nth_field_offs(offsets, + trx_id_col, &len); + byte* b; + + if (UNIV_UNLIKELY(data + l >= end) + || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN))) { + page_zip_fail(("page_zip_apply_log: " + "trx_id %p+%lu >= %p\n", + (const void*) data, + (ulong) l, + (const void*) end)); + return(NULL); + } + + /* Copy any preceding data bytes. */ + memcpy(rec, data, l); + data += l; + + /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */ + b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + len = rec_get_end(rec, offsets) - b; + if (UNIV_UNLIKELY(data + len >= end)) { + page_zip_fail(("page_zip_apply_log: " + "clust %p+%lu >= %p\n", + (const void*) data, + (ulong) len, + (const void*) end)); + return(NULL); + } + memcpy(b, data, len); + data += len; + } + } +} + +/************************************************************************** +Decompress the records of a node pointer page. */ +static +ibool +page_zip_decompress_node_ptrs( +/*==========================*/ + /* out: TRUE on success, + FALSE on failure */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint* offsets, /* in/out: temporary offsets */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + ulint heap_status = REC_STATUS_NODE_PTR + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + const byte* storage; + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= n_dense + * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES + - d_stream->next_out; + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + /* Apparently, n_dense has grown + since the time the page was last compressed. */ + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); + /* Prepare to decompress the data bytes. */ + d_stream->next_out = rec; + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Decompress the data bytes, except node_ptr. */ + d_stream->avail_out = rec_offs_data_size(offsets) + - REC_NODE_PTR_SIZE; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_node_ptrs:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + /* Clear the node pointer in case the record + will be deleted and the space will be reallocated + to a smaller record. */ + memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE); + d_stream->next_out += REC_NODE_PTR_SIZE; + + ut_ad(d_stream->next_out == rec_get_end(rec, offsets)); + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = page_header_get_field(page_zip->data, + PAGE_HEAP_TOP) + - page_offset(d_stream->next_out); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_node_ptrs:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY + (page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index), NULL) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress_node_ptrs:" + " %lu + %lu >= %lu, %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, dict_index_is_clust(index), + NULL), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip), + (ulong) dict_index_is_clust(index))); + return(FALSE); + } + + /* Restore the uncompressed columns in heap_no order. */ + storage = page_zip->data + page_zip_get_size(page_zip) + - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + storage -= REC_NODE_PTR_SIZE; + + memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE, + storage, REC_NODE_PTR_SIZE); + } + + return(TRUE); +} + +/************************************************************************** +Decompress the records of a leaf node of a secondary index. */ +static +ibool +page_zip_decompress_sec( +/*====================*/ + /* out: TRUE on success, + FALSE on failure */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint* offsets) /* in/out: temporary offsets */ +{ + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + ulint slot; + + ut_a(!dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + /* Decompress everything up to this record. */ + d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES + - d_stream->next_out; + + if (UNIV_LIKELY(d_stream->avail_out)) { + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + /* Apparently, n_dense has grown + since the time the page was last compressed. */ + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); + + /* Skip the REC_N_NEW_EXTRA_BYTES. */ + + d_stream->next_out = rec; + + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + } + + /* Decompress the data of the last record and any trailing garbage, + in case the last record was allocated from an originally longer space + on the free list. */ + d_stream->avail_out = page_header_get_field(page_zip->data, + PAGE_HEAP_TOP) + - page_offset(d_stream->next_out); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_sec:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_sec:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + ULINT_UNDEFINED, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE, NULL) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, FALSE, NULL), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + /* There are no uncompressed columns on leaf pages of + secondary indexes. */ + + return(TRUE); +} + +/************************************************************************** +Decompress a record of a leaf node of a clustered index that contains +externally stored columns. */ +static +ibool +page_zip_decompress_clust_ext( +/*==========================*/ + /* out: TRUE on success */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t* rec, /* in/out: record */ + const ulint* offsets, /* in: rec_get_offsets(rec) */ + ulint trx_id_col) /* in: position of of DB_TRX_ID */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + ulint len; + byte* dst; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + /* Skip trx_id and roll_ptr */ + dst = rec_get_nth_field(rec, offsets, i, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " len[%lu] = %lu\n", + (ulong) i, (ulong) len)); + return(FALSE); + } + + if (rec_offs_nth_extern(offsets, i)) { + + page_zip_fail(("page_zip_decompress_clust_ext:" + " DB_TRX_ID at %lu is ext\n", + (ulong) i)); + return(FALSE); + } + + d_stream->avail_out = dst - d_stream->next_out; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } else if (rec_offs_nth_extern(offsets, i)) { + dst = rec_get_nth_field(rec, offsets, i, &len); + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + d_stream->avail_out = dst - d_stream->next_out; + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust_ext:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + return(FALSE); + } + + ut_ad(d_stream->next_out == dst); + + /* Clear the BLOB pointer in case + the record will be deleted and the + space will not be reused. Note that + the final initialization of the BLOB + pointers (copying from "externs" + or clearing) will have to take place + only after the page modification log + has been applied. Otherwise, we + could end up with an uninitialized + BLOB pointer when a record is deleted, + reallocated and deleted. */ + memset(d_stream->next_out, 0, + BTR_EXTERN_FIELD_REF_SIZE); + d_stream->next_out + += BTR_EXTERN_FIELD_REF_SIZE; + } + } + + return(TRUE); +} + +/************************************************************************** +Compress the records of a leaf node of a clustered index. */ +static +ibool +page_zip_decompress_clust( +/*======================*/ + /* out: TRUE on success, + FALSE on failure */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + z_stream* d_stream, /* in/out: compressed page stream */ + rec_t** recs, /* in: dense page directory + sorted by address */ + ulint n_dense, /* in: size of recs[] */ + dict_index_t* index, /* in: the index of the page */ + ulint trx_id_col, /* index of the trx_id column */ + ulint* offsets, /* in/out: temporary offsets */ + mem_heap_t* heap) /* in: temporary memory heap */ +{ + int err; + ulint slot; + ulint heap_status = REC_STATUS_ORDINARY + | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT; + const byte* storage; + const byte* externs; + + ut_a(dict_index_is_clust(index)); + + /* Subtract the space reserved for uncompressed data. */ + d_stream->avail_in -= n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Decompress the records in heap_no order. */ + for (slot = 0; slot < n_dense; slot++) { + rec_t* rec = recs[slot]; + + d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES + - d_stream->next_out; + + ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR); + err = inflate(d_stream, Z_SYNC_FLUSH); + switch (err) { + case Z_STREAM_END: + /* Apparently, n_dense has grown + since the time the page was last compressed. */ + goto zlib_done; + case Z_OK: + case Z_BUF_ERROR: + if (UNIV_LIKELY(!d_stream->avail_out)) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 1 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES); + /* Prepare to decompress the data bytes. */ + d_stream->next_out = rec; + /* Set heap_no and the status bits. */ + mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status); + heap_status += 1 << REC_HEAP_NO_SHIFT; + + /* Read the offsets. The status bits are needed here. */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* This is a leaf page in a clustered index. */ + + /* Check if there are any externally stored columns. + For each externally stored column, restore the + BTR_EXTERN_FIELD_REF separately. */ + + if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) { + if (UNIV_UNLIKELY + (!page_zip_decompress_clust_ext( + d_stream, rec, offsets, trx_id_col))) { + + goto zlib_error; + } + } else { + /* Skip trx_id and roll_ptr */ + ulint len; + byte* dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN)) { + + page_zip_fail(("page_zip_decompress_clust:" + " len = %lu\n", (ulong) len)); + goto zlib_error; + } + + d_stream->avail_out = dst - d_stream->next_out; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 2 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + + ut_ad(d_stream->next_out == dst); + + /* Clear DB_TRX_ID and DB_ROLL_PTR in order to + avoid uninitialized bytes in case the record + is affected by page_zip_apply_log(). */ + memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + d_stream->next_out += DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN; + } + + /* Decompress the last bytes of the record. */ + d_stream->avail_out = rec_get_end(rec, offsets) + - d_stream->next_out; + + switch (inflate(d_stream, Z_SYNC_FLUSH)) { + case Z_STREAM_END: + case Z_OK: + case Z_BUF_ERROR: + if (!d_stream->avail_out) { + break; + } + /* fall through */ + default: + page_zip_fail(("page_zip_decompress_clust:" + " 3 inflate(Z_SYNC_FLUSH)=%s\n", + d_stream->msg)); + goto zlib_error; + } + } + + /* Decompress any trailing garbage, in case the last record was + allocated from an originally longer space on the free list. */ + d_stream->avail_out = page_header_get_field(page_zip->data, + PAGE_HEAP_TOP) + - page_offset(d_stream->next_out); + if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE + - PAGE_ZIP_START - PAGE_DIR)) { + + page_zip_fail(("page_zip_decompress_clust:" + " avail_out = %u\n", + d_stream->avail_out)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) { + page_zip_fail(("page_zip_decompress_clust:" + " inflate(Z_FINISH)=%s\n", + d_stream->msg)); +zlib_error: + inflateEnd(d_stream); + return(FALSE); + } + + /* Note that d_stream->avail_out > 0 may hold here + if the modification log is nonempty. */ + +zlib_done: + if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) { + ut_error; + } + + { + page_t* page = page_align(d_stream->next_out); + + /* Clear the unused heap space on the uncompressed page. */ + memset(d_stream->next_out, 0, + page_dir_get_nth_slot(page, + page_dir_get_n_slots(page) - 1) + - d_stream->next_out); + } + +#ifdef UNIV_DEBUG + page_zip->m_start = PAGE_DATA + d_stream->total_in; +#endif /* UNIV_DEBUG */ + + /* Apply the modification log. */ + { + const byte* mod_log_ptr; + mod_log_ptr = page_zip_apply_log(d_stream->next_in, + d_stream->avail_in + 1, + recs, n_dense, + trx_id_col, heap_status, + index, offsets); + + if (UNIV_UNLIKELY(!mod_log_ptr)) { + return(FALSE); + } + page_zip->m_end = mod_log_ptr - page_zip->data; + page_zip->m_nonempty = mod_log_ptr != d_stream->next_in; + } + + if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE, NULL) + + page_zip->m_end >= page_zip_get_size(page_zip))) { + + page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n", + (ulong) page_zip_get_trailer_len( + page_zip, TRUE, NULL), + (ulong) page_zip->m_end, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + storage = page_zip->data + page_zip_get_size(page_zip) + - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + + externs = storage - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Restore the uncompressed columns in heap_no order. */ + + for (slot = 0; slot < n_dense; slot++) { + ulint i; + ulint len; + byte* dst; + rec_t* rec = recs[slot]; + ibool exists = !page_zip_dir_find_free( + page_zip, page_offset(rec)); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + dst = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + memcpy(dst, storage, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + /* Check if there are any externally stored + columns in this record. For each externally + stored column, restore or clear the + BTR_EXTERN_FIELD_REF. */ + if (!rec_offs_any_extern(offsets)) { + continue; + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + dst = rec_get_nth_field(rec, offsets, i, &len); + + if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) { + page_zip_fail(("page_zip_decompress_clust:" + " %lu < 20\n", + (ulong) len)); + return(FALSE); + } + + dst += len - BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_LIKELY(exists)) { + /* Existing record: + restore the BLOB pointer */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY + (externs < page_zip->data + + page_zip->m_end)) { + page_zip_fail(("page_zip_" + "decompress_clust: " + "%p < %p + %lu\n", + (const void*) externs, + (const void*) + page_zip->data, + (ulong) + page_zip->m_end)); + return(FALSE); + } + + memcpy(dst, externs, + BTR_EXTERN_FIELD_REF_SIZE); + + page_zip->n_blobs++; + } else { + /* Deleted record: + clear the BLOB pointer */ + memset(dst, 0, + BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + + return(TRUE); +} + +/************************************************************************** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. */ +UNIV_INTERN +ibool +page_zip_decompress( +/*================*/ + /* out: TRUE on success, FALSE on failure */ + page_zip_des_t* page_zip,/* in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page) /* out: uncompressed page, may be trashed */ +{ + z_stream d_stream; + dict_index_t* index = NULL; + rec_t** recs; /* dense page directory, sorted by address */ + ulint n_dense;/* number of user records on the page */ + ulint trx_id_col = ULINT_UNDEFINED; + mem_heap_t* heap; + ulint* offsets; + ullint usec = ut_time_us(NULL); + + ut_ad(page_zip_simple_validate(page_zip)); + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* The dense directory excludes the infimum and supremum records. */ + n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW; + if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE + >= page_zip_get_size(page_zip))) { + page_zip_fail(("page_zip_decompress 1: %lu %lu\n", + (ulong) n_dense, + (ulong) page_zip_get_size(page_zip))); + return(FALSE); + } + + heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE); + recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)); + +#ifdef UNIV_ZIP_DEBUG + /* Clear the page. */ + memset(page, 0x55, UNIV_PAGE_SIZE); +#endif /* UNIV_ZIP_DEBUG */ + UNIV_MEM_INVALID(page, UNIV_PAGE_SIZE); + /* Copy the page header. */ + memcpy(page, page_zip->data, PAGE_DATA); + + /* Copy the page directory. */ + if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs, + recs + n_dense, n_dense))) { +zlib_error: + mem_heap_free(heap); + return(FALSE); + } + + /* Copy the infimum and supremum records. */ + memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES), + infimum_extra, sizeof infimum_extra); + if (UNIV_UNLIKELY(!page_get_n_recs(page))) { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_new(page + PAGE_NEW_INFIMUM, + page_zip_dir_get(page_zip, 0) + & PAGE_ZIP_DIR_SLOT_MASK); + } + memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data); + memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1), + supremum_extra_data, sizeof supremum_extra_data); + + page_zip_set_alloc(&d_stream, heap); + + if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + != Z_OK)) { + ut_error; + } + + d_stream.next_in = page_zip->data + PAGE_DATA; + /* Subtract the space reserved for + the page header and the end marker of the modification log. */ + d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1); + + d_stream.next_out = page + PAGE_ZIP_START; + d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; + + /* Decode the zlib header and the index information. */ + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { + + page_zip_fail(("page_zip_decompress:" + " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg)); + goto zlib_error; + } + + index = page_zip_fields_decode( + page + PAGE_ZIP_START, d_stream.next_out, + page_is_leaf(page) ? &trx_id_col : NULL); + + if (UNIV_UNLIKELY(!index)) { + + goto zlib_error; + } + + /* Decompress the user records. */ + page_zip->n_blobs = 0; + d_stream.next_out = page + PAGE_ZIP_START; + + { + /* Pre-allocate the offsets for rec_get_offsets_reverse(). */ + ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + offsets = mem_heap_alloc(heap, n * sizeof(ulint)); + *offsets = n; + } + + /* Decompress the records in heap_no order. */ + if (!page_is_leaf(page)) { + /* This is a node pointer page. */ + ulint info_bits; + + if (UNIV_UNLIKELY + (!page_zip_decompress_node_ptrs(page_zip, &d_stream, + recs, n_dense, index, + offsets, heap))) { + goto err_exit; + } + + info_bits = mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL + ? REC_INFO_MIN_REC_FLAG : 0; + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page, + info_bits))) { + goto err_exit; + } + } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) { + /* This is a leaf page in a secondary index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream, + recs, n_dense, + index, offsets))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { +err_exit: + page_zip_fields_free(index); + mem_heap_free(heap); + return(FALSE); + } + } else { + /* This is a leaf page in a clustered index. */ + if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip, + &d_stream, recs, + n_dense, index, + trx_id_col, + offsets, heap))) { + goto err_exit; + } + + if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, + page, 0))) { + goto err_exit; + } + } + + ut_a(page_is_comp(page)); + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + + page_zip_fields_free(index); + mem_heap_free(heap); + { + page_zip_stat_t* zip_stat + = &page_zip_stat[page_zip->ssize - 1]; + zip_stat->decompressed++; + zip_stat->decompressed_usec += ut_time_us(NULL) - usec; + } + + /* Update the stat counter for LRU policy. */ + buf_LRU_stat_inc_unzip(); + + return(TRUE); +} + +#ifdef UNIV_ZIP_DEBUG +/************************************************************************** +Dump a block of memory on the standard error stream. */ +static +void +page_zip_hexdump_func( +/*==================*/ + const char* name, /* in: name of the data structure */ + const void* buf, /* in: data */ + ulint size) /* in: length of the data, in bytes */ +{ + const byte* s = buf; + ulint addr; + const ulint width = 32; /* bytes per line */ + + fprintf(stderr, "%s:\n", name); + + for (addr = 0; addr < size; addr += width) { + ulint i; + + fprintf(stderr, "%04lx ", (ulong) addr); + + i = ut_min(width, size - addr); + + while (i--) { + fprintf(stderr, "%02x", *s++); + } + + putc('\n', stderr); + } +} + +#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size) + +/* Flag: make page_zip_validate() compare page headers only */ +UNIV_INTERN ibool page_zip_validate_header_only = FALSE; + +/************************************************************************** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate_low( +/*==================*/ + /* out: TRUE if valid, FALSE if not */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page, /* in: uncompressed page */ + ibool sloppy) /* in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ +{ + page_zip_des_t temp_page_zip; + byte* temp_page_buf; + page_t* temp_page; + ibool valid; + + if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV) + || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2) + || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)) { + page_zip_fail(("page_zip_validate: page header\n")); + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + return(FALSE); + } + + ut_a(page_is_comp(page)); + + if (page_zip_validate_header_only) { + return(TRUE); + } + + /* page_zip_decompress() expects the uncompressed page to be + UNIV_PAGE_SIZE aligned. */ + temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE); + temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE); + +#ifdef UNIV_DEBUG_VALGRIND + /* Get detailed information on the valid bits in case the + UNIV_MEM_ASSERT_RW() checks fail. The v-bits of page[], + page_zip->data[] or page_zip could be viewed at temp_page[] or + temp_page_zip in a debugger when running valgrind --db-attach. */ + VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip); + UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip); + VALGRIND_GET_VBITS(page_zip->data, temp_page, + page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); +#endif /* UNIV_DEBUG_VALGRIND */ + + temp_page_zip = *page_zip; + valid = page_zip_decompress(&temp_page_zip, temp_page); + if (!valid) { + fputs("page_zip_validate(): failed to decompress\n", stderr); + goto func_exit; + } + if (page_zip->n_blobs != temp_page_zip.n_blobs) { + page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n", + page_zip->n_blobs, temp_page_zip.n_blobs)); + valid = FALSE; + } +#ifdef UNIV_DEBUG + if (page_zip->m_start != temp_page_zip.m_start) { + page_zip_fail(("page_zip_validate: m_start: %u!=%u\n", + page_zip->m_start, temp_page_zip.m_start)); + valid = FALSE; + } +#endif /* UNIV_DEBUG */ + if (page_zip->m_end != temp_page_zip.m_end) { + page_zip_fail(("page_zip_validate: m_end: %u!=%u\n", + page_zip->m_end, temp_page_zip.m_end)); + valid = FALSE; + } + if (page_zip->m_nonempty != temp_page_zip.m_nonempty) { + page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n", + page_zip->m_nonempty, + temp_page_zip.m_nonempty)); + valid = FALSE; + } + if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) { + + /* In crash recovery, the "minimum record" flag may be + set incorrectly until the mini-transaction is + committed. Let us tolerate that difference when we + are performing a sloppy validation. */ + + if (sloppy) { + byte info_bits_diff; + ulint offset + = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + ut_a(offset >= PAGE_NEW_SUPREMUM); + offset -= 5 /* REC_NEW_INFO_BITS */; + + info_bits_diff = page[offset] ^ temp_page[offset]; + + if (info_bits_diff == REC_INFO_MIN_REC_FLAG) { + temp_page[offset] = page[offset]; + + if (!memcmp(page + PAGE_HEADER, + temp_page + PAGE_HEADER, + UNIV_PAGE_SIZE - PAGE_HEADER + - FIL_PAGE_DATA_END)) { + + /* Only the minimum record flag + differed. Let us ignore it. */ + page_zip_fail(("page_zip_validate: " + "min_rec_flag " + "(ignored, " + "%lu,%lu,0x%02lx)\n", + page_get_space_id(page), + page_get_page_no(page), + (ulong) page[offset])); + goto func_exit; + } + } + } + page_zip_fail(("page_zip_validate: content\n")); + valid = FALSE; + } + +func_exit: + if (!valid) { + page_zip_hexdump(page_zip, sizeof *page_zip); + page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip)); + page_zip_hexdump(page, UNIV_PAGE_SIZE); + page_zip_hexdump(temp_page, UNIV_PAGE_SIZE); + } + ut_free(temp_page_buf); + return(valid); +} + +/************************************************************************** +Check that the compressed and decompressed pages match. */ +UNIV_INTERN +ibool +page_zip_validate( +/*==============*/ + /* out: TRUE if valid, FALSE if not */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page) /* in: uncompressed page */ +{ + return(page_zip_validate_low(page_zip, page, + recv_recovery_is_on())); +} +#endif /* UNIV_ZIP_DEBUG */ + +#ifdef UNIV_DEBUG +static +ibool +page_zip_header_cmp( +/*================*/ + /* out: TRUE */ + const page_zip_des_t* page_zip,/* in: compressed page */ + const byte* page) /* in: uncompressed page */ +{ + ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV, + FIL_PAGE_LSN - FIL_PAGE_PREV)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, + 2)); + ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA, + PAGE_DATA - FIL_PAGE_DATA)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Write a record on the compressed page that contains externally stored +columns. The data must already have been written to the uncompressed page. */ +static +byte* +page_zip_write_rec_ext( +/*===================*/ + /* out: end of modification log */ + page_zip_des_t* page_zip, /* in/out: compressed page */ + const page_t* page, /* in: page containing rec */ + const byte* rec, /* in: record being written */ + dict_index_t* index, /* in: record descriptor */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + ulint create, /* in: nonzero=insert, zero=update */ + ulint trx_id_col, /* in: position of DB_TRX_ID */ + ulint heap_no, /* in: heap number of rec */ + byte* storage, /* in: end of dense page directory */ + byte* data) /* in: end of modification log */ +{ + const byte* start = rec; + ulint i; + ulint len; + byte* externs = storage; + ulint n_ext = rec_offs_n_extern(offsets); + + ut_ad(rec_offs_validate(rec, index, offsets)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW); + + /* Note that this will not take into account + the BLOB columns of rec if create==TRUE. */ + ut_ad(data + rec_offs_data_size(offsets) + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + - n_ext * BTR_EXTERN_FIELD_REF_SIZE + < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs); + + { + ulint blob_no = page_zip_get_n_prev_extern( + page_zip, rec, index); + byte* ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(blob_no <= page_zip->n_blobs); + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + if (create) { + page_zip->n_blobs += n_ext; + ASSERT_ZERO_BLOB(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE); + memmove(ext_end - n_ext + * BTR_EXTERN_FIELD_REF_SIZE, + ext_end, + externs - ext_end); + } + + ut_a(blob_no + n_ext <= page_zip->n_blobs); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* src; + + if (UNIV_UNLIKELY(i == trx_id_col)) { + ut_ad(!rec_offs_nth_extern(offsets, + i)); + ut_ad(!rec_offs_nth_extern(offsets, + i + 1)); + /* Locate trx_id and roll_ptr. */ + src = rec_get_nth_field(rec, offsets, + i, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + i + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + (DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN); + + /* Store trx_id and roll_ptr. */ + memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + i++; /* skip also roll_ptr */ + } else if (rec_offs_nth_extern(offsets, i)) { + src = rec_get_nth_field(rec, offsets, + i, &len); + + ut_ad(dict_index_is_clust(index)); + ut_ad(len + >= BTR_EXTERN_FIELD_REF_SIZE); + src += len - BTR_EXTERN_FIELD_REF_SIZE; + + ASSERT_ZERO(data, src - start); + memcpy(data, start, src - start); + data += src - start; + start = src + BTR_EXTERN_FIELD_REF_SIZE; + + /* Store the BLOB pointer. */ + externs -= BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(data < externs); + memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE); + } + } + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) - (start - rec); + + ASSERT_ZERO(data, len); + memcpy(data, start, len); + data += len; + + return(data); +} + +/************************************************************************** +Write an entire record on the compressed page. The data must already +have been written to the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record being written */ + dict_index_t* index, /* in: the index the record belongs to */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint create) /* in: nonzero=insert, zero=update */ +{ + const page_t* page; + byte* data; + byte* storage; + ulint heap_no; + byte* slot; + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + + page = page_align(rec); + + ut_ad(page_zip_header_cmp(page_zip, page)); + ut_ad(page_simple_validate_new((page_t*) page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + /* Copy the delete mark. */ + if (rec_get_deleted_flag(rec, TRUE)) { + *slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8; + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } + + ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START); + ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE + - PAGE_DIR - PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots(page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */ + ut_ad(heap_no < page_dir_get_n_heap(page)); + + /* Append to the modification log. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + + /* Identify the record by writing its heap number - 1. + 0 is reserved to indicate the end of the modification log. */ + + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1); + ut_ad(!*data); + + { + const byte* start = rec - rec_offs_extra_size(offsets); + const byte* b = rec - REC_N_NEW_EXTRA_BYTES; + + /* Write the extra bytes backwards, so that + rec_offs_extra_size() can be easily computed in + page_zip_apply_log() by invoking + rec_get_offsets_reverse(). */ + + while (b != start) { + *data++ = *--b; + ut_ad(!*data); + } + } + + /* Write the data bytes. Store the uncompressed bytes separately. */ + storage = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + if (page_is_leaf(page)) { + ulint len; + + if (dict_index_is_clust(index)) { + ulint trx_id_col; + + trx_id_col = dict_index_get_sys_col_pos(index, + DATA_TRX_ID); + ut_ad(trx_id_col != ULINT_UNDEFINED); + + /* Store separately trx_id, roll_ptr and + the BTR_EXTERN_FIELD_REF of each BLOB column. */ + if (rec_offs_any_extern(offsets)) { + data = page_zip_write_rec_ext( + page_zip, page, + rec, index, offsets, create, + trx_id_col, heap_no, storage, data); + } else { + /* Locate trx_id and roll_ptr. */ + const byte* src + = rec_get_nth_field(rec, offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(src + DATA_TRX_ID_LEN + == rec_get_nth_field( + rec, offsets, + trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); + + /* Log the preceding fields. */ + ASSERT_ZERO(data, src - rec); + memcpy(data, rec, src - rec); + data += src - rec; + + /* Store trx_id and roll_ptr. */ + memcpy(storage + - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN) + * (heap_no - 1), + src, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + /* Log the last bytes of the record. */ + len = rec_offs_data_size(offsets) + - (src - rec); + + ASSERT_ZERO(data, len); + memcpy(data, src, len); + data += len; + } + } else { + /* Leaf page of a secondary index: + no externally stored columns */ + ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID) + == ULINT_UNDEFINED); + ut_ad(!rec_offs_any_extern(offsets)); + + /* Log the entire record. */ + len = rec_offs_data_size(offsets); + + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + } + } else { + /* This is a node pointer page. */ + ulint len; + + /* Non-leaf nodes should not have any externally + stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + /* Copy the data bytes, except node_ptr. */ + len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE; + ut_ad(data + len < storage - REC_NODE_PTR_SIZE + * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)); + ASSERT_ZERO(data, len); + memcpy(data, rec, len); + data += len; + + /* Copy the node pointer to the uncompressed area. */ + memcpy(storage - REC_NODE_PTR_SIZE + * (heap_no - 1), + rec + len, + REC_NODE_PTR_SIZE); + } + + ut_a(!*data); + ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip)); + page_zip->m_end = data - page_zip->data; + page_zip->m_nonempty = TRUE; + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec))); +#endif /* UNIV_ZIP_DEBUG */ +} + +/*************************************************************** +Parses a log record of writing a BLOB pointer of a record. */ +UNIV_INTERN +byte* +page_zip_parse_write_blob_ptr( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip)/* in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY + (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(!page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + memcpy(page_zip->data + z_offset, + ptr + 4, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE)); +} + +/************************************************************************** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +UNIV_INTERN +void +page_zip_write_blob_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in/out: record whose data is being + written */ + dict_index_t* index, /* in: index of the page */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint n, /* in: column index */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ +{ + const byte* field; + byte* externs; + const page_t* page = page_align(rec); + ulint blob_no; + ulint len; + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_simple_validate_new((page_t*) page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_comp(offsets)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(rec_offs_nth_extern(offsets, n)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index) + + rec_get_n_extern_new(rec, index, n); + ut_a(blob_no < page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + field = rec_get_nth_field(rec, offsets, n, &len); + + externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE; + field += len - BTR_EXTERN_FIELD_REF_SIZE; + + memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + if (mtr) { + byte* log_ptr = mlog_open( + mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, externs - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE); + log_ptr += BTR_EXTERN_FIELD_REF_SIZE; + mlog_close(mtr, log_ptr); + } +} + +/*************************************************************** +Parses a log record of writing the node pointer of a record. */ +UNIV_INTERN +byte* +page_zip_parse_write_node_ptr( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip)/* in/out: compressed page */ +{ + ulint offset; + ulint z_offset; + + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) { + + return(NULL); + } + + offset = mach_read_from_2(ptr); + z_offset = mach_read_from_2(ptr + 2); + + if (UNIV_UNLIKELY(offset < PAGE_ZIP_START) + || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE) + || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (page) { + byte* storage_end; + byte* field; + byte* storage; + ulint heap_no; + + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_is_leaf(page))) { + + goto corrupt; + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + field = page + offset; + storage = page_zip->data + z_offset; + + storage_end = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE; + + if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE) + || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW) + || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) { + + goto corrupt; + } + + memcpy(field, ptr + 4, REC_NODE_PTR_SIZE); + memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + (2 + 2 + REC_NODE_PTR_SIZE)); +} + +/************************************************************************** +Write the node pointer of a record on a non-leaf compressed page. */ +UNIV_INTERN +void +page_zip_write_node_ptr( +/*====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + ulint size, /* in: data size of rec */ + ulint ptr, /* in: node pointer */ + mtr_t* mtr) /* in: mini-transaction, or NULL */ +{ + byte* field; + byte* storage; + page_t* page = page_align(rec); + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(page_rec_is_comp(rec)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(!page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, size); + + storage = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; + field = rec + size - REC_NODE_PTR_SIZE; + +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if REC_NODE_PTR_SIZE != 4 +# error "REC_NODE_PTR_SIZE != 4" +#endif + mach_write_to_4(field, ptr); + memcpy(storage, field, REC_NODE_PTR_SIZE); + + if (mtr) { + byte* log_ptr = mlog_open(mtr, + 11 + 2 + 2 + REC_NODE_PTR_SIZE); + if (UNIV_UNLIKELY(!log_ptr)) { + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr); + mach_write_to_2(log_ptr, page_offset(field)); + log_ptr += 2; + mach_write_to_2(log_ptr, storage - page_zip->data); + log_ptr += 2; + memcpy(log_ptr, field, REC_NODE_PTR_SIZE); + log_ptr += REC_NODE_PTR_SIZE; + mlog_close(mtr, log_ptr); + } +} + +/************************************************************************** +Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */ +UNIV_INTERN +void +page_zip_write_trx_id_and_roll_ptr( +/*===============================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in/out: record */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint trx_id_col,/* in: column number of TRX_ID in rec */ + dulint trx_id, /* in: transaction identifier */ + dulint roll_ptr)/* in: roll_ptr */ +{ + byte* field; + byte* storage; + page_t* page = page_align(rec); + ulint len; + + ut_ad(buf_frame_get_page_zip(rec) == page_zip); + ut_ad(page_simple_validate_new(page)); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + page_zip_dir_size(page_zip)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_offs_comp(offsets)); + + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_zip_header_cmp(page_zip, page)); + + ut_ad(page_is_leaf(page)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + storage = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE + - (rec_get_heap_no_new(rec) - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + field = rec_get_nth_field(rec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + ut_ad(field + DATA_TRX_ID_LEN + == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len)); + ut_ad(len == DATA_ROLL_PTR_LEN); +#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG + ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)); +#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */ +#if DATA_TRX_ID_LEN != 6 +# error "DATA_TRX_ID_LEN != 6" +#endif + mach_write_to_6(field, trx_id); +#if DATA_ROLL_PTR_LEN != 7 +# error "DATA_ROLL_PTR_LEN != 7" +#endif + mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr); + memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); +} + +#ifdef UNIV_ZIP_DEBUG +/* Set this variable in a debugger to disable page_zip_clear_rec(). +The only observable effect should be the compression ratio due to +deleted records not being zeroed out. In rare cases, there can be +page_zip_validate() failures on the node_ptr, trx_id and roll_ptr +columns if the space is reallocated for a smaller record. */ +UNIV_INTERN ibool page_zip_clear_rec_disable; +#endif /* UNIV_ZIP_DEBUG */ + +/************************************************************************** +Clear an area on the uncompressed and compressed page, if possible. */ +static +void +page_zip_clear_rec( +/*===============*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: record to clear */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint heap_no; + page_t* page = page_align(rec); + /* page_zip_validate() would fail here if a record + containing externally stored columns is being deleted. */ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_zip_dir_find(page_zip, page_offset(rec))); + ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec))); + ut_ad(page_zip_header_cmp(page_zip, page)); + + heap_no = rec_get_heap_no_new(rec); + ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + if ( +#ifdef UNIV_ZIP_DEBUG + !page_zip_clear_rec_disable && +#endif /* UNIV_ZIP_DEBUG */ + page_zip->m_end + + 1 + ((heap_no - 1) >= 64)/* size of the log entry */ + + page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index), NULL) + < page_zip_get_size(page_zip)) { + byte* data; + + /* Clear only the data bytes, because the allocator and + the decompressor depend on the extra bytes. */ + memset(rec, 0, rec_offs_data_size(offsets)); + + if (!page_is_leaf(page)) { + /* Clear node_ptr on the compressed page. */ + byte* storage = page_zip->data + + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) + - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, + 0, REC_NODE_PTR_SIZE); + } else if (dict_index_is_clust(index)) { + /* Clear trx_id and roll_ptr on the compressed page. */ + byte* storage = page_zip->data + + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) + - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + + memset(storage - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + } + + /* Log that the data was zeroed out. */ + data = page_zip->data + page_zip->m_end; + ut_ad(!*data); + if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { + *data++ = (byte) (0x80 | (heap_no - 1) >> 7); + ut_ad(!*data); + } + *data++ = (byte) ((heap_no - 1) << 1 | 1); + ut_ad(!*data); + ut_ad((ulint) (data - page_zip->data) + < page_zip_get_size(page_zip)); + page_zip->m_end = data - page_zip->data; + page_zip->m_nonempty = TRUE; + } else if (page_is_leaf(page) && dict_index_is_clust(index)) { + /* Do not clear the record, because there is not enough space + to log the operation. */ + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + /* Clear all BLOB pointers in order to make + page_zip_validate() pass. */ + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + byte* field = rec_get_nth_field( + rec, offsets, i, &len); + memset(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + 0, BTR_EXTERN_FIELD_REF_SIZE); + } + } + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ +} + +/************************************************************************** +Write the "deleted" flag of a record on a compressed page. The flag must +already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_deleted( +/*=====================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the deleted flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page_align(rec))); +#endif /* UNIV_ZIP_DEBUG */ +} + +/************************************************************************** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +UNIV_INTERN +void +page_zip_rec_set_owned( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* rec, /* in: record on the uncompressed page */ + ulint flag) /* in: the owned flag (nonzero=TRUE) */ +{ + byte* slot = page_zip_dir_find(page_zip, page_offset(rec)); + ut_a(slot); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + if (flag) { + *slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } else { + *slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8); + } +} + +/************************************************************************** +Insert a record to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_insert( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* prev_rec,/* in: record after which to insert */ + const byte* free_rec,/* in: record from which rec was + allocated, or NULL */ + byte* rec) /* in: record to insert */ +{ + ulint n_dense; + byte* slot_rec; + byte* slot_free; + + ut_ad(prev_rec != rec); + ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec); + ut_ad(page_zip_simple_validate(page_zip)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + if (page_rec_is_infimum(prev_rec)) { + /* Use the first slot. */ + slot_rec = page_zip->data + page_zip_get_size(page_zip); + } else { + byte* end = page_zip->data + page_zip_get_size(page_zip); + byte* start = end - page_zip_dir_user_size(page_zip); + + if (UNIV_LIKELY(!free_rec)) { + /* PAGE_N_RECS was already incremented + in page_cur_insert_rec_zip(), but the + dense directory slot at that position + contains garbage. Skip it. */ + start += PAGE_ZIP_DIR_SLOT_SIZE; + } + + slot_rec = page_zip_dir_find_low(start, end, + page_offset(prev_rec)); + ut_a(slot_rec); + } + + /* Read the old n_dense (n_heap may have been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + if (UNIV_LIKELY_NULL(free_rec)) { + /* The record was allocated from the free list. + Shift the dense directory only up to that slot. + Note that in this case, n_dense is actually + off by one, because page_cur_insert_rec_zip() + did not increment n_heap. */ + ut_ad(rec_get_heap_no_new(rec) < n_dense + 1 + + PAGE_HEAP_NO_USER_LOW); + ut_ad(rec >= free_rec); + slot_free = page_zip_dir_find(page_zip, page_offset(free_rec)); + ut_ad(slot_free); + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } else { + /* The record was allocated from the heap. + Shift the entire dense directory. */ + ut_ad(rec_get_heap_no_new(rec) == n_dense + + PAGE_HEAP_NO_USER_LOW); + + /* Shift to the end of the dense page directory. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + } + + /* Shift the dense directory to allocate place for rec. */ + memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free, + slot_rec - slot_free); + + /* Write the entry for the inserted record. + The "owned" and "deleted" flags must be zero. */ + mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec)); +} + +/************************************************************************** +Shift the dense page directory and the array of BLOB pointers +when a record is deleted. */ +UNIV_INTERN +void +page_zip_dir_delete( +/*================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + byte* rec, /* in: record to delete */ + dict_index_t* index, /* in: index of rec */ + const ulint* offsets,/* in: rec_get_offsets(rec) */ + const byte* free) /* in: previous start of the free list */ +{ + byte* slot_rec; + byte* slot_free; + ulint n_ext; + page_t* page = page_align(rec); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_comp(offsets)); + + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets)); + UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), + rec_offs_extra_size(offsets)); + + slot_rec = page_zip_dir_find(page_zip, page_offset(rec)); + + ut_a(slot_rec); + + /* This could not be done before page_zip_dir_find(). */ + page_header_set_field(page, page_zip, PAGE_N_RECS, + (ulint)(page_get_n_recs(page) - 1)); + + if (UNIV_UNLIKELY(!free)) { + /* Make the last slot the start of the free list. */ + slot_free = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE + * (page_dir_get_n_heap(page_zip->data) + - PAGE_HEAP_NO_USER_LOW); + } else { + slot_free = page_zip_dir_find_free(page_zip, + page_offset(free)); + ut_a(slot_free < slot_rec); + /* Grow the free list by one slot by moving the start. */ + slot_free += PAGE_ZIP_DIR_SLOT_SIZE; + } + + if (UNIV_LIKELY(slot_rec > slot_free)) { + memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, + slot_free, + slot_rec - slot_free); + } + + /* Write the entry for the deleted record. + The "owned" and "deleted" flags will be cleared. */ + mach_write_to_2(slot_free, page_offset(rec)); + + if (!page_is_leaf(page) || !dict_index_is_clust(index)) { + ut_ad(!rec_offs_any_extern(offsets)); + goto skip_blobs; + } + + n_ext = rec_offs_n_extern(offsets); + if (UNIV_UNLIKELY(n_ext)) { + /* Shift and zero fill the array of BLOB pointers. */ + ulint blob_no; + byte* externs; + byte* ext_end; + + blob_no = page_zip_get_n_prev_extern(page_zip, rec, index); + ut_a(blob_no + n_ext <= page_zip->n_blobs); + + externs = page_zip->data + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) + * (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + ext_end = externs - page_zip->n_blobs + * BTR_EXTERN_FIELD_REF_SIZE; + externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE; + + page_zip->n_blobs -= n_ext; + /* Shift and zero fill the array. */ + memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end, + (page_zip->n_blobs - blob_no) + * BTR_EXTERN_FIELD_REF_SIZE); + memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE); + } + +skip_blobs: + /* The compression algorithm expects info_bits and n_owned + to be 0 for deleted records. */ + rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */ + + page_zip_clear_rec(page_zip, rec, index, offsets); +} + +/************************************************************************** +Add a slot to the dense page directory. */ +UNIV_INTERN +void +page_zip_dir_add_slot( +/*==================*/ + page_zip_des_t* page_zip, /* in/out: compressed page */ + ulint is_clustered) /* in: nonzero for clustered index, + zero for others */ +{ + ulint n_dense; + byte* dir; + byte* stored; + + ut_ad(page_is_comp(page_zip->data)); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Read the old n_dense (n_heap has already been incremented). */ + n_dense = page_dir_get_n_heap(page_zip->data) + - (PAGE_HEAP_NO_USER_LOW + 1); + + dir = page_zip->data + page_zip_get_size(page_zip) + - PAGE_ZIP_DIR_SLOT_SIZE * n_dense; + + if (!page_is_leaf(page_zip->data)) { + ut_ad(!page_zip->n_blobs); + stored = dir - n_dense * REC_NODE_PTR_SIZE; + } else if (UNIV_UNLIKELY(is_clustered)) { + /* Move the BLOB pointer array backwards to make space for the + roll_ptr and trx_id columns and the dense directory slot. */ + byte* externs; + + stored = dir - n_dense + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + externs = stored + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(externs + - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memmove(externs - (PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + externs, stored - externs); + } else { + stored = dir + - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE; + ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE, + PAGE_ZIP_DIR_SLOT_SIZE); + } + + /* Move the uncompressed area backwards to make space + for one directory slot. */ + memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored); +} + +/*************************************************************** +Parses a log record of writing to the header of a page. */ +UNIV_INTERN +byte* +page_zip_parse_write_header( +/*========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: redo log buffer */ + byte* end_ptr,/* in: redo log buffer end */ + page_t* page, /* in/out: uncompressed page */ + page_zip_des_t* page_zip)/* in/out: compressed page */ +{ + ulint offset; + ulint len; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) { + + return(NULL); + } + + offset = (ulint) *ptr++; + len = (ulint) *ptr++; + + if (UNIV_UNLIKELY(!len) || UNIV_UNLIKELY(offset + len >= PAGE_DATA)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + if (UNIV_UNLIKELY(end_ptr < ptr + len)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip)) { + + goto corrupt; + } +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + memcpy(page + offset, ptr, len); + memcpy(page_zip->data + offset, ptr, len); + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + } + + return(ptr + len); +} + +/************************************************************************** +Write a log record of writing to the uncompressed header portion of a page. */ +UNIV_INTERN +void +page_zip_write_header_log( +/*======================*/ + const byte* data, /* in: data on the uncompressed page */ + ulint length, /* in: length of the data */ + mtr_t* mtr) /* in: mini-transaction */ +{ + byte* log_ptr = mlog_open(mtr, 11 + 1 + 1); + ulint offset = page_offset(data); + + ut_ad(offset < PAGE_DATA); + ut_ad(offset + length < PAGE_DATA); +#if PAGE_DATA > 255 +# error "PAGE_DATA > 255" +#endif + ut_ad(length < 256); + + /* If no logging is requested, we may return now */ + if (UNIV_UNLIKELY(!log_ptr)) { + + return; + } + + log_ptr = mlog_write_initial_log_record_fast( + (byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr); + *log_ptr++ = (byte) offset; + *log_ptr++ = (byte) length; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, data, length); +} + +/************************************************************************** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. */ +UNIV_INTERN +ibool +page_zip_reorganize( +/*================*/ + /* out: TRUE on success, FALSE on failure; + page and page_zip will be left intact + on failure. */ + buf_block_t* block, /* in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /* in: index of the B-tree node */ + mtr_t* mtr) /* in: mini-transaction */ +{ + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + buf_block_t* temp_block; + page_t* temp_page; + ulint log_mode; + + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_is_comp(page)); + /* Note that page_zip_validate(page_zip, page) may fail here. */ + UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); + + /* Disable logging */ + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + + temp_block = buf_block_alloc(0); + temp_page = temp_block->frame; + + btr_search_drop_page_hash_index(block); + + /* Copy the old page to temporary space */ + buf_frame_copy(temp_page, page); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + page_create(block, mtr, TRUE); + block->check_index_page_at_flush = TRUE; + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + page_copy_rec_list_end_no_locks(block, temp_block, + page_get_infimum_rec(temp_page), + index, mtr); + /* Copy max trx id to recreated page */ + page_set_max_trx_id(block, NULL, page_get_max_trx_id(temp_page)); + + /* Restore logging. */ + mtr_set_log_mode(mtr, log_mode); + + if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) { + + /* Restore the old page and exit. */ + buf_frame_copy(page, temp_page); + + buf_block_free(temp_block); + return(FALSE); + } + + lock_move_reorganize_page(block, temp_block); + + buf_block_free(temp_block); + return(TRUE); +} + +/************************************************************************** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +UNIV_INTERN +void +page_zip_copy_recs( +/*===============*/ + page_zip_des_t* page_zip, /* out: copy of src_zip + (n_blobs, m_start, m_end, + m_nonempty, data[0..size-1]) */ + page_t* page, /* out: copy of src */ + const page_zip_des_t* src_zip, /* in: compressed page */ + const page_t* src, /* in: page */ + dict_index_t* index, /* in: index of the B-tree */ + mtr_t* mtr) /* in: mini-transaction */ +{ + ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + /* The B-tree operations that call this function may set + FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag + mismatch. A strict page_zip_validate() will be executed later + during the B-tree operations. */ + ut_a(page_zip_validate_low(src_zip, src, TRUE)); +#endif /* UNIV_ZIP_DEBUG */ + ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip)); + if (UNIV_UNLIKELY(src_zip->n_blobs)) { + ut_a(page_is_leaf(src)); + ut_a(dict_index_is_clust(index)); + } + + UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip)); + UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE); + UNIV_MEM_ASSERT_RW(src_zip->data, page_zip_get_size(page_zip)); + + /* Copy those B-tree page header fields that are related to + the records stored in the page. Also copy the field + PAGE_MAX_TRX_ID. Skip the rest of the page header and + trailer. On the compressed page, there is no trailer. */ +#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END +# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END" +#endif + memcpy(PAGE_HEADER + page, PAGE_HEADER + src, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page, PAGE_DATA + src, + UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END); + memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data, + PAGE_HEADER_PRIV_END); + memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data, + page_zip_get_size(page_zip) - PAGE_DATA); + + /* Copy all fields of src_zip to page_zip, except the pointer + to the compressed data page. */ + { + page_zip_t* data = page_zip->data; + memcpy(page_zip, src_zip, sizeof *page_zip); + page_zip->data = data; + } + ut_ad(page_zip_get_trailer_len(page_zip, + dict_index_is_clust(index), NULL) + + page_zip->m_end < page_zip_get_size(page_zip)); + + if (!page_is_leaf(src) + && UNIV_UNLIKELY(mach_read_from_4(src + FIL_PAGE_PREV) == FIL_NULL) + && UNIV_LIKELY(mach_read_from_4(page + + FIL_PAGE_PREV) != FIL_NULL)) { + /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */ + ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE); + if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) { + rec_t* rec = page + offs; + ut_a(rec[-REC_N_NEW_EXTRA_BYTES] + & REC_INFO_MIN_REC_FLAG); + rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG; + } + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page)); +#endif /* UNIV_ZIP_DEBUG */ + + page_zip_compress_write_log(page_zip, page, index, mtr); +} + +/************************************************************************** +Parses a log record of compressing an index page. */ +UNIV_INTERN +byte* +page_zip_parse_compress( +/*====================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* out: uncompressed page */ + page_zip_des_t* page_zip)/* out: compressed page */ +{ + ulint size; + ulint trailer_size; + + ut_ad(ptr && end_ptr); + ut_ad(!page == !page_zip); + + if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) { + + return(NULL); + } + + size = mach_read_from_2(ptr); + ptr += 2; + trailer_size = mach_read_from_2(ptr); + ptr += 2; + + if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) { + + return(NULL); + } + + if (page) { + if (UNIV_UNLIKELY(!page_zip) + || UNIV_UNLIKELY(page_zip_get_size(page_zip) < size)) { +corrupt: + recv_sys->found_corrupt_log = TRUE; + + return(NULL); + } + + memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4); + memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4); + memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size); + memset(page_zip->data + FIL_PAGE_TYPE + size, 0, + page_zip_get_size(page_zip) - trailer_size + - (FIL_PAGE_TYPE + size)); + memcpy(page_zip->data + page_zip_get_size(page_zip) + - trailer_size, ptr + 8 + size, trailer_size); + + if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, page))) { + + goto corrupt; + } + } + + return(ptr + 8 + size + trailer_size); +} + +/************************************************************************** +Calculate the compressed page checksum. */ +UNIV_INTERN +ulint +page_zip_calc_checksum( +/*===================*/ + /* out: page checksum */ + const void* data, /* in: compressed page */ + ulint size) /* in: size of compressed page */ +{ + /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN, + and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */ + + const Bytef* s = data; + uLong adler; + + ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + adler = adler32(0L, s + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET); + adler = adler32(adler, s + FIL_PAGE_TYPE, 2); + adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + return((ulint) adler); +} diff --git a/storage/xtradb/pars/lexyy.c b/storage/xtradb/pars/lexyy.c new file mode 100644 index 00000000000..489752a1900 --- /dev/null +++ b/storage/xtradb/pars/lexyy.c @@ -0,0 +1,2780 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +#include "univ.i" +#line 2 "lexyy.c" + +#line 4 "lexyy.c" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 31 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; +#endif /* ! C99 */ + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +#if __STDC__ + +#define YY_USE_CONST + +#endif /* __STDC__ */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart(yyin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +static int yyleng; + +static FILE *yyin, *yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +/* The following is because we cannot portably get our hands on size_t + * (without autoconf's help, which isn't available because we want + * flex-generated scanners to compile on their own). + */ + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef unsigned int yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when yytext is formed. */ +static char yy_hold_char; +static int yy_n_chars; /* number of characters read into yy_ch_buf */ +static int yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 1; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow yywrap()'s to do buffer switches + * instead of setting up a fresh yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +static void yyrestart (FILE *input_file ); +__attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +static YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ); +static void yy_delete_buffer (YY_BUFFER_STATE b ); +static void yy_flush_buffer (YY_BUFFER_STATE b ); +__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +__attribute__((unused)) static void yypop_buffer_state (void ); + +static void yyensure_buffer_stack (void ); +static void yy_load_buffer_state (void ); +static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len ); + +static void *yyalloc (yy_size_t ); +static void *yyrealloc (void *,yy_size_t ); +static void yyfree (void * ); + +#define yy_new_buffer yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define yywrap(n) 1 +#define YY_SKIP_YYWRAP + +typedef unsigned char YY_CHAR; + +static FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; + +typedef int yy_state_type; + +static int yylineno; + +static int yylineno = 1; + +static char *yytext; +#define yytext_ptr yytext + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + yyleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 119 +#define YY_END_OF_BUFFER 120 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[399] = + { 0, + 0, 0, 114, 114, 0, 0, 0, 0, 120, 118, + 117, 117, 8, 118, 109, 5, 98, 104, 107, 105, + 102, 106, 118, 108, 1, 118, 103, 101, 99, 100, + 112, 92, 92, 92, 92, 92, 92, 92, 92, 92, + 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, + 110, 111, 114, 115, 6, 7, 9, 10, 117, 4, + 93, 113, 2, 1, 3, 94, 95, 97, 96, 92, + 92, 92, 92, 92, 92, 44, 92, 92, 92, 92, + 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, + 92, 92, 28, 17, 25, 92, 92, 92, 92, 92, + + 54, 61, 92, 14, 92, 92, 92, 92, 92, 92, + 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, + 92, 92, 114, 115, 115, 116, 6, 7, 9, 10, + 2, 13, 45, 92, 92, 92, 92, 92, 92, 92, + 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, + 92, 27, 92, 92, 92, 41, 92, 92, 92, 92, + 21, 92, 92, 92, 92, 15, 92, 92, 92, 18, + 92, 92, 92, 92, 92, 80, 92, 92, 92, 51, + 92, 12, 92, 36, 92, 92, 92, 92, 92, 92, + 92, 92, 92, 92, 92, 92, 92, 92, 20, 24, + + 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, + 46, 92, 92, 30, 92, 87, 92, 92, 39, 92, + 92, 92, 92, 92, 48, 92, 89, 32, 91, 92, + 11, 64, 92, 92, 92, 42, 92, 92, 92, 92, + 92, 92, 92, 92, 92, 92, 29, 92, 92, 92, + 92, 92, 92, 92, 92, 92, 85, 92, 26, 92, + 66, 92, 92, 92, 37, 92, 92, 92, 92, 92, + 92, 92, 31, 65, 23, 92, 57, 92, 75, 92, + 92, 92, 43, 92, 92, 92, 92, 92, 92, 92, + 92, 90, 92, 92, 56, 92, 92, 92, 92, 92, + + 92, 92, 40, 33, 79, 19, 92, 83, 74, 55, + 92, 63, 92, 52, 92, 92, 92, 47, 92, 76, + 92, 78, 92, 92, 34, 92, 92, 92, 35, 72, + 92, 92, 92, 92, 58, 92, 50, 49, 92, 92, + 53, 62, 92, 92, 92, 22, 92, 92, 73, 81, + 92, 92, 77, 92, 68, 92, 92, 92, 92, 38, + 92, 88, 67, 92, 84, 92, 92, 92, 86, 92, + 59, 92, 16, 92, 70, 69, 92, 92, 82, 92, + 92, 92, 92, 92, 92, 92, 92, 92, 92, 71, + 92, 92, 92, 92, 92, 92, 60, 0 + + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 4, 1, 5, 6, 1, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 17, 18, 19, + 20, 21, 22, 1, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 32, + 1, 1, 1, 1, 48, 1, 32, 32, 32, 32, + + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 49, 1, 50, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[51] = + { 0, + 1, 1, 1, 2, 1, 1, 3, 1, 1, 4, + 1, 1, 1, 1, 1, 5, 1, 1, 1, 6, + 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 1, 1 + } ; + +static yyconst flex_int16_t yy_base[409] = + { 0, + 0, 0, 437, 436, 438, 437, 439, 438, 441, 448, + 49, 51, 448, 0, 448, 448, 448, 448, 448, 448, + 448, 448, 426, 429, 41, 418, 448, 38, 448, 417, + 448, 20, 33, 32, 46, 40, 44, 0, 54, 52, + 399, 48, 60, 395, 65, 67, 81, 27, 411, 75, + 448, 448, 0, 98, 0, 426, 0, 428, 113, 0, + 448, 448, 415, 54, 410, 448, 448, 448, 448, 0, + 403, 68, 399, 391, 389, 0, 402, 80, 84, 397, + 383, 96, 381, 394, 379, 393, 387, 375, 379, 375, + 377, 377, 0, 98, 0, 376, 97, 385, 368, 375, + + 0, 0, 381, 381, 364, 94, 103, 379, 98, 65, + 381, 369, 109, 361, 377, 373, 351, 97, 372, 363, + 115, 356, 0, 137, 138, 448, 0, 388, 0, 390, + 377, 0, 0, 365, 360, 367, 365, 348, 346, 345, + 350, 359, 347, 359, 95, 347, 353, 354, 336, 336, + 123, 0, 334, 350, 351, 0, 338, 347, 344, 122, + 124, 341, 336, 330, 340, 338, 331, 328, 336, 0, + 326, 336, 334, 325, 315, 309, 322, 307, 327, 0, + 313, 0, 311, 0, 325, 316, 313, 131, 309, 316, + 323, 302, 304, 309, 309, 301, 304, 299, 0, 0, + + 311, 295, 305, 312, 292, 291, 305, 294, 307, 287, + 0, 297, 279, 0, 298, 0, 295, 282, 0, 281, + 276, 281, 280, 290, 0, 276, 0, 0, 0, 280, + 0, 0, 276, 273, 287, 0, 272, 272, 270, 286, + 271, 283, 280, 264, 282, 277, 0, 272, 272, 258, + 257, 270, 256, 270, 269, 268, 0, 252, 0, 246, + 0, 265, 249, 248, 0, 262, 252, 247, 246, 258, + 248, 247, 0, 0, 0, 251, 0, 239, 0, 253, + 249, 235, 0, 249, 250, 233, 238, 231, 249, 231, + 228, 0, 229, 226, 0, 231, 243, 230, 237, 227, + + 235, 220, 0, 0, 0, 212, 219, 0, 0, 0, + 216, 0, 230, 0, 231, 218, 217, 0, 213, 0, + 216, 0, 208, 210, 0, 209, 223, 216, 0, 0, + 219, 222, 204, 219, 0, 215, 0, 0, 199, 213, + 0, 0, 197, 196, 201, 0, 210, 195, 0, 0, + 201, 197, 0, 192, 0, 204, 204, 192, 202, 0, + 179, 0, 0, 199, 0, 183, 177, 183, 0, 174, + 0, 193, 0, 192, 0, 0, 183, 187, 0, 174, + 174, 180, 166, 189, 181, 180, 166, 151, 118, 0, + 130, 136, 127, 123, 119, 111, 0, 448, 167, 173, + + 179, 152, 181, 124, 187, 193, 199, 205 + } ; + +static yyconst flex_int16_t yy_def[409] = + { 0, + 398, 1, 399, 399, 400, 400, 401, 401, 398, 398, + 398, 398, 398, 402, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 403, 398, 398, 398, 398, + 398, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 398, 398, 405, 406, 407, 398, 408, 398, 398, 402, + 398, 398, 398, 398, 403, 398, 398, 398, 398, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 405, 406, 406, 398, 407, 398, 408, 398, + 398, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 404, 404, 404, + 404, 404, 404, 404, 404, 404, 404, 0, 398, 398, + + 398, 398, 398, 398, 398, 398, 398, 398 + } ; + +static yyconst flex_int16_t yy_nxt[499] = + { 0, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 38, + 39, 38, 38, 40, 41, 42, 43, 44, 38, 45, + 46, 47, 48, 49, 50, 38, 38, 38, 51, 52, + 59, 59, 59, 59, 63, 71, 64, 67, 68, 73, + 72, 77, 118, 74, 119, 78, 75, 63, 79, 64, + 88, 80, 82, 85, 81, 86, 83, 89, 96, 76, + 90, 93, 84, 91, 99, 87, 92, 101, 97, 94, + 100, 107, 133, 110, 95, 102, 111, 103, 179, 104, + + 108, 109, 105, 115, 121, 112, 180, 125, 134, 113, + 116, 122, 126, 114, 59, 59, 139, 117, 141, 142, + 146, 163, 140, 159, 171, 173, 143, 189, 70, 147, + 172, 177, 183, 164, 207, 208, 148, 190, 160, 161, + 174, 193, 178, 184, 175, 194, 398, 125, 222, 214, + 224, 398, 126, 215, 248, 249, 60, 397, 396, 395, + 225, 394, 393, 223, 392, 391, 250, 53, 53, 53, + 53, 53, 53, 55, 55, 55, 55, 55, 55, 57, + 57, 57, 57, 57, 57, 65, 65, 123, 123, 123, + 390, 123, 123, 124, 124, 124, 124, 124, 124, 127, + + 127, 389, 127, 127, 127, 129, 388, 129, 129, 129, + 129, 387, 386, 385, 384, 383, 382, 381, 380, 379, + 378, 377, 376, 375, 374, 373, 372, 371, 370, 369, + 368, 367, 366, 365, 364, 363, 362, 361, 360, 359, + 358, 357, 356, 355, 354, 353, 352, 351, 350, 349, + 348, 347, 346, 345, 344, 343, 342, 341, 340, 339, + 338, 337, 336, 335, 334, 333, 332, 331, 330, 329, + 328, 327, 326, 325, 324, 323, 322, 321, 320, 319, + 318, 317, 316, 315, 314, 313, 312, 311, 310, 309, + 308, 307, 306, 305, 304, 303, 302, 301, 300, 299, + + 298, 297, 296, 295, 294, 293, 292, 291, 290, 289, + 288, 287, 286, 285, 284, 283, 282, 281, 280, 279, + 278, 277, 276, 275, 274, 273, 272, 271, 270, 269, + 268, 267, 266, 265, 264, 263, 262, 261, 260, 259, + 258, 257, 256, 255, 254, 253, 252, 251, 247, 246, + 245, 244, 243, 242, 241, 240, 239, 238, 237, 236, + 235, 234, 233, 232, 231, 230, 229, 228, 227, 226, + 221, 220, 219, 218, 217, 216, 213, 212, 211, 210, + 209, 206, 205, 204, 203, 202, 201, 200, 199, 198, + 197, 196, 131, 130, 128, 195, 192, 191, 188, 187, + + 186, 185, 182, 181, 176, 170, 169, 168, 167, 166, + 165, 162, 158, 157, 156, 155, 154, 153, 152, 151, + 150, 149, 145, 144, 138, 137, 136, 135, 132, 398, + 131, 130, 128, 120, 106, 98, 69, 66, 62, 61, + 398, 58, 58, 56, 56, 54, 54, 9, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398 + + } ; + +static yyconst flex_int16_t yy_chk[499] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 11, 11, 12, 12, 25, 32, 25, 28, 28, 33, + 32, 34, 48, 33, 48, 34, 33, 64, 34, 64, + 37, 34, 35, 36, 34, 36, 35, 37, 40, 33, + 37, 39, 35, 37, 42, 36, 37, 43, 40, 39, + 42, 45, 72, 46, 39, 43, 46, 43, 110, 43, + + 45, 45, 43, 47, 50, 46, 110, 54, 72, 46, + 47, 50, 54, 46, 59, 59, 78, 47, 79, 79, + 82, 97, 78, 94, 106, 107, 79, 118, 404, 82, + 106, 109, 113, 97, 145, 145, 82, 118, 94, 94, + 107, 121, 109, 113, 107, 121, 124, 125, 160, 151, + 161, 124, 125, 151, 188, 188, 402, 396, 395, 394, + 161, 393, 392, 160, 391, 389, 188, 399, 399, 399, + 399, 399, 399, 400, 400, 400, 400, 400, 400, 401, + 401, 401, 401, 401, 401, 403, 403, 405, 405, 405, + 388, 405, 405, 406, 406, 406, 406, 406, 406, 407, + + 407, 387, 407, 407, 407, 408, 386, 408, 408, 408, + 408, 385, 384, 383, 382, 381, 380, 378, 377, 374, + 372, 370, 368, 367, 366, 364, 361, 359, 358, 357, + 356, 354, 352, 351, 348, 347, 345, 344, 343, 340, + 339, 336, 334, 333, 332, 331, 328, 327, 326, 324, + 323, 321, 319, 317, 316, 315, 313, 311, 307, 306, + 302, 301, 300, 299, 298, 297, 296, 294, 293, 291, + 290, 289, 288, 287, 286, 285, 284, 282, 281, 280, + 278, 276, 272, 271, 270, 269, 268, 267, 266, 264, + 263, 262, 260, 258, 256, 255, 254, 253, 252, 251, + + 250, 249, 248, 246, 245, 244, 243, 242, 241, 240, + 239, 238, 237, 235, 234, 233, 230, 226, 224, 223, + 222, 221, 220, 218, 217, 215, 213, 212, 210, 209, + 208, 207, 206, 205, 204, 203, 202, 201, 198, 197, + 196, 195, 194, 193, 192, 191, 190, 189, 187, 186, + 185, 183, 181, 179, 178, 177, 176, 175, 174, 173, + 172, 171, 169, 168, 167, 166, 165, 164, 163, 162, + 159, 158, 157, 155, 154, 153, 150, 149, 148, 147, + 146, 144, 143, 142, 141, 140, 139, 138, 137, 136, + 135, 134, 131, 130, 128, 122, 120, 119, 117, 116, + + 115, 114, 112, 111, 108, 105, 104, 103, 100, 99, + 98, 96, 92, 91, 90, 89, 88, 87, 86, 85, + 84, 83, 81, 80, 77, 75, 74, 73, 71, 65, + 63, 58, 56, 49, 44, 41, 30, 26, 24, 23, + 9, 8, 7, 6, 5, 4, 3, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398, 398, 398, + 398, 398, 398, 398, 398, 398, 398, 398 + + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +static int yy_flex_debug; +static int yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +static char *yytext; +#line 1 "pars0lex.l" +/****************************************************** +SQL parser lexical analyzer: input file for the GNU Flex lexer generator + +(c) 1997 Innobase Oy + +Created 12/14/1997 Heikki Tuuri +Published under the GPL version 2 + +The InnoDB parser is frozen because MySQL takes care of SQL parsing. +Therefore we normally keep the InnoDB parser C files as they are, and do +not automatically generate them from pars0grm.y and pars0lex.l. + +How to make the InnoDB parser and lexer C files: + +1. Run ./make_flex.sh to generate lexer files. + +2. Run ./make_bison.sh to generate parser files. + +These instructions seem to work at least with bison-1.875d and flex-2.5.31 on +Linux. +*******************************************************/ +#define YY_NO_INPUT 1 +#define YY_NO_UNISTD_H 1 +#line 38 "pars0lex.l" +#define YYSTYPE que_node_t* + +#include "univ.i" +#include "pars0pars.h" +#include "pars0grm.h" +#include "pars0sym.h" +#include "mem0mem.h" +#include "os0proc.h" + +#define malloc(A) ut_malloc(A) +#define free(A) ut_free(A) +#define realloc(P, A) ut_realloc(P, A) +#define exit(A) ut_error + +#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size) + +/* String buffer for removing quotes */ +static ulint stringbuf_len_alloc = 0; /* Allocated length */ +static ulint stringbuf_len = 0; /* Current length */ +static char* stringbuf; /* Start of buffer */ +/* Appends a string to the buffer. */ +static +void +string_append( +/*==========*/ + const char* str, /* in: string to be appended */ + ulint len) /* in: length of the string */ +{ + if (stringbuf == NULL) { + stringbuf = malloc(1); + stringbuf_len_alloc = 1; + } + + if (stringbuf_len + len > stringbuf_len_alloc) { + while (stringbuf_len + len > stringbuf_len_alloc) { + stringbuf_len_alloc <<= 1; + } + stringbuf = realloc(stringbuf, stringbuf_len_alloc); + } + + memcpy(stringbuf + stringbuf_len, str, len); + stringbuf_len += len; +} + + + + +#line 759 "lexyy.c" + +#define INITIAL 0 +#define comment 1 +#define quoted 2 +#define id 3 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (void ); +#else +extern int yywrap (void ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO (void) fwrite( yytext, yyleng, 1, yyout ) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +UNIV_INTERN int yylex (void); + +#define YY_DECL UNIV_INTERN int yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + +#line 92 "pars0lex.l" + + +#line 914 "lexyy.c" + + if ( (yy_init) ) + { + (yy_init) = 0; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_load_buffer_state( ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 399 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_current_state != 398 ); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 94 "pars0lex.l" +{ + yylval = sym_tab_add_int_lit(pars_sym_tab_global, + atoi(yytext)); + return(PARS_INT_LIT); +} + YY_BREAK +case 2: +YY_RULE_SETUP +#line 100 "pars0lex.l" +{ + ut_error; /* not implemented */ + + return(PARS_FLOAT_LIT); +} + YY_BREAK +case 3: +YY_RULE_SETUP +#line 106 "pars0lex.l" +{ + ulint type; + + yylval = sym_tab_add_bound_lit(pars_sym_tab_global, + yytext + 1, &type); + + return((int) type); +} + YY_BREAK +case 4: +YY_RULE_SETUP +#line 115 "pars0lex.l" +{ + yylval = sym_tab_add_bound_id(pars_sym_tab_global, + yytext + 1); + + return(PARS_ID_TOKEN); +} + YY_BREAK +case 5: +YY_RULE_SETUP +#line 122 "pars0lex.l" +{ +/* Quoted character string literals are handled in an explicit +start state 'quoted'. This state is entered and the buffer for +the scanned string is emptied upon encountering a starting quote. + +In the state 'quoted', only two actions are possible (defined below). */ + BEGIN(quoted); + stringbuf_len = 0; +} + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 131 "pars0lex.l" +{ + /* Got a sequence of characters other than "'": + append to string buffer */ + string_append(yytext, yyleng); +} + YY_BREAK +case 7: +YY_RULE_SETUP +#line 136 "pars0lex.l" +{ + /* Got a sequence of "'" characters: + append half of them to string buffer, + as "''" represents a single "'". + We apply truncating division, + so that "'''" will result in "'". */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + string literal. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_str_lit( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + return(PARS_STR_LIT); + } +} + YY_BREAK +case 8: +YY_RULE_SETUP +#line 160 "pars0lex.l" +{ +/* Quoted identifiers are handled in an explicit start state 'id'. +This state is entered and the buffer for the scanned string is emptied +upon encountering a starting quote. + +In the state 'id', only two actions are possible (defined below). */ + BEGIN(id); + stringbuf_len = 0; +} + YY_BREAK +case 9: +/* rule 9 can match eol */ +YY_RULE_SETUP +#line 169 "pars0lex.l" +{ + /* Got a sequence of characters other than '"': + append to string buffer */ + string_append(yytext, yyleng); +} + YY_BREAK +case 10: +YY_RULE_SETUP +#line 174 "pars0lex.l" +{ + /* Got a sequence of '"' characters: + append half of them to string buffer, + as '""' represents a single '"'. + We apply truncating division, + so that '"""' will result in '"'. */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + identifier. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_id( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + + return(PARS_ID_TOKEN); + } +} + YY_BREAK +case 11: +YY_RULE_SETUP +#line 199 "pars0lex.l" +{ + yylval = sym_tab_add_null_lit(pars_sym_tab_global); + + return(PARS_NULL_LIT); +} + YY_BREAK +case 12: +YY_RULE_SETUP +#line 205 "pars0lex.l" +{ + /* Implicit cursor name */ + yylval = sym_tab_add_str_lit(pars_sym_tab_global, + (byte*) yytext, yyleng); + return(PARS_SQL_TOKEN); +} + YY_BREAK +case 13: +YY_RULE_SETUP +#line 212 "pars0lex.l" +{ + return(PARS_AND_TOKEN); +} + YY_BREAK +case 14: +YY_RULE_SETUP +#line 216 "pars0lex.l" +{ + return(PARS_OR_TOKEN); +} + YY_BREAK +case 15: +YY_RULE_SETUP +#line 220 "pars0lex.l" +{ + return(PARS_NOT_TOKEN); +} + YY_BREAK +case 16: +YY_RULE_SETUP +#line 224 "pars0lex.l" +{ + return(PARS_PROCEDURE_TOKEN); +} + YY_BREAK +case 17: +YY_RULE_SETUP +#line 228 "pars0lex.l" +{ + return(PARS_IN_TOKEN); +} + YY_BREAK +case 18: +YY_RULE_SETUP +#line 232 "pars0lex.l" +{ + return(PARS_OUT_TOKEN); +} + YY_BREAK +case 19: +YY_RULE_SETUP +#line 236 "pars0lex.l" +{ + return(PARS_BINARY_TOKEN); +} + YY_BREAK +case 20: +YY_RULE_SETUP +#line 240 "pars0lex.l" +{ + return(PARS_BLOB_TOKEN); +} + YY_BREAK +case 21: +YY_RULE_SETUP +#line 244 "pars0lex.l" +{ + return(PARS_INT_TOKEN); +} + YY_BREAK +case 22: +YY_RULE_SETUP +#line 248 "pars0lex.l" +{ + return(PARS_INT_TOKEN); +} + YY_BREAK +case 23: +YY_RULE_SETUP +#line 252 "pars0lex.l" +{ + return(PARS_FLOAT_TOKEN); +} + YY_BREAK +case 24: +YY_RULE_SETUP +#line 256 "pars0lex.l" +{ + return(PARS_CHAR_TOKEN); +} + YY_BREAK +case 25: +YY_RULE_SETUP +#line 260 "pars0lex.l" +{ + return(PARS_IS_TOKEN); +} + YY_BREAK +case 26: +YY_RULE_SETUP +#line 264 "pars0lex.l" +{ + return(PARS_BEGIN_TOKEN); +} + YY_BREAK +case 27: +YY_RULE_SETUP +#line 268 "pars0lex.l" +{ + return(PARS_END_TOKEN); +} + YY_BREAK +case 28: +YY_RULE_SETUP +#line 272 "pars0lex.l" +{ + return(PARS_IF_TOKEN); +} + YY_BREAK +case 29: +YY_RULE_SETUP +#line 276 "pars0lex.l" +{ + return(PARS_THEN_TOKEN); +} + YY_BREAK +case 30: +YY_RULE_SETUP +#line 280 "pars0lex.l" +{ + return(PARS_ELSE_TOKEN); +} + YY_BREAK +case 31: +YY_RULE_SETUP +#line 284 "pars0lex.l" +{ + return(PARS_ELSIF_TOKEN); +} + YY_BREAK +case 32: +YY_RULE_SETUP +#line 288 "pars0lex.l" +{ + return(PARS_LOOP_TOKEN); +} + YY_BREAK +case 33: +YY_RULE_SETUP +#line 292 "pars0lex.l" +{ + return(PARS_WHILE_TOKEN); +} + YY_BREAK +case 34: +YY_RULE_SETUP +#line 296 "pars0lex.l" +{ + return(PARS_RETURN_TOKEN); +} + YY_BREAK +case 35: +YY_RULE_SETUP +#line 300 "pars0lex.l" +{ + return(PARS_SELECT_TOKEN); +} + YY_BREAK +case 36: +YY_RULE_SETUP +#line 304 "pars0lex.l" +{ + return(PARS_SUM_TOKEN); +} + YY_BREAK +case 37: +YY_RULE_SETUP +#line 308 "pars0lex.l" +{ + return(PARS_COUNT_TOKEN); +} + YY_BREAK +case 38: +YY_RULE_SETUP +#line 312 "pars0lex.l" +{ + return(PARS_DISTINCT_TOKEN); +} + YY_BREAK +case 39: +YY_RULE_SETUP +#line 316 "pars0lex.l" +{ + return(PARS_FROM_TOKEN); +} + YY_BREAK +case 40: +YY_RULE_SETUP +#line 320 "pars0lex.l" +{ + return(PARS_WHERE_TOKEN); +} + YY_BREAK +case 41: +YY_RULE_SETUP +#line 324 "pars0lex.l" +{ + return(PARS_FOR_TOKEN); +} + YY_BREAK +case 42: +YY_RULE_SETUP +#line 328 "pars0lex.l" +{ + return(PARS_READ_TOKEN); +} + YY_BREAK +case 43: +YY_RULE_SETUP +#line 332 "pars0lex.l" +{ + return(PARS_ORDER_TOKEN); +} + YY_BREAK +case 44: +YY_RULE_SETUP +#line 336 "pars0lex.l" +{ + return(PARS_BY_TOKEN); +} + YY_BREAK +case 45: +YY_RULE_SETUP +#line 340 "pars0lex.l" +{ + return(PARS_ASC_TOKEN); +} + YY_BREAK +case 46: +YY_RULE_SETUP +#line 344 "pars0lex.l" +{ + return(PARS_DESC_TOKEN); +} + YY_BREAK +case 47: +YY_RULE_SETUP +#line 348 "pars0lex.l" +{ + return(PARS_INSERT_TOKEN); +} + YY_BREAK +case 48: +YY_RULE_SETUP +#line 352 "pars0lex.l" +{ + return(PARS_INTO_TOKEN); +} + YY_BREAK +case 49: +YY_RULE_SETUP +#line 356 "pars0lex.l" +{ + return(PARS_VALUES_TOKEN); +} + YY_BREAK +case 50: +YY_RULE_SETUP +#line 360 "pars0lex.l" +{ + return(PARS_UPDATE_TOKEN); +} + YY_BREAK +case 51: +YY_RULE_SETUP +#line 364 "pars0lex.l" +{ + return(PARS_SET_TOKEN); +} + YY_BREAK +case 52: +YY_RULE_SETUP +#line 368 "pars0lex.l" +{ + return(PARS_DELETE_TOKEN); +} + YY_BREAK +case 53: +YY_RULE_SETUP +#line 372 "pars0lex.l" +{ + return(PARS_CURRENT_TOKEN); +} + YY_BREAK +case 54: +YY_RULE_SETUP +#line 376 "pars0lex.l" +{ + return(PARS_OF_TOKEN); +} + YY_BREAK +case 55: +YY_RULE_SETUP +#line 380 "pars0lex.l" +{ + return(PARS_CREATE_TOKEN); +} + YY_BREAK +case 56: +YY_RULE_SETUP +#line 384 "pars0lex.l" +{ + return(PARS_TABLE_TOKEN); +} + YY_BREAK +case 57: +YY_RULE_SETUP +#line 388 "pars0lex.l" +{ + return(PARS_INDEX_TOKEN); +} + YY_BREAK +case 58: +YY_RULE_SETUP +#line 392 "pars0lex.l" +{ + return(PARS_UNIQUE_TOKEN); +} + YY_BREAK +case 59: +YY_RULE_SETUP +#line 396 "pars0lex.l" +{ + return(PARS_CLUSTERED_TOKEN); +} + YY_BREAK +case 60: +YY_RULE_SETUP +#line 400 "pars0lex.l" +{ + return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN); +} + YY_BREAK +case 61: +YY_RULE_SETUP +#line 404 "pars0lex.l" +{ + return(PARS_ON_TOKEN); +} + YY_BREAK +case 62: +YY_RULE_SETUP +#line 408 "pars0lex.l" +{ + return(PARS_DECLARE_TOKEN); +} + YY_BREAK +case 63: +YY_RULE_SETUP +#line 412 "pars0lex.l" +{ + return(PARS_CURSOR_TOKEN); +} + YY_BREAK +case 64: +YY_RULE_SETUP +#line 416 "pars0lex.l" +{ + return(PARS_OPEN_TOKEN); +} + YY_BREAK +case 65: +YY_RULE_SETUP +#line 420 "pars0lex.l" +{ + return(PARS_FETCH_TOKEN); +} + YY_BREAK +case 66: +YY_RULE_SETUP +#line 424 "pars0lex.l" +{ + return(PARS_CLOSE_TOKEN); +} + YY_BREAK +case 67: +YY_RULE_SETUP +#line 428 "pars0lex.l" +{ + return(PARS_NOTFOUND_TOKEN); +} + YY_BREAK +case 68: +YY_RULE_SETUP +#line 432 "pars0lex.l" +{ + return(PARS_TO_CHAR_TOKEN); +} + YY_BREAK +case 69: +YY_RULE_SETUP +#line 436 "pars0lex.l" +{ + return(PARS_TO_NUMBER_TOKEN); +} + YY_BREAK +case 70: +YY_RULE_SETUP +#line 440 "pars0lex.l" +{ + return(PARS_TO_BINARY_TOKEN); +} + YY_BREAK +case 71: +YY_RULE_SETUP +#line 444 "pars0lex.l" +{ + return(PARS_BINARY_TO_NUMBER_TOKEN); +} + YY_BREAK +case 72: +YY_RULE_SETUP +#line 448 "pars0lex.l" +{ + return(PARS_SUBSTR_TOKEN); +} + YY_BREAK +case 73: +YY_RULE_SETUP +#line 452 "pars0lex.l" +{ + return(PARS_REPLSTR_TOKEN); +} + YY_BREAK +case 74: +YY_RULE_SETUP +#line 456 "pars0lex.l" +{ + return(PARS_CONCAT_TOKEN); +} + YY_BREAK +case 75: +YY_RULE_SETUP +#line 460 "pars0lex.l" +{ + return(PARS_INSTR_TOKEN); +} + YY_BREAK +case 76: +YY_RULE_SETUP +#line 464 "pars0lex.l" +{ + return(PARS_LENGTH_TOKEN); +} + YY_BREAK +case 77: +YY_RULE_SETUP +#line 468 "pars0lex.l" +{ + return(PARS_SYSDATE_TOKEN); +} + YY_BREAK +case 78: +YY_RULE_SETUP +#line 472 "pars0lex.l" +{ + return(PARS_PRINTF_TOKEN); +} + YY_BREAK +case 79: +YY_RULE_SETUP +#line 476 "pars0lex.l" +{ + return(PARS_ASSERT_TOKEN); +} + YY_BREAK +case 80: +YY_RULE_SETUP +#line 480 "pars0lex.l" +{ + return(PARS_RND_TOKEN); +} + YY_BREAK +case 81: +YY_RULE_SETUP +#line 484 "pars0lex.l" +{ + return(PARS_RND_STR_TOKEN); +} + YY_BREAK +case 82: +YY_RULE_SETUP +#line 488 "pars0lex.l" +{ + return(PARS_ROW_PRINTF_TOKEN); +} + YY_BREAK +case 83: +YY_RULE_SETUP +#line 492 "pars0lex.l" +{ + return(PARS_COMMIT_TOKEN); +} + YY_BREAK +case 84: +YY_RULE_SETUP +#line 496 "pars0lex.l" +{ + return(PARS_ROLLBACK_TOKEN); +} + YY_BREAK +case 85: +YY_RULE_SETUP +#line 500 "pars0lex.l" +{ + return(PARS_WORK_TOKEN); +} + YY_BREAK +case 86: +YY_RULE_SETUP +#line 504 "pars0lex.l" +{ + return(PARS_UNSIGNED_TOKEN); +} + YY_BREAK +case 87: +YY_RULE_SETUP +#line 508 "pars0lex.l" +{ + return(PARS_EXIT_TOKEN); +} + YY_BREAK +case 88: +YY_RULE_SETUP +#line 512 "pars0lex.l" +{ + return(PARS_FUNCTION_TOKEN); +} + YY_BREAK +case 89: +YY_RULE_SETUP +#line 516 "pars0lex.l" +{ + return(PARS_LOCK_TOKEN); +} + YY_BREAK +case 90: +YY_RULE_SETUP +#line 520 "pars0lex.l" +{ + return(PARS_SHARE_TOKEN); +} + YY_BREAK +case 91: +YY_RULE_SETUP +#line 524 "pars0lex.l" +{ + return(PARS_MODE_TOKEN); +} + YY_BREAK +case 92: +YY_RULE_SETUP +#line 528 "pars0lex.l" +{ + yylval = sym_tab_add_id(pars_sym_tab_global, + (byte*)yytext, + ut_strlen(yytext)); + return(PARS_ID_TOKEN); +} + YY_BREAK +case 93: +YY_RULE_SETUP +#line 535 "pars0lex.l" +{ + return(PARS_DDOT_TOKEN); +} + YY_BREAK +case 94: +YY_RULE_SETUP +#line 539 "pars0lex.l" +{ + return(PARS_ASSIGN_TOKEN); +} + YY_BREAK +case 95: +YY_RULE_SETUP +#line 543 "pars0lex.l" +{ + return(PARS_LE_TOKEN); +} + YY_BREAK +case 96: +YY_RULE_SETUP +#line 547 "pars0lex.l" +{ + return(PARS_GE_TOKEN); +} + YY_BREAK +case 97: +YY_RULE_SETUP +#line 551 "pars0lex.l" +{ + return(PARS_NE_TOKEN); +} + YY_BREAK +case 98: +YY_RULE_SETUP +#line 555 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 99: +YY_RULE_SETUP +#line 560 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 100: +YY_RULE_SETUP +#line 565 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 101: +YY_RULE_SETUP +#line 570 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 102: +YY_RULE_SETUP +#line 575 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 103: +YY_RULE_SETUP +#line 580 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 104: +YY_RULE_SETUP +#line 585 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 105: +YY_RULE_SETUP +#line 590 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 106: +YY_RULE_SETUP +#line 595 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 107: +YY_RULE_SETUP +#line 600 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 108: +YY_RULE_SETUP +#line 605 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 109: +YY_RULE_SETUP +#line 610 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 110: +YY_RULE_SETUP +#line 615 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 111: +YY_RULE_SETUP +#line 620 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 112: +YY_RULE_SETUP +#line 625 "pars0lex.l" +{ + + return((int)(*yytext)); +} + YY_BREAK +case 113: +YY_RULE_SETUP +#line 630 "pars0lex.l" +BEGIN(comment); /* eat up comment */ + YY_BREAK +case 114: +/* rule 114 can match eol */ +YY_RULE_SETUP +#line 632 "pars0lex.l" + + YY_BREAK +case 115: +/* rule 115 can match eol */ +YY_RULE_SETUP +#line 633 "pars0lex.l" + + YY_BREAK +case 116: +YY_RULE_SETUP +#line 634 "pars0lex.l" +BEGIN(INITIAL); + YY_BREAK +case 117: +/* rule 117 can match eol */ +YY_RULE_SETUP +#line 636 "pars0lex.l" +/* eat up whitespace */ + YY_BREAK +case 118: +YY_RULE_SETUP +#line 639 "pars0lex.l" +{ + fprintf(stderr,"Unrecognized character: %02x\n", + *yytext); + + ut_error; + + return(0); +} + YY_BREAK +case 119: +YY_RULE_SETUP +#line 648 "pars0lex.l" +YY_FATAL_ERROR( "flex scanner jammed" ); + YY_BREAK +#line 1916 "lexyy.c" +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(comment): +case YY_STATE_EOF(quoted): +case YY_STATE_EOF(id): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + size_t num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart(yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 399 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + register char *yy_cp = (yy_c_buf_p); + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 399 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 398); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (int)((yy_c_buf_p) - (yytext_ptr)); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart(yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + static void yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_init_buffer(YY_CURRENT_BUFFER,input_file ); + yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + __attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + static YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * + */ + static void yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree((void *) b->yy_ch_buf ); + + yyfree((void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + yy_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + static void yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +__attribute__((unused)) static void yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (void) +{ + int num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +__attribute__((unused)) static int yyget_lineno (void) +{ + + return yylineno; +} + +/** Get the input stream. + * + */ +__attribute__((unused)) static FILE *yyget_in (void) +{ + return yyin; +} + +/** Get the output stream. + * + */ +__attribute__((unused)) static FILE *yyget_out (void) +{ + return yyout; +} + +/** Get the length of the current token. + * + */ +__attribute__((unused)) static int yyget_leng (void) +{ + return yyleng; +} + +/** Get the current token. + * + */ + +__attribute__((unused)) static char *yyget_text (void) +{ + return yytext; +} + +/** Set the current line number. + * @param line_number + * + */ +__attribute__((unused)) static void yyset_lineno (int line_number ) +{ + + yylineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see yy_switch_to_buffer + */ +__attribute__((unused)) static void yyset_in (FILE * in_str ) +{ + yyin = in_str ; +} + +__attribute__((unused)) static void yyset_out (FILE * out_str ) +{ + yyout = out_str ; +} + +__attribute__((unused)) static int yyget_debug (void) +{ + return yy_flex_debug; +} + +__attribute__((unused)) static void yyset_debug (int bdebug ) +{ + yy_flex_debug = bdebug ; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +__attribute__((unused)) static int yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +static void *yyalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +static void *yyrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +static void yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef yytext_ptr +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif +#line 648 "pars0lex.l" + + + diff --git a/storage/xtradb/pars/make_bison.sh b/storage/xtradb/pars/make_bison.sh new file mode 100755 index 00000000000..09bb86e3106 --- /dev/null +++ b/storage/xtradb/pars/make_bison.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA +# +# generate parser files from bison input files. + +set -eu +TMPFILE=pars0grm.tab.c +OUTFILE=pars0grm.c + +bison -d pars0grm.y +mv pars0grm.tab.h ../include/pars0grm.h + +sed -e ' +s/'"$TMPFILE"'/'"$OUTFILE"'/; +s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/; +s/\(\(YYSTYPE\|int\) yy\(lval\|parse\)\)/UNIV_INTERN \1/; +' < "$TMPFILE" > "$OUTFILE" + +rm "$TMPFILE" diff --git a/storage/xtradb/pars/make_flex.sh b/storage/xtradb/pars/make_flex.sh new file mode 100755 index 00000000000..89308a6636f --- /dev/null +++ b/storage/xtradb/pars/make_flex.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA +# +# generate lexer files from flex input files. + +set -eu + +TMPFILE=_flex_tmp.c +OUTFILE=lexyy.c + +flex -o $TMPFILE pars0lex.l + +# AIX needs its includes done in a certain order, so include "univ.i" first +# to be sure we get it right. +echo '#include "univ.i"' > $OUTFILE + +# flex assigns a pointer to an int in one place without a cast, resulting in +# a warning on Win64. Add the cast. Also define some symbols as static. +sed -e ' +s/'"$TMPFILE"'/'"$OUTFILE"'/; +s/\(int offset = \)\((yy_c_buf_p) - (yytext_ptr)\);/\1(int)(\2);/; +s/\(void yy\(restart\|_\(delete\|flush\)_buffer\)\)/static \1/; +s/\(void yy_switch_to_buffer\)/__attribute__((unused)) static \1/; +s/\(void yy\(push\|pop\)_buffer_state\)/__attribute__((unused)) static \1/; +s/\(YY_BUFFER_STATE yy_create_buffer\)/static \1/; +s/\(\(int\|void\) yy[gs]et_\)/__attribute__((unused)) static \1/; +s/\(void \*\?yy\(\(re\)\?alloc\|free\)\)/static \1/; +s/\(extern \)\?\(int yy\(leng\|lineno\|_flex_debug\)\)/static \2/; +s/\(int yylex_destroy\)/__attribute__((unused)) static \1/; +s/\(extern \)\?\(int yylex \)/UNIV_INTERN \2/; +s/^\(\(FILE\|char\) *\* *yyget\)/__attribute__((unused)) static \1/; +s/^\(extern \)\?\(\(FILE\|char\) *\* *yy\)/static \2/; +' < $TMPFILE >> $OUTFILE + +rm $TMPFILE diff --git a/storage/xtradb/pars/pars0grm.c b/storage/xtradb/pars/pars0grm.c new file mode 100644 index 00000000000..d667970735e --- /dev/null +++ b/storage/xtradb/pars/pars0grm.c @@ -0,0 +1,2601 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software +Foundation, Inc. + +As a special exception, when this file is copied by Bison into a +Bison output file, you may use that output file without restriction. +This special exception was added by the Free Software Foundation +in version 1.24 of Bison. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/* A Bison parser, made by GNU Bison 2.0. */ + +/* Written by Richard Stallman by simplifying the original so called + ``semantic'' parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Using locations. */ +#define YYLSP_NEEDED 0 + + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + PARS_INT_LIT = 258, + PARS_FLOAT_LIT = 259, + PARS_STR_LIT = 260, + PARS_FIXBINARY_LIT = 261, + PARS_BLOB_LIT = 262, + PARS_NULL_LIT = 263, + PARS_ID_TOKEN = 264, + PARS_AND_TOKEN = 265, + PARS_OR_TOKEN = 266, + PARS_NOT_TOKEN = 267, + PARS_GE_TOKEN = 268, + PARS_LE_TOKEN = 269, + PARS_NE_TOKEN = 270, + PARS_PROCEDURE_TOKEN = 271, + PARS_IN_TOKEN = 272, + PARS_OUT_TOKEN = 273, + PARS_BINARY_TOKEN = 274, + PARS_BLOB_TOKEN = 275, + PARS_INT_TOKEN = 276, + PARS_INTEGER_TOKEN = 277, + PARS_FLOAT_TOKEN = 278, + PARS_CHAR_TOKEN = 279, + PARS_IS_TOKEN = 280, + PARS_BEGIN_TOKEN = 281, + PARS_END_TOKEN = 282, + PARS_IF_TOKEN = 283, + PARS_THEN_TOKEN = 284, + PARS_ELSE_TOKEN = 285, + PARS_ELSIF_TOKEN = 286, + PARS_LOOP_TOKEN = 287, + PARS_WHILE_TOKEN = 288, + PARS_RETURN_TOKEN = 289, + PARS_SELECT_TOKEN = 290, + PARS_SUM_TOKEN = 291, + PARS_COUNT_TOKEN = 292, + PARS_DISTINCT_TOKEN = 293, + PARS_FROM_TOKEN = 294, + PARS_WHERE_TOKEN = 295, + PARS_FOR_TOKEN = 296, + PARS_DDOT_TOKEN = 297, + PARS_READ_TOKEN = 298, + PARS_ORDER_TOKEN = 299, + PARS_BY_TOKEN = 300, + PARS_ASC_TOKEN = 301, + PARS_DESC_TOKEN = 302, + PARS_INSERT_TOKEN = 303, + PARS_INTO_TOKEN = 304, + PARS_VALUES_TOKEN = 305, + PARS_UPDATE_TOKEN = 306, + PARS_SET_TOKEN = 307, + PARS_DELETE_TOKEN = 308, + PARS_CURRENT_TOKEN = 309, + PARS_OF_TOKEN = 310, + PARS_CREATE_TOKEN = 311, + PARS_TABLE_TOKEN = 312, + PARS_INDEX_TOKEN = 313, + PARS_UNIQUE_TOKEN = 314, + PARS_CLUSTERED_TOKEN = 315, + PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316, + PARS_ON_TOKEN = 317, + PARS_ASSIGN_TOKEN = 318, + PARS_DECLARE_TOKEN = 319, + PARS_CURSOR_TOKEN = 320, + PARS_SQL_TOKEN = 321, + PARS_OPEN_TOKEN = 322, + PARS_FETCH_TOKEN = 323, + PARS_CLOSE_TOKEN = 324, + PARS_NOTFOUND_TOKEN = 325, + PARS_TO_CHAR_TOKEN = 326, + PARS_TO_NUMBER_TOKEN = 327, + PARS_TO_BINARY_TOKEN = 328, + PARS_BINARY_TO_NUMBER_TOKEN = 329, + PARS_SUBSTR_TOKEN = 330, + PARS_REPLSTR_TOKEN = 331, + PARS_CONCAT_TOKEN = 332, + PARS_INSTR_TOKEN = 333, + PARS_LENGTH_TOKEN = 334, + PARS_SYSDATE_TOKEN = 335, + PARS_PRINTF_TOKEN = 336, + PARS_ASSERT_TOKEN = 337, + PARS_RND_TOKEN = 338, + PARS_RND_STR_TOKEN = 339, + PARS_ROW_PRINTF_TOKEN = 340, + PARS_COMMIT_TOKEN = 341, + PARS_ROLLBACK_TOKEN = 342, + PARS_WORK_TOKEN = 343, + PARS_UNSIGNED_TOKEN = 344, + PARS_EXIT_TOKEN = 345, + PARS_FUNCTION_TOKEN = 346, + PARS_LOCK_TOKEN = 347, + PARS_SHARE_TOKEN = 348, + PARS_MODE_TOKEN = 349, + NEG = 350 + }; +#endif +#define PARS_INT_LIT 258 +#define PARS_FLOAT_LIT 259 +#define PARS_STR_LIT 260 +#define PARS_FIXBINARY_LIT 261 +#define PARS_BLOB_LIT 262 +#define PARS_NULL_LIT 263 +#define PARS_ID_TOKEN 264 +#define PARS_AND_TOKEN 265 +#define PARS_OR_TOKEN 266 +#define PARS_NOT_TOKEN 267 +#define PARS_GE_TOKEN 268 +#define PARS_LE_TOKEN 269 +#define PARS_NE_TOKEN 270 +#define PARS_PROCEDURE_TOKEN 271 +#define PARS_IN_TOKEN 272 +#define PARS_OUT_TOKEN 273 +#define PARS_BINARY_TOKEN 274 +#define PARS_BLOB_TOKEN 275 +#define PARS_INT_TOKEN 276 +#define PARS_INTEGER_TOKEN 277 +#define PARS_FLOAT_TOKEN 278 +#define PARS_CHAR_TOKEN 279 +#define PARS_IS_TOKEN 280 +#define PARS_BEGIN_TOKEN 281 +#define PARS_END_TOKEN 282 +#define PARS_IF_TOKEN 283 +#define PARS_THEN_TOKEN 284 +#define PARS_ELSE_TOKEN 285 +#define PARS_ELSIF_TOKEN 286 +#define PARS_LOOP_TOKEN 287 +#define PARS_WHILE_TOKEN 288 +#define PARS_RETURN_TOKEN 289 +#define PARS_SELECT_TOKEN 290 +#define PARS_SUM_TOKEN 291 +#define PARS_COUNT_TOKEN 292 +#define PARS_DISTINCT_TOKEN 293 +#define PARS_FROM_TOKEN 294 +#define PARS_WHERE_TOKEN 295 +#define PARS_FOR_TOKEN 296 +#define PARS_DDOT_TOKEN 297 +#define PARS_READ_TOKEN 298 +#define PARS_ORDER_TOKEN 299 +#define PARS_BY_TOKEN 300 +#define PARS_ASC_TOKEN 301 +#define PARS_DESC_TOKEN 302 +#define PARS_INSERT_TOKEN 303 +#define PARS_INTO_TOKEN 304 +#define PARS_VALUES_TOKEN 305 +#define PARS_UPDATE_TOKEN 306 +#define PARS_SET_TOKEN 307 +#define PARS_DELETE_TOKEN 308 +#define PARS_CURRENT_TOKEN 309 +#define PARS_OF_TOKEN 310 +#define PARS_CREATE_TOKEN 311 +#define PARS_TABLE_TOKEN 312 +#define PARS_INDEX_TOKEN 313 +#define PARS_UNIQUE_TOKEN 314 +#define PARS_CLUSTERED_TOKEN 315 +#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316 +#define PARS_ON_TOKEN 317 +#define PARS_ASSIGN_TOKEN 318 +#define PARS_DECLARE_TOKEN 319 +#define PARS_CURSOR_TOKEN 320 +#define PARS_SQL_TOKEN 321 +#define PARS_OPEN_TOKEN 322 +#define PARS_FETCH_TOKEN 323 +#define PARS_CLOSE_TOKEN 324 +#define PARS_NOTFOUND_TOKEN 325 +#define PARS_TO_CHAR_TOKEN 326 +#define PARS_TO_NUMBER_TOKEN 327 +#define PARS_TO_BINARY_TOKEN 328 +#define PARS_BINARY_TO_NUMBER_TOKEN 329 +#define PARS_SUBSTR_TOKEN 330 +#define PARS_REPLSTR_TOKEN 331 +#define PARS_CONCAT_TOKEN 332 +#define PARS_INSTR_TOKEN 333 +#define PARS_LENGTH_TOKEN 334 +#define PARS_SYSDATE_TOKEN 335 +#define PARS_PRINTF_TOKEN 336 +#define PARS_ASSERT_TOKEN 337 +#define PARS_RND_TOKEN 338 +#define PARS_RND_STR_TOKEN 339 +#define PARS_ROW_PRINTF_TOKEN 340 +#define PARS_COMMIT_TOKEN 341 +#define PARS_ROLLBACK_TOKEN 342 +#define PARS_WORK_TOKEN 343 +#define PARS_UNSIGNED_TOKEN 344 +#define PARS_EXIT_TOKEN 345 +#define PARS_FUNCTION_TOKEN 346 +#define PARS_LOCK_TOKEN 347 +#define PARS_SHARE_TOKEN 348 +#define PARS_MODE_TOKEN 349 +#define NEG 350 + + + + +/* Copy the first part of user declarations. */ +#line 13 "pars0grm.y" + +/* The value of the semantic attribute is a pointer to a query tree node +que_node_t */ + +#include "univ.i" +#include /* Can't be before univ.i */ +#include "pars0pars.h" +#include "mem0mem.h" +#include "que0types.h" +#include "que0que.h" +#include "row0sel.h" + +#define YYSTYPE que_node_t* + +/* #define __STDC__ */ + +int +yylex(void); + + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED) +typedef int YYSTYPE; +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +# define YYSTYPE_IS_TRIVIAL 1 +#endif + + + +/* Copy the second part of user declarations. */ + + +/* Line 213 of yacc.c. */ +#line 297 "pars0grm.c" + +#if ! defined (yyoverflow) || YYERROR_VERBOSE + +# ifndef YYFREE +# define YYFREE free +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# endif + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# else +# define YYSTACK_ALLOC alloca +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0) +# else +# if defined (__STDC__) || defined (__cplusplus) +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# endif +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# endif +#endif /* ! defined (yyoverflow) || YYERROR_VERBOSE */ + + +#if (! defined (yyoverflow) \ + && (! defined (__cplusplus) \ + || (defined (YYSTYPE_IS_TRIVIAL) && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + short int yyss; + YYSTYPE yyvs; + }; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (short int) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +/* Copy COUNT objects from FROM to TO. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined (__GNUC__) && 1 < __GNUC__ +# define YYCOPY(To, From, Count) \ + __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# else +# define YYCOPY(To, From, Count) \ + do \ + { \ + register YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (To)[yyi] = (From)[yyi]; \ + } \ + while (0) +# endif +# endif + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack, Stack, yysize); \ + Stack = &yyptr->Stack; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (0) + +#endif + +#if defined (__STDC__) || defined (__cplusplus) + typedef signed char yysigned_char; +#else + typedef short int yysigned_char; +#endif + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 5 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 752 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 111 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 70 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 175 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 339 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 350 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const unsigned char yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 103, 2, 2, + 105, 106, 100, 99, 108, 98, 2, 101, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 104, + 96, 95, 97, 107, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 109, 2, 110, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, + 102 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const unsigned short int yyprhs[] = +{ + 0, 0, 3, 6, 8, 11, 14, 17, 20, 23, + 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, + 56, 59, 62, 65, 68, 71, 73, 76, 78, 83, + 85, 87, 89, 91, 93, 95, 97, 101, 105, 109, + 113, 116, 120, 124, 128, 132, 136, 140, 144, 148, + 152, 155, 159, 163, 165, 167, 169, 171, 173, 175, + 177, 179, 181, 183, 185, 186, 188, 192, 199, 204, + 206, 208, 210, 214, 216, 220, 221, 223, 227, 228, + 230, 234, 236, 241, 247, 252, 253, 255, 259, 261, + 265, 267, 268, 271, 272, 275, 276, 281, 282, 284, + 286, 287, 292, 301, 305, 311, 314, 318, 320, 324, + 329, 334, 337, 340, 344, 347, 350, 353, 357, 362, + 364, 367, 368, 371, 373, 381, 388, 399, 401, 403, + 406, 409, 414, 419, 425, 427, 431, 432, 436, 437, + 439, 440, 443, 444, 446, 454, 456, 460, 461, 463, + 464, 466, 477, 480, 483, 485, 487, 489, 491, 493, + 497, 501, 502, 504, 508, 512, 513, 515, 518, 525, + 530, 532, 534, 535, 537, 540 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const short int yyrhs[] = +{ + 112, 0, -1, 180, 104, -1, 118, -1, 119, 104, + -1, 151, 104, -1, 152, 104, -1, 153, 104, -1, + 150, 104, -1, 154, 104, -1, 146, 104, -1, 133, + 104, -1, 135, 104, -1, 145, 104, -1, 143, 104, + -1, 144, 104, -1, 140, 104, -1, 141, 104, -1, + 155, 104, -1, 157, 104, -1, 156, 104, -1, 169, + 104, -1, 170, 104, -1, 164, 104, -1, 168, 104, + -1, 113, -1, 114, 113, -1, 9, -1, 116, 105, + 124, 106, -1, 3, -1, 4, -1, 5, -1, 6, + -1, 7, -1, 8, -1, 66, -1, 115, 99, 115, + -1, 115, 98, 115, -1, 115, 100, 115, -1, 115, + 101, 115, -1, 98, 115, -1, 105, 115, 106, -1, + 115, 95, 115, -1, 115, 96, 115, -1, 115, 97, + 115, -1, 115, 13, 115, -1, 115, 14, 115, -1, + 115, 15, 115, -1, 115, 10, 115, -1, 115, 11, + 115, -1, 12, 115, -1, 9, 103, 70, -1, 66, + 103, 70, -1, 71, -1, 72, -1, 73, -1, 74, + -1, 75, -1, 77, -1, 78, -1, 79, -1, 80, + -1, 83, -1, 84, -1, -1, 107, -1, 117, 108, + 107, -1, 109, 9, 105, 117, 106, 110, -1, 120, + 105, 124, 106, -1, 76, -1, 81, -1, 82, -1, + 9, 105, 106, -1, 9, -1, 122, 108, 9, -1, + -1, 9, -1, 123, 108, 9, -1, -1, 115, -1, + 124, 108, 115, -1, 115, -1, 37, 105, 100, 106, + -1, 37, 105, 38, 9, 106, -1, 36, 105, 115, + 106, -1, -1, 125, -1, 126, 108, 125, -1, 100, + -1, 126, 49, 123, -1, 126, -1, -1, 40, 115, + -1, -1, 41, 51, -1, -1, 92, 17, 93, 94, + -1, -1, 46, -1, 47, -1, -1, 44, 45, 9, + 131, -1, 35, 127, 39, 122, 128, 129, 130, 132, + -1, 48, 49, 9, -1, 134, 50, 105, 124, 106, + -1, 134, 133, -1, 9, 95, 115, -1, 136, -1, + 137, 108, 136, -1, 40, 54, 55, 9, -1, 51, + 9, 52, 137, -1, 139, 128, -1, 139, 138, -1, + 53, 39, 9, -1, 142, 128, -1, 142, 138, -1, + 85, 133, -1, 9, 63, 115, -1, 31, 115, 29, + 114, -1, 147, -1, 148, 147, -1, -1, 30, 114, + -1, 148, -1, 28, 115, 29, 114, 149, 27, 28, + -1, 33, 115, 32, 114, 27, 32, -1, 41, 9, + 17, 115, 42, 115, 32, 114, 27, 32, -1, 90, + -1, 34, -1, 67, 9, -1, 69, 9, -1, 68, + 9, 49, 123, -1, 68, 9, 49, 121, -1, 9, + 171, 160, 161, 162, -1, 158, -1, 159, 108, 158, + -1, -1, 105, 3, 106, -1, -1, 89, -1, -1, + 12, 8, -1, -1, 61, -1, 56, 57, 9, 105, + 159, 106, 163, -1, 9, -1, 165, 108, 9, -1, + -1, 59, -1, -1, 60, -1, 56, 166, 167, 58, + 9, 62, 9, 105, 165, 106, -1, 86, 88, -1, + 87, 88, -1, 21, -1, 22, -1, 24, -1, 19, + -1, 20, -1, 9, 17, 171, -1, 9, 18, 171, + -1, -1, 172, -1, 173, 108, 172, -1, 9, 171, + 104, -1, -1, 174, -1, 175, 174, -1, 64, 65, + 9, 25, 133, 104, -1, 64, 91, 9, 104, -1, + 176, -1, 177, -1, -1, 178, -1, 179, 178, -1, + 16, 9, 105, 173, 106, 25, 175, 179, 26, 114, + 27, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const unsigned short int yyrline[] = +{ + 0, 138, 138, 141, 142, 143, 144, 145, 146, 147, + 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, + 158, 159, 160, 161, 162, 166, 167, 172, 173, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, + 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, + 196, 197, 199, 204, 205, 206, 207, 209, 210, 211, + 212, 213, 214, 215, 218, 220, 221, 225, 230, 235, + 236, 237, 241, 245, 246, 251, 252, 253, 258, 259, + 260, 264, 265, 270, 276, 283, 284, 285, 290, 292, + 294, 298, 299, 303, 304, 309, 310, 315, 316, 317, + 321, 322, 327, 337, 342, 344, 349, 353, 354, 359, + 365, 372, 377, 382, 388, 393, 398, 403, 408, 414, + 415, 420, 421, 423, 427, 434, 440, 448, 452, 456, + 462, 468, 470, 475, 480, 481, 486, 487, 492, 493, + 499, 500, 506, 507, 513, 519, 520, 525, 526, 530, + 531, 535, 543, 548, 553, 554, 555, 556, 557, 561, + 564, 570, 571, 572, 577, 581, 583, 584, 588, 594, + 599, 600, 603, 605, 606, 610 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE +/* YYTNME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT", + "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT", + "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN", + "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN", + "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN", + "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN", + "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN", + "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN", + "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN", + "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN", + "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN", + "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN", + "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN", + "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN", + "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN", + "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN", + "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN", + "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN", + "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN", + "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN", + "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN", + "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN", + "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN", + "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN", + "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN", + "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN", + "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN", + "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN", + "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN", + "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN", "'='", "'<'", + "'>'", "'-'", "'+'", "'*'", "'/'", "NEG", "'%'", "';'", "'('", "')'", + "'?'", "','", "'{'", "'}'", "$accept", "top_statement", "statement", + "statement_list", "exp", "function_name", "question_mark_list", + "stored_procedure_call", "predefined_procedure_call", + "predefined_procedure_name", "user_function_call", "table_list", + "variable_list", "exp_list", "select_item", "select_item_list", + "select_list", "search_condition", "for_update_clause", + "lock_shared_clause", "order_direction", "order_by_clause", + "select_statement", "insert_statement_start", "insert_statement", + "column_assignment", "column_assignment_list", "cursor_positioned", + "update_statement_start", "update_statement_searched", + "update_statement_positioned", "delete_statement_start", + "delete_statement_searched", "delete_statement_positioned", + "row_printf_statement", "assignment_statement", "elsif_element", + "elsif_list", "else_part", "if_statement", "while_statement", + "for_statement", "exit_statement", "return_statement", + "open_cursor_statement", "close_cursor_statement", "fetch_statement", + "column_def", "column_def_list", "opt_column_len", "opt_unsigned", + "opt_not_null", "not_fit_in_memory", "create_table", "column_list", + "unique_def", "clustered_def", "create_index", "commit_statement", + "rollback_statement", "type_name", "parameter_declaration", + "parameter_declaration_list", "variable_declaration", + "variable_declaration_list", "cursor_declaration", + "function_declaration", "declaration", "declaration_list", + "procedure_definition", 0 +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const unsigned short int yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, + 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, + 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, + 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, + 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, + 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 61, 60, 62, 45, 43, + 42, 47, 350, 37, 59, 40, 41, 63, 44, 123, + 125 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const unsigned char yyr1[] = +{ + 0, 111, 112, 113, 113, 113, 113, 113, 113, 113, + 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, + 113, 113, 113, 113, 113, 114, 114, 115, 115, 115, + 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, + 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, + 115, 115, 115, 116, 116, 116, 116, 116, 116, 116, + 116, 116, 116, 116, 117, 117, 117, 118, 119, 120, + 120, 120, 121, 122, 122, 123, 123, 123, 124, 124, + 124, 125, 125, 125, 125, 126, 126, 126, 127, 127, + 127, 128, 128, 129, 129, 130, 130, 131, 131, 131, + 132, 132, 133, 134, 135, 135, 136, 137, 137, 138, + 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, + 148, 149, 149, 149, 150, 151, 152, 153, 154, 155, + 156, 157, 157, 158, 159, 159, 160, 160, 161, 161, + 162, 162, 163, 163, 164, 165, 165, 166, 166, 167, + 167, 168, 169, 170, 171, 171, 171, 171, 171, 172, + 172, 173, 173, 173, 174, 175, 175, 175, 176, 177, + 178, 178, 179, 179, 179, 180 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const unsigned char yyr2[] = +{ + 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 1, 2, 1, 4, 1, + 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, + 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 1, 3, 6, 4, 1, + 1, 1, 3, 1, 3, 0, 1, 3, 0, 1, + 3, 1, 4, 5, 4, 0, 1, 3, 1, 3, + 1, 0, 2, 0, 2, 0, 4, 0, 1, 1, + 0, 4, 8, 3, 5, 2, 3, 1, 3, 4, + 4, 2, 2, 3, 2, 2, 2, 3, 4, 1, + 2, 0, 2, 1, 7, 6, 10, 1, 1, 2, + 2, 4, 4, 5, 1, 3, 0, 3, 0, 1, + 0, 2, 0, 1, 7, 1, 3, 0, 1, 0, + 1, 10, 2, 2, 1, 1, 1, 1, 1, 3, + 3, 0, 1, 3, 3, 0, 1, 2, 6, 4, + 1, 1, 0, 1, 2, 11 +}; + +/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state + STATE-NUM when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const unsigned char yydefact[] = +{ + 0, 0, 0, 0, 0, 1, 2, 161, 0, 162, + 0, 0, 0, 0, 0, 157, 158, 154, 155, 156, + 159, 160, 165, 163, 0, 166, 172, 0, 0, 167, + 170, 171, 173, 0, 164, 0, 0, 0, 174, 0, + 0, 0, 0, 0, 128, 85, 0, 0, 0, 0, + 147, 0, 0, 0, 69, 70, 71, 0, 0, 0, + 127, 0, 25, 0, 3, 0, 0, 0, 0, 0, + 91, 0, 0, 91, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 169, 0, 29, 30, 31, 32, 33, 34, 27, + 0, 35, 53, 54, 55, 56, 57, 58, 59, 60, + 61, 62, 63, 0, 0, 0, 0, 0, 0, 0, + 88, 81, 86, 90, 0, 0, 0, 0, 0, 0, + 148, 149, 129, 0, 130, 116, 152, 153, 0, 175, + 26, 4, 78, 11, 0, 105, 12, 0, 111, 112, + 16, 17, 114, 115, 14, 15, 13, 10, 8, 5, + 6, 7, 9, 18, 20, 19, 23, 24, 21, 22, + 0, 117, 0, 50, 0, 40, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 78, 0, 0, 0, 75, 0, 0, 0, 103, 0, + 113, 0, 150, 0, 75, 64, 79, 0, 78, 0, + 92, 168, 51, 52, 41, 48, 49, 45, 46, 47, + 121, 42, 43, 44, 37, 36, 38, 39, 0, 0, + 0, 0, 0, 76, 89, 87, 73, 91, 0, 0, + 107, 110, 0, 0, 76, 132, 131, 65, 0, 68, + 0, 0, 0, 0, 0, 119, 123, 0, 28, 0, + 84, 0, 82, 0, 0, 0, 93, 0, 0, 0, + 0, 134, 0, 0, 0, 0, 0, 80, 104, 109, + 122, 0, 120, 0, 125, 83, 77, 74, 0, 95, + 0, 106, 108, 136, 142, 0, 0, 72, 67, 66, + 0, 124, 94, 0, 100, 0, 0, 138, 143, 144, + 135, 0, 118, 0, 0, 102, 0, 0, 139, 140, + 0, 0, 0, 0, 137, 0, 133, 145, 0, 96, + 97, 126, 141, 151, 0, 98, 99, 101, 146 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const short int yydefgoto[] = +{ + -1, 2, 62, 63, 206, 116, 248, 64, 65, 66, + 245, 237, 234, 207, 122, 123, 124, 148, 289, 304, + 337, 315, 67, 68, 69, 240, 241, 149, 70, 71, + 72, 73, 74, 75, 76, 77, 255, 256, 257, 78, + 79, 80, 81, 82, 83, 84, 85, 271, 272, 307, + 319, 326, 309, 86, 328, 131, 203, 87, 88, 89, + 20, 9, 10, 25, 26, 30, 31, 32, 33, 3 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -177 +static const short int yypact[] = +{ + 28, 38, 54, -46, -29, -177, -177, 56, 50, -177, + -75, 8, 8, 46, 56, -177, -177, -177, -177, -177, + -177, -177, 63, -177, 8, -177, 2, -26, -51, -177, + -177, -177, -177, -13, -177, 71, 72, 587, -177, 57, + -21, 26, 272, 272, -177, 13, 91, 55, 96, 67, + -22, 99, 100, 103, -177, -177, -177, 75, 29, 35, + -177, 116, -177, 396, -177, 22, 23, 27, -9, 30, + 87, 31, 32, 87, 47, 49, 52, 58, 59, 60, + 61, 62, 65, 66, 74, 77, 78, 86, 89, 102, + 75, -177, 272, -177, -177, -177, -177, -177, -177, 39, + 272, 51, -177, -177, -177, -177, -177, -177, -177, -177, + -177, -177, -177, 272, 272, 361, 25, 489, 45, 90, + -177, 651, -177, -39, 93, 142, 124, 108, 152, 170, + -177, 131, -177, 143, -177, -177, -177, -177, 98, -177, + -177, -177, 272, -177, 110, -177, -177, 256, -177, -177, + -177, -177, -177, -177, -177, -177, -177, -177, -177, -177, + -177, -177, -177, -177, -177, -177, -177, -177, -177, -177, + 112, 651, 137, 101, 147, 204, 88, 272, 272, 272, + 272, 272, 587, 272, 272, 272, 272, 272, 272, 272, + 272, 587, 272, -30, 211, 168, 212, 272, -177, 213, + -177, 118, -177, 167, 217, 122, 651, -63, 272, 175, + 651, -177, -177, -177, -177, 101, 101, 21, 21, 651, + 332, 21, 21, 21, -6, -6, 204, 204, -60, 460, + 198, 222, 126, -177, 125, -177, -177, -33, 584, 140, + -177, 128, 228, 229, 139, -177, 125, -177, -53, -177, + 272, -49, 240, 587, 272, -177, 224, 226, -177, 225, + -177, 150, -177, 258, 272, 260, 230, 272, 272, 213, + 8, -177, -45, 208, 166, 164, 176, 651, -177, -177, + 587, 631, -177, 254, -177, -177, -177, -177, 234, 194, + 638, 651, -177, 182, 227, 228, 280, -177, -177, -177, + 587, -177, -177, 273, 247, 587, 289, 214, -177, -177, + -177, 195, 587, 209, 261, -177, 524, 199, -177, 295, + 292, 215, 299, 279, -177, 304, -177, -177, -44, -177, + -8, -177, -177, -177, 305, -177, -177, -177, -177 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const short int yypgoto[] = +{ + -177, -177, -62, -176, -40, -177, -177, -177, -177, -177, + -177, -177, 109, -166, 120, -177, -177, -69, -177, -177, + -177, -177, -34, -177, -177, 48, -177, 243, -177, -177, + -177, -177, -177, -177, -177, -177, 64, -177, -177, -177, + -177, -177, -177, -177, -177, -177, -177, 24, -177, -177, + -177, -177, -177, -177, -177, -177, -177, -177, -177, -177, + -12, 307, -177, 297, -177, -177, -177, 285, -177, -177 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If zero, do what YYDEFACT says. + If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const unsigned short int yytable[] = +{ + 21, 140, 115, 117, 152, 121, 220, 264, 231, 181, + 194, 24, 27, 37, 35, 229, 93, 94, 95, 96, + 97, 98, 99, 135, 228, 100, 45, 15, 16, 17, + 18, 13, 19, 14, 145, 129, 181, 130, 335, 336, + 36, 144, 251, 249, 1, 250, 258, 4, 250, 118, + 119, 28, 171, 275, 5, 276, 170, 278, 6, 250, + 173, 294, 333, 295, 334, 8, 28, 11, 12, 195, + 232, 22, 24, 175, 176, 265, 7, 280, 34, 101, + 39, 40, 90, 91, 102, 103, 104, 105, 106, 92, + 107, 108, 109, 110, 188, 189, 111, 112, 177, 178, + 125, 179, 180, 181, 126, 127, 128, 210, 132, 133, + 45, 113, 134, 120, 179, 180, 181, 136, 114, 186, + 187, 188, 189, 137, 312, 138, 141, 147, 142, 316, + 190, 143, 196, 198, 146, 150, 151, 215, 216, 217, + 218, 219, 172, 221, 222, 223, 224, 225, 226, 227, + 192, 154, 230, 155, 174, 121, 156, 238, 140, 197, + 199, 200, 157, 158, 159, 160, 161, 140, 266, 162, + 163, 93, 94, 95, 96, 97, 98, 99, 164, 201, + 100, 165, 166, 183, 184, 185, 186, 187, 188, 189, + 167, 202, 204, 168, 214, 193, 183, 184, 185, 186, + 187, 188, 189, 205, 118, 119, 169, 212, 177, 178, + 277, 179, 180, 181, 281, 208, 211, 213, 140, 181, + 233, 236, 239, 242, 210, 243, 244, 290, 291, 247, + 252, 261, 262, 263, 101, 268, 269, 270, 273, 102, + 103, 104, 105, 106, 274, 107, 108, 109, 110, 279, + 140, 111, 112, 283, 140, 254, 285, 284, 293, 93, + 94, 95, 96, 97, 98, 99, 113, 286, 100, 287, + 296, 288, 297, 114, 298, 93, 94, 95, 96, 97, + 98, 99, 301, 299, 100, 302, 303, 306, 308, 311, + 313, 314, 317, 183, 184, 185, 186, 187, 188, 189, + 320, 327, 321, 318, 260, 324, 322, 325, 330, 329, + 209, 331, 332, 246, 338, 235, 153, 292, 38, 310, + 282, 23, 101, 29, 0, 0, 0, 102, 103, 104, + 105, 106, 0, 107, 108, 109, 110, 0, 101, 111, + 112, 41, 0, 102, 103, 104, 105, 106, 0, 107, + 108, 109, 110, 0, 113, 111, 112, 0, 0, 0, + 42, 114, 253, 254, 0, 43, 44, 45, 0, 0, + 113, 177, 178, 46, 179, 180, 181, 114, 0, 0, + 47, 0, 0, 48, 0, 49, 0, 0, 50, 0, + 182, 0, 0, 0, 0, 0, 0, 0, 0, 51, + 52, 53, 0, 0, 0, 41, 0, 0, 54, 0, + 0, 0, 0, 55, 56, 0, 0, 57, 58, 59, + 0, 0, 60, 139, 42, 0, 0, 0, 0, 43, + 44, 45, 0, 0, 0, 0, 0, 46, 0, 0, + 0, 61, 0, 0, 47, 0, 0, 48, 0, 49, + 0, 0, 50, 0, 0, 0, 183, 184, 185, 186, + 187, 188, 189, 51, 52, 53, 0, 0, 0, 41, + 0, 0, 54, 0, 0, 0, 0, 55, 56, 0, + 0, 57, 58, 59, 0, 0, 60, 259, 42, 0, + 0, 0, 0, 43, 44, 45, 0, 0, 0, 177, + 178, 46, 179, 180, 181, 61, 0, 0, 47, 0, + 0, 48, 0, 49, 0, 0, 50, 0, 0, 0, + 0, 191, 0, 0, 0, 0, 0, 51, 52, 53, + 0, 0, 0, 41, 0, 0, 54, 0, 0, 0, + 0, 55, 56, 0, 0, 57, 58, 59, 0, 0, + 60, 323, 42, 0, 0, 0, 0, 43, 44, 45, + 0, 0, 0, 0, 0, 46, 0, 0, 0, 61, + 0, 0, 47, 0, 0, 48, 0, 49, 0, 0, + 50, 0, 0, 0, 183, 184, 185, 186, 187, 188, + 189, 51, 52, 53, 177, 178, 41, 179, 180, 181, + 54, 0, 0, 0, 0, 55, 56, 0, 0, 57, + 58, 59, 0, 0, 60, 42, 0, 0, 0, 0, + 43, 44, 45, 0, 0, 0, 267, 0, 46, 0, + 0, 0, 0, 61, 0, 47, 0, 0, 48, 0, + 49, 177, 178, 50, 179, 180, 181, 0, 177, 178, + 0, 179, 180, 181, 51, 52, 53, 0, 0, 0, + 300, 177, 178, 54, 179, 180, 181, 0, 55, 56, + 305, 0, 57, 58, 59, 0, 0, 60, 0, 183, + 184, 185, 186, 187, 188, 189, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 183, 184, 185, 186, + 187, 188, 189, 183, 184, 185, 186, 187, 188, 189, + 0, 0, 0, 0, 0, 0, 183, 184, 185, 186, + 187, 188, 189 +}; + +static const short int yycheck[] = +{ + 12, 63, 42, 43, 73, 45, 182, 40, 38, 15, + 49, 9, 24, 26, 65, 191, 3, 4, 5, 6, + 7, 8, 9, 57, 190, 12, 35, 19, 20, 21, + 22, 106, 24, 108, 68, 57, 15, 59, 46, 47, + 91, 50, 208, 106, 16, 108, 106, 9, 108, 36, + 37, 64, 92, 106, 0, 108, 90, 106, 104, 108, + 100, 106, 106, 108, 108, 9, 64, 17, 18, 108, + 100, 25, 9, 113, 114, 108, 105, 253, 104, 66, + 9, 9, 25, 104, 71, 72, 73, 74, 75, 63, + 77, 78, 79, 80, 100, 101, 83, 84, 10, 11, + 9, 13, 14, 15, 49, 9, 39, 147, 9, 9, + 35, 98, 9, 100, 13, 14, 15, 88, 105, 98, + 99, 100, 101, 88, 300, 9, 104, 40, 105, 305, + 105, 104, 39, 9, 104, 104, 104, 177, 178, 179, + 180, 181, 103, 183, 184, 185, 186, 187, 188, 189, + 105, 104, 192, 104, 103, 195, 104, 197, 220, 17, + 52, 9, 104, 104, 104, 104, 104, 229, 237, 104, + 104, 3, 4, 5, 6, 7, 8, 9, 104, 9, + 12, 104, 104, 95, 96, 97, 98, 99, 100, 101, + 104, 60, 49, 104, 106, 105, 95, 96, 97, 98, + 99, 100, 101, 105, 36, 37, 104, 70, 10, 11, + 250, 13, 14, 15, 254, 105, 104, 70, 280, 15, + 9, 9, 9, 105, 264, 58, 9, 267, 268, 107, + 55, 9, 106, 108, 66, 95, 108, 9, 9, 71, + 72, 73, 74, 75, 105, 77, 78, 79, 80, 9, + 312, 83, 84, 27, 316, 31, 106, 32, 270, 3, + 4, 5, 6, 7, 8, 9, 98, 9, 12, 9, + 62, 41, 106, 105, 110, 3, 4, 5, 6, 7, + 8, 9, 28, 107, 12, 51, 92, 105, 61, 9, + 17, 44, 3, 95, 96, 97, 98, 99, 100, 101, + 105, 9, 93, 89, 106, 106, 45, 12, 9, 94, + 54, 32, 8, 204, 9, 195, 73, 269, 33, 295, + 256, 14, 66, 26, -1, -1, -1, 71, 72, 73, + 74, 75, -1, 77, 78, 79, 80, -1, 66, 83, + 84, 9, -1, 71, 72, 73, 74, 75, -1, 77, + 78, 79, 80, -1, 98, 83, 84, -1, -1, -1, + 28, 105, 30, 31, -1, 33, 34, 35, -1, -1, + 98, 10, 11, 41, 13, 14, 15, 105, -1, -1, + 48, -1, -1, 51, -1, 53, -1, -1, 56, -1, + 29, -1, -1, -1, -1, -1, -1, -1, -1, 67, + 68, 69, -1, -1, -1, 9, -1, -1, 76, -1, + -1, -1, -1, 81, 82, -1, -1, 85, 86, 87, + -1, -1, 90, 27, 28, -1, -1, -1, -1, 33, + 34, 35, -1, -1, -1, -1, -1, 41, -1, -1, + -1, 109, -1, -1, 48, -1, -1, 51, -1, 53, + -1, -1, 56, -1, -1, -1, 95, 96, 97, 98, + 99, 100, 101, 67, 68, 69, -1, -1, -1, 9, + -1, -1, 76, -1, -1, -1, -1, 81, 82, -1, + -1, 85, 86, 87, -1, -1, 90, 27, 28, -1, + -1, -1, -1, 33, 34, 35, -1, -1, -1, 10, + 11, 41, 13, 14, 15, 109, -1, -1, 48, -1, + -1, 51, -1, 53, -1, -1, 56, -1, -1, -1, + -1, 32, -1, -1, -1, -1, -1, 67, 68, 69, + -1, -1, -1, 9, -1, -1, 76, -1, -1, -1, + -1, 81, 82, -1, -1, 85, 86, 87, -1, -1, + 90, 27, 28, -1, -1, -1, -1, 33, 34, 35, + -1, -1, -1, -1, -1, 41, -1, -1, -1, 109, + -1, -1, 48, -1, -1, 51, -1, 53, -1, -1, + 56, -1, -1, -1, 95, 96, 97, 98, 99, 100, + 101, 67, 68, 69, 10, 11, 9, 13, 14, 15, + 76, -1, -1, -1, -1, 81, 82, -1, -1, 85, + 86, 87, -1, -1, 90, 28, -1, -1, -1, -1, + 33, 34, 35, -1, -1, -1, 42, -1, 41, -1, + -1, -1, -1, 109, -1, 48, -1, -1, 51, -1, + 53, 10, 11, 56, 13, 14, 15, -1, 10, 11, + -1, 13, 14, 15, 67, 68, 69, -1, -1, -1, + 29, 10, 11, 76, 13, 14, 15, -1, 81, 82, + 32, -1, 85, 86, 87, -1, -1, 90, -1, 95, + 96, 97, 98, 99, 100, 101, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 109, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 95, 96, 97, 98, + 99, 100, 101, 95, 96, 97, 98, 99, 100, 101, + -1, -1, -1, -1, -1, -1, 95, 96, 97, 98, + 99, 100, 101 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const unsigned char yystos[] = +{ + 0, 16, 112, 180, 9, 0, 104, 105, 9, 172, + 173, 17, 18, 106, 108, 19, 20, 21, 22, 24, + 171, 171, 25, 172, 9, 174, 175, 171, 64, 174, + 176, 177, 178, 179, 104, 65, 91, 26, 178, 9, + 9, 9, 28, 33, 34, 35, 41, 48, 51, 53, + 56, 67, 68, 69, 76, 81, 82, 85, 86, 87, + 90, 109, 113, 114, 118, 119, 120, 133, 134, 135, + 139, 140, 141, 142, 143, 144, 145, 146, 150, 151, + 152, 153, 154, 155, 156, 157, 164, 168, 169, 170, + 25, 104, 63, 3, 4, 5, 6, 7, 8, 9, + 12, 66, 71, 72, 73, 74, 75, 77, 78, 79, + 80, 83, 84, 98, 105, 115, 116, 115, 36, 37, + 100, 115, 125, 126, 127, 9, 49, 9, 39, 57, + 59, 166, 9, 9, 9, 133, 88, 88, 9, 27, + 113, 104, 105, 104, 50, 133, 104, 40, 128, 138, + 104, 104, 128, 138, 104, 104, 104, 104, 104, 104, + 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, + 133, 115, 103, 115, 103, 115, 115, 10, 11, 13, + 14, 15, 29, 95, 96, 97, 98, 99, 100, 101, + 105, 32, 105, 105, 49, 108, 39, 17, 9, 52, + 9, 9, 60, 167, 49, 105, 115, 124, 105, 54, + 115, 104, 70, 70, 106, 115, 115, 115, 115, 115, + 114, 115, 115, 115, 115, 115, 115, 115, 124, 114, + 115, 38, 100, 9, 123, 125, 9, 122, 115, 9, + 136, 137, 105, 58, 9, 121, 123, 107, 117, 106, + 108, 124, 55, 30, 31, 147, 148, 149, 106, 27, + 106, 9, 106, 108, 40, 108, 128, 42, 95, 108, + 9, 158, 159, 9, 105, 106, 108, 115, 106, 9, + 114, 115, 147, 27, 32, 106, 9, 9, 41, 129, + 115, 115, 136, 171, 106, 108, 62, 106, 110, 107, + 29, 28, 51, 92, 130, 32, 105, 160, 61, 163, + 158, 9, 114, 17, 44, 132, 114, 3, 89, 161, + 105, 93, 45, 27, 106, 12, 162, 9, 165, 94, + 9, 32, 8, 106, 108, 46, 47, 131, 9 +}; + +#if ! defined (YYSIZE_T) && defined (__SIZE_TYPE__) +# define YYSIZE_T __SIZE_TYPE__ +#endif +#if ! defined (YYSIZE_T) && defined (size_t) +# define YYSIZE_T size_t +#endif +#if ! defined (YYSIZE_T) +# if defined (__STDC__) || defined (__cplusplus) +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# endif +#endif +#if ! defined (YYSIZE_T) +# define YYSIZE_T unsigned int +#endif + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. */ + +#define YYFAIL goto yyerrlab + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY && yylen == 1) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + yytoken = YYTRANSLATE (yychar); \ + YYPOPSTACK; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror ("syntax error: cannot back up");\ + YYERROR; \ + } \ +while (0) + + +#define YYTERROR 1 +#define YYERRCODE 256 + + +/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N]. + If N is 0, then set CURRENT to the empty location which ends + the previous symbol: RHS[0] (always defined). */ + +#define YYRHSLOC(Rhs, K) ((Rhs)[K]) +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (N) \ + { \ + (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \ + (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \ + (Current).last_line = YYRHSLOC (Rhs, N).last_line; \ + (Current).last_column = YYRHSLOC (Rhs, N).last_column; \ + } \ + else \ + { \ + (Current).first_line = (Current).last_line = \ + YYRHSLOC (Rhs, 0).last_line; \ + (Current).first_column = (Current).last_column = \ + YYRHSLOC (Rhs, 0).last_column; \ + } \ + while (0) +#endif + + +/* YY_LOCATION_PRINT -- Print the location on the stream. + This macro was not mandated originally: define only if we know + we won't break user code: when these are the locations we know. */ + +#ifndef YY_LOCATION_PRINT +# if YYLTYPE_IS_TRIVIAL +# define YY_LOCATION_PRINT(File, Loc) \ + fprintf (File, "%d.%d-%d.%d", \ + (Loc).first_line, (Loc).first_column, \ + (Loc).last_line, (Loc).last_column) +# else +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +# endif +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (YYLEX_PARAM) +#else +# define YYLEX yylex () +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (0) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yysymprint (stderr, \ + Type, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (0) + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yy_stack_print (short int *bottom, short int *top) +#else +static void +yy_stack_print (bottom, top) + short int *bottom; + short int *top; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (/* Nothing. */; bottom <= top; ++bottom) + YYFPRINTF (stderr, " %d", *bottom); + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (0) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yy_reduce_print (int yyrule) +#else +static void +yy_reduce_print (yyrule) + int yyrule; +#endif +{ + int yyi; + unsigned int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %u), ", + yyrule - 1, yylno); + /* Print the symbols being reduced, and their result. */ + for (yyi = yyprhs[yyrule]; 0 <= yyrhs[yyi]; yyi++) + YYFPRINTF (stderr, "%s ", yytname [yyrhs[yyi]]); + YYFPRINTF (stderr, "-> %s\n", yytname [yyr1[yyrule]]); +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (Rule); \ +} while (0) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + SIZE_MAX < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined (__GLIBC__) && defined (_STRING_H) +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +static YYSIZE_T +# if defined (__STDC__) || defined (__cplusplus) +yystrlen (const char *yystr) +# else +yystrlen (yystr) + const char *yystr; +# endif +{ + register const char *yys = yystr; + + while (*yys++ != '\0') + continue; + + return yys - yystr - 1; +} +# endif +# endif + +# ifndef yystpcpy +# if defined (__GLIBC__) && defined (_STRING_H) && defined (_GNU_SOURCE) +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +static char * +# if defined (__STDC__) || defined (__cplusplus) +yystpcpy (char *yydest, const char *yysrc) +# else +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +# endif +{ + register char *yyd = yydest; + register const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +#endif /* !YYERROR_VERBOSE */ + + + +#if YYDEBUG +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yysymprint (FILE *yyoutput, int yytype, YYSTYPE *yyvaluep) +#else +static void +yysymprint (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + /* Pacify ``unused variable'' warnings. */ + (void) yyvaluep; + + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# endif + switch (yytype) + { + default: + break; + } + YYFPRINTF (yyoutput, ")"); +} + +#endif /* ! YYDEBUG */ +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +#if defined (__STDC__) || defined (__cplusplus) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep) +#else +static void +yydestruct (yymsg, yytype, yyvaluep) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + /* Pacify ``unused variable'' warnings. */ + (void) yyvaluep; + + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); + + switch (yytype) + { + + default: + break; + } +} + + +/* Prevent warnings from -Wmissing-prototypes. */ + +#ifdef YYPARSE_PARAM +# if defined (__STDC__) || defined (__cplusplus) +UNIV_INTERN int yyparse (void *YYPARSE_PARAM); +# else +UNIV_INTERN int yyparse (); +# endif +#else /* ! YYPARSE_PARAM */ +#if defined (__STDC__) || defined (__cplusplus) +UNIV_INTERN int yyparse (void); +#else +UNIV_INTERN int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + + +/* The look-ahead symbol. */ +static int yychar; + +/* The semantic value of the look-ahead symbol. */ +UNIV_INTERN YYSTYPE yylval; + +/* Number of syntax errors so far. */ +static int yynerrs; + + + +/*----------. +| yyparse. | +`----------*/ + +#ifdef YYPARSE_PARAM +# if defined (__STDC__) || defined (__cplusplus) +UNIV_INTERN int yyparse (void *YYPARSE_PARAM) +# else +UNIV_INTERN int yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +# endif +#else /* ! YYPARSE_PARAM */ +#if defined (__STDC__) || defined (__cplusplus) +int +yyparse (void) +#else +int +yyparse () + +#endif +#endif +{ + + register int yystate; + register int yyn; + int yyresult; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + /* Look-ahead token as an internal (translated) token number. */ + int yytoken = 0; + + /* Three stacks and their tools: + `yyss': related to states, + `yyvs': related to semantic values, + `yyls': related to locations. + + Refer to the stacks thru separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + short int yyssa[YYINITDEPTH]; + short int *yyss = yyssa; + register short int *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs = yyvsa; + register YYSTYPE *yyvsp; + + + +#define YYPOPSTACK (yyvsp--, yyssp--) + + YYSIZE_T yystacksize = YYINITDEPTH; + + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + + + /* When reducing, the number of symbols on the RHS of the reduced + rule. */ + int yylen; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + + yyssp = yyss; + yyvsp = yyvs; + + + yyvsp[0] = yylval; + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. so pushing a state here evens the stacks. + */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + short int *yyss1 = yyss; + + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow ("parser stack overflow", + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyoverflowlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyoverflowlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + short int *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyoverflowlab; + YYSTACK_RELOCATE (yyss); + YYSTACK_RELOCATE (yyvs); + +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + +/* Do appropriate processing given the current state. */ +/* Read a look-ahead token if we need one and don't already have one. */ +/* yyresume: */ + + /* First try to decide what to do without reference to look-ahead token. */ + + yyn = yypact[yystate]; + if (yyn == YYPACT_NINF) + goto yydefault; + + /* Not known => get a look-ahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yyn == 0 || yyn == YYTABLE_NINF) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + if (yyn == YYFINAL) + YYACCEPT; + + /* Shift the look-ahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the token being shifted unless it is eof. */ + if (yychar != YYEOF) + yychar = YYEMPTY; + + *++yyvsp = yylval; + + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + yystate = yyn; + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 25: +#line 166 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 26: +#line 168 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;} + break; + + case 27: +#line 172 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 28: +#line 174 "pars0grm.y" + { (yyval) = pars_func((yyvsp[-3]), (yyvsp[-1])); ;} + break; + + case 29: +#line 175 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 30: +#line 176 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 31: +#line 177 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 32: +#line 178 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 33: +#line 179 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 34: +#line 180 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 35: +#line 181 "pars0grm.y" + { (yyval) = (yyvsp[0]);;} + break; + + case 36: +#line 182 "pars0grm.y" + { (yyval) = pars_op('+', (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 37: +#line 183 "pars0grm.y" + { (yyval) = pars_op('-', (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 38: +#line 184 "pars0grm.y" + { (yyval) = pars_op('*', (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 39: +#line 185 "pars0grm.y" + { (yyval) = pars_op('/', (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 40: +#line 186 "pars0grm.y" + { (yyval) = pars_op('-', (yyvsp[0]), NULL); ;} + break; + + case 41: +#line 187 "pars0grm.y" + { (yyval) = (yyvsp[-1]); ;} + break; + + case 42: +#line 188 "pars0grm.y" + { (yyval) = pars_op('=', (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 43: +#line 189 "pars0grm.y" + { (yyval) = pars_op('<', (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 44: +#line 190 "pars0grm.y" + { (yyval) = pars_op('>', (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 45: +#line 191 "pars0grm.y" + { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 46: +#line 192 "pars0grm.y" + { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 47: +#line 193 "pars0grm.y" + { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 48: +#line 194 "pars0grm.y" + { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 49: +#line 195 "pars0grm.y" + { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 50: +#line 196 "pars0grm.y" + { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[0]), NULL); ;} + break; + + case 51: +#line 198 "pars0grm.y" + { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;} + break; + + case 52: +#line 200 "pars0grm.y" + { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;} + break; + + case 53: +#line 204 "pars0grm.y" + { (yyval) = &pars_to_char_token; ;} + break; + + case 54: +#line 205 "pars0grm.y" + { (yyval) = &pars_to_number_token; ;} + break; + + case 55: +#line 206 "pars0grm.y" + { (yyval) = &pars_to_binary_token; ;} + break; + + case 56: +#line 208 "pars0grm.y" + { (yyval) = &pars_binary_to_number_token; ;} + break; + + case 57: +#line 209 "pars0grm.y" + { (yyval) = &pars_substr_token; ;} + break; + + case 58: +#line 210 "pars0grm.y" + { (yyval) = &pars_concat_token; ;} + break; + + case 59: +#line 211 "pars0grm.y" + { (yyval) = &pars_instr_token; ;} + break; + + case 60: +#line 212 "pars0grm.y" + { (yyval) = &pars_length_token; ;} + break; + + case 61: +#line 213 "pars0grm.y" + { (yyval) = &pars_sysdate_token; ;} + break; + + case 62: +#line 214 "pars0grm.y" + { (yyval) = &pars_rnd_token; ;} + break; + + case 63: +#line 215 "pars0grm.y" + { (yyval) = &pars_rnd_str_token; ;} + break; + + case 67: +#line 226 "pars0grm.y" + { (yyval) = pars_stored_procedure_call((yyvsp[-4])); ;} + break; + + case 68: +#line 231 "pars0grm.y" + { (yyval) = pars_procedure_call((yyvsp[-3]), (yyvsp[-1])); ;} + break; + + case 69: +#line 235 "pars0grm.y" + { (yyval) = &pars_replstr_token; ;} + break; + + case 70: +#line 236 "pars0grm.y" + { (yyval) = &pars_printf_token; ;} + break; + + case 71: +#line 237 "pars0grm.y" + { (yyval) = &pars_assert_token; ;} + break; + + case 72: +#line 241 "pars0grm.y" + { (yyval) = (yyvsp[-2]); ;} + break; + + case 73: +#line 245 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 74: +#line 247 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 75: +#line 251 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 76: +#line 252 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 77: +#line 254 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 78: +#line 258 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 79: +#line 259 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0]));;} + break; + + case 80: +#line 260 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 81: +#line 264 "pars0grm.y" + { (yyval) = (yyvsp[0]); ;} + break; + + case 82: +#line 266 "pars0grm.y" + { (yyval) = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + sym_tab_add_int_lit( + pars_sym_tab_global, 1))); ;} + break; + + case 83: +#line 271 "pars0grm.y" + { (yyval) = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + pars_func(&pars_distinct_token, + que_node_list_add_last( + NULL, (yyvsp[-1]))))); ;} + break; + + case 84: +#line 277 "pars0grm.y" + { (yyval) = pars_func(&pars_sum_token, + que_node_list_add_last(NULL, + (yyvsp[-1]))); ;} + break; + + case 85: +#line 283 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 86: +#line 284 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 87: +#line 286 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 88: +#line 290 "pars0grm.y" + { (yyval) = pars_select_list(&pars_star_denoter, + NULL); ;} + break; + + case 89: +#line 293 "pars0grm.y" + { (yyval) = pars_select_list((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 90: +#line 294 "pars0grm.y" + { (yyval) = pars_select_list((yyvsp[0]), NULL); ;} + break; + + case 91: +#line 298 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 92: +#line 299 "pars0grm.y" + { (yyval) = (yyvsp[0]); ;} + break; + + case 93: +#line 303 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 94: +#line 305 "pars0grm.y" + { (yyval) = &pars_update_token; ;} + break; + + case 95: +#line 309 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 96: +#line 311 "pars0grm.y" + { yyval = &pars_share_token; ;} + break; + + case 97: +#line 315 "pars0grm.y" + { (yyval) = &pars_asc_token; ;} + break; + + case 98: +#line 316 "pars0grm.y" + { (yyval) = &pars_asc_token; ;} + break; + + case 99: +#line 317 "pars0grm.y" + { (yyval) = &pars_desc_token; ;} + break; + + case 100: +#line 321 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 101: +#line 323 "pars0grm.y" + { (yyval) = pars_order_by((yyvsp[-1]), (yyvsp[0])); ;} + break; + + case 102: +#line 332 "pars0grm.y" + { (yyval) = pars_select_statement((yyvsp[-6]), (yyvsp[-4]), (yyvsp[-3]), + (yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;} + break; + + case 103: +#line 338 "pars0grm.y" + { (yyval) = (yyvsp[0]); ;} + break; + + case 104: +#line 343 "pars0grm.y" + { (yyval) = pars_insert_statement((yyvsp[-4]), (yyvsp[-1]), NULL); ;} + break; + + case 105: +#line 345 "pars0grm.y" + { (yyval) = pars_insert_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;} + break; + + case 106: +#line 349 "pars0grm.y" + { (yyval) = pars_column_assignment((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 107: +#line 353 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 108: +#line 355 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 109: +#line 361 "pars0grm.y" + { (yyval) = (yyvsp[0]); ;} + break; + + case 110: +#line 367 "pars0grm.y" + { (yyval) = pars_update_statement_start(FALSE, + (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 111: +#line 373 "pars0grm.y" + { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;} + break; + + case 112: +#line 378 "pars0grm.y" + { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;} + break; + + case 113: +#line 383 "pars0grm.y" + { (yyval) = pars_update_statement_start(TRUE, + (yyvsp[0]), NULL); ;} + break; + + case 114: +#line 389 "pars0grm.y" + { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;} + break; + + case 115: +#line 394 "pars0grm.y" + { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;} + break; + + case 116: +#line 399 "pars0grm.y" + { (yyval) = pars_row_printf_statement((yyvsp[0])); ;} + break; + + case 117: +#line 404 "pars0grm.y" + { (yyval) = pars_assignment_statement((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 118: +#line 410 "pars0grm.y" + { (yyval) = pars_elsif_element((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 119: +#line 414 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 120: +#line 416 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;} + break; + + case 121: +#line 420 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 122: +#line 422 "pars0grm.y" + { (yyval) = (yyvsp[0]); ;} + break; + + case 123: +#line 423 "pars0grm.y" + { (yyval) = (yyvsp[0]); ;} + break; + + case 124: +#line 430 "pars0grm.y" + { (yyval) = pars_if_statement((yyvsp[-5]), (yyvsp[-3]), (yyvsp[-2])); ;} + break; + + case 125: +#line 436 "pars0grm.y" + { (yyval) = pars_while_statement((yyvsp[-4]), (yyvsp[-2])); ;} + break; + + case 126: +#line 444 "pars0grm.y" + { (yyval) = pars_for_statement((yyvsp[-8]), (yyvsp[-6]), (yyvsp[-4]), (yyvsp[-2])); ;} + break; + + case 127: +#line 448 "pars0grm.y" + { (yyval) = pars_exit_statement(); ;} + break; + + case 128: +#line 452 "pars0grm.y" + { (yyval) = pars_return_statement(); ;} + break; + + case 129: +#line 457 "pars0grm.y" + { (yyval) = pars_open_statement( + ROW_SEL_OPEN_CURSOR, (yyvsp[0])); ;} + break; + + case 130: +#line 463 "pars0grm.y" + { (yyval) = pars_open_statement( + ROW_SEL_CLOSE_CURSOR, (yyvsp[0])); ;} + break; + + case 131: +#line 469 "pars0grm.y" + { (yyval) = pars_fetch_statement((yyvsp[-2]), (yyvsp[0]), NULL); ;} + break; + + case 132: +#line 471 "pars0grm.y" + { (yyval) = pars_fetch_statement((yyvsp[-2]), NULL, (yyvsp[0])); ;} + break; + + case 133: +#line 476 "pars0grm.y" + { (yyval) = pars_column_def((yyvsp[-4]), (yyvsp[-3]), (yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;} + break; + + case 134: +#line 480 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 135: +#line 482 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 136: +#line 486 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 137: +#line 488 "pars0grm.y" + { (yyval) = (yyvsp[-1]); ;} + break; + + case 138: +#line 492 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 139: +#line 494 "pars0grm.y" + { (yyval) = &pars_int_token; + /* pass any non-NULL pointer */ ;} + break; + + case 140: +#line 499 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 141: +#line 501 "pars0grm.y" + { (yyval) = &pars_int_token; + /* pass any non-NULL pointer */ ;} + break; + + case 142: +#line 506 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 143: +#line 508 "pars0grm.y" + { (yyval) = &pars_int_token; + /* pass any non-NULL pointer */ ;} + break; + + case 144: +#line 515 "pars0grm.y" + { (yyval) = pars_create_table((yyvsp[-4]), (yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 145: +#line 519 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 146: +#line 521 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 147: +#line 525 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 148: +#line 526 "pars0grm.y" + { (yyval) = &pars_unique_token; ;} + break; + + case 149: +#line 530 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 150: +#line 531 "pars0grm.y" + { (yyval) = &pars_clustered_token; ;} + break; + + case 151: +#line 539 "pars0grm.y" + { (yyval) = pars_create_index((yyvsp[-8]), (yyvsp[-7]), (yyvsp[-5]), (yyvsp[-3]), (yyvsp[-1])); ;} + break; + + case 152: +#line 544 "pars0grm.y" + { (yyval) = pars_commit_statement(); ;} + break; + + case 153: +#line 549 "pars0grm.y" + { (yyval) = pars_rollback_statement(); ;} + break; + + case 154: +#line 553 "pars0grm.y" + { (yyval) = &pars_int_token; ;} + break; + + case 155: +#line 554 "pars0grm.y" + { (yyval) = &pars_int_token; ;} + break; + + case 156: +#line 555 "pars0grm.y" + { (yyval) = &pars_char_token; ;} + break; + + case 157: +#line 556 "pars0grm.y" + { (yyval) = &pars_binary_token; ;} + break; + + case 158: +#line 557 "pars0grm.y" + { (yyval) = &pars_blob_token; ;} + break; + + case 159: +#line 562 "pars0grm.y" + { (yyval) = pars_parameter_declaration((yyvsp[-2]), + PARS_INPUT, (yyvsp[0])); ;} + break; + + case 160: +#line 565 "pars0grm.y" + { (yyval) = pars_parameter_declaration((yyvsp[-2]), + PARS_OUTPUT, (yyvsp[0])); ;} + break; + + case 161: +#line 570 "pars0grm.y" + { (yyval) = NULL; ;} + break; + + case 162: +#line 571 "pars0grm.y" + { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;} + break; + + case 163: +#line 573 "pars0grm.y" + { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;} + break; + + case 164: +#line 578 "pars0grm.y" + { (yyval) = pars_variable_declaration((yyvsp[-2]), (yyvsp[-1])); ;} + break; + + case 168: +#line 590 "pars0grm.y" + { (yyval) = pars_cursor_declaration((yyvsp[-3]), (yyvsp[-1])); ;} + break; + + case 169: +#line 595 "pars0grm.y" + { (yyval) = pars_function_declaration((yyvsp[-1])); ;} + break; + + case 175: +#line 616 "pars0grm.y" + { (yyval) = pars_procedure_definition((yyvsp[-9]), (yyvsp[-7]), + (yyvsp[-1])); ;} + break; + + + } + +/* Line 1010 of yacc.c. */ +#line 2345 "pars0grm.c" + + yyvsp -= yylen; + yyssp -= yylen; + + + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if YYERROR_VERBOSE + yyn = yypact[yystate]; + + if (YYPACT_NINF < yyn && yyn < YYLAST) + { + YYSIZE_T yysize = 0; + int yytype = YYTRANSLATE (yychar); + const char* yyprefix; + char *yymsg; + int yyx; + + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yycount = 0; + + yyprefix = ", expecting "; + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR) + { + yysize += yystrlen (yyprefix) + yystrlen (yytname [yyx]); + yycount += 1; + if (yycount == 5) + { + yysize = 0; + break; + } + } + yysize += (sizeof ("syntax error, unexpected ") + + yystrlen (yytname[yytype])); + yymsg = (char *) YYSTACK_ALLOC (yysize); + if (yymsg != 0) + { + char *yyp = yystpcpy (yymsg, "syntax error, unexpected "); + yyp = yystpcpy (yyp, yytname[yytype]); + + if (yycount < 5) + { + yyprefix = ", expecting "; + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR) + { + yyp = yystpcpy (yyp, yyprefix); + yyp = yystpcpy (yyp, yytname[yyx]); + yyprefix = " or "; + } + } + yyerror (yymsg); + YYSTACK_FREE (yymsg); + } + else + yyerror ("syntax error; also virtual memory exhausted"); + } + else +#endif /* YYERROR_VERBOSE */ + yyerror ("syntax error"); + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse look-ahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* If at end of input, pop the error token, + then the rest of the stack, then return failure. */ + if (yychar == YYEOF) + for (;;) + { + + YYPOPSTACK; + if (yyssp == yyss) + YYABORT; + yydestruct ("Error: popping", + yystos[*yyssp], yyvsp); + } + } + else + { + yydestruct ("Error: discarding", yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse look-ahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + +#ifdef __GNUC__ + /* Pacify GCC when the user code never invokes YYERROR and the label + yyerrorlab therefore never appears in user code. */ + if (0) + goto yyerrorlab; +#endif + +yyvsp -= yylen; + yyssp -= yylen; + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (yyn != YYPACT_NINF) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", yystos[yystate], yyvsp); + YYPOPSTACK; + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + if (yyn == YYFINAL) + YYACCEPT; + + *++yyvsp = yylval; + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yydestruct ("Error: discarding lookahead", + yytoken, &yylval); + yychar = YYEMPTY; + yyresult = 1; + goto yyreturn; + +#ifndef yyoverflow +/*----------------------------------------------. +| yyoverflowlab -- parser overflow comes here. | +`----------------------------------------------*/ +yyoverflowlab: + yyerror ("parser stack overflow"); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif + return yyresult; +} + + +#line 620 "pars0grm.y" + + diff --git a/storage/xtradb/pars/pars0grm.y b/storage/xtradb/pars/pars0grm.y new file mode 100644 index 00000000000..14d64f1826f --- /dev/null +++ b/storage/xtradb/pars/pars0grm.y @@ -0,0 +1,635 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser: input file for the GNU Bison parser generator + +Look from pars0lex.l for instructions how to generate the C files for +the InnoDB parser. + +Created 12/14/1997 Heikki Tuuri +*******************************************************/ + +%{ +/* The value of the semantic attribute is a pointer to a query tree node +que_node_t */ + +#include "univ.i" +#include /* Can't be before univ.i */ +#include "pars0pars.h" +#include "mem0mem.h" +#include "que0types.h" +#include "que0que.h" +#include "row0sel.h" + +#define YYSTYPE que_node_t* + +/* #define __STDC__ */ + +int +yylex(void); +%} + +%token PARS_INT_LIT +%token PARS_FLOAT_LIT +%token PARS_STR_LIT +%token PARS_FIXBINARY_LIT +%token PARS_BLOB_LIT +%token PARS_NULL_LIT +%token PARS_ID_TOKEN +%token PARS_AND_TOKEN +%token PARS_OR_TOKEN +%token PARS_NOT_TOKEN +%token PARS_GE_TOKEN +%token PARS_LE_TOKEN +%token PARS_NE_TOKEN +%token PARS_PROCEDURE_TOKEN +%token PARS_IN_TOKEN +%token PARS_OUT_TOKEN +%token PARS_BINARY_TOKEN +%token PARS_BLOB_TOKEN +%token PARS_INT_TOKEN +%token PARS_INTEGER_TOKEN +%token PARS_FLOAT_TOKEN +%token PARS_CHAR_TOKEN +%token PARS_IS_TOKEN +%token PARS_BEGIN_TOKEN +%token PARS_END_TOKEN +%token PARS_IF_TOKEN +%token PARS_THEN_TOKEN +%token PARS_ELSE_TOKEN +%token PARS_ELSIF_TOKEN +%token PARS_LOOP_TOKEN +%token PARS_WHILE_TOKEN +%token PARS_RETURN_TOKEN +%token PARS_SELECT_TOKEN +%token PARS_SUM_TOKEN +%token PARS_COUNT_TOKEN +%token PARS_DISTINCT_TOKEN +%token PARS_FROM_TOKEN +%token PARS_WHERE_TOKEN +%token PARS_FOR_TOKEN +%token PARS_DDOT_TOKEN +%token PARS_READ_TOKEN +%token PARS_ORDER_TOKEN +%token PARS_BY_TOKEN +%token PARS_ASC_TOKEN +%token PARS_DESC_TOKEN +%token PARS_INSERT_TOKEN +%token PARS_INTO_TOKEN +%token PARS_VALUES_TOKEN +%token PARS_UPDATE_TOKEN +%token PARS_SET_TOKEN +%token PARS_DELETE_TOKEN +%token PARS_CURRENT_TOKEN +%token PARS_OF_TOKEN +%token PARS_CREATE_TOKEN +%token PARS_TABLE_TOKEN +%token PARS_INDEX_TOKEN +%token PARS_UNIQUE_TOKEN +%token PARS_CLUSTERED_TOKEN +%token PARS_DOES_NOT_FIT_IN_MEM_TOKEN +%token PARS_ON_TOKEN +%token PARS_ASSIGN_TOKEN +%token PARS_DECLARE_TOKEN +%token PARS_CURSOR_TOKEN +%token PARS_SQL_TOKEN +%token PARS_OPEN_TOKEN +%token PARS_FETCH_TOKEN +%token PARS_CLOSE_TOKEN +%token PARS_NOTFOUND_TOKEN +%token PARS_TO_CHAR_TOKEN +%token PARS_TO_NUMBER_TOKEN +%token PARS_TO_BINARY_TOKEN +%token PARS_BINARY_TO_NUMBER_TOKEN +%token PARS_SUBSTR_TOKEN +%token PARS_REPLSTR_TOKEN +%token PARS_CONCAT_TOKEN +%token PARS_INSTR_TOKEN +%token PARS_LENGTH_TOKEN +%token PARS_SYSDATE_TOKEN +%token PARS_PRINTF_TOKEN +%token PARS_ASSERT_TOKEN +%token PARS_RND_TOKEN +%token PARS_RND_STR_TOKEN +%token PARS_ROW_PRINTF_TOKEN +%token PARS_COMMIT_TOKEN +%token PARS_ROLLBACK_TOKEN +%token PARS_WORK_TOKEN +%token PARS_UNSIGNED_TOKEN +%token PARS_EXIT_TOKEN +%token PARS_FUNCTION_TOKEN +%token PARS_LOCK_TOKEN +%token PARS_SHARE_TOKEN +%token PARS_MODE_TOKEN + +%left PARS_AND_TOKEN PARS_OR_TOKEN +%left PARS_NOT_TOKEN +%left '=' '<' '>' PARS_GE_TOKEN PARS_LE_TOKEN +%left '-' '+' +%left '*' '/' +%left NEG /* negation--unary minus */ +%left '%' + +/* Grammar follows */ +%% + +top_statement: + procedure_definition ';' + +statement: + stored_procedure_call + | predefined_procedure_call ';' + | while_statement ';' + | for_statement ';' + | exit_statement ';' + | if_statement ';' + | return_statement ';' + | assignment_statement ';' + | select_statement ';' + | insert_statement ';' + | row_printf_statement ';' + | delete_statement_searched ';' + | delete_statement_positioned ';' + | update_statement_searched ';' + | update_statement_positioned ';' + | open_cursor_statement ';' + | fetch_statement ';' + | close_cursor_statement ';' + | commit_statement ';' + | rollback_statement ';' + | create_table ';' + | create_index ';' +; + +statement_list: + statement { $$ = que_node_list_add_last(NULL, $1); } + | statement_list statement + { $$ = que_node_list_add_last($1, $2); } +; + +exp: + PARS_ID_TOKEN { $$ = $1;} + | function_name '(' exp_list ')' + { $$ = pars_func($1, $3); } + | PARS_INT_LIT { $$ = $1;} + | PARS_FLOAT_LIT { $$ = $1;} + | PARS_STR_LIT { $$ = $1;} + | PARS_FIXBINARY_LIT { $$ = $1;} + | PARS_BLOB_LIT { $$ = $1;} + | PARS_NULL_LIT { $$ = $1;} + | PARS_SQL_TOKEN { $$ = $1;} + | exp '+' exp { $$ = pars_op('+', $1, $3); } + | exp '-' exp { $$ = pars_op('-', $1, $3); } + | exp '*' exp { $$ = pars_op('*', $1, $3); } + | exp '/' exp { $$ = pars_op('/', $1, $3); } + | '-' exp %prec NEG { $$ = pars_op('-', $2, NULL); } + | '(' exp ')' { $$ = $2; } + | exp '=' exp { $$ = pars_op('=', $1, $3); } + | exp '<' exp { $$ = pars_op('<', $1, $3); } + | exp '>' exp { $$ = pars_op('>', $1, $3); } + | exp PARS_GE_TOKEN exp { $$ = pars_op(PARS_GE_TOKEN, $1, $3); } + | exp PARS_LE_TOKEN exp { $$ = pars_op(PARS_LE_TOKEN, $1, $3); } + | exp PARS_NE_TOKEN exp { $$ = pars_op(PARS_NE_TOKEN, $1, $3); } + | exp PARS_AND_TOKEN exp{ $$ = pars_op(PARS_AND_TOKEN, $1, $3); } + | exp PARS_OR_TOKEN exp { $$ = pars_op(PARS_OR_TOKEN, $1, $3); } + | PARS_NOT_TOKEN exp { $$ = pars_op(PARS_NOT_TOKEN, $2, NULL); } + | PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN + { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); } + | PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN + { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); } +; + +function_name: + PARS_TO_CHAR_TOKEN { $$ = &pars_to_char_token; } + | PARS_TO_NUMBER_TOKEN { $$ = &pars_to_number_token; } + | PARS_TO_BINARY_TOKEN { $$ = &pars_to_binary_token; } + | PARS_BINARY_TO_NUMBER_TOKEN + { $$ = &pars_binary_to_number_token; } + | PARS_SUBSTR_TOKEN { $$ = &pars_substr_token; } + | PARS_CONCAT_TOKEN { $$ = &pars_concat_token; } + | PARS_INSTR_TOKEN { $$ = &pars_instr_token; } + | PARS_LENGTH_TOKEN { $$ = &pars_length_token; } + | PARS_SYSDATE_TOKEN { $$ = &pars_sysdate_token; } + | PARS_RND_TOKEN { $$ = &pars_rnd_token; } + | PARS_RND_STR_TOKEN { $$ = &pars_rnd_str_token; } +; + +question_mark_list: + /* Nothing */ + | '?' + | question_mark_list ',' '?' +; + +stored_procedure_call: + '{' PARS_ID_TOKEN '(' question_mark_list ')' '}' + { $$ = pars_stored_procedure_call($2); } +; + +predefined_procedure_call: + predefined_procedure_name '(' exp_list ')' + { $$ = pars_procedure_call($1, $3); } +; + +predefined_procedure_name: + PARS_REPLSTR_TOKEN { $$ = &pars_replstr_token; } + | PARS_PRINTF_TOKEN { $$ = &pars_printf_token; } + | PARS_ASSERT_TOKEN { $$ = &pars_assert_token; } +; + +user_function_call: + PARS_ID_TOKEN '(' ')' { $$ = $1; } +; + +table_list: + PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); } + | table_list ',' PARS_ID_TOKEN + { $$ = que_node_list_add_last($1, $3); } +; + +variable_list: + /* Nothing */ { $$ = NULL; } + | PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); } + | variable_list ',' PARS_ID_TOKEN + { $$ = que_node_list_add_last($1, $3); } +; + +exp_list: + /* Nothing */ { $$ = NULL; } + | exp { $$ = que_node_list_add_last(NULL, $1);} + | exp_list ',' exp { $$ = que_node_list_add_last($1, $3); } +; + +select_item: + exp { $$ = $1; } + | PARS_COUNT_TOKEN '(' '*' ')' + { $$ = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + sym_tab_add_int_lit( + pars_sym_tab_global, 1))); } + | PARS_COUNT_TOKEN '(' PARS_DISTINCT_TOKEN PARS_ID_TOKEN ')' + { $$ = pars_func(&pars_count_token, + que_node_list_add_last(NULL, + pars_func(&pars_distinct_token, + que_node_list_add_last( + NULL, $4)))); } + | PARS_SUM_TOKEN '(' exp ')' + { $$ = pars_func(&pars_sum_token, + que_node_list_add_last(NULL, + $3)); } +; + +select_item_list: + /* Nothing */ { $$ = NULL; } + | select_item { $$ = que_node_list_add_last(NULL, $1); } + | select_item_list ',' select_item + { $$ = que_node_list_add_last($1, $3); } +; + +select_list: + '*' { $$ = pars_select_list(&pars_star_denoter, + NULL); } + | select_item_list PARS_INTO_TOKEN variable_list + { $$ = pars_select_list($1, $3); } + | select_item_list { $$ = pars_select_list($1, NULL); } +; + +search_condition: + /* Nothing */ { $$ = NULL; } + | PARS_WHERE_TOKEN exp { $$ = $2; } +; + +for_update_clause: + /* Nothing */ { $$ = NULL; } + | PARS_FOR_TOKEN PARS_UPDATE_TOKEN + { $$ = &pars_update_token; } +; + +lock_shared_clause: + /* Nothing */ { $$ = NULL; } + | PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN + { $$ = &pars_share_token; } +; + +order_direction: + /* Nothing */ { $$ = &pars_asc_token; } + | PARS_ASC_TOKEN { $$ = &pars_asc_token; } + | PARS_DESC_TOKEN { $$ = &pars_desc_token; } +; + +order_by_clause: + /* Nothing */ { $$ = NULL; } + | PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction + { $$ = pars_order_by($3, $4); } +; + +select_statement: + PARS_SELECT_TOKEN select_list + PARS_FROM_TOKEN table_list + search_condition + for_update_clause + lock_shared_clause + order_by_clause { $$ = pars_select_statement($2, $4, $5, + $6, $7, $8); } +; + +insert_statement_start: + PARS_INSERT_TOKEN PARS_INTO_TOKEN + PARS_ID_TOKEN { $$ = $3; } +; + +insert_statement: + insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')' + { $$ = pars_insert_statement($1, $4, NULL); } + | insert_statement_start select_statement + { $$ = pars_insert_statement($1, NULL, $2); } +; + +column_assignment: + PARS_ID_TOKEN '=' exp { $$ = pars_column_assignment($1, $3); } +; + +column_assignment_list: + column_assignment { $$ = que_node_list_add_last(NULL, $1); } + | column_assignment_list ',' column_assignment + { $$ = que_node_list_add_last($1, $3); } +; + +cursor_positioned: + PARS_WHERE_TOKEN + PARS_CURRENT_TOKEN PARS_OF_TOKEN + PARS_ID_TOKEN { $$ = $4; } +; + +update_statement_start: + PARS_UPDATE_TOKEN PARS_ID_TOKEN + PARS_SET_TOKEN + column_assignment_list { $$ = pars_update_statement_start(FALSE, + $2, $4); } +; + +update_statement_searched: + update_statement_start + search_condition { $$ = pars_update_statement($1, NULL, $2); } +; + +update_statement_positioned: + update_statement_start + cursor_positioned { $$ = pars_update_statement($1, $2, NULL); } +; + +delete_statement_start: + PARS_DELETE_TOKEN PARS_FROM_TOKEN + PARS_ID_TOKEN { $$ = pars_update_statement_start(TRUE, + $3, NULL); } +; + +delete_statement_searched: + delete_statement_start + search_condition { $$ = pars_update_statement($1, NULL, $2); } +; + +delete_statement_positioned: + delete_statement_start + cursor_positioned { $$ = pars_update_statement($1, $2, NULL); } +; + +row_printf_statement: + PARS_ROW_PRINTF_TOKEN select_statement + { $$ = pars_row_printf_statement($2); } +; + +assignment_statement: + PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp + { $$ = pars_assignment_statement($1, $3); } +; + +elsif_element: + PARS_ELSIF_TOKEN + exp PARS_THEN_TOKEN statement_list + { $$ = pars_elsif_element($2, $4); } +; + +elsif_list: + elsif_element { $$ = que_node_list_add_last(NULL, $1); } + | elsif_list elsif_element + { $$ = que_node_list_add_last($1, $2); } +; + +else_part: + /* Nothing */ { $$ = NULL; } + | PARS_ELSE_TOKEN statement_list + { $$ = $2; } + | elsif_list { $$ = $1; } +; + +if_statement: + PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list + else_part + PARS_END_TOKEN PARS_IF_TOKEN + { $$ = pars_if_statement($2, $4, $5); } +; + +while_statement: + PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list + PARS_END_TOKEN PARS_LOOP_TOKEN + { $$ = pars_while_statement($2, $4); } +; + +for_statement: + PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN + exp PARS_DDOT_TOKEN exp + PARS_LOOP_TOKEN statement_list + PARS_END_TOKEN PARS_LOOP_TOKEN + { $$ = pars_for_statement($2, $4, $6, $8); } +; + +exit_statement: + PARS_EXIT_TOKEN { $$ = pars_exit_statement(); } +; + +return_statement: + PARS_RETURN_TOKEN { $$ = pars_return_statement(); } +; + +open_cursor_statement: + PARS_OPEN_TOKEN PARS_ID_TOKEN + { $$ = pars_open_statement( + ROW_SEL_OPEN_CURSOR, $2); } +; + +close_cursor_statement: + PARS_CLOSE_TOKEN PARS_ID_TOKEN + { $$ = pars_open_statement( + ROW_SEL_CLOSE_CURSOR, $2); } +; + +fetch_statement: + PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list + { $$ = pars_fetch_statement($2, $4, NULL); } + | PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call + { $$ = pars_fetch_statement($2, NULL, $4); } +; + +column_def: + PARS_ID_TOKEN type_name opt_column_len opt_unsigned opt_not_null + { $$ = pars_column_def($1, $2, $3, $4, $5); } +; + +column_def_list: + column_def { $$ = que_node_list_add_last(NULL, $1); } + | column_def_list ',' column_def + { $$ = que_node_list_add_last($1, $3); } +; + +opt_column_len: + /* Nothing */ { $$ = NULL; } + | '(' PARS_INT_LIT ')' + { $$ = $2; } +; + +opt_unsigned: + /* Nothing */ { $$ = NULL; } + | PARS_UNSIGNED_TOKEN + { $$ = &pars_int_token; + /* pass any non-NULL pointer */ } +; + +opt_not_null: + /* Nothing */ { $$ = NULL; } + | PARS_NOT_TOKEN PARS_NULL_LIT + { $$ = &pars_int_token; + /* pass any non-NULL pointer */ } +; + +not_fit_in_memory: + /* Nothing */ { $$ = NULL; } + | PARS_DOES_NOT_FIT_IN_MEM_TOKEN + { $$ = &pars_int_token; + /* pass any non-NULL pointer */ } +; + +create_table: + PARS_CREATE_TOKEN PARS_TABLE_TOKEN + PARS_ID_TOKEN '(' column_def_list ')' + not_fit_in_memory { $$ = pars_create_table($3, $5, $7); } +; + +column_list: + PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); } + | column_list ',' PARS_ID_TOKEN + { $$ = que_node_list_add_last($1, $3); } +; + +unique_def: + /* Nothing */ { $$ = NULL; } + | PARS_UNIQUE_TOKEN { $$ = &pars_unique_token; } +; + +clustered_def: + /* Nothing */ { $$ = NULL; } + | PARS_CLUSTERED_TOKEN { $$ = &pars_clustered_token; } +; + +create_index: + PARS_CREATE_TOKEN unique_def + clustered_def + PARS_INDEX_TOKEN + PARS_ID_TOKEN PARS_ON_TOKEN PARS_ID_TOKEN + '(' column_list ')' { $$ = pars_create_index($2, $3, $5, $7, $9); } +; + +commit_statement: + PARS_COMMIT_TOKEN PARS_WORK_TOKEN + { $$ = pars_commit_statement(); } +; + +rollback_statement: + PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN + { $$ = pars_rollback_statement(); } +; + +type_name: + PARS_INT_TOKEN { $$ = &pars_int_token; } + | PARS_INTEGER_TOKEN { $$ = &pars_int_token; } + | PARS_CHAR_TOKEN { $$ = &pars_char_token; } + | PARS_BINARY_TOKEN { $$ = &pars_binary_token; } + | PARS_BLOB_TOKEN { $$ = &pars_blob_token; } +; + +parameter_declaration: + PARS_ID_TOKEN PARS_IN_TOKEN type_name + { $$ = pars_parameter_declaration($1, + PARS_INPUT, $3); } + | PARS_ID_TOKEN PARS_OUT_TOKEN type_name + { $$ = pars_parameter_declaration($1, + PARS_OUTPUT, $3); } +; + +parameter_declaration_list: + /* Nothing */ { $$ = NULL; } + | parameter_declaration { $$ = que_node_list_add_last(NULL, $1); } + | parameter_declaration_list ',' parameter_declaration + { $$ = que_node_list_add_last($1, $3); } +; + +variable_declaration: + PARS_ID_TOKEN type_name ';' + { $$ = pars_variable_declaration($1, $2); } +; + +variable_declaration_list: + /* Nothing */ + | variable_declaration + | variable_declaration_list variable_declaration +; + +cursor_declaration: + PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN + PARS_IS_TOKEN select_statement ';' + { $$ = pars_cursor_declaration($3, $5); } +; + +function_declaration: + PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';' + { $$ = pars_function_declaration($3); } +; + +declaration: + cursor_declaration + | function_declaration +; + +declaration_list: + /* Nothing */ + | declaration + | declaration_list declaration +; + +procedure_definition: + PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' parameter_declaration_list ')' + PARS_IS_TOKEN + variable_declaration_list + declaration_list + PARS_BEGIN_TOKEN + statement_list + PARS_END_TOKEN { $$ = pars_procedure_definition($2, $4, + $10); } +; + +%% diff --git a/storage/xtradb/pars/pars0lex.l b/storage/xtradb/pars/pars0lex.l new file mode 100644 index 00000000000..38cb744bd44 --- /dev/null +++ b/storage/xtradb/pars/pars0lex.l @@ -0,0 +1,663 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser lexical analyzer: input file for the GNU Flex lexer generator + +The InnoDB parser is frozen because MySQL takes care of SQL parsing. +Therefore we normally keep the InnoDB parser C files as they are, and do +not automatically generate them from pars0grm.y and pars0lex.l. + +How to make the InnoDB parser and lexer C files: + +1. Run ./make_flex.sh to generate lexer files. + +2. Run ./make_bison.sh to generate parser files. + +These instructions seem to work at least with bison-1.875d and flex-2.5.31 on +Linux. + +Created 12/14/1997 Heikki Tuuri +*******************************************************/ + +%option nostdinit +%option 8bit +%option warn +%option pointer +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option noyy_scan_buffer +%option noyy_scan_bytes +%option noyy_scan_string +%option nounistd + +%{ +#define YYSTYPE que_node_t* + +#include "univ.i" +#include "pars0pars.h" +#include "pars0grm.h" +#include "pars0sym.h" +#include "mem0mem.h" +#include "os0proc.h" + +#define malloc(A) ut_malloc(A) +#define free(A) ut_free(A) +#define realloc(P, A) ut_realloc(P, A) +#define exit(A) ut_error + +#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size) + +/* String buffer for removing quotes */ +static ulint stringbuf_len_alloc = 0; /* Allocated length */ +static ulint stringbuf_len = 0; /* Current length */ +static char* stringbuf; /* Start of buffer */ +/* Appends a string to the buffer. */ +static +void +string_append( +/*==========*/ + const char* str, /* in: string to be appended */ + ulint len) /* in: length of the string */ +{ + if (stringbuf == NULL) { + stringbuf = malloc(1); + stringbuf_len_alloc = 1; + } + + if (stringbuf_len + len > stringbuf_len_alloc) { + while (stringbuf_len + len > stringbuf_len_alloc) { + stringbuf_len_alloc <<= 1; + } + stringbuf = realloc(stringbuf, stringbuf_len_alloc); + } + + memcpy(stringbuf + stringbuf_len, str, len); + stringbuf_len += len; +} + +%} + +DIGIT [0-9] +ID [a-z_A-Z][a-z_A-Z0-9]* +BOUND_LIT \:[a-z_A-Z0-9]+ +BOUND_ID \$[a-z_A-Z0-9]+ + +%x comment +%x quoted +%x id +%% + +{DIGIT}+ { + yylval = sym_tab_add_int_lit(pars_sym_tab_global, + atoi(yytext)); + return(PARS_INT_LIT); +} + +{DIGIT}+"."{DIGIT}* { + ut_error; /* not implemented */ + + return(PARS_FLOAT_LIT); +} + +{BOUND_LIT} { + ulint type; + + yylval = sym_tab_add_bound_lit(pars_sym_tab_global, + yytext + 1, &type); + + return((int) type); +} + +{BOUND_ID} { + yylval = sym_tab_add_bound_id(pars_sym_tab_global, + yytext + 1); + + return(PARS_ID_TOKEN); +} + +"'" { +/* Quoted character string literals are handled in an explicit +start state 'quoted'. This state is entered and the buffer for +the scanned string is emptied upon encountering a starting quote. + +In the state 'quoted', only two actions are possible (defined below). */ + BEGIN(quoted); + stringbuf_len = 0; +} +[^\']+ { + /* Got a sequence of characters other than "'": + append to string buffer */ + string_append(yytext, yyleng); +} +"'"+ { + /* Got a sequence of "'" characters: + append half of them to string buffer, + as "''" represents a single "'". + We apply truncating division, + so that "'''" will result in "'". */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + string literal. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_str_lit( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + return(PARS_STR_LIT); + } +} + +\" { +/* Quoted identifiers are handled in an explicit start state 'id'. +This state is entered and the buffer for the scanned string is emptied +upon encountering a starting quote. + +In the state 'id', only two actions are possible (defined below). */ + BEGIN(id); + stringbuf_len = 0; +} +[^\"]+ { + /* Got a sequence of characters other than '"': + append to string buffer */ + string_append(yytext, yyleng); +} +\"+ { + /* Got a sequence of '"' characters: + append half of them to string buffer, + as '""' represents a single '"'. + We apply truncating division, + so that '"""' will result in '"'. */ + + string_append(yytext, yyleng / 2); + + /* If we got an odd number of quotes, then the + last quote we got is the terminating quote. + At the end of the string, we return to the + initial start state and report the scanned + identifier. */ + + if (yyleng % 2) { + BEGIN(INITIAL); + yylval = sym_tab_add_id( + pars_sym_tab_global, + (byte*) stringbuf, stringbuf_len); + + return(PARS_ID_TOKEN); + } +} + +"NULL" { + yylval = sym_tab_add_null_lit(pars_sym_tab_global); + + return(PARS_NULL_LIT); +} + +"SQL" { + /* Implicit cursor name */ + yylval = sym_tab_add_str_lit(pars_sym_tab_global, + (byte*) yytext, yyleng); + return(PARS_SQL_TOKEN); +} + +"AND" { + return(PARS_AND_TOKEN); +} + +"OR" { + return(PARS_OR_TOKEN); +} + +"NOT" { + return(PARS_NOT_TOKEN); +} + +"PROCEDURE" { + return(PARS_PROCEDURE_TOKEN); +} + +"IN" { + return(PARS_IN_TOKEN); +} + +"OUT" { + return(PARS_OUT_TOKEN); +} + +"BINARY" { + return(PARS_BINARY_TOKEN); +} + +"BLOB" { + return(PARS_BLOB_TOKEN); +} + +"INT" { + return(PARS_INT_TOKEN); +} + +"INTEGER" { + return(PARS_INT_TOKEN); +} + +"FLOAT" { + return(PARS_FLOAT_TOKEN); +} + +"CHAR" { + return(PARS_CHAR_TOKEN); +} + +"IS" { + return(PARS_IS_TOKEN); +} + +"BEGIN" { + return(PARS_BEGIN_TOKEN); +} + +"END" { + return(PARS_END_TOKEN); +} + +"IF" { + return(PARS_IF_TOKEN); +} + +"THEN" { + return(PARS_THEN_TOKEN); +} + +"ELSE" { + return(PARS_ELSE_TOKEN); +} + +"ELSIF" { + return(PARS_ELSIF_TOKEN); +} + +"LOOP" { + return(PARS_LOOP_TOKEN); +} + +"WHILE" { + return(PARS_WHILE_TOKEN); +} + +"RETURN" { + return(PARS_RETURN_TOKEN); +} + +"SELECT" { + return(PARS_SELECT_TOKEN); +} + +"SUM" { + return(PARS_SUM_TOKEN); +} + +"COUNT" { + return(PARS_COUNT_TOKEN); +} + +"DISTINCT" { + return(PARS_DISTINCT_TOKEN); +} + +"FROM" { + return(PARS_FROM_TOKEN); +} + +"WHERE" { + return(PARS_WHERE_TOKEN); +} + +"FOR" { + return(PARS_FOR_TOKEN); +} + +"READ" { + return(PARS_READ_TOKEN); +} + +"ORDER" { + return(PARS_ORDER_TOKEN); +} + +"BY" { + return(PARS_BY_TOKEN); +} + +"ASC" { + return(PARS_ASC_TOKEN); +} + +"DESC" { + return(PARS_DESC_TOKEN); +} + +"INSERT" { + return(PARS_INSERT_TOKEN); +} + +"INTO" { + return(PARS_INTO_TOKEN); +} + +"VALUES" { + return(PARS_VALUES_TOKEN); +} + +"UPDATE" { + return(PARS_UPDATE_TOKEN); +} + +"SET" { + return(PARS_SET_TOKEN); +} + +"DELETE" { + return(PARS_DELETE_TOKEN); +} + +"CURRENT" { + return(PARS_CURRENT_TOKEN); +} + +"OF" { + return(PARS_OF_TOKEN); +} + +"CREATE" { + return(PARS_CREATE_TOKEN); +} + +"TABLE" { + return(PARS_TABLE_TOKEN); +} + +"INDEX" { + return(PARS_INDEX_TOKEN); +} + +"UNIQUE" { + return(PARS_UNIQUE_TOKEN); +} + +"CLUSTERED" { + return(PARS_CLUSTERED_TOKEN); +} + +"DOES_NOT_FIT_IN_MEMORY" { + return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN); +} + +"ON" { + return(PARS_ON_TOKEN); +} + +"DECLARE" { + return(PARS_DECLARE_TOKEN); +} + +"CURSOR" { + return(PARS_CURSOR_TOKEN); +} + +"OPEN" { + return(PARS_OPEN_TOKEN); +} + +"FETCH" { + return(PARS_FETCH_TOKEN); +} + +"CLOSE" { + return(PARS_CLOSE_TOKEN); +} + +"NOTFOUND" { + return(PARS_NOTFOUND_TOKEN); +} + +"TO_CHAR" { + return(PARS_TO_CHAR_TOKEN); +} + +"TO_NUMBER" { + return(PARS_TO_NUMBER_TOKEN); +} + +"TO_BINARY" { + return(PARS_TO_BINARY_TOKEN); +} + +"BINARY_TO_NUMBER" { + return(PARS_BINARY_TO_NUMBER_TOKEN); +} + +"SUBSTR" { + return(PARS_SUBSTR_TOKEN); +} + +"REPLSTR" { + return(PARS_REPLSTR_TOKEN); +} + +"CONCAT" { + return(PARS_CONCAT_TOKEN); +} + +"INSTR" { + return(PARS_INSTR_TOKEN); +} + +"LENGTH" { + return(PARS_LENGTH_TOKEN); +} + +"SYSDATE" { + return(PARS_SYSDATE_TOKEN); +} + +"PRINTF" { + return(PARS_PRINTF_TOKEN); +} + +"ASSERT" { + return(PARS_ASSERT_TOKEN); +} + +"RND" { + return(PARS_RND_TOKEN); +} + +"RND_STR" { + return(PARS_RND_STR_TOKEN); +} + +"ROW_PRINTF" { + return(PARS_ROW_PRINTF_TOKEN); +} + +"COMMIT" { + return(PARS_COMMIT_TOKEN); +} + +"ROLLBACK" { + return(PARS_ROLLBACK_TOKEN); +} + +"WORK" { + return(PARS_WORK_TOKEN); +} + +"UNSIGNED" { + return(PARS_UNSIGNED_TOKEN); +} + +"EXIT" { + return(PARS_EXIT_TOKEN); +} + +"FUNCTION" { + return(PARS_FUNCTION_TOKEN); +} + +"LOCK" { + return(PARS_LOCK_TOKEN); +} + +"SHARE" { + return(PARS_SHARE_TOKEN); +} + +"MODE" { + return(PARS_MODE_TOKEN); +} + +{ID} { + yylval = sym_tab_add_id(pars_sym_tab_global, + (byte*)yytext, + ut_strlen(yytext)); + return(PARS_ID_TOKEN); +} + +".." { + return(PARS_DDOT_TOKEN); +} + +":=" { + return(PARS_ASSIGN_TOKEN); +} + +"<=" { + return(PARS_LE_TOKEN); +} + +">=" { + return(PARS_GE_TOKEN); +} + +"<>" { + return(PARS_NE_TOKEN); +} + +"(" { + + return((int)(*yytext)); +} + +"=" { + + return((int)(*yytext)); +} + +">" { + + return((int)(*yytext)); +} + +"<" { + + return((int)(*yytext)); +} + +"," { + + return((int)(*yytext)); +} + +";" { + + return((int)(*yytext)); +} + +")" { + + return((int)(*yytext)); +} + +"+" { + + return((int)(*yytext)); +} + +"-" { + + return((int)(*yytext)); +} + +"*" { + + return((int)(*yytext)); +} + +"/" { + + return((int)(*yytext)); +} + +"%" { + + return((int)(*yytext)); +} + +"{" { + + return((int)(*yytext)); +} + +"}" { + + return((int)(*yytext)); +} + +"?" { + + return((int)(*yytext)); +} + +"/*" BEGIN(comment); /* eat up comment */ + +[^*]* +"*"+[^*/]* +"*"+"/" BEGIN(INITIAL); + +[ \t\n]+ /* eat up whitespace */ + + +. { + fprintf(stderr,"Unrecognized character: %02x\n", + *yytext); + + ut_error; + + return(0); +} + +%% diff --git a/storage/xtradb/pars/pars0opt.c b/storage/xtradb/pars/pars0opt.c new file mode 100644 index 00000000000..34246929c53 --- /dev/null +++ b/storage/xtradb/pars/pars0opt.c @@ -0,0 +1,1224 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ + +#include "pars0opt.h" + +#ifdef UNIV_NONINL +#include "pars0opt.ic" +#endif + +#include "row0sel.h" +#include "row0ins.h" +#include "row0upd.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "que0que.h" +#include "pars0grm.h" +#include "pars0pars.h" +#include "lock0lock.h" + +#define OPT_EQUAL 1 /* comparison by = */ +#define OPT_COMPARISON 2 /* comparison by <, >, <=, or >= */ + +#define OPT_NOT_COND 1 +#define OPT_END_COND 2 +#define OPT_TEST_COND 3 +#define OPT_SCROLL_COND 4 + + +/*********************************************************************** +Inverts a comparison operator. */ +static +int +opt_invert_cmp_op( +/*==============*/ + /* out: the equivalent operator when the order of + the arguments is switched */ + int op) /* in: operator */ +{ + if (op == '<') { + return('>'); + } else if (op == '>') { + return('<'); + } else if (op == '=') { + return('='); + } else if (op == PARS_LE_TOKEN) { + return(PARS_GE_TOKEN); + } else if (op == PARS_GE_TOKEN) { + return(PARS_LE_TOKEN); + } else { + ut_error; + } + + return(0); +} + +/*********************************************************************** +Checks if the value of an expression can be calculated BEFORE the nth table +in a join is accessed. If this is the case, it can possibly be used in an +index search for the nth table. */ +static +ibool +opt_check_exp_determined_before( +/*============================*/ + /* out: TRUE if already determined */ + que_node_t* exp, /* in: expression */ + sel_node_t* sel_node, /* in: select node */ + ulint nth_table) /* in: nth table will be accessed */ +{ + func_node_t* func_node; + sym_node_t* sym_node; + dict_table_t* table; + que_node_t* arg; + ulint i; + + ut_ad(exp && sel_node); + + if (que_node_get_type(exp) == QUE_NODE_FUNC) { + func_node = exp; + + arg = func_node->args; + + while (arg) { + if (!opt_check_exp_determined_before(arg, sel_node, + nth_table)) { + return(FALSE); + } + + arg = que_node_get_next(arg); + } + + return(TRUE); + } + + ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL); + + sym_node = exp; + + if (sym_node->token_type != SYM_COLUMN) { + + return(TRUE); + } + + for (i = 0; i < nth_table; i++) { + + table = sel_node_get_nth_plan(sel_node, i)->table; + + if (sym_node->table == table) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*********************************************************************** +Looks in a comparison condition if a column value is already restricted by +it BEFORE the nth table is accessed. */ +static +que_node_t* +opt_look_for_col_in_comparison_before( +/*==================================*/ + /* out: expression restricting the + value of the column, or NULL if not + known */ + ulint cmp_type, /* in: OPT_EQUAL, OPT_COMPARISON */ + ulint col_no, /* in: column number */ + func_node_t* search_cond, /* in: comparison condition */ + sel_node_t* sel_node, /* in: select node */ + ulint nth_table, /* in: nth table in a join (a query + from a single table is considered a + join of 1 table) */ + ulint* op) /* out: comparison operator ('=', + PARS_GE_TOKEN, ... ); this is inverted + if the column appears on the right + side */ +{ + sym_node_t* sym_node; + dict_table_t* table; + que_node_t* exp; + que_node_t* arg; + + ut_ad(search_cond); + + ut_a((search_cond->func == '<') + || (search_cond->func == '>') + || (search_cond->func == '=') + || (search_cond->func == PARS_GE_TOKEN) + || (search_cond->func == PARS_LE_TOKEN)); + + table = sel_node_get_nth_plan(sel_node, nth_table)->table; + + if ((cmp_type == OPT_EQUAL) && (search_cond->func != '=')) { + + return(NULL); + + } else if ((cmp_type == OPT_COMPARISON) + && (search_cond->func != '<') + && (search_cond->func != '>') + && (search_cond->func != PARS_GE_TOKEN) + && (search_cond->func != PARS_LE_TOKEN)) { + + return(NULL); + } + + arg = search_cond->args; + + if (que_node_get_type(arg) == QUE_NODE_SYMBOL) { + sym_node = arg; + + if ((sym_node->token_type == SYM_COLUMN) + && (sym_node->table == table) + && (sym_node->col_no == col_no)) { + + /* sym_node contains the desired column id */ + + /* Check if the expression on the right side of the + operator is already determined */ + + exp = que_node_get_next(arg); + + if (opt_check_exp_determined_before(exp, sel_node, + nth_table)) { + *op = search_cond->func; + + return(exp); + } + } + } + + exp = search_cond->args; + arg = que_node_get_next(arg); + + if (que_node_get_type(arg) == QUE_NODE_SYMBOL) { + sym_node = arg; + + if ((sym_node->token_type == SYM_COLUMN) + && (sym_node->table == table) + && (sym_node->col_no == col_no)) { + + if (opt_check_exp_determined_before(exp, sel_node, + nth_table)) { + *op = opt_invert_cmp_op(search_cond->func); + + return(exp); + } + } + } + + return(NULL); +} + +/*********************************************************************** +Looks in a search condition if a column value is already restricted by the +search condition BEFORE the nth table is accessed. Takes into account that +if we will fetch in an ascending order, we cannot utilize an upper limit for +a column value; in a descending order, respectively, a lower limit. */ +static +que_node_t* +opt_look_for_col_in_cond_before( +/*============================*/ + /* out: expression restricting the + value of the column, or NULL if not + known */ + ulint cmp_type, /* in: OPT_EQUAL, OPT_COMPARISON */ + ulint col_no, /* in: column number */ + func_node_t* search_cond, /* in: search condition or NULL */ + sel_node_t* sel_node, /* in: select node */ + ulint nth_table, /* in: nth table in a join (a query + from a single table is considered a + join of 1 table) */ + ulint* op) /* out: comparison operator ('=', + PARS_GE_TOKEN, ... ) */ +{ + func_node_t* new_cond; + que_node_t* exp; + + if (search_cond == NULL) { + + return(NULL); + } + + ut_a(que_node_get_type(search_cond) == QUE_NODE_FUNC); + ut_a(search_cond->func != PARS_OR_TOKEN); + ut_a(search_cond->func != PARS_NOT_TOKEN); + + if (search_cond->func == PARS_AND_TOKEN) { + new_cond = search_cond->args; + + exp = opt_look_for_col_in_cond_before(cmp_type, col_no, + new_cond, sel_node, + nth_table, op); + if (exp) { + + return(exp); + } + + new_cond = que_node_get_next(new_cond); + + exp = opt_look_for_col_in_cond_before(cmp_type, col_no, + new_cond, sel_node, + nth_table, op); + return(exp); + } + + exp = opt_look_for_col_in_comparison_before(cmp_type, col_no, + search_cond, sel_node, + nth_table, op); + if (exp == NULL) { + + return(NULL); + } + + /* If we will fetch in an ascending order, we cannot utilize an upper + limit for a column value; in a descending order, respectively, a lower + limit */ + + if (sel_node->asc && ((*op == '<') || (*op == PARS_LE_TOKEN))) { + + return(NULL); + + } else if (!sel_node->asc + && ((*op == '>') || (*op == PARS_GE_TOKEN))) { + + return(NULL); + } + + return(exp); +} + +/*********************************************************************** +Calculates the goodness for an index according to a select node. The +goodness is 4 times the number of first fields in index whose values we +already know exactly in the query. If we have a comparison condition for +an additional field, 2 point are added. If the index is unique, and we know +all the unique fields for the index we add 1024 points. For a clustered index +we add 1 point. */ +static +ulint +opt_calc_index_goodness( +/*====================*/ + /* out: goodness */ + dict_index_t* index, /* in: index */ + sel_node_t* sel_node, /* in: parsed select node */ + ulint nth_table, /* in: nth table in a join */ + que_node_t** index_plan, /* in/out: comparison expressions for + this index */ + ulint* last_op) /* out: last comparison operator, if + goodness > 1 */ +{ + que_node_t* exp; + ulint goodness; + ulint n_fields; + ulint col_no; + ulint op; + ulint j; + + goodness = 0; + + /* Note that as higher level node pointers in the B-tree contain + page addresses as the last field, we must not put more fields in + the search tuple than dict_index_get_n_unique_in_tree(index); see + the note in btr_cur_search_to_nth_level. */ + + n_fields = dict_index_get_n_unique_in_tree(index); + + for (j = 0; j < n_fields; j++) { + + col_no = dict_index_get_nth_col_no(index, j); + + exp = opt_look_for_col_in_cond_before( + OPT_EQUAL, col_no, sel_node->search_cond, + sel_node, nth_table, &op); + if (exp) { + /* The value for this column is exactly known already + at this stage of the join */ + + index_plan[j] = exp; + *last_op = op; + goodness += 4; + } else { + /* Look for non-equality comparisons */ + + exp = opt_look_for_col_in_cond_before( + OPT_COMPARISON, col_no, sel_node->search_cond, + sel_node, nth_table, &op); + if (exp) { + index_plan[j] = exp; + *last_op = op; + goodness += 2; + } + + break; + } + } + + if (goodness >= 4 * dict_index_get_n_unique(index)) { + goodness += 1024; + + if (dict_index_is_clust(index)) { + + goodness += 1024; + } + } + + /* We have to test for goodness here, as last_op may note be set */ + if (goodness && dict_index_is_clust(index)) { + + goodness++; + } + + return(goodness); +} + +/*********************************************************************** +Calculates the number of matched fields based on an index goodness. */ +UNIV_INLINE +ulint +opt_calc_n_fields_from_goodness( +/*============================*/ + /* out: number of excatly or partially matched + fields */ + ulint goodness) /* in: goodness */ +{ + return(((goodness % 1024) + 2) / 4); +} + +/*********************************************************************** +Converts a comparison operator to the corresponding search mode PAGE_CUR_GE, +... */ +UNIV_INLINE +ulint +opt_op_to_search_mode( +/*==================*/ + /* out: search mode */ + ibool asc, /* in: TRUE if the rows should be fetched in an + ascending order */ + ulint op) /* in: operator '=', PARS_GE_TOKEN, ... */ +{ + if (op == '=') { + if (asc) { + return(PAGE_CUR_GE); + } else { + return(PAGE_CUR_LE); + } + } else if (op == '<') { + ut_a(!asc); + return(PAGE_CUR_L); + } else if (op == '>') { + ut_a(asc); + return(PAGE_CUR_G); + } else if (op == PARS_GE_TOKEN) { + ut_a(asc); + return(PAGE_CUR_GE); + } else if (op == PARS_LE_TOKEN) { + ut_a(!asc); + return(PAGE_CUR_LE); + } else { + ut_error; + } + + return(0); +} + +/*********************************************************************** +Determines if a node is an argument node of a function node. */ +static +ibool +opt_is_arg( +/*=======*/ + /* out: TRUE if is an argument */ + que_node_t* arg_node, /* in: possible argument node */ + func_node_t* func_node) /* in: function node */ +{ + que_node_t* arg; + + arg = func_node->args; + + while (arg) { + if (arg == arg_node) { + + return(TRUE); + } + + arg = que_node_get_next(arg); + } + + return(FALSE); +} + +/*********************************************************************** +Decides if the fetching of rows should be made in a descending order, and +also checks that the chosen query plan produces a result which satisfies +the order-by. */ +static +void +opt_check_order_by( +/*===============*/ + sel_node_t* sel_node) /* in: select node; asserts an error + if the plan does not agree with the + order-by */ +{ + order_node_t* order_node; + dict_table_t* order_table; + ulint order_col_no; + plan_t* plan; + ulint i; + + if (!sel_node->order_by) { + + return; + } + + order_node = sel_node->order_by; + order_col_no = order_node->column->col_no; + order_table = order_node->column->table; + + /* If there is an order-by clause, the first non-exactly matched field + in the index used for the last table in the table list should be the + column defined in the order-by clause, and for all the other tables + we should get only at most a single row, otherwise we cannot presently + calculate the order-by, as we have no sort utility */ + + for (i = 0; i < sel_node->n_tables; i++) { + + plan = sel_node_get_nth_plan(sel_node, i); + + if (i < sel_node->n_tables - 1) { + ut_a(dict_index_get_n_unique(plan->index) + <= plan->n_exact_match); + } else { + ut_a(plan->table == order_table); + + ut_a((dict_index_get_n_unique(plan->index) + <= plan->n_exact_match) + || (dict_index_get_nth_col_no(plan->index, + plan->n_exact_match) + == order_col_no)); + } + } +} + +/*********************************************************************** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +static +void +opt_search_plan_for_table( +/*======================*/ + sel_node_t* sel_node, /* in: parsed select node */ + ulint i, /* in: this is the ith table */ + dict_table_t* table) /* in: table */ +{ + plan_t* plan; + dict_index_t* index; + dict_index_t* best_index; + ulint n_fields; + ulint goodness; + ulint last_op = 75946965; /* Eliminate a Purify + warning */ + ulint best_goodness; + ulint best_last_op = 0; /* remove warning */ + que_node_t* index_plan[256]; + que_node_t* best_index_plan[256]; + + plan = sel_node_get_nth_plan(sel_node, i); + + plan->table = table; + plan->asc = sel_node->asc; + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + + /* Calculate goodness for each index of the table */ + + index = dict_table_get_first_index(table); + best_index = index; /* Eliminate compiler warning */ + best_goodness = 0; + + /* should be do ... until ? comment by Jani */ + while (index) { + goodness = opt_calc_index_goodness(index, sel_node, i, + index_plan, &last_op); + if (goodness > best_goodness) { + + best_index = index; + best_goodness = goodness; + n_fields = opt_calc_n_fields_from_goodness(goodness); + + ut_memcpy(best_index_plan, index_plan, + n_fields * sizeof(void*)); + best_last_op = last_op; + } + + index = dict_table_get_next_index(index); + } + + plan->index = best_index; + + n_fields = opt_calc_n_fields_from_goodness(best_goodness); + + if (n_fields == 0) { + plan->tuple = NULL; + plan->n_exact_match = 0; + } else { + plan->tuple = dtuple_create(pars_sym_tab_global->heap, + n_fields); + dict_index_copy_types(plan->tuple, plan->index, n_fields); + + plan->tuple_exps = mem_heap_alloc(pars_sym_tab_global->heap, + n_fields * sizeof(void*)); + + ut_memcpy(plan->tuple_exps, best_index_plan, + n_fields * sizeof(void*)); + if (best_last_op == '=') { + plan->n_exact_match = n_fields; + } else { + plan->n_exact_match = n_fields - 1; + } + + plan->mode = opt_op_to_search_mode(sel_node->asc, + best_last_op); + } + + if (dict_index_is_clust(best_index) + && (plan->n_exact_match >= dict_index_get_n_unique(best_index))) { + + plan->unique_search = TRUE; + } else { + plan->unique_search = FALSE; + } + + plan->old_vers_heap = NULL; + + btr_pcur_init(&(plan->pcur)); + btr_pcur_init(&(plan->clust_pcur)); +} + +/*********************************************************************** +Looks at a comparison condition and decides if it can, and need, be tested for +a table AFTER the table has been accessed. */ +static +ulint +opt_classify_comparison( +/*====================*/ + /* out: OPT_NOT_COND if not for this + table, else OPT_END_COND, + OPT_TEST_COND, or OPT_SCROLL_COND, + where the last means that the + condition need not be tested, except + when scroll cursors are used */ + sel_node_t* sel_node, /* in: select node */ + ulint i, /* in: ith table in the join */ + func_node_t* cond) /* in: comparison condition */ +{ + plan_t* plan; + ulint n_fields; + ulint op; + ulint j; + + ut_ad(cond && sel_node); + + plan = sel_node_get_nth_plan(sel_node, i); + + /* Check if the condition is determined after the ith table has been + accessed, but not after the i - 1:th */ + + if (!opt_check_exp_determined_before(cond, sel_node, i + 1)) { + + return(OPT_NOT_COND); + } + + if ((i > 0) && opt_check_exp_determined_before(cond, sel_node, i)) { + + return(OPT_NOT_COND); + } + + /* If the condition is an exact match condition used in constructing + the search tuple, it is classified as OPT_END_COND */ + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + } else { + n_fields = 0; + } + + for (j = 0; j < plan->n_exact_match; j++) { + + if (opt_is_arg(plan->tuple_exps[j], cond)) { + + return(OPT_END_COND); + } + } + + /* If the condition is an non-exact match condition used in + constructing the search tuple, it is classified as OPT_SCROLL_COND. + When the cursor is positioned, and if a non-scroll cursor is used, + there is no need to test this condition; if a scroll cursor is used + the testing is necessary when the cursor is reversed. */ + + if ((n_fields > plan->n_exact_match) + && opt_is_arg(plan->tuple_exps[n_fields - 1], cond)) { + + return(OPT_SCROLL_COND); + } + + /* If the condition is a non-exact match condition on the first field + in index for which there is no exact match, and it limits the search + range from the opposite side of the search tuple already BEFORE we + access the table, it is classified as OPT_END_COND */ + + if ((dict_index_get_n_fields(plan->index) > plan->n_exact_match) + && opt_look_for_col_in_comparison_before( + OPT_COMPARISON, + dict_index_get_nth_col_no(plan->index, + plan->n_exact_match), + cond, sel_node, i, &op)) { + + if (sel_node->asc && ((op == '<') || (op == PARS_LE_TOKEN))) { + + return(OPT_END_COND); + } + + if (!sel_node->asc && ((op == '>') || (op == PARS_GE_TOKEN))) { + + return(OPT_END_COND); + } + } + + /* Otherwise, cond is classified as OPT_TEST_COND */ + + return(OPT_TEST_COND); +} + +/*********************************************************************** +Recursively looks for test conditions for a table in a join. */ +static +void +opt_find_test_conds( +/*================*/ + sel_node_t* sel_node, /* in: select node */ + ulint i, /* in: ith table in the join */ + func_node_t* cond) /* in: conjunction of search + conditions or NULL */ +{ + func_node_t* new_cond; + ulint class; + plan_t* plan; + + if (cond == NULL) { + + return; + } + + if (cond->func == PARS_AND_TOKEN) { + new_cond = cond->args; + + opt_find_test_conds(sel_node, i, new_cond); + + new_cond = que_node_get_next(new_cond); + + opt_find_test_conds(sel_node, i, new_cond); + + return; + } + + plan = sel_node_get_nth_plan(sel_node, i); + + class = opt_classify_comparison(sel_node, i, cond); + + if (class == OPT_END_COND) { + UT_LIST_ADD_LAST(cond_list, plan->end_conds, cond); + + } else if (class == OPT_TEST_COND) { + UT_LIST_ADD_LAST(cond_list, plan->other_conds, cond); + + } +} + +/*********************************************************************** +Normalizes a list of comparison conditions so that a column of the table +appears on the left side of the comparison if possible. This is accomplished +by switching the arguments of the operator. */ +static +void +opt_normalize_cmp_conds( +/*====================*/ + func_node_t* cond, /* in: first in a list of comparison + conditions, or NULL */ + dict_table_t* table) /* in: table */ +{ + que_node_t* arg1; + que_node_t* arg2; + sym_node_t* sym_node; + + while (cond) { + arg1 = cond->args; + arg2 = que_node_get_next(arg1); + + if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) { + + sym_node = arg2; + + if ((sym_node->token_type == SYM_COLUMN) + && (sym_node->table == table)) { + + /* Switch the order of the arguments */ + + cond->args = arg2; + que_node_list_add_last(NULL, arg2); + que_node_list_add_last(arg2, arg1); + + /* Invert the operator */ + cond->func = opt_invert_cmp_op(cond->func); + } + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } +} + +/*********************************************************************** +Finds out the search condition conjuncts we can, and need, to test as the ith +table in a join is accessed. The search tuple can eliminate the need to test +some conjuncts. */ +static +void +opt_determine_and_normalize_test_conds( +/*===================================*/ + sel_node_t* sel_node, /* in: select node */ + ulint i) /* in: ith table in the join */ +{ + plan_t* plan; + + plan = sel_node_get_nth_plan(sel_node, i); + + UT_LIST_INIT(plan->end_conds); + UT_LIST_INIT(plan->other_conds); + + /* Recursively go through the conjuncts and classify them */ + + opt_find_test_conds(sel_node, i, sel_node->search_cond); + + opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds), + plan->table); + + ut_a(UT_LIST_GET_LEN(plan->end_conds) >= plan->n_exact_match); +} + +/*********************************************************************** +Looks for occurrences of the columns of the table in the query subgraph and +adds them to the list of columns if an occurrence of the same column does not +already exist in the list. If the column is already in the list, puts a value +indirection to point to the occurrence in the column list, except if the +column occurrence we are looking at is in the column list, in which case +nothing is done. */ +UNIV_INTERN +void +opt_find_all_cols( +/*==============*/ + ibool copy_val, /* in: if TRUE, new found columns are + added as columns to copy */ + dict_index_t* index, /* in: index of the table to use */ + sym_node_list_t* col_list, /* in: base node of a list where + to add new found columns */ + plan_t* plan, /* in: plan or NULL */ + que_node_t* exp) /* in: expression or condition or + NULL */ +{ + func_node_t* func_node; + que_node_t* arg; + sym_node_t* sym_node; + sym_node_t* col_node; + ulint col_pos; + + if (exp == NULL) { + + return; + } + + if (que_node_get_type(exp) == QUE_NODE_FUNC) { + func_node = exp; + + arg = func_node->args; + + while (arg) { + opt_find_all_cols(copy_val, index, col_list, plan, + arg); + arg = que_node_get_next(arg); + } + + return; + } + + ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL); + + sym_node = exp; + + if (sym_node->token_type != SYM_COLUMN) { + + return; + } + + if (sym_node->table != index->table) { + + return; + } + + /* Look for an occurrence of the same column in the plan column + list */ + + col_node = UT_LIST_GET_FIRST(*col_list); + + while (col_node) { + if (col_node->col_no == sym_node->col_no) { + + if (col_node == sym_node) { + /* sym_node was already in a list: do + nothing */ + + return; + } + + /* Put an indirection */ + sym_node->indirection = col_node; + sym_node->alias = col_node; + + return; + } + + col_node = UT_LIST_GET_NEXT(col_var_list, col_node); + } + + /* The same column did not occur in the list: add it */ + + UT_LIST_ADD_LAST(col_var_list, *col_list, sym_node); + + sym_node->copy_val = copy_val; + + /* Fill in the field_no fields in sym_node */ + + sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos( + dict_table_get_first_index(index->table), sym_node->col_no); + if (!dict_index_is_clust(index)) { + + ut_a(plan); + + col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no); + + if (col_pos == ULINT_UNDEFINED) { + + plan->must_get_clust = TRUE; + } + + sym_node->field_nos[SYM_SEC_FIELD_NO] = col_pos; + } +} + +/*********************************************************************** +Looks for occurrences of the columns of the table in conditions which are +not yet determined AFTER the join operation has fetched a row in the ith +table. The values for these column must be copied to dynamic memory for +later use. */ +static +void +opt_find_copy_cols( +/*===============*/ + sel_node_t* sel_node, /* in: select node */ + ulint i, /* in: ith table in the join */ + func_node_t* search_cond) /* in: search condition or NULL */ +{ + func_node_t* new_cond; + plan_t* plan; + + if (search_cond == NULL) { + + return; + } + + ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC); + + if (search_cond->func == PARS_AND_TOKEN) { + new_cond = search_cond->args; + + opt_find_copy_cols(sel_node, i, new_cond); + + new_cond = que_node_get_next(new_cond); + + opt_find_copy_cols(sel_node, i, new_cond); + + return; + } + + if (!opt_check_exp_determined_before(search_cond, sel_node, i + 1)) { + + /* Any ith table columns occurring in search_cond should be + copied, as this condition cannot be tested already on the + fetch from the ith table */ + + plan = sel_node_get_nth_plan(sel_node, i); + + opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan, + search_cond); + } +} + +/*********************************************************************** +Classifies the table columns according to whether we use the column only while +holding the latch on the page, or whether we have to copy the column value to +dynamic memory. Puts the first occurrence of a column to either list in the +plan node, and puts indirections to later occurrences of the column. */ +static +void +opt_classify_cols( +/*==============*/ + sel_node_t* sel_node, /* in: select node */ + ulint i) /* in: ith table in the join */ +{ + plan_t* plan; + que_node_t* exp; + + plan = sel_node_get_nth_plan(sel_node, i); + + /* The final value of the following field will depend on the + environment of the select statement: */ + + plan->must_get_clust = FALSE; + + UT_LIST_INIT(plan->columns); + + /* All select list columns should be copied: therefore TRUE as the + first argument */ + + exp = sel_node->select_list; + + while (exp) { + opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan, + exp); + exp = que_node_get_next(exp); + } + + opt_find_copy_cols(sel_node, i, sel_node->search_cond); + + /* All remaining columns in the search condition are temporary + columns: therefore FALSE */ + + opt_find_all_cols(FALSE, plan->index, &(plan->columns), plan, + sel_node->search_cond); +} + +/*********************************************************************** +Fills in the info in plan which is used in accessing a clustered index +record. The columns must already be classified for the plan node. */ +static +void +opt_clust_access( +/*=============*/ + sel_node_t* sel_node, /* in: select node */ + ulint n) /* in: nth table in select */ +{ + plan_t* plan; + dict_table_t* table; + dict_index_t* clust_index; + dict_index_t* index; + mem_heap_t* heap; + ulint n_fields; + ulint pos; + ulint i; + + plan = sel_node_get_nth_plan(sel_node, n); + + index = plan->index; + + /* The final value of the following field depends on the environment + of the select statement: */ + + plan->no_prefetch = FALSE; + + if (dict_index_is_clust(index)) { + plan->clust_map = NULL; + plan->clust_ref = NULL; + + return; + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + n_fields = dict_index_get_n_unique(clust_index); + + heap = pars_sym_tab_global->heap; + + plan->clust_ref = dtuple_create(heap, n_fields); + + dict_index_copy_types(plan->clust_ref, clust_index, n_fields); + + plan->clust_map = mem_heap_alloc(heap, n_fields * sizeof(ulint)); + + for (i = 0; i < n_fields; i++) { + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + /* We optimize here only queries to InnoDB's internal system + tables, and they should not contain column prefix indexes. */ + + if (dict_index_get_nth_field(index, pos)->prefix_len != 0 + || dict_index_get_nth_field(clust_index, i) + ->prefix_len != 0) { + fprintf(stderr, + "InnoDB: Error in pars0opt.c:" + " table %s has prefix_len != 0\n", + index->table_name); + } + + *(plan->clust_map + i) = pos; + + ut_ad(pos != ULINT_UNDEFINED); + } +} + +/*********************************************************************** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +UNIV_INTERN +void +opt_search_plan( +/*============*/ + sel_node_t* sel_node) /* in: parsed select node */ +{ + sym_node_t* table_node; + dict_table_t* table; + order_node_t* order_by; + ulint i; + + sel_node->plans = mem_heap_alloc(pars_sym_tab_global->heap, + sel_node->n_tables * sizeof(plan_t)); + + /* Analyze the search condition to find out what we know at each + join stage about the conditions that the columns of a table should + satisfy */ + + table_node = sel_node->table_list; + + if (sel_node->order_by == NULL) { + sel_node->asc = TRUE; + } else { + order_by = sel_node->order_by; + + sel_node->asc = order_by->asc; + } + + for (i = 0; i < sel_node->n_tables; i++) { + + table = table_node->table; + + /* Choose index through which to access the table */ + + opt_search_plan_for_table(sel_node, i, table); + + /* Determine the search condition conjuncts we can test at + this table; normalize the end conditions */ + + opt_determine_and_normalize_test_conds(sel_node, i); + + table_node = que_node_get_next(table_node); + } + + table_node = sel_node->table_list; + + for (i = 0; i < sel_node->n_tables; i++) { + + /* Classify the table columns into those we only need to access + but not copy, and to those we must copy to dynamic memory */ + + opt_classify_cols(sel_node, i); + + /* Calculate possible info for accessing the clustered index + record */ + + opt_clust_access(sel_node, i); + + table_node = que_node_get_next(table_node); + } + + /* Check that the plan obeys a possible order-by clause: if not, + an assertion error occurs */ + + opt_check_order_by(sel_node); + +#ifdef UNIV_SQL_DEBUG + opt_print_query_plan(sel_node); +#endif +} + +/************************************************************************ +Prints info of a query plan. */ +UNIV_INTERN +void +opt_print_query_plan( +/*=================*/ + sel_node_t* sel_node) /* in: select node */ +{ + plan_t* plan; + ulint n_fields; + ulint i; + + fputs("QUERY PLAN FOR A SELECT NODE\n", stderr); + + fputs(sel_node->asc ? "Asc. search; " : "Desc. search; ", stderr); + + if (sel_node->set_x_locks) { + fputs("sets row x-locks; ", stderr); + ut_a(sel_node->row_lock_mode == LOCK_X); + ut_a(!sel_node->consistent_read); + } else if (sel_node->consistent_read) { + fputs("consistent read; ", stderr); + } else { + ut_a(sel_node->row_lock_mode == LOCK_S); + fputs("sets row s-locks; ", stderr); + } + + putc('\n', stderr); + + for (i = 0; i < sel_node->n_tables; i++) { + plan = sel_node_get_nth_plan(sel_node, i); + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + } else { + n_fields = 0; + } + + fputs("Table ", stderr); + dict_index_name_print(stderr, NULL, plan->index); + fprintf(stderr,"; exact m. %lu, match %lu, end conds %lu\n", + (unsigned long) plan->n_exact_match, + (unsigned long) n_fields, + (unsigned long) UT_LIST_GET_LEN(plan->end_conds)); + } +} diff --git a/storage/xtradb/pars/pars0pars.c b/storage/xtradb/pars/pars0pars.c new file mode 100644 index 00000000000..62ae3b3d09b --- /dev/null +++ b/storage/xtradb/pars/pars0pars.c @@ -0,0 +1,2205 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ + +/* Historical note: Innobase executed its first SQL string (CREATE TABLE) +on 1/27/1998 */ + +#include "pars0pars.h" + +#ifdef UNIV_NONINL +#include "pars0pars.ic" +#endif + +#include "row0sel.h" +#include "row0ins.h" +#include "row0upd.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "dict0crea.h" +#include "que0que.h" +#include "pars0grm.h" +#include "pars0opt.h" +#include "data0data.h" +#include "data0type.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "lock0lock.h" +#include "eval0eval.h" + +#ifdef UNIV_SQL_DEBUG +/* If the following is set TRUE, the lexer will print the SQL string +as it tokenizes it */ + +ibool pars_print_lexed = FALSE; +#endif /* UNIV_SQL_DEBUG */ + +/* Global variable used while parsing a single procedure or query : the code is +NOT re-entrant */ +UNIV_INTERN sym_tab_t* pars_sym_tab_global; + +/* Global variables used to denote certain reserved words, used in +constructing the parsing tree */ + +UNIV_INTERN pars_res_word_t pars_to_char_token = {PARS_TO_CHAR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_to_number_token = {PARS_TO_NUMBER_TOKEN}; +UNIV_INTERN pars_res_word_t pars_to_binary_token = {PARS_TO_BINARY_TOKEN}; +UNIV_INTERN pars_res_word_t pars_binary_to_number_token = {PARS_BINARY_TO_NUMBER_TOKEN}; +UNIV_INTERN pars_res_word_t pars_substr_token = {PARS_SUBSTR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_replstr_token = {PARS_REPLSTR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_concat_token = {PARS_CONCAT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_instr_token = {PARS_INSTR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_length_token = {PARS_LENGTH_TOKEN}; +UNIV_INTERN pars_res_word_t pars_sysdate_token = {PARS_SYSDATE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_printf_token = {PARS_PRINTF_TOKEN}; +UNIV_INTERN pars_res_word_t pars_assert_token = {PARS_ASSERT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_rnd_token = {PARS_RND_TOKEN}; +UNIV_INTERN pars_res_word_t pars_rnd_str_token = {PARS_RND_STR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_count_token = {PARS_COUNT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_sum_token = {PARS_SUM_TOKEN}; +UNIV_INTERN pars_res_word_t pars_distinct_token = {PARS_DISTINCT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_binary_token = {PARS_BINARY_TOKEN}; +UNIV_INTERN pars_res_word_t pars_blob_token = {PARS_BLOB_TOKEN}; +UNIV_INTERN pars_res_word_t pars_int_token = {PARS_INT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_char_token = {PARS_CHAR_TOKEN}; +UNIV_INTERN pars_res_word_t pars_float_token = {PARS_FLOAT_TOKEN}; +UNIV_INTERN pars_res_word_t pars_update_token = {PARS_UPDATE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_asc_token = {PARS_ASC_TOKEN}; +UNIV_INTERN pars_res_word_t pars_desc_token = {PARS_DESC_TOKEN}; +UNIV_INTERN pars_res_word_t pars_open_token = {PARS_OPEN_TOKEN}; +UNIV_INTERN pars_res_word_t pars_close_token = {PARS_CLOSE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_share_token = {PARS_SHARE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_unique_token = {PARS_UNIQUE_TOKEN}; +UNIV_INTERN pars_res_word_t pars_clustered_token = {PARS_CLUSTERED_TOKEN}; + +/* Global variable used to denote the '*' in SELECT * FROM.. */ +#define PARS_STAR_DENOTER 12345678 +UNIV_INTERN ulint pars_star_denoter = PARS_STAR_DENOTER; + + +/************************************************************************* +Determines the class of a function code. */ +static +ulint +pars_func_get_class( +/*================*/ + /* out: function class: PARS_FUNC_ARITH, ... */ + int func) /* in: function code: '=', PARS_GE_TOKEN, ... */ +{ + switch (func) { + case '+': case '-': case '*': case '/': + return(PARS_FUNC_ARITH); + + case '=': case '<': case '>': + case PARS_GE_TOKEN: case PARS_LE_TOKEN: case PARS_NE_TOKEN: + return(PARS_FUNC_CMP); + + case PARS_AND_TOKEN: case PARS_OR_TOKEN: case PARS_NOT_TOKEN: + return(PARS_FUNC_LOGICAL); + + case PARS_COUNT_TOKEN: case PARS_SUM_TOKEN: + return(PARS_FUNC_AGGREGATE); + + case PARS_TO_CHAR_TOKEN: + case PARS_TO_NUMBER_TOKEN: + case PARS_TO_BINARY_TOKEN: + case PARS_BINARY_TO_NUMBER_TOKEN: + case PARS_SUBSTR_TOKEN: + case PARS_CONCAT_TOKEN: + case PARS_LENGTH_TOKEN: + case PARS_INSTR_TOKEN: + case PARS_SYSDATE_TOKEN: + case PARS_NOTFOUND_TOKEN: + case PARS_PRINTF_TOKEN: + case PARS_ASSERT_TOKEN: + case PARS_RND_TOKEN: + case PARS_RND_STR_TOKEN: + case PARS_REPLSTR_TOKEN: + return(PARS_FUNC_PREDEFINED); + + default: + return(PARS_FUNC_OTHER); + } +} + +/************************************************************************* +Parses an operator or predefined function expression. */ +static +func_node_t* +pars_func_low( +/*==========*/ + /* out, own: function node in a query tree */ + int func, /* in: function token code */ + que_node_t* arg) /* in: first argument in the argument list */ +{ + func_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t)); + + node->common.type = QUE_NODE_FUNC; + dfield_set_data(&(node->common.val), NULL, 0); + node->common.val_buf_size = 0; + + node->func = func; + + node->class = pars_func_get_class(func); + + node->args = arg; + + UT_LIST_ADD_LAST(func_node_list, pars_sym_tab_global->func_node_list, + node); + return(node); +} + +/************************************************************************* +Parses a function expression. */ +UNIV_INTERN +func_node_t* +pars_func( +/*======*/ + /* out, own: function node in a query tree */ + que_node_t* res_word,/* in: function name reserved word */ + que_node_t* arg) /* in: first argument in the argument list */ +{ + return(pars_func_low(((pars_res_word_t*)res_word)->code, arg)); +} + +/************************************************************************* +Parses an operator expression. */ +UNIV_INTERN +func_node_t* +pars_op( +/*====*/ + /* out, own: function node in a query tree */ + int func, /* in: operator token code */ + que_node_t* arg1, /* in: first argument */ + que_node_t* arg2) /* in: second argument or NULL for an unary + operator */ +{ + que_node_list_add_last(NULL, arg1); + + if (arg2) { + que_node_list_add_last(arg1, arg2); + } + + return(pars_func_low(func, arg1)); +} + +/************************************************************************* +Parses an ORDER BY clause. Order by a single column only is supported. */ +UNIV_INTERN +order_node_t* +pars_order_by( +/*==========*/ + /* out, own: order-by node in a query tree */ + sym_node_t* column, /* in: column name */ + pars_res_word_t* asc) /* in: &pars_asc_token or pars_desc_token */ +{ + order_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(order_node_t)); + + node->common.type = QUE_NODE_ORDER; + + node->column = column; + + if (asc == &pars_asc_token) { + node->asc = TRUE; + } else { + ut_a(asc == &pars_desc_token); + node->asc = FALSE; + } + + return(node); +} + +/************************************************************************* +Determine if a data type is a built-in string data type of the InnoDB +SQL parser. */ +static +ibool +pars_is_string_type( +/*================*/ + /* out: TRUE if string data type */ + ulint mtype) /* in: main data type */ +{ + switch (mtype) { + case DATA_VARCHAR: case DATA_CHAR: + case DATA_FIXBINARY: case DATA_BINARY: + return(TRUE); + } + + return(FALSE); +} + +/************************************************************************* +Resolves the data type of a function in an expression. The argument data +types must already be resolved. */ +static +void +pars_resolve_func_data_type( +/*========================*/ + func_node_t* node) /* in: function node */ +{ + que_node_t* arg; + + ut_a(que_node_get_type(node) == QUE_NODE_FUNC); + + arg = node->args; + + switch (node->func) { + case PARS_SUM_TOKEN: + case '+': case '-': case '*': case '/': + /* Inherit the data type from the first argument (which must + not be the SQL null literal whose type is DATA_ERROR) */ + + dtype_copy(que_node_get_data_type(node), + que_node_get_data_type(arg)); + + ut_a(dtype_get_mtype(que_node_get_data_type(node)) + == DATA_INT); + break; + + case PARS_COUNT_TOKEN: + ut_a(arg); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_TO_CHAR_TOKEN: + case PARS_RND_STR_TOKEN: + ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT); + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + DATA_ENGLISH, 0); + break; + + case PARS_TO_BINARY_TOKEN: + if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) { + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + DATA_ENGLISH, 0); + } else { + dtype_set(que_node_get_data_type(node), DATA_BINARY, + 0, 0); + } + break; + + case PARS_TO_NUMBER_TOKEN: + case PARS_BINARY_TO_NUMBER_TOKEN: + case PARS_LENGTH_TOKEN: + case PARS_INSTR_TOKEN: + ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype)); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_SYSDATE_TOKEN: + ut_a(arg == NULL); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_SUBSTR_TOKEN: + case PARS_CONCAT_TOKEN: + ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype)); + dtype_set(que_node_get_data_type(node), DATA_VARCHAR, + DATA_ENGLISH, 0); + break; + + case '>': case '<': case '=': + case PARS_GE_TOKEN: + case PARS_LE_TOKEN: + case PARS_NE_TOKEN: + case PARS_AND_TOKEN: + case PARS_OR_TOKEN: + case PARS_NOT_TOKEN: + case PARS_NOTFOUND_TOKEN: + + /* We currently have no iboolean type: use integer type */ + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + case PARS_RND_TOKEN: + ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT); + dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4); + break; + + default: + ut_error; + } +} + +/************************************************************************* +Resolves the meaning of variables in an expression and the data types of +functions. It is an error if some identifier cannot be resolved here. */ +static +void +pars_resolve_exp_variables_and_types( +/*=================================*/ + sel_node_t* select_node, /* in: select node or NULL; if + this is not NULL then the variable + sym nodes are added to the + copy_variables list of select_node */ + que_node_t* exp_node) /* in: expression */ +{ + func_node_t* func_node; + que_node_t* arg; + sym_node_t* sym_node; + sym_node_t* node; + + ut_a(exp_node); + + if (que_node_get_type(exp_node) == QUE_NODE_FUNC) { + func_node = exp_node; + + arg = func_node->args; + + while (arg) { + pars_resolve_exp_variables_and_types(select_node, arg); + + arg = que_node_get_next(arg); + } + + pars_resolve_func_data_type(func_node); + + return; + } + + ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL); + + sym_node = exp_node; + + if (sym_node->resolved) { + + return; + } + + /* Not resolved yet: look in the symbol table for a variable + or a cursor or a function with the same name */ + + node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list); + + while (node) { + if (node->resolved + && ((node->token_type == SYM_VAR) + || (node->token_type == SYM_CURSOR) + || (node->token_type == SYM_FUNCTION)) + && node->name + && (sym_node->name_len == node->name_len) + && (ut_memcmp(sym_node->name, node->name, + node->name_len) == 0)) { + + /* Found a variable or a cursor declared with + the same name */ + + break; + } + + node = UT_LIST_GET_NEXT(sym_list, node); + } + + if (!node) { + fprintf(stderr, "PARSER ERROR: Unresolved identifier %s\n", + sym_node->name); + } + + ut_a(node); + + sym_node->resolved = TRUE; + sym_node->token_type = SYM_IMPLICIT_VAR; + sym_node->alias = node; + sym_node->indirection = node; + + if (select_node) { + UT_LIST_ADD_LAST(col_var_list, select_node->copy_variables, + sym_node); + } + + dfield_set_type(que_node_get_val(sym_node), + que_node_get_data_type(node)); +} + +/************************************************************************* +Resolves the meaning of variables in an expression list. It is an error if +some identifier cannot be resolved here. Resolves also the data types of +functions. */ +static +void +pars_resolve_exp_list_variables_and_types( +/*======================================*/ + sel_node_t* select_node, /* in: select node or NULL */ + que_node_t* exp_node) /* in: expression list first node, or + NULL */ +{ + while (exp_node) { + pars_resolve_exp_variables_and_types(select_node, exp_node); + + exp_node = que_node_get_next(exp_node); + } +} + +/************************************************************************* +Resolves the columns in an expression. */ +static +void +pars_resolve_exp_columns( +/*=====================*/ + sym_node_t* table_node, /* in: first node in a table list */ + que_node_t* exp_node) /* in: expression */ +{ + func_node_t* func_node; + que_node_t* arg; + sym_node_t* sym_node; + dict_table_t* table; + sym_node_t* t_node; + ulint n_cols; + ulint i; + + ut_a(exp_node); + + if (que_node_get_type(exp_node) == QUE_NODE_FUNC) { + func_node = exp_node; + + arg = func_node->args; + + while (arg) { + pars_resolve_exp_columns(table_node, arg); + + arg = que_node_get_next(arg); + } + + return; + } + + ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL); + + sym_node = exp_node; + + if (sym_node->resolved) { + + return; + } + + /* Not resolved yet: look in the table list for a column with the + same name */ + + t_node = table_node; + + while (t_node) { + table = t_node->table; + + n_cols = dict_table_get_n_cols(table); + + for (i = 0; i < n_cols; i++) { + const dict_col_t* col + = dict_table_get_nth_col(table, i); + const char* col_name + = dict_table_get_col_name(table, i); + + if ((sym_node->name_len == ut_strlen(col_name)) + && (0 == ut_memcmp(sym_node->name, col_name, + sym_node->name_len))) { + /* Found */ + sym_node->resolved = TRUE; + sym_node->token_type = SYM_COLUMN; + sym_node->table = table; + sym_node->col_no = i; + sym_node->prefetch_buf = NULL; + + dict_col_copy_type( + col, + dfield_get_type(&sym_node + ->common.val)); + + return; + } + } + + t_node = que_node_get_next(t_node); + } +} + +/************************************************************************* +Resolves the meaning of columns in an expression list. */ +static +void +pars_resolve_exp_list_columns( +/*==========================*/ + sym_node_t* table_node, /* in: first node in a table list */ + que_node_t* exp_node) /* in: expression list first node, or + NULL */ +{ + while (exp_node) { + pars_resolve_exp_columns(table_node, exp_node); + + exp_node = que_node_get_next(exp_node); + } +} + +/************************************************************************* +Retrieves the table definition for a table name id. */ +static +void +pars_retrieve_table_def( +/*====================*/ + sym_node_t* sym_node) /* in: table node */ +{ + const char* table_name; + + ut_a(sym_node); + ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); + + sym_node->resolved = TRUE; + sym_node->token_type = SYM_TABLE; + + table_name = (const char*) sym_node->name; + + sym_node->table = dict_table_get_low(table_name); + + ut_a(sym_node->table); +} + +/************************************************************************* +Retrieves the table definitions for a list of table name ids. */ +static +ulint +pars_retrieve_table_list_defs( +/*==========================*/ + /* out: number of tables */ + sym_node_t* sym_node) /* in: first table node in list */ +{ + ulint count = 0; + + if (sym_node == NULL) { + + return(count); + } + + while (sym_node) { + pars_retrieve_table_def(sym_node); + + count++; + + sym_node = que_node_get_next(sym_node); + } + + return(count); +} + +/************************************************************************* +Adds all columns to the select list if the query is SELECT * FROM ... */ +static +void +pars_select_all_columns( +/*====================*/ + sel_node_t* select_node) /* in: select node already containing + the table list */ +{ + sym_node_t* col_node; + sym_node_t* table_node; + dict_table_t* table; + ulint i; + + select_node->select_list = NULL; + + table_node = select_node->table_list; + + while (table_node) { + table = table_node->table; + + for (i = 0; i < dict_table_get_n_user_cols(table); i++) { + const char* col_name = dict_table_get_col_name( + table, i); + + col_node = sym_tab_add_id(pars_sym_tab_global, + (byte*)col_name, + ut_strlen(col_name)); + + select_node->select_list = que_node_list_add_last( + select_node->select_list, col_node); + } + + table_node = que_node_get_next(table_node); + } +} + +/************************************************************************* +Parses a select list; creates a query graph node for the whole SELECT +statement. */ +UNIV_INTERN +sel_node_t* +pars_select_list( +/*=============*/ + /* out, own: select node in a query + tree */ + que_node_t* select_list, /* in: select list */ + sym_node_t* into_list) /* in: variables list or NULL */ +{ + sel_node_t* node; + + node = sel_node_create(pars_sym_tab_global->heap); + + node->select_list = select_list; + node->into_list = into_list; + + pars_resolve_exp_list_variables_and_types(NULL, into_list); + + return(node); +} + +/************************************************************************* +Checks if the query is an aggregate query, in which case the selct list must +contain only aggregate function items. */ +static +void +pars_check_aggregate( +/*=================*/ + sel_node_t* select_node) /* in: select node already containing + the select list */ +{ + que_node_t* exp_node; + func_node_t* func_node; + ulint n_nodes = 0; + ulint n_aggregate_nodes = 0; + + exp_node = select_node->select_list; + + while (exp_node) { + + n_nodes++; + + if (que_node_get_type(exp_node) == QUE_NODE_FUNC) { + + func_node = exp_node; + + if (func_node->class == PARS_FUNC_AGGREGATE) { + + n_aggregate_nodes++; + } + } + + exp_node = que_node_get_next(exp_node); + } + + if (n_aggregate_nodes > 0) { + ut_a(n_nodes == n_aggregate_nodes); + + select_node->is_aggregate = TRUE; + } else { + select_node->is_aggregate = FALSE; + } +} + +/************************************************************************* +Parses a select statement. */ +UNIV_INTERN +sel_node_t* +pars_select_statement( +/*==================*/ + /* out, own: select node in a query + tree */ + sel_node_t* select_node, /* in: select node already containing + the select list */ + sym_node_t* table_list, /* in: table list */ + que_node_t* search_cond, /* in: search condition or NULL */ + pars_res_word_t* for_update, /* in: NULL or &pars_update_token */ + pars_res_word_t* lock_shared, /* in: NULL or &pars_share_token */ + order_node_t* order_by) /* in: NULL or an order-by node */ +{ + select_node->state = SEL_NODE_OPEN; + + select_node->table_list = table_list; + select_node->n_tables = pars_retrieve_table_list_defs(table_list); + + if (select_node->select_list == &pars_star_denoter) { + + /* SELECT * FROM ... */ + pars_select_all_columns(select_node); + } + + if (select_node->into_list) { + ut_a(que_node_list_get_len(select_node->into_list) + == que_node_list_get_len(select_node->select_list)); + } + + UT_LIST_INIT(select_node->copy_variables); + + pars_resolve_exp_list_columns(table_list, select_node->select_list); + pars_resolve_exp_list_variables_and_types(select_node, + select_node->select_list); + pars_check_aggregate(select_node); + + select_node->search_cond = search_cond; + + if (search_cond) { + pars_resolve_exp_columns(table_list, search_cond); + pars_resolve_exp_variables_and_types(select_node, search_cond); + } + + if (for_update) { + ut_a(!lock_shared); + + select_node->set_x_locks = TRUE; + select_node->row_lock_mode = LOCK_X; + + select_node->consistent_read = FALSE; + select_node->read_view = NULL; + } else if (lock_shared){ + select_node->set_x_locks = FALSE; + select_node->row_lock_mode = LOCK_S; + + select_node->consistent_read = FALSE; + select_node->read_view = NULL; + } else { + select_node->set_x_locks = FALSE; + select_node->row_lock_mode = LOCK_S; + + select_node->consistent_read = TRUE; + } + + select_node->order_by = order_by; + + if (order_by) { + pars_resolve_exp_columns(table_list, order_by->column); + } + + /* The final value of the following fields depend on the environment + where the select statement appears: */ + + select_node->can_get_updated = FALSE; + select_node->explicit_cursor = NULL; + + opt_search_plan(select_node); + + return(select_node); +} + +/************************************************************************* +Parses a cursor declaration. */ +UNIV_INTERN +que_node_t* +pars_cursor_declaration( +/*====================*/ + /* out: sym_node */ + sym_node_t* sym_node, /* in: cursor id node in the symbol + table */ + sel_node_t* select_node) /* in: select node */ +{ + sym_node->resolved = TRUE; + sym_node->token_type = SYM_CURSOR; + sym_node->cursor_def = select_node; + + select_node->state = SEL_NODE_CLOSED; + select_node->explicit_cursor = sym_node; + + return(sym_node); +} + +/************************************************************************* +Parses a function declaration. */ +UNIV_INTERN +que_node_t* +pars_function_declaration( +/*======================*/ + /* out: sym_node */ + sym_node_t* sym_node) /* in: function id node in the symbol + table */ +{ + sym_node->resolved = TRUE; + sym_node->token_type = SYM_FUNCTION; + + /* Check that the function exists. */ + ut_a(pars_info_get_user_func(pars_sym_tab_global->info, + sym_node->name)); + + return(sym_node); +} + +/************************************************************************* +Parses a delete or update statement start. */ +UNIV_INTERN +upd_node_t* +pars_update_statement_start( +/*========================*/ + /* out, own: update node in a query + tree */ + ibool is_delete, /* in: TRUE if delete */ + sym_node_t* table_sym, /* in: table name node */ + col_assign_node_t* col_assign_list)/* in: column assignment list, NULL + if delete */ +{ + upd_node_t* node; + + node = upd_node_create(pars_sym_tab_global->heap); + + node->is_delete = is_delete; + + node->table_sym = table_sym; + node->col_assign_list = col_assign_list; + + return(node); +} + +/************************************************************************* +Parses a column assignment in an update. */ +UNIV_INTERN +col_assign_node_t* +pars_column_assignment( +/*===================*/ + /* out: column assignment node */ + sym_node_t* column, /* in: column to assign */ + que_node_t* exp) /* in: value to assign */ +{ + col_assign_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, + sizeof(col_assign_node_t)); + node->common.type = QUE_NODE_COL_ASSIGNMENT; + + node->col = column; + node->val = exp; + + return(node); +} + +/************************************************************************* +Processes an update node assignment list. */ +static +void +pars_process_assign_list( +/*=====================*/ + upd_node_t* node) /* in: update node */ +{ + col_assign_node_t* col_assign_list; + sym_node_t* table_sym; + col_assign_node_t* assign_node; + upd_field_t* upd_field; + dict_index_t* clust_index; + sym_node_t* col_sym; + ulint changes_ord_field; + ulint changes_field_size; + ulint n_assigns; + ulint i; + + table_sym = node->table_sym; + col_assign_list = node->col_assign_list; + clust_index = dict_table_get_first_index(node->table); + + assign_node = col_assign_list; + n_assigns = 0; + + while (assign_node) { + pars_resolve_exp_columns(table_sym, assign_node->col); + pars_resolve_exp_columns(table_sym, assign_node->val); + pars_resolve_exp_variables_and_types(NULL, assign_node->val); +#if 0 + ut_a(dtype_get_mtype( + dfield_get_type(que_node_get_val( + assign_node->col))) + == dtype_get_mtype( + dfield_get_type(que_node_get_val( + assign_node->val)))); +#endif + + /* Add to the update node all the columns found in assignment + values as columns to copy: therefore, TRUE */ + + opt_find_all_cols(TRUE, clust_index, &(node->columns), NULL, + assign_node->val); + n_assigns++; + + assign_node = que_node_get_next(assign_node); + } + + node->update = upd_create(n_assigns, pars_sym_tab_global->heap); + + assign_node = col_assign_list; + + changes_field_size = UPD_NODE_NO_SIZE_CHANGE; + + for (i = 0; i < n_assigns; i++) { + upd_field = upd_get_nth_field(node->update, i); + + col_sym = assign_node->col; + + upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos( + clust_index, col_sym->col_no), + clust_index, NULL); + upd_field->exp = assign_node->val; + + if (!dict_col_get_fixed_size( + dict_index_get_nth_col(clust_index, + upd_field->field_no))) { + changes_field_size = 0; + } + + assign_node = que_node_get_next(assign_node); + } + + /* Find out if the update can modify an ordering field in any index */ + + changes_ord_field = UPD_NODE_NO_ORD_CHANGE; + + if (row_upd_changes_some_index_ord_field_binary(node->table, + node->update)) { + changes_ord_field = 0; + } + + node->cmpl_info = changes_ord_field | changes_field_size; +} + +/************************************************************************* +Parses an update or delete statement. */ +UNIV_INTERN +upd_node_t* +pars_update_statement( +/*==================*/ + /* out, own: update node in a query + tree */ + upd_node_t* node, /* in: update node */ + sym_node_t* cursor_sym, /* in: pointer to a cursor entry in + the symbol table or NULL */ + que_node_t* search_cond) /* in: search condition or NULL */ +{ + sym_node_t* table_sym; + sel_node_t* sel_node; + plan_t* plan; + + table_sym = node->table_sym; + + pars_retrieve_table_def(table_sym); + node->table = table_sym->table; + + UT_LIST_INIT(node->columns); + + /* Make the single table node into a list of table nodes of length 1 */ + + que_node_list_add_last(NULL, table_sym); + + if (cursor_sym) { + pars_resolve_exp_variables_and_types(NULL, cursor_sym); + + sel_node = cursor_sym->alias->cursor_def; + + node->searched_update = FALSE; + } else { + sel_node = pars_select_list(NULL, NULL); + + pars_select_statement(sel_node, table_sym, search_cond, NULL, + &pars_share_token, NULL); + node->searched_update = TRUE; + sel_node->common.parent = node; + } + + node->select = sel_node; + + ut_a(!node->is_delete || (node->col_assign_list == NULL)); + ut_a(node->is_delete || (node->col_assign_list != NULL)); + + if (node->is_delete) { + node->cmpl_info = 0; + } else { + pars_process_assign_list(node); + } + + if (node->searched_update) { + node->has_clust_rec_x_lock = TRUE; + sel_node->set_x_locks = TRUE; + sel_node->row_lock_mode = LOCK_X; + } else { + node->has_clust_rec_x_lock = sel_node->set_x_locks; + } + + ut_a(sel_node->n_tables == 1); + ut_a(sel_node->consistent_read == FALSE); + ut_a(sel_node->order_by == NULL); + ut_a(sel_node->is_aggregate == FALSE); + + sel_node->can_get_updated = TRUE; + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + plan = sel_node_get_nth_plan(sel_node, 0); + + plan->no_prefetch = TRUE; + + if (!dict_index_is_clust(plan->index)) { + + plan->must_get_clust = TRUE; + + node->pcur = &(plan->clust_pcur); + } else { + node->pcur = &(plan->pcur); + } + + return(node); +} + +/************************************************************************* +Parses an insert statement. */ +UNIV_INTERN +ins_node_t* +pars_insert_statement( +/*==================*/ + /* out, own: update node in a query + tree */ + sym_node_t* table_sym, /* in: table name node */ + que_node_t* values_list, /* in: value expression list or NULL */ + sel_node_t* select) /* in: select condition or NULL */ +{ + ins_node_t* node; + dtuple_t* row; + ulint ins_type; + + ut_a(values_list || select); + ut_a(!values_list || !select); + + if (values_list) { + ins_type = INS_VALUES; + } else { + ins_type = INS_SEARCHED; + } + + pars_retrieve_table_def(table_sym); + + node = ins_node_create(ins_type, table_sym->table, + pars_sym_tab_global->heap); + + row = dtuple_create(pars_sym_tab_global->heap, + dict_table_get_n_cols(node->table)); + + dict_table_copy_types(row, table_sym->table); + + ins_node_set_new_row(node, row); + + node->select = select; + + if (select) { + select->common.parent = node; + + ut_a(que_node_list_get_len(select->select_list) + == dict_table_get_n_user_cols(table_sym->table)); + } + + node->values_list = values_list; + + if (node->values_list) { + pars_resolve_exp_list_variables_and_types(NULL, values_list); + + ut_a(que_node_list_get_len(values_list) + == dict_table_get_n_user_cols(table_sym->table)); + } + + return(node); +} + +/************************************************************************* +Set the type of a dfield. */ +static +void +pars_set_dfield_type( +/*=================*/ + dfield_t* dfield, /* in: dfield */ + pars_res_word_t* type, /* in: pointer to a type + token */ + ulint len, /* in: length, or 0 */ + ibool is_unsigned, /* in: if TRUE, column is + UNSIGNED. */ + ibool is_not_null) /* in: if TRUE, column is + NOT NULL. */ +{ + ulint flags = 0; + + if (is_not_null) { + flags |= DATA_NOT_NULL; + } + + if (is_unsigned) { + flags |= DATA_UNSIGNED; + } + + if (type == &pars_int_token) { + ut_a(len == 0); + + dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4); + + } else if (type == &pars_char_token) { + ut_a(len == 0); + + dtype_set(dfield_get_type(dfield), DATA_VARCHAR, + DATA_ENGLISH | flags, 0); + } else if (type == &pars_binary_token) { + ut_a(len != 0); + + dtype_set(dfield_get_type(dfield), DATA_FIXBINARY, + DATA_BINARY_TYPE | flags, len); + } else if (type == &pars_blob_token) { + ut_a(len == 0); + + dtype_set(dfield_get_type(dfield), DATA_BLOB, + DATA_BINARY_TYPE | flags, 0); + } else { + ut_error; + } +} + +/************************************************************************* +Parses a variable declaration. */ +UNIV_INTERN +sym_node_t* +pars_variable_declaration( +/*======================*/ + /* out, own: symbol table node of type + SYM_VAR */ + sym_node_t* node, /* in: symbol table node allocated for the + id of the variable */ + pars_res_word_t* type) /* in: pointer to a type token */ +{ + node->resolved = TRUE; + node->token_type = SYM_VAR; + + node->param_type = PARS_NOT_PARAM; + + pars_set_dfield_type(que_node_get_val(node), type, 0, FALSE, FALSE); + + return(node); +} + +/************************************************************************* +Parses a procedure parameter declaration. */ +UNIV_INTERN +sym_node_t* +pars_parameter_declaration( +/*=======================*/ + /* out, own: symbol table node of type + SYM_VAR */ + sym_node_t* node, /* in: symbol table node allocated for the + id of the parameter */ + ulint param_type, + /* in: PARS_INPUT or PARS_OUTPUT */ + pars_res_word_t* type) /* in: pointer to a type token */ +{ + ut_a((param_type == PARS_INPUT) || (param_type == PARS_OUTPUT)); + + pars_variable_declaration(node, type); + + node->param_type = param_type; + + return(node); +} + +/************************************************************************* +Sets the parent field in a query node list. */ +static +void +pars_set_parent_in_list( +/*====================*/ + que_node_t* node_list, /* in: first node in a list */ + que_node_t* parent) /* in: parent value to set in all + nodes of the list */ +{ + que_common_t* common; + + common = node_list; + + while (common) { + common->parent = parent; + + common = que_node_get_next(common); + } +} + +/************************************************************************* +Parses an elsif element. */ +UNIV_INTERN +elsif_node_t* +pars_elsif_element( +/*===============*/ + /* out: elsif node */ + que_node_t* cond, /* in: if-condition */ + que_node_t* stat_list) /* in: statement list */ +{ + elsif_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(elsif_node_t)); + + node->common.type = QUE_NODE_ELSIF; + + node->cond = cond; + + pars_resolve_exp_variables_and_types(NULL, cond); + + node->stat_list = stat_list; + + return(node); +} + +/************************************************************************* +Parses an if-statement. */ +UNIV_INTERN +if_node_t* +pars_if_statement( +/*==============*/ + /* out: if-statement node */ + que_node_t* cond, /* in: if-condition */ + que_node_t* stat_list, /* in: statement list */ + que_node_t* else_part) /* in: else-part statement list + or elsif element list */ +{ + if_node_t* node; + elsif_node_t* elsif_node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(if_node_t)); + + node->common.type = QUE_NODE_IF; + + node->cond = cond; + + pars_resolve_exp_variables_and_types(NULL, cond); + + node->stat_list = stat_list; + + if (else_part && (que_node_get_type(else_part) == QUE_NODE_ELSIF)) { + + /* There is a list of elsif conditions */ + + node->else_part = NULL; + node->elsif_list = else_part; + + elsif_node = else_part; + + while (elsif_node) { + pars_set_parent_in_list(elsif_node->stat_list, node); + + elsif_node = que_node_get_next(elsif_node); + } + } else { + node->else_part = else_part; + node->elsif_list = NULL; + + pars_set_parent_in_list(else_part, node); + } + + pars_set_parent_in_list(stat_list, node); + + return(node); +} + +/************************************************************************* +Parses a while-statement. */ +UNIV_INTERN +while_node_t* +pars_while_statement( +/*=================*/ + /* out: while-statement node */ + que_node_t* cond, /* in: while-condition */ + que_node_t* stat_list) /* in: statement list */ +{ + while_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(while_node_t)); + + node->common.type = QUE_NODE_WHILE; + + node->cond = cond; + + pars_resolve_exp_variables_and_types(NULL, cond); + + node->stat_list = stat_list; + + pars_set_parent_in_list(stat_list, node); + + return(node); +} + +/************************************************************************* +Parses a for-loop-statement. */ +UNIV_INTERN +for_node_t* +pars_for_statement( +/*===============*/ + /* out: for-statement node */ + sym_node_t* loop_var, /* in: loop variable */ + que_node_t* loop_start_limit,/* in: loop start expression */ + que_node_t* loop_end_limit, /* in: loop end expression */ + que_node_t* stat_list) /* in: statement list */ +{ + for_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t)); + + node->common.type = QUE_NODE_FOR; + + pars_resolve_exp_variables_and_types(NULL, loop_var); + pars_resolve_exp_variables_and_types(NULL, loop_start_limit); + pars_resolve_exp_variables_and_types(NULL, loop_end_limit); + + node->loop_var = loop_var->indirection; + + ut_a(loop_var->indirection); + + node->loop_start_limit = loop_start_limit; + node->loop_end_limit = loop_end_limit; + + node->stat_list = stat_list; + + pars_set_parent_in_list(stat_list, node); + + return(node); +} + +/************************************************************************* +Parses an exit statement. */ +UNIV_INTERN +exit_node_t* +pars_exit_statement(void) +/*=====================*/ + /* out: exit statement node */ +{ + exit_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t)); + node->common.type = QUE_NODE_EXIT; + + return(node); +} + +/************************************************************************* +Parses a return-statement. */ +UNIV_INTERN +return_node_t* +pars_return_statement(void) +/*=======================*/ + /* out: return-statement node */ +{ + return_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, + sizeof(return_node_t)); + node->common.type = QUE_NODE_RETURN; + + return(node); +} + +/************************************************************************* +Parses an assignment statement. */ +UNIV_INTERN +assign_node_t* +pars_assignment_statement( +/*======================*/ + /* out: assignment statement node */ + sym_node_t* var, /* in: variable to assign */ + que_node_t* val) /* in: value to assign */ +{ + assign_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, + sizeof(assign_node_t)); + node->common.type = QUE_NODE_ASSIGNMENT; + + node->var = var; + node->val = val; + + pars_resolve_exp_variables_and_types(NULL, var); + pars_resolve_exp_variables_and_types(NULL, val); + + ut_a(dtype_get_mtype(dfield_get_type(que_node_get_val(var))) + == dtype_get_mtype(dfield_get_type(que_node_get_val(val)))); + + return(node); +} + +/************************************************************************* +Parses a procedure call. */ +UNIV_INTERN +func_node_t* +pars_procedure_call( +/*================*/ + /* out: function node */ + que_node_t* res_word,/* in: procedure name reserved word */ + que_node_t* args) /* in: argument list */ +{ + func_node_t* node; + + node = pars_func(res_word, args); + + pars_resolve_exp_list_variables_and_types(NULL, args); + + return(node); +} + +/************************************************************************* +Parses a fetch statement. into_list or user_func (but not both) must be +non-NULL. */ +UNIV_INTERN +fetch_node_t* +pars_fetch_statement( +/*=================*/ + /* out: fetch statement node */ + sym_node_t* cursor, /* in: cursor node */ + sym_node_t* into_list, /* in: variables to set, or NULL */ + sym_node_t* user_func) /* in: user function name, or NULL */ +{ + sym_node_t* cursor_decl; + fetch_node_t* node; + + /* Logical XOR. */ + ut_a(!into_list != !user_func); + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(fetch_node_t)); + + node->common.type = QUE_NODE_FETCH; + + pars_resolve_exp_variables_and_types(NULL, cursor); + + if (into_list) { + pars_resolve_exp_list_variables_and_types(NULL, into_list); + node->into_list = into_list; + node->func = NULL; + } else { + pars_resolve_exp_variables_and_types(NULL, user_func); + + node->func = pars_info_get_user_func(pars_sym_tab_global->info, + user_func->name); + ut_a(node->func); + + node->into_list = NULL; + } + + cursor_decl = cursor->alias; + + ut_a(cursor_decl->token_type == SYM_CURSOR); + + node->cursor_def = cursor_decl->cursor_def; + + if (into_list) { + ut_a(que_node_list_get_len(into_list) + == que_node_list_get_len(node->cursor_def->select_list)); + } + + return(node); +} + +/************************************************************************* +Parses an open or close cursor statement. */ +UNIV_INTERN +open_node_t* +pars_open_statement( +/*================*/ + /* out: fetch statement node */ + ulint type, /* in: ROW_SEL_OPEN_CURSOR + or ROW_SEL_CLOSE_CURSOR */ + sym_node_t* cursor) /* in: cursor node */ +{ + sym_node_t* cursor_decl; + open_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(open_node_t)); + + node->common.type = QUE_NODE_OPEN; + + pars_resolve_exp_variables_and_types(NULL, cursor); + + cursor_decl = cursor->alias; + + ut_a(cursor_decl->token_type == SYM_CURSOR); + + node->op_type = type; + node->cursor_def = cursor_decl->cursor_def; + + return(node); +} + +/************************************************************************* +Parses a row_printf-statement. */ +UNIV_INTERN +row_printf_node_t* +pars_row_printf_statement( +/*======================*/ + /* out: row_printf-statement node */ + sel_node_t* sel_node) /* in: select node */ +{ + row_printf_node_t* node; + + node = mem_heap_alloc(pars_sym_tab_global->heap, + sizeof(row_printf_node_t)); + node->common.type = QUE_NODE_ROW_PRINTF; + + node->sel_node = sel_node; + + sel_node->common.parent = node; + + return(node); +} + +/************************************************************************* +Parses a commit statement. */ +UNIV_INTERN +commit_node_t* +pars_commit_statement(void) +/*=======================*/ +{ + return(commit_node_create(pars_sym_tab_global->heap)); +} + +/************************************************************************* +Parses a rollback statement. */ +UNIV_INTERN +roll_node_t* +pars_rollback_statement(void) +/*=========================*/ +{ + return(roll_node_create(pars_sym_tab_global->heap)); +} + +/************************************************************************* +Parses a column definition at a table creation. */ +UNIV_INTERN +sym_node_t* +pars_column_def( +/*============*/ + /* out: column sym table + node */ + sym_node_t* sym_node, /* in: column node in the + symbol table */ + pars_res_word_t* type, /* in: data type */ + sym_node_t* len, /* in: length of column, or + NULL */ + void* is_unsigned, /* in: if not NULL, column + is of type UNSIGNED. */ + void* is_not_null) /* in: if not NULL, column + is of type NOT NULL. */ +{ + ulint len2; + + if (len) { + len2 = eval_node_get_int_val(len); + } else { + len2 = 0; + } + + pars_set_dfield_type(que_node_get_val(sym_node), type, len2, + is_unsigned != NULL, is_not_null != NULL); + + return(sym_node); +} + +/************************************************************************* +Parses a table creation operation. */ +UNIV_INTERN +tab_node_t* +pars_create_table( +/*==============*/ + /* out: table create subgraph */ + sym_node_t* table_sym, /* in: table name node in the symbol + table */ + sym_node_t* column_defs, /* in: list of column names */ + void* not_fit_in_memory __attribute__((unused))) + /* in: a non-NULL pointer means that + this is a table which in simulations + should be simulated as not fitting + in memory; thread is put to sleep + to simulate disk accesses; NOTE that + this flag is not stored to the data + dictionary on disk, and the database + will forget about non-NULL value if + it has to reload the table definition + from disk */ +{ + dict_table_t* table; + sym_node_t* column; + tab_node_t* node; + const dtype_t* dtype; + ulint n_cols; + + n_cols = que_node_list_get_len(column_defs); + + /* As the InnoDB SQL parser is for internal use only, + for creating some system tables, this function will only + create tables in the old (not compact) record format. */ + table = dict_mem_table_create(table_sym->name, 0, n_cols, 0); + +#ifdef UNIV_DEBUG + if (not_fit_in_memory != NULL) { + table->does_not_fit_in_memory = TRUE; + } +#endif /* UNIV_DEBUG */ + column = column_defs; + + while (column) { + dtype = dfield_get_type(que_node_get_val(column)); + + dict_mem_table_add_col(table, table->heap, + column->name, dtype->mtype, + dtype->prtype, dtype->len); + column->resolved = TRUE; + column->token_type = SYM_COLUMN; + + column = que_node_get_next(column); + } + + node = tab_create_graph_create(table, pars_sym_tab_global->heap); + + table_sym->resolved = TRUE; + table_sym->token_type = SYM_TABLE; + + return(node); +} + +/************************************************************************* +Parses an index creation operation. */ +UNIV_INTERN +ind_node_t* +pars_create_index( +/*==============*/ + /* out: index create subgraph */ + pars_res_word_t* unique_def, /* in: not NULL if a unique index */ + pars_res_word_t* clustered_def, /* in: not NULL if a clustered index */ + sym_node_t* index_sym, /* in: index name node in the symbol + table */ + sym_node_t* table_sym, /* in: table name node in the symbol + table */ + sym_node_t* column_list) /* in: list of column names */ +{ + dict_index_t* index; + sym_node_t* column; + ind_node_t* node; + ulint n_fields; + ulint ind_type; + + n_fields = que_node_list_get_len(column_list); + + ind_type = 0; + + if (unique_def) { + ind_type = ind_type | DICT_UNIQUE; + } + + if (clustered_def) { + ind_type = ind_type | DICT_CLUSTERED; + } + + index = dict_mem_index_create(table_sym->name, index_sym->name, 0, + ind_type, n_fields); + column = column_list; + + while (column) { + dict_mem_index_add_field(index, column->name, 0); + + column->resolved = TRUE; + column->token_type = SYM_COLUMN; + + column = que_node_get_next(column); + } + + node = ind_create_graph_create(index, pars_sym_tab_global->heap); + + table_sym->resolved = TRUE; + table_sym->token_type = SYM_TABLE; + + index_sym->resolved = TRUE; + index_sym->token_type = SYM_TABLE; + + return(node); +} + +/************************************************************************* +Parses a procedure definition. */ +UNIV_INTERN +que_fork_t* +pars_procedure_definition( +/*======================*/ + /* out: query fork node */ + sym_node_t* sym_node, /* in: procedure id node in the symbol + table */ + sym_node_t* param_list, /* in: parameter declaration list */ + que_node_t* stat_list) /* in: statement list */ +{ + proc_node_t* node; + que_fork_t* fork; + que_thr_t* thr; + mem_heap_t* heap; + + heap = pars_sym_tab_global->heap; + + fork = que_fork_create(NULL, NULL, QUE_FORK_PROCEDURE, heap); + fork->trx = NULL; + + thr = que_thr_create(fork, heap); + + node = mem_heap_alloc(heap, sizeof(proc_node_t)); + + node->common.type = QUE_NODE_PROC; + node->common.parent = thr; + + sym_node->token_type = SYM_PROCEDURE_NAME; + sym_node->resolved = TRUE; + + node->proc_id = sym_node; + node->param_list = param_list; + node->stat_list = stat_list; + + pars_set_parent_in_list(stat_list, node); + + node->sym_tab = pars_sym_tab_global; + + thr->child = node; + + pars_sym_tab_global->query_graph = fork; + + return(fork); +} + +/***************************************************************** +Parses a stored procedure call, when this is not within another stored +procedure, that is, the client issues a procedure call directly. +In MySQL/InnoDB, stored InnoDB procedures are invoked via the +parsed procedure tree, not via InnoDB SQL, so this function is not used. */ +UNIV_INTERN +que_fork_t* +pars_stored_procedure_call( +/*=======================*/ + /* out: query graph */ + sym_node_t* sym_node __attribute__((unused))) + /* in: stored procedure name */ +{ + ut_error; + return(NULL); +} + +/***************************************************************** +Retrieves characters to the lexical analyzer. */ +UNIV_INTERN +void +pars_get_lex_chars( +/*===============*/ + char* buf, /* in/out: buffer where to copy */ + int* result, /* out: number of characters copied or EOF */ + int max_size) /* in: maximum number of characters which fit + in the buffer */ +{ + int len; + + len = pars_sym_tab_global->string_len + - pars_sym_tab_global->next_char_pos; + if (len == 0) { +#ifdef YYDEBUG + /* fputs("SQL string ends\n", stderr); */ +#endif + *result = 0; + + return; + } + + if (len > max_size) { + len = max_size; + } + +#ifdef UNIV_SQL_DEBUG + if (pars_print_lexed) { + + if (len >= 5) { + len = 5; + } + + fwrite(pars_sym_tab_global->sql_string + + pars_sym_tab_global->next_char_pos, + 1, len, stderr); + } +#endif /* UNIV_SQL_DEBUG */ + + ut_memcpy(buf, pars_sym_tab_global->sql_string + + pars_sym_tab_global->next_char_pos, len); + *result = len; + + pars_sym_tab_global->next_char_pos += len; +} + +/***************************************************************** +Called by yyparse on error. */ +UNIV_INTERN +void +yyerror( +/*====*/ + const char* s __attribute__((unused))) + /* in: error message string */ +{ + ut_ad(s); + + fputs("PARSER ERROR: Syntax error in SQL string\n", stderr); + + ut_error; +} + +/***************************************************************** +Parses an SQL string returning the query graph. */ +UNIV_INTERN +que_t* +pars_sql( +/*=====*/ + /* out, own: the query graph */ + pars_info_t* info, /* in: extra information, or NULL */ + const char* str) /* in: SQL string */ +{ + sym_node_t* sym_node; + mem_heap_t* heap; + que_t* graph; + + ut_ad(str); + + heap = mem_heap_create(256); + + /* Currently, the parser is not reentrant: */ + ut_ad(mutex_own(&(dict_sys->mutex))); + + pars_sym_tab_global = sym_tab_create(heap); + + pars_sym_tab_global->string_len = strlen(str); + pars_sym_tab_global->sql_string = mem_heap_dup( + heap, str, pars_sym_tab_global->string_len + 1); + pars_sym_tab_global->next_char_pos = 0; + pars_sym_tab_global->info = info; + + yyparse(); + + sym_node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list); + + while (sym_node) { + ut_a(sym_node->resolved); + + sym_node = UT_LIST_GET_NEXT(sym_list, sym_node); + } + + graph = pars_sym_tab_global->query_graph; + + graph->sym_tab = pars_sym_tab_global; + graph->info = info; + + /* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */ + + return(graph); +} + +/********************************************************************** +Completes a query graph by adding query thread and fork nodes +above it and prepares the graph for running. The fork created is of +type QUE_FORK_MYSQL_INTERFACE. */ +UNIV_INTERN +que_thr_t* +pars_complete_graph_for_exec( +/*=========================*/ + /* out: query thread node to run */ + que_node_t* node, /* in: root node for an incomplete + query graph */ + trx_t* trx, /* in: transaction handle */ + mem_heap_t* heap) /* in: memory heap from which allocated */ +{ + que_fork_t* fork; + que_thr_t* thr; + + fork = que_fork_create(NULL, NULL, QUE_FORK_MYSQL_INTERFACE, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + + thr->child = node; + + que_node_set_parent(node, thr); + + trx->graph = NULL; + + return(thr); +} + +/******************************************************************** +Create parser info struct.*/ +UNIV_INTERN +pars_info_t* +pars_info_create(void) +/*==================*/ + /* out, own: info struct */ +{ + pars_info_t* info; + mem_heap_t* heap; + + heap = mem_heap_create(512); + + info = mem_heap_alloc(heap, sizeof(*info)); + + info->heap = heap; + info->funcs = NULL; + info->bound_lits = NULL; + info->bound_ids = NULL; + info->graph_owns_us = TRUE; + + return(info); +} + +/******************************************************************** +Free info struct and everything it contains.*/ +UNIV_INTERN +void +pars_info_free( +/*===========*/ + pars_info_t* info) /* in: info struct */ +{ + mem_heap_free(info->heap); +} + +/******************************************************************** +Add bound literal. */ +UNIV_INTERN +void +pars_info_add_literal( +/*==================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const void* address, /* in: address */ + ulint length, /* in: length of data */ + ulint type, /* in: type, e.g. DATA_FIXBINARY */ + ulint prtype) /* in: precise type, e.g. + DATA_UNSIGNED */ +{ + pars_bound_lit_t* pbl; + + ut_ad(!pars_info_get_bound_lit(info, name)); + + pbl = mem_heap_alloc(info->heap, sizeof(*pbl)); + + pbl->name = name; + pbl->address = address; + pbl->length = length; + pbl->type = type; + pbl->prtype = prtype; + + if (!info->bound_lits) { + info->bound_lits = ib_vector_create(info->heap, 8); + } + + ib_vector_push(info->bound_lits, pbl); +} + +/******************************************************************** +Equivalent to pars_info_add_literal(info, name, str, strlen(str), +DATA_VARCHAR, DATA_ENGLISH). */ +UNIV_INTERN +void +pars_info_add_str_literal( +/*======================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const char* str) /* in: string */ +{ + pars_info_add_literal(info, name, str, strlen(str), + DATA_VARCHAR, DATA_ENGLISH); +} + +/******************************************************************** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_int4_literal( +/*=======================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + lint val) /* in: value */ +{ + byte* buf = mem_heap_alloc(info->heap, 4); + + mach_write_to_4(buf, val); + pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); +} + +/******************************************************************** +Equivalent to: + +char buf[8]; +mach_write_to_8(buf, val); +pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +UNIV_INTERN +void +pars_info_add_dulint_literal( +/*=========================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + dulint val) /* in: value */ +{ + byte* buf = mem_heap_alloc(info->heap, 8); + + mach_write_to_8(buf, val); + + pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); +} + +/******************************************************************** +Add user function. */ +UNIV_INTERN +void +pars_info_add_function( +/*===================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: function name */ + pars_user_func_cb_t func, /* in: function address */ + void* arg) /* in: user-supplied argument */ +{ + pars_user_func_t* puf; + + ut_ad(!pars_info_get_user_func(info, name)); + + puf = mem_heap_alloc(info->heap, sizeof(*puf)); + + puf->name = name; + puf->func = func; + puf->arg = arg; + + if (!info->funcs) { + info->funcs = ib_vector_create(info->heap, 8); + } + + ib_vector_push(info->funcs, puf); +} + +/******************************************************************** +Add bound id. */ +UNIV_INTERN +void +pars_info_add_id( +/*=============*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const char* id) /* in: id */ +{ + pars_bound_id_t* bid; + + ut_ad(!pars_info_get_bound_id(info, name)); + + bid = mem_heap_alloc(info->heap, sizeof(*bid)); + + bid->name = name; + bid->id = id; + + if (!info->bound_ids) { + info->bound_ids = ib_vector_create(info->heap, 8); + } + + ib_vector_push(info->bound_ids, bid); +} + +/******************************************************************** +Get user function with the given name.*/ +UNIV_INTERN +pars_user_func_t* +pars_info_get_user_func( +/*====================*/ + /* out: user func, or NULL if not + found */ + pars_info_t* info, /* in: info struct */ + const char* name) /* in: function name to find*/ +{ + ulint i; + ib_vector_t* vec; + + if (!info || !info->funcs) { + return(NULL); + } + + vec = info->funcs; + + for (i = 0; i < ib_vector_size(vec); i++) { + pars_user_func_t* puf = ib_vector_get(vec, i); + + if (strcmp(puf->name, name) == 0) { + return(puf); + } + } + + return(NULL); +} + +/******************************************************************** +Get bound literal with the given name.*/ +UNIV_INTERN +pars_bound_lit_t* +pars_info_get_bound_lit( +/*====================*/ + /* out: bound literal, or NULL if + not found */ + pars_info_t* info, /* in: info struct */ + const char* name) /* in: bound literal name to find */ +{ + ulint i; + ib_vector_t* vec; + + if (!info || !info->bound_lits) { + return(NULL); + } + + vec = info->bound_lits; + + for (i = 0; i < ib_vector_size(vec); i++) { + pars_bound_lit_t* pbl = ib_vector_get(vec, i); + + if (strcmp(pbl->name, name) == 0) { + return(pbl); + } + } + + return(NULL); +} + +/******************************************************************** +Get bound id with the given name.*/ +UNIV_INTERN +pars_bound_id_t* +pars_info_get_bound_id( +/*===================*/ + /* out: bound id, or NULL if not + found */ + pars_info_t* info, /* in: info struct */ + const char* name) /* in: bound id name to find */ +{ + ulint i; + ib_vector_t* vec; + + if (!info || !info->bound_ids) { + return(NULL); + } + + vec = info->bound_ids; + + for (i = 0; i < ib_vector_size(vec); i++) { + pars_bound_id_t* bid = ib_vector_get(vec, i); + + if (strcmp(bid->name, name) == 0) { + return(bid); + } + } + + return(NULL); +} diff --git a/storage/xtradb/pars/pars0sym.c b/storage/xtradb/pars/pars0sym.c new file mode 100644 index 00000000000..fb23547e767 --- /dev/null +++ b/storage/xtradb/pars/pars0sym.c @@ -0,0 +1,370 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ + +#include "pars0sym.h" + +#ifdef UNIV_NONINL +#include "pars0sym.ic" +#endif + +#include "mem0mem.h" +#include "data0type.h" +#include "data0data.h" +#include "pars0grm.h" +#include "pars0pars.h" +#include "que0que.h" +#include "eval0eval.h" +#include "row0sel.h" + +/********************************************************************** +Creates a symbol table for a single stored procedure or query. */ +UNIV_INTERN +sym_tab_t* +sym_tab_create( +/*===========*/ + /* out, own: symbol table */ + mem_heap_t* heap) /* in: memory heap where to create */ +{ + sym_tab_t* sym_tab; + + sym_tab = mem_heap_alloc(heap, sizeof(sym_tab_t)); + + UT_LIST_INIT(sym_tab->sym_list); + UT_LIST_INIT(sym_tab->func_node_list); + + sym_tab->heap = heap; + + return(sym_tab); +} + +/********************************************************************** +Frees the memory allocated dynamically AFTER parsing phase for variables +etc. in the symbol table. Does not free the mem heap where the table was +originally created. Frees also SQL explicit cursor definitions. */ +UNIV_INTERN +void +sym_tab_free_private( +/*=================*/ + sym_tab_t* sym_tab) /* in, own: symbol table */ +{ + sym_node_t* sym; + func_node_t* func; + + sym = UT_LIST_GET_FIRST(sym_tab->sym_list); + + while (sym) { + eval_node_free_val_buf(sym); + + if (sym->prefetch_buf) { + sel_col_prefetch_buf_free(sym->prefetch_buf); + } + + if (sym->cursor_def) { + que_graph_free_recursive(sym->cursor_def); + } + + sym = UT_LIST_GET_NEXT(sym_list, sym); + } + + func = UT_LIST_GET_FIRST(sym_tab->func_node_list); + + while (func) { + eval_node_free_val_buf(func); + + func = UT_LIST_GET_NEXT(func_node_list, func); + } +} + +/********************************************************************** +Adds an integer literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_int_lit( +/*================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + ulint val) /* in: integer value */ +{ + sym_node_t* node; + byte* data; + + node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)); + + node->common.type = QUE_NODE_SYMBOL; + + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4); + + data = mem_heap_alloc(sym_tab->heap, 4); + mach_write_to_4(data, val); + + dfield_set_data(&(node->common.val), data, 4); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + node->sym_table = sym_tab; + + return(node); +} + +/********************************************************************** +Adds a string literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_str_lit( +/*================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + byte* str, /* in: string with no quotes around + it */ + ulint len) /* in: string length */ +{ + sym_node_t* node; + byte* data; + + node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)); + + node->common.type = QUE_NODE_SYMBOL; + + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + dtype_set(dfield_get_type(&node->common.val), + DATA_VARCHAR, DATA_ENGLISH, 0); + + if (len) { + data = mem_heap_alloc(sym_tab->heap, len); + ut_memcpy(data, str, len); + } else { + data = NULL; + } + + dfield_set_data(&(node->common.val), data, len); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + node->sym_table = sym_tab; + + return(node); +} + +/********************************************************************** +Add a bound literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_lit( +/*==================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + const char* name, /* in: name of bound literal */ + ulint* lit_type) /* out: type of literal (PARS_*_LIT) */ +{ + sym_node_t* node; + pars_bound_lit_t* blit; + ulint len = 0; + + blit = pars_info_get_bound_lit(sym_tab->info, name); + ut_a(blit); + + node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)); + + node->common.type = QUE_NODE_SYMBOL; + + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + switch (blit->type) { + case DATA_FIXBINARY: + len = blit->length; + *lit_type = PARS_FIXBINARY_LIT; + break; + + case DATA_BLOB: + *lit_type = PARS_BLOB_LIT; + break; + + case DATA_VARCHAR: + *lit_type = PARS_STR_LIT; + break; + + case DATA_CHAR: + ut_a(blit->length > 0); + + len = blit->length; + *lit_type = PARS_STR_LIT; + break; + + case DATA_INT: + ut_a(blit->length > 0); + ut_a(blit->length <= 8); + + len = blit->length; + *lit_type = PARS_INT_LIT; + break; + + default: + ut_error; + } + + dtype_set(dfield_get_type(&node->common.val), + blit->type, blit->prtype, len); + + dfield_set_data(&(node->common.val), blit->address, blit->length); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + node->sym_table = sym_tab; + + return(node); +} + +/********************************************************************** +Adds an SQL null literal to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_null_lit( +/*=================*/ + /* out: symbol table node */ + sym_tab_t* sym_tab) /* in: symbol table */ +{ + sym_node_t* node; + + node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)); + + node->common.type = QUE_NODE_SYMBOL; + + node->resolved = TRUE; + node->token_type = SYM_LIT; + + node->indirection = NULL; + + dfield_get_type(&node->common.val)->mtype = DATA_ERROR; + + dfield_set_null(&node->common.val); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + node->sym_table = sym_tab; + + return(node); +} + +/********************************************************************** +Adds an identifier to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_id( +/*===========*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + byte* name, /* in: identifier name */ + ulint len) /* in: identifier length */ +{ + sym_node_t* node; + + node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)); + + node->common.type = QUE_NODE_SYMBOL; + + node->resolved = FALSE; + node->indirection = NULL; + + node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len); + node->name_len = len; + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + dfield_set_null(&node->common.val); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + node->sym_table = sym_tab; + + return(node); +} + +/********************************************************************** +Add a bound identifier to a symbol table. */ +UNIV_INTERN +sym_node_t* +sym_tab_add_bound_id( +/*===========*/ + /* out: symbol table node */ + sym_tab_t* sym_tab, /* in: symbol table */ + const char* name) /* in: name of bound id */ +{ + sym_node_t* node; + pars_bound_id_t* bid; + + bid = pars_info_get_bound_id(sym_tab->info, name); + ut_a(bid); + + node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)); + + node->common.type = QUE_NODE_SYMBOL; + + node->resolved = FALSE; + node->indirection = NULL; + + node->name = mem_heap_strdup(sym_tab->heap, bid->id); + node->name_len = strlen(node->name); + + UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node); + + dfield_set_null(&node->common.val); + + node->common.val_buf_size = 0; + node->prefetch_buf = NULL; + node->cursor_def = NULL; + + node->sym_table = sym_tab; + + return(node); +} diff --git a/storage/xtradb/plug.in b/storage/xtradb/plug.in new file mode 100644 index 00000000000..4d5e792bfb1 --- /dev/null +++ b/storage/xtradb/plug.in @@ -0,0 +1,80 @@ +# +# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA +# + +MYSQL_STORAGE_ENGINE(innobase, innodb, [InnoDB Storage Engine], + [Transactional Tables using InnoDB], [max,max-no-ndb]) +MYSQL_PLUGIN_DIRECTORY(innobase, [storage/xtradb]) +MYSQL_PLUGIN_STATIC(innobase, [libinnobase.a]) +MYSQL_PLUGIN_DYNAMIC(innobase, [ha_innodb.la]) +MYSQL_PLUGIN_ACTIONS(innobase, [ + AC_CHECK_HEADERS(sched.h) + AC_CHECK_SIZEOF(int, 4) + AC_CHECK_SIZEOF(long, 4) + AC_CHECK_SIZEOF(void*, 4) + AC_CHECK_FUNCS(sched_yield fdatasync localtime_r) + AC_C_BIGENDIAN + case "$target_os" in + lin*) + CFLAGS="$CFLAGS -DUNIV_LINUX";; + hpux10*) + CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX -DUNIV_HPUX10";; + hp*) + CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX";; + aix*) + CFLAGS="$CFLAGS -DUNIV_AIX";; + irix*|osf*|sysv5uw7*|openbsd*) + CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE";; + *solaris*|*SunOS*) + CFLAGS="$CFLAGS -DUNIV_SOLARIS";; + esac + INNODB_DYNAMIC_CFLAGS="-DMYSQL_DYNAMIC_PLUGIN" + case "$target_cpu" in + x86_64) + # The AMD64 ABI forbids absolute addresses in shared libraries + ;; + *86) + # Use absolute addresses on IA-32 + INNODB_DYNAMIC_CFLAGS="$INNODB_DYNAMIC_CFLAGS -prefer-non-pic" + ;; + esac + AC_SUBST(INNODB_DYNAMIC_CFLAGS) + AC_MSG_CHECKING(whether pthread_t can be used by GCC atomic builtins) + AC_TRY_RUN( + [ + #include + + int main(int argc, char** argv) { + pthread_t x1; + pthread_t x2; + pthread_t x3; + + __sync_bool_compare_and_swap(&x1, x2, x3); + + return(0); + } + ], + [ + AC_DEFINE([HAVE_ATOMIC_PTHREAD_T], [1], + [pthread_t can be used by GCC atomic builtins]) + AC_MSG_RESULT(yes) + ], + [ + AC_MSG_RESULT(no) + ] + ) + ]) + +# vim: set ft=config: diff --git a/storage/xtradb/que/que0que.c b/storage/xtradb/que/que0que.c new file mode 100644 index 00000000000..91a9d30ec4c --- /dev/null +++ b/storage/xtradb/que/que0que.c @@ -0,0 +1,1461 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" + +#ifdef UNIV_NONINL +#include "que0que.ic" +#endif + +#include "srv0que.h" +#include "usr0sess.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "row0undo.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0purge.h" +#include "dict0crea.h" +#include "log0log.h" +#include "eval0proc.h" +#include "eval0eval.h" +#include "pars0types.h" + +#define QUE_PARALLELIZE_LIMIT (64 * 256 * 256 * 256) +#define QUE_ROUND_ROBIN_LIMIT (64 * 256 * 256 * 256) +#define QUE_MAX_LOOPS_WITHOUT_CHECK 16 + +#ifdef UNIV_DEBUG +/* If the following flag is set TRUE, the module will print trace info +of SQL execution in the UNIV_SQL_DEBUG version */ +UNIV_INTERN ibool que_trace_on = FALSE; +#endif /* UNIV_DEBUG */ + +/* Short introduction to query graphs + ================================== + +A query graph consists of nodes linked to each other in various ways. The +execution starts at que_run_threads() which takes a que_thr_t parameter. +que_thr_t contains two fields that control query graph execution: run_node +and prev_node. run_node is the next node to execute and prev_node is the +last node executed. + +Each node has a pointer to a 'next' statement, i.e., its brother, and a +pointer to its parent node. The next pointer is NULL in the last statement +of a block. + +Loop nodes contain a link to the first statement of the enclosed statement +list. While the loop runs, que_thr_step() checks if execution to the loop +node came from its parent or from one of the statement nodes in the loop. If +it came from the parent of the loop node it starts executing the first +statement node in the loop. If it came from one of the statement nodes in +the loop, then it checks if the statement node has another statement node +following it, and runs it if so. + +To signify loop ending, the loop statements (see e.g. while_step()) set +que_thr_t->run_node to the loop node's parent node. This is noticed on the +next call of que_thr_step() and execution proceeds to the node pointed to by +the loop node's 'next' pointer. + +For example, the code: + +X := 1; +WHILE X < 5 LOOP + X := X + 1; + X := X + 1; +X := 5 + +will result in the following node hierarchy, with the X-axis indicating +'next' links and the Y-axis indicating parent/child links: + +A - W - A + | + | + A - A + +A = assign_node_t, W = while_node_t. */ + +/* How a stored procedure containing COMMIT or ROLLBACK commands +is executed? + +The commit or rollback can be seen as a subprocedure call. +The problem is that if there are several query threads +currently running within the transaction, their action could +mess the commit or rollback operation. Or, at the least, the +operation would be difficult to visualize and keep in control. + +Therefore the query thread requesting a commit or a rollback +sends to the transaction a signal, which moves the transaction +to TRX_QUE_SIGNALED state. All running query threads of the +transaction will eventually notice that the transaction is now in +this state and voluntarily suspend themselves. Only the last +query thread which suspends itself will trigger handling of +the signal. + +When the transaction starts to handle a rollback or commit +signal, it builds a query graph which, when executed, will +roll back or commit the incomplete transaction. The transaction +is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state. +If specified, the SQL cursors opened by the transaction are closed. +When the execution of the graph completes, it is like returning +from a subprocedure: the query thread which requested the operation +starts running again. */ + +/************************************************************************** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction. +***NOTE***: This is the only function in which such a transition is allowed +to happen! */ +static +void +que_thr_move_to_run_state( +/*======================*/ + que_thr_t* thr); /* in: an query thread */ + +/*************************************************************************** +Adds a query graph to the session's list of graphs. */ +UNIV_INTERN +void +que_graph_publish( +/*==============*/ + que_t* graph, /* in: graph */ + sess_t* sess) /* in: session */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + UT_LIST_ADD_LAST(graphs, sess->graphs, graph); +} + +/*************************************************************************** +Creates a query graph fork node. */ +UNIV_INTERN +que_fork_t* +que_fork_create( +/*============*/ + /* out, own: fork node */ + que_t* graph, /* in: graph, if NULL then this + fork node is assumed to be the + graph root */ + que_node_t* parent, /* in: parent node */ + ulint fork_type, /* in: fork type */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + que_fork_t* fork; + + ut_ad(heap); + + fork = mem_heap_alloc(heap, sizeof(que_fork_t)); + + fork->common.type = QUE_NODE_FORK; + fork->n_active_thrs = 0; + + fork->state = QUE_FORK_COMMAND_WAIT; + + if (graph != NULL) { + fork->graph = graph; + } else { + fork->graph = fork; + } + + fork->common.parent = parent; + fork->fork_type = fork_type; + + fork->caller = NULL; + + UT_LIST_INIT(fork->thrs); + + fork->sym_tab = NULL; + fork->info = NULL; + + fork->heap = heap; + + return(fork); +} + +/*************************************************************************** +Creates a query graph thread node. */ +UNIV_INTERN +que_thr_t* +que_thr_create( +/*===========*/ + /* out, own: query thread node */ + que_fork_t* parent, /* in: parent node, i.e., a fork node */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + que_thr_t* thr; + + ut_ad(parent && heap); + + thr = mem_heap_alloc(heap, sizeof(que_thr_t)); + + thr->common.type = QUE_NODE_THR; + thr->common.parent = parent; + + thr->magic_n = QUE_THR_MAGIC_N; + + thr->graph = parent->graph; + + thr->state = QUE_THR_COMMAND_WAIT; + + thr->is_active = FALSE; + + thr->run_node = NULL; + thr->resource = 0; + thr->lock_state = QUE_THR_LOCK_NOLOCK; + + UT_LIST_ADD_LAST(thrs, parent->thrs, thr); + + return(thr); +} + +/************************************************************************** +Moves a suspended query thread to the QUE_THR_RUNNING state and may release +a single worker thread to execute it. This function should be used to end +the wait state of a query thread waiting for a lock or a stored procedure +completion. */ +UNIV_INTERN +void +que_thr_end_wait( +/*=============*/ + que_thr_t* thr, /* in: query thread in the + QUE_THR_LOCK_WAIT, + or QUE_THR_PROCEDURE_WAIT, or + QUE_THR_SIG_REPLY_WAIT state */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if NULL is passed + as the parameter, it is ignored */ +{ + ibool was_active; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(thr); + ut_ad((thr->state == QUE_THR_LOCK_WAIT) + || (thr->state == QUE_THR_PROCEDURE_WAIT) + || (thr->state == QUE_THR_SIG_REPLY_WAIT)); + ut_ad(thr->run_node); + + thr->prev_node = thr->run_node; + + was_active = thr->is_active; + + que_thr_move_to_run_state(thr); + + if (was_active) { + + return; + } + + if (next_thr && *next_thr == NULL) { + *next_thr = thr; + } else { + ut_a(0); + srv_que_task_enqueue_low(thr); + } +} + +/************************************************************************** +Same as que_thr_end_wait, but no parameter next_thr available. */ +UNIV_INTERN +void +que_thr_end_wait_no_next_thr( +/*=========================*/ + que_thr_t* thr) /* in: query thread in the QUE_THR_LOCK_WAIT, + or QUE_THR_PROCEDURE_WAIT, or + QUE_THR_SIG_REPLY_WAIT state */ +{ + ibool was_active; + + ut_a(thr->state == QUE_THR_LOCK_WAIT); /* In MySQL this is the + only possible state here */ + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(thr); + ut_ad((thr->state == QUE_THR_LOCK_WAIT) + || (thr->state == QUE_THR_PROCEDURE_WAIT) + || (thr->state == QUE_THR_SIG_REPLY_WAIT)); + + was_active = thr->is_active; + + que_thr_move_to_run_state(thr); + + if (was_active) { + + return; + } + + /* In MySQL we let the OS thread (not just the query thread) to wait + for the lock to be released: */ + + srv_release_mysql_thread_if_suspended(thr); + + /* srv_que_task_enqueue_low(thr); */ +} + +/************************************************************************** +Inits a query thread for a command. */ +UNIV_INLINE +void +que_thr_init_command( +/*=================*/ + que_thr_t* thr) /* in: query thread */ +{ + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + que_thr_move_to_run_state(thr); +} + +/************************************************************************** +Starts execution of a command in a query fork. Picks a query thread which +is not in the QUE_THR_RUNNING state and moves it to that state. If none +can be chosen, a situation which may arise in parallelized fetches, NULL +is returned. */ +UNIV_INTERN +que_thr_t* +que_fork_start_command( +/*===================*/ + /* out: a query thread of the graph moved to + QUE_THR_RUNNING state, or NULL; the query + thread should be executed by que_run_threads + by the caller */ + que_fork_t* fork) /* in: a query fork */ +{ + que_thr_t* thr; + que_thr_t* suspended_thr = NULL; + que_thr_t* completed_thr = NULL; + + fork->state = QUE_FORK_ACTIVE; + + fork->last_sel_node = NULL; + + suspended_thr = NULL; + completed_thr = NULL; + + /* Choose the query thread to run: usually there is just one thread, + but in a parallelized select, which necessarily is non-scrollable, + there may be several to choose from */ + + /* First we try to find a query thread in the QUE_THR_COMMAND_WAIT + state. Then we try to find a query thread in the QUE_THR_SUSPENDED + state, finally we try to find a query thread in the QUE_THR_COMPLETED + state */ + + thr = UT_LIST_GET_FIRST(fork->thrs); + + /* We make a single pass over the thr list within which we note which + threads are ready to run. */ + while (thr) { + switch (thr->state) { + case QUE_THR_COMMAND_WAIT: + + /* We have to send the initial message to query thread + to start it */ + + que_thr_init_command(thr); + + return(thr); + + case QUE_THR_SUSPENDED: + /* In this case the execution of the thread was + suspended: no initial message is needed because + execution can continue from where it was left */ + if (!suspended_thr) { + suspended_thr = thr; + } + + break; + + case QUE_THR_COMPLETED: + if (!completed_thr) { + completed_thr = thr; + } + + break; + + case QUE_THR_LOCK_WAIT: + ut_error; + + } + + thr = UT_LIST_GET_NEXT(thrs, thr); + } + + if (suspended_thr) { + + thr = suspended_thr; + que_thr_move_to_run_state(thr); + + } else if (completed_thr) { + + thr = completed_thr; + que_thr_init_command(thr); + } + + return(thr); +} + +/************************************************************************** +After signal handling is finished, returns control to a query graph error +handling routine. (Currently, just returns the control to the root of the +graph so that the graph can communicate an error message to the client.) */ +UNIV_INTERN +void +que_fork_error_handle( +/*==================*/ + trx_t* trx __attribute__((unused)), /* in: trx */ + que_t* fork) /* in: query graph which was run before signal + handling started, NULL not allowed */ +{ + que_thr_t* thr; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->sess->state == SESS_ERROR); + ut_ad(UT_LIST_GET_LEN(trx->reply_signals) == 0); + ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + thr = UT_LIST_GET_FIRST(fork->thrs); + + while (thr != NULL) { + ut_ad(!thr->is_active); + ut_ad(thr->state != QUE_THR_SIG_REPLY_WAIT); + ut_ad(thr->state != QUE_THR_LOCK_WAIT); + + thr->run_node = thr; + thr->prev_node = thr->child; + thr->state = QUE_THR_COMPLETED; + + thr = UT_LIST_GET_NEXT(thrs, thr); + } + + thr = UT_LIST_GET_FIRST(fork->thrs); + + que_thr_move_to_run_state(thr); + + ut_a(0); + srv_que_task_enqueue_low(thr); +} + +/******************************************************************** +Tests if all the query threads in the same fork have a given state. */ +UNIV_INLINE +ibool +que_fork_all_thrs_in_state( +/*=======================*/ + /* out: TRUE if all the query threads in the + same fork were in the given state */ + que_fork_t* fork, /* in: query fork */ + ulint state) /* in: state */ +{ + que_thr_t* thr_node; + + thr_node = UT_LIST_GET_FIRST(fork->thrs); + + while (thr_node != NULL) { + if (thr_node->state != state) { + + return(FALSE); + } + + thr_node = UT_LIST_GET_NEXT(thrs, thr_node); + } + + return(TRUE); +} + +/************************************************************************** +Calls que_graph_free_recursive for statements in a statement list. */ +static +void +que_graph_free_stat_list( +/*=====================*/ + que_node_t* node) /* in: first query graph node in the list */ +{ + while (node) { + que_graph_free_recursive(node); + + node = que_node_get_next(node); + } +} + +/************************************************************************** +Frees a query graph, but not the heap where it was created. Does not free +explicit cursor declarations, they are freed in que_graph_free. */ +UNIV_INTERN +void +que_graph_free_recursive( +/*=====================*/ + que_node_t* node) /* in: query graph node */ +{ + que_fork_t* fork; + que_thr_t* thr; + undo_node_t* undo; + sel_node_t* sel; + ins_node_t* ins; + upd_node_t* upd; + tab_node_t* cre_tab; + ind_node_t* cre_ind; + + if (node == NULL) { + + return; + } + + switch (que_node_get_type(node)) { + + case QUE_NODE_FORK: + fork = node; + + thr = UT_LIST_GET_FIRST(fork->thrs); + + while (thr) { + que_graph_free_recursive(thr); + + thr = UT_LIST_GET_NEXT(thrs, thr); + } + + break; + case QUE_NODE_THR: + + thr = node; + + if (thr->magic_n != QUE_THR_MAGIC_N) { + fprintf(stderr, + "que_thr struct appears corrupt;" + " magic n %lu\n", + (unsigned long) thr->magic_n); + mem_analyze_corruption(thr); + ut_error; + } + + thr->magic_n = QUE_THR_MAGIC_FREED; + + que_graph_free_recursive(thr->child); + + break; + case QUE_NODE_UNDO: + + undo = node; + + mem_heap_free(undo->heap); + + break; + case QUE_NODE_SELECT: + + sel = node; + + sel_node_free_private(sel); + + break; + case QUE_NODE_INSERT: + + ins = node; + + que_graph_free_recursive(ins->select); + + mem_heap_free(ins->entry_sys_heap); + + break; + case QUE_NODE_UPDATE: + + upd = node; + + if (upd->in_mysql_interface) { + + btr_pcur_free_for_mysql(upd->pcur); + } + + que_graph_free_recursive(upd->cascade_node); + + if (upd->cascade_heap) { + mem_heap_free(upd->cascade_heap); + } + + que_graph_free_recursive(upd->select); + + mem_heap_free(upd->heap); + + break; + case QUE_NODE_CREATE_TABLE: + cre_tab = node; + + que_graph_free_recursive(cre_tab->tab_def); + que_graph_free_recursive(cre_tab->col_def); + que_graph_free_recursive(cre_tab->commit_node); + + mem_heap_free(cre_tab->heap); + + break; + case QUE_NODE_CREATE_INDEX: + cre_ind = node; + + que_graph_free_recursive(cre_ind->ind_def); + que_graph_free_recursive(cre_ind->field_def); + que_graph_free_recursive(cre_ind->commit_node); + + mem_heap_free(cre_ind->heap); + + break; + case QUE_NODE_PROC: + que_graph_free_stat_list(((proc_node_t*)node)->stat_list); + + break; + case QUE_NODE_IF: + que_graph_free_stat_list(((if_node_t*)node)->stat_list); + que_graph_free_stat_list(((if_node_t*)node)->else_part); + que_graph_free_stat_list(((if_node_t*)node)->elsif_list); + + break; + case QUE_NODE_ELSIF: + que_graph_free_stat_list(((elsif_node_t*)node)->stat_list); + + break; + case QUE_NODE_WHILE: + que_graph_free_stat_list(((while_node_t*)node)->stat_list); + + break; + case QUE_NODE_FOR: + que_graph_free_stat_list(((for_node_t*)node)->stat_list); + + break; + + case QUE_NODE_ASSIGNMENT: + case QUE_NODE_EXIT: + case QUE_NODE_RETURN: + case QUE_NODE_COMMIT: + case QUE_NODE_ROLLBACK: + case QUE_NODE_LOCK: + case QUE_NODE_FUNC: + case QUE_NODE_ORDER: + case QUE_NODE_ROW_PRINTF: + case QUE_NODE_OPEN: + case QUE_NODE_FETCH: + /* No need to do anything */ + + break; + default: + fprintf(stderr, + "que_node struct appears corrupt; type %lu\n", + (unsigned long) que_node_get_type(node)); + mem_analyze_corruption(node); + ut_error; + } +} + +/************************************************************************** +Frees a query graph. */ +UNIV_INTERN +void +que_graph_free( +/*===========*/ + que_t* graph) /* in: query graph; we assume that the memory + heap where this graph was created is private + to this graph: if not, then use + que_graph_free_recursive and free the heap + afterwards! */ +{ + ut_ad(graph); + + if (graph->sym_tab) { + /* The following call frees dynamic memory allocated + for variables etc. during execution. Frees also explicit + cursor definitions. */ + + sym_tab_free_private(graph->sym_tab); + } + + if (graph->info && graph->info->graph_owns_us) { + pars_info_free(graph->info); + } + + que_graph_free_recursive(graph); + + mem_heap_free(graph->heap); +} + +/************************************************************************** +Checks if the query graph is in a state where it should be freed, and +frees it in that case. If the session is in a state where it should be +closed, also this is done. */ +UNIV_INTERN +ibool +que_graph_try_free( +/*===============*/ + /* out: TRUE if freed */ + que_t* graph) /* in: query graph */ +{ + sess_t* sess; + + ut_ad(mutex_own(&kernel_mutex)); + + sess = (graph->trx)->sess; + + if ((graph->state == QUE_FORK_BEING_FREED) + && (graph->n_active_thrs == 0)) { + + UT_LIST_REMOVE(graphs, sess->graphs, graph); + que_graph_free(graph); + + sess_try_close(sess); + + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************** +Performs an execution step on a thr node. */ +static +que_thr_t* +que_thr_node_step( +/*==============*/ + /* out: query thread to run next, or NULL + if none */ + que_thr_t* thr) /* in: query thread where run_node must + be the thread node itself */ +{ + ut_ad(thr->run_node == thr); + + if (thr->prev_node == thr->common.parent) { + /* If control to the node came from above, it is just passed + on */ + + thr->run_node = thr->child; + + return(thr); + } + + mutex_enter(&kernel_mutex); + + if (que_thr_peek_stop(thr)) { + + mutex_exit(&kernel_mutex); + + return(thr); + } + + /* Thread execution completed */ + + thr->state = QUE_THR_COMPLETED; + + mutex_exit(&kernel_mutex); + + return(NULL); +} + +/************************************************************************** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction if thr was +not active. +***NOTE***: This and ..._mysql are the only functions in which such a +transition is allowed to happen! */ +static +void +que_thr_move_to_run_state( +/*======================*/ + que_thr_t* thr) /* in: an query thread */ +{ + trx_t* trx; + + ut_ad(thr->state != QUE_THR_RUNNING); + + trx = thr_get_trx(thr); + + if (!thr->is_active) { + + (thr->graph)->n_active_thrs++; + + trx->n_active_thrs++; + + thr->is_active = TRUE; + + ut_ad((thr->graph)->n_active_thrs == 1); + ut_ad(trx->n_active_thrs == 1); + } + + thr->state = QUE_THR_RUNNING; +} + +/************************************************************************** +Decrements the query thread reference counts in the query graph and the +transaction. May start signal handling, e.g., a rollback. +*** NOTE ***: +This and que_thr_stop_for_mysql are the only functions where the reference +count can be decremented and this function may only be called from inside +que_run_threads or que_thr_check_if_switch! These restrictions exist to make +the rollback code easier to maintain. */ +static +void +que_thr_dec_refer_count( +/*====================*/ + que_thr_t* thr, /* in: query thread */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + que_fork_t* fork; + trx_t* trx; + ulint fork_type; + ibool stopped; + + fork = thr->common.parent; + trx = thr_get_trx(thr); + + mutex_enter(&kernel_mutex); + + ut_a(thr->is_active); + + if (thr->state == QUE_THR_RUNNING) { + + stopped = que_thr_stop(thr); + + if (!stopped) { + /* The reason for the thr suspension or wait was + already canceled before we came here: continue + running the thread */ + + /* fputs("!!!!!!!! Wait already ended: continue thr\n", + stderr); */ + + if (next_thr && *next_thr == NULL) { + /* Normally srv_suspend_mysql_thread resets + the state to DB_SUCCESS before waiting, but + in this case we have to do it here, + otherwise nobody does it. */ + trx->error_state = DB_SUCCESS; + + *next_thr = thr; + } else { + ut_error; + srv_que_task_enqueue_low(thr); + } + + mutex_exit(&kernel_mutex); + + return; + } + } + + ut_ad(fork->n_active_thrs == 1); + ut_ad(trx->n_active_thrs == 1); + + fork->n_active_thrs--; + trx->n_active_thrs--; + + thr->is_active = FALSE; + + if (trx->n_active_thrs > 0) { + + mutex_exit(&kernel_mutex); + + return; + } + + fork_type = fork->fork_type; + + /* Check if all query threads in the same fork are completed */ + + if (que_fork_all_thrs_in_state(fork, QUE_THR_COMPLETED)) { + + switch (fork_type) { + case QUE_FORK_ROLLBACK: + /* This is really the undo graph used in rollback, + no roll_node in this graph */ + + ut_ad(UT_LIST_GET_LEN(trx->signals) > 0); + ut_ad(trx->handling_signals == TRUE); + + trx_finish_rollback_off_kernel(fork, trx, next_thr); + break; + + case QUE_FORK_PURGE: + case QUE_FORK_RECOVERY: + case QUE_FORK_MYSQL_INTERFACE: + + /* Do nothing */ + break; + + default: + ut_error; /* not used in MySQL */ + } + } + + if (UT_LIST_GET_LEN(trx->signals) > 0 && trx->n_active_thrs == 0) { + + /* If the trx is signaled and its query thread count drops to + zero, then we start processing a signal; from it we may get + a new query thread to run */ + + trx_sig_start_handle(trx, next_thr); + } + + if (trx->handling_signals && UT_LIST_GET_LEN(trx->signals) == 0) { + + trx_end_signal_handling(trx); + } + + mutex_exit(&kernel_mutex); +} + +/************************************************************************** +Stops a query thread if graph or trx is in a state requiring it. The +conditions are tested in the order (1) graph, (2) trx. The kernel mutex has +to be reserved. */ +UNIV_INTERN +ibool +que_thr_stop( +/*=========*/ + /* out: TRUE if stopped */ + que_thr_t* thr) /* in: query thread */ +{ + trx_t* trx; + que_t* graph; + ibool ret = TRUE; + + ut_ad(mutex_own(&kernel_mutex)); + + graph = thr->graph; + trx = graph->trx; + + if (graph->state == QUE_FORK_COMMAND_WAIT) { + thr->state = QUE_THR_SUSPENDED; + + } else if (trx->que_state == TRX_QUE_LOCK_WAIT) { + + UT_LIST_ADD_FIRST(trx_thrs, trx->wait_thrs, thr); + thr->state = QUE_THR_LOCK_WAIT; + + } else if (trx->error_state != DB_SUCCESS + && trx->error_state != DB_LOCK_WAIT) { + + /* Error handling built for the MySQL interface */ + thr->state = QUE_THR_COMPLETED; + + } else if (UT_LIST_GET_LEN(trx->signals) > 0 + && graph->fork_type != QUE_FORK_ROLLBACK) { + + thr->state = QUE_THR_SUSPENDED; + } else { + ut_ad(graph->state == QUE_FORK_ACTIVE); + + ret = FALSE; + } + + return(ret); +} + +/************************************************************************** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The +query thread is stopped and made inactive, except in the case where +it was put to the lock wait state in lock0lock.c, but the lock has already +been granted or the transaction chosen as a victim in deadlock resolution. */ +UNIV_INTERN +void +que_thr_stop_for_mysql( +/*===================*/ + que_thr_t* thr) /* in: query thread */ +{ + trx_t* trx; + + trx = thr_get_trx(thr); + + mutex_enter(&kernel_mutex); + + if (thr->state == QUE_THR_RUNNING) { + + if (trx->error_state != DB_SUCCESS + && trx->error_state != DB_LOCK_WAIT) { + + /* Error handling built for the MySQL interface */ + thr->state = QUE_THR_COMPLETED; + } else { + /* It must have been a lock wait but the lock was + already released, or this transaction was chosen + as a victim in selective deadlock resolution */ + + mutex_exit(&kernel_mutex); + + return; + } + } + + ut_ad(thr->is_active == TRUE); + ut_ad(trx->n_active_thrs == 1); + ut_ad(thr->graph->n_active_thrs == 1); + + thr->is_active = FALSE; + (thr->graph)->n_active_thrs--; + + trx->n_active_thrs--; + + mutex_exit(&kernel_mutex); +} + +/************************************************************************** +Moves a thread from another state to the QUE_THR_RUNNING state. Increments +the n_active_thrs counters of the query graph and transaction if thr was +not active. */ +UNIV_INTERN +void +que_thr_move_to_run_state_for_mysql( +/*================================*/ + que_thr_t* thr, /* in: an query thread */ + trx_t* trx) /* in: transaction */ +{ + if (thr->magic_n != QUE_THR_MAGIC_N) { + fprintf(stderr, + "que_thr struct appears corrupt; magic n %lu\n", + (unsigned long) thr->magic_n); + + mem_analyze_corruption(thr); + + ut_error; + } + + if (!thr->is_active) { + + thr->graph->n_active_thrs++; + + trx->n_active_thrs++; + + thr->is_active = TRUE; + } + + thr->state = QUE_THR_RUNNING; +} + +/************************************************************************** +A patch for MySQL used to 'stop' a dummy query thread used in MySQL +select, when there is no error or lock wait. */ +UNIV_INTERN +void +que_thr_stop_for_mysql_no_error( +/*============================*/ + que_thr_t* thr, /* in: query thread */ + trx_t* trx) /* in: transaction */ +{ + ut_ad(thr->state == QUE_THR_RUNNING); + ut_ad(thr->is_active == TRUE); + ut_ad(trx->n_active_thrs == 1); + ut_ad(thr->graph->n_active_thrs == 1); + + if (thr->magic_n != QUE_THR_MAGIC_N) { + fprintf(stderr, + "que_thr struct appears corrupt; magic n %lu\n", + (unsigned long) thr->magic_n); + + mem_analyze_corruption(thr); + + ut_error; + } + + thr->state = QUE_THR_COMPLETED; + + thr->is_active = FALSE; + (thr->graph)->n_active_thrs--; + + trx->n_active_thrs--; +} + +/******************************************************************** +Get the first containing loop node (e.g. while_node_t or for_node_t) for the +given node, or NULL if the node is not within a loop. */ +UNIV_INTERN +que_node_t* +que_node_get_containing_loop_node( +/*==============================*/ + /* out: containing loop node, or NULL. */ + que_node_t* node) /* in: node */ +{ + ut_ad(node); + + for (;;) { + ulint type; + + node = que_node_get_parent(node); + + if (!node) { + break; + } + + type = que_node_get_type(node); + + if ((type == QUE_NODE_FOR) || (type == QUE_NODE_WHILE)) { + break; + } + } + + return(node); +} + +/************************************************************************** +Prints info of an SQL query graph node. */ +UNIV_INTERN +void +que_node_print_info( +/*================*/ + que_node_t* node) /* in: query graph node */ +{ + ulint type; + const char* str; + + type = que_node_get_type(node); + + if (type == QUE_NODE_SELECT) { + str = "SELECT"; + } else if (type == QUE_NODE_INSERT) { + str = "INSERT"; + } else if (type == QUE_NODE_UPDATE) { + str = "UPDATE"; + } else if (type == QUE_NODE_WHILE) { + str = "WHILE"; + } else if (type == QUE_NODE_ASSIGNMENT) { + str = "ASSIGNMENT"; + } else if (type == QUE_NODE_IF) { + str = "IF"; + } else if (type == QUE_NODE_FETCH) { + str = "FETCH"; + } else if (type == QUE_NODE_OPEN) { + str = "OPEN"; + } else if (type == QUE_NODE_PROC) { + str = "STORED PROCEDURE"; + } else if (type == QUE_NODE_FUNC) { + str = "FUNCTION"; + } else if (type == QUE_NODE_LOCK) { + str = "LOCK"; + } else if (type == QUE_NODE_THR) { + str = "QUERY THREAD"; + } else if (type == QUE_NODE_COMMIT) { + str = "COMMIT"; + } else if (type == QUE_NODE_UNDO) { + str = "UNDO ROW"; + } else if (type == QUE_NODE_PURGE) { + str = "PURGE ROW"; + } else if (type == QUE_NODE_ROLLBACK) { + str = "ROLLBACK"; + } else if (type == QUE_NODE_CREATE_TABLE) { + str = "CREATE TABLE"; + } else if (type == QUE_NODE_CREATE_INDEX) { + str = "CREATE INDEX"; + } else if (type == QUE_NODE_FOR) { + str = "FOR LOOP"; + } else if (type == QUE_NODE_RETURN) { + str = "RETURN"; + } else if (type == QUE_NODE_EXIT) { + str = "EXIT"; + } else { + str = "UNKNOWN NODE TYPE"; + } + + fprintf(stderr, "Node type %lu: %s, address %p\n", + (ulong) type, str, (void*) node); +} + +/************************************************************************** +Performs an execution step on a query thread. */ +UNIV_INLINE +que_thr_t* +que_thr_step( +/*=========*/ + /* out: query thread to run next: it may + differ from the input parameter if, e.g., a + subprocedure call is made */ + que_thr_t* thr) /* in: query thread */ +{ + que_node_t* node; + que_thr_t* old_thr; + trx_t* trx; + ulint type; + + trx = thr_get_trx(thr); + + ut_ad(thr->state == QUE_THR_RUNNING); + ut_a(trx->error_state == DB_SUCCESS); + + thr->resource++; + + node = thr->run_node; + type = que_node_get_type(node); + + old_thr = thr; + +#ifdef UNIV_DEBUG + if (que_trace_on) { + fputs("To execute: ", stderr); + que_node_print_info(node); + } +#endif + if (type & QUE_NODE_CONTROL_STAT) { + if ((thr->prev_node != que_node_get_parent(node)) + && que_node_get_next(thr->prev_node)) { + + /* The control statements, like WHILE, always pass the + control to the next child statement if there is any + child left */ + + thr->run_node = que_node_get_next(thr->prev_node); + + } else if (type == QUE_NODE_IF) { + if_step(thr); + } else if (type == QUE_NODE_FOR) { + for_step(thr); + } else if (type == QUE_NODE_PROC) { + + /* We can access trx->undo_no without reserving + trx->undo_mutex, because there cannot be active query + threads doing updating or inserting at the moment! */ + + if (thr->prev_node == que_node_get_parent(node)) { + trx->last_sql_stat_start.least_undo_no + = trx->undo_no; + } + + proc_step(thr); + } else if (type == QUE_NODE_WHILE) { + while_step(thr); + } else { + ut_error; + } + } else if (type == QUE_NODE_ASSIGNMENT) { + assign_step(thr); + } else if (type == QUE_NODE_SELECT) { + thr = row_sel_step(thr); + } else if (type == QUE_NODE_INSERT) { + thr = row_ins_step(thr); + } else if (type == QUE_NODE_UPDATE) { + thr = row_upd_step(thr); + } else if (type == QUE_NODE_FETCH) { + thr = fetch_step(thr); + } else if (type == QUE_NODE_OPEN) { + thr = open_step(thr); + } else if (type == QUE_NODE_FUNC) { + proc_eval_step(thr); + + } else if (type == QUE_NODE_LOCK) { + + ut_error; + /* + thr = que_lock_step(thr); + */ + } else if (type == QUE_NODE_THR) { + thr = que_thr_node_step(thr); + } else if (type == QUE_NODE_COMMIT) { + thr = trx_commit_step(thr); + } else if (type == QUE_NODE_UNDO) { + thr = row_undo_step(thr); + } else if (type == QUE_NODE_PURGE) { + thr = row_purge_step(thr); + } else if (type == QUE_NODE_RETURN) { + thr = return_step(thr); + } else if (type == QUE_NODE_EXIT) { + thr = exit_step(thr); + } else if (type == QUE_NODE_ROLLBACK) { + thr = trx_rollback_step(thr); + } else if (type == QUE_NODE_CREATE_TABLE) { + thr = dict_create_table_step(thr); + } else if (type == QUE_NODE_CREATE_INDEX) { + thr = dict_create_index_step(thr); + } else if (type == QUE_NODE_ROW_PRINTF) { + thr = row_printf_step(thr); + } else { + ut_error; + } + + if (type == QUE_NODE_EXIT) { + old_thr->prev_node = que_node_get_containing_loop_node(node); + } else { + old_thr->prev_node = node; + } + + if (thr) { + ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS); + } + + return(thr); +} + +/************************************************************************** +Run a query thread until it finishes or encounters e.g. a lock wait. */ +static +void +que_run_threads_low( +/*================*/ + que_thr_t* thr) /* in: query thread */ +{ + que_thr_t* next_thr; + ulint cumul_resource; + ulint loop_count; + + ut_ad(thr->state == QUE_THR_RUNNING); + ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS); + ut_ad(!mutex_own(&kernel_mutex)); + + /* cumul_resource counts how much resources the OS thread (NOT the + query thread) has spent in this function */ + + loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK; + cumul_resource = 0; +loop: + /* Check that there is enough space in the log to accommodate + possible log entries by this query step; if the operation can touch + more than about 4 pages, checks must be made also within the query + step! */ + + log_free_check(); + + /* Perform the actual query step: note that the query thread + may change if, e.g., a subprocedure call is made */ + + /*-------------------------*/ + next_thr = que_thr_step(thr); + /*-------------------------*/ + + ut_a(!next_thr || (thr_get_trx(next_thr)->error_state == DB_SUCCESS)); + + loop_count++; + + if (next_thr != thr) { + ut_a(next_thr == NULL); + + /* This can change next_thr to a non-NULL value if there was + a lock wait that already completed. */ + que_thr_dec_refer_count(thr, &next_thr); + + if (next_thr == NULL) { + + return; + } + + loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK; + + thr = next_thr; + } + + goto loop; +} + +/************************************************************************** +Run a query thread. Handles lock waits. */ +UNIV_INTERN +void +que_run_threads( +/*============*/ + que_thr_t* thr) /* in: query thread */ +{ +loop: + ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS); + que_run_threads_low(thr); + + mutex_enter(&kernel_mutex); + + switch (thr->state) { + + case QUE_THR_RUNNING: + /* There probably was a lock wait, but it already ended + before we came here: continue running thr */ + + mutex_exit(&kernel_mutex); + + goto loop; + + case QUE_THR_LOCK_WAIT: + mutex_exit(&kernel_mutex); + + /* The ..._mysql_... function works also for InnoDB's + internal threads. Let us wait that the lock wait ends. */ + + srv_suspend_mysql_thread(thr); + + if (thr_get_trx(thr)->error_state != DB_SUCCESS) { + /* thr was chosen as a deadlock victim or there was + a lock wait timeout */ + + que_thr_dec_refer_count(thr, NULL); + + return; + } + + goto loop; + + case QUE_THR_COMPLETED: + case QUE_THR_COMMAND_WAIT: + /* Do nothing */ + break; + + default: + ut_error; + } + + mutex_exit(&kernel_mutex); +} + +/************************************************************************* +Evaluate the given SQL. */ +UNIV_INTERN +ulint +que_eval_sql( +/*=========*/ + /* out: error code or DB_SUCCESS */ + pars_info_t* info, /* in: info struct, or NULL */ + const char* sql, /* in: SQL string */ + ibool reserve_dict_mutex, + /* in: if TRUE, acquire/release + dict_sys->mutex around call to pars_sql. */ + trx_t* trx) /* in: trx */ +{ + que_thr_t* thr; + que_t* graph; + + ut_a(trx->error_state == DB_SUCCESS); + + if (reserve_dict_mutex) { + mutex_enter(&dict_sys->mutex); + } + + graph = pars_sql(info, sql); + + if (reserve_dict_mutex) { + mutex_exit(&dict_sys->mutex); + } + + ut_a(graph); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + que_graph_free(graph); + + return(trx->error_state); +} diff --git a/storage/xtradb/read/read0read.c b/storage/xtradb/read/read0read.c new file mode 100644 index 00000000000..e3e5ee5d623 --- /dev/null +++ b/storage/xtradb/read/read0read.c @@ -0,0 +1,538 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#include "read0read.h" + +#ifdef UNIV_NONINL +#include "read0read.ic" +#endif + +#include "srv0srv.h" +#include "trx0sys.h" + +/* +------------------------------------------------------------------------------- +FACT A: Cursor read view on a secondary index sees only committed versions +------- +of the records in the secondary index or those versions of rows created +by transaction which created a cursor before cursor was created even +if transaction which created the cursor has changed that clustered index page. + +PROOF: We must show that read goes always to the clustered index record +to see that record is visible in the cursor read view. Consider e.g. +following table and SQL-clauses: + +create table t1(a int not null, b int, primary key(a), index(b)); +insert into t1 values (1,1),(2,2); +commit; + +Now consider that we have a cursor for a query + +select b from t1 where b >= 1; + +This query will use secondary key on the table t1. Now after the first fetch +on this cursor if we do a update: + +update t1 set b = 5 where b = 2; + +Now second fetch of the cursor should not see record (2,5) instead it should +see record (2,2). + +We also should show that if we have delete t1 where b = 5; we still +can see record (2,2). + +When we access a secondary key record maximum transaction id is fetched +from this record and this trx_id is compared to up_limit_id in the view. +If trx_id in the record is greater or equal than up_limit_id in the view +cluster record is accessed. Because trx_id of the creating +transaction is stored when this view was created to the list of +trx_ids not seen by this read view previous version of the +record is requested to be built. This is build using clustered record. +If the secondary key record is delete marked it's corresponding +clustered record can be already be purged only if records +trx_id < low_limit_no. Purge can't remove any record deleted by a +transaction which was active when cursor was created. But, we still +may have a deleted secondary key record but no clustered record. But, +this is not a problem because this case is handled in +row_sel_get_clust_rec() function which is called +whenever we note that this read view does not see trx_id in the +record. Thus, we see correct version. Q. E. D. + +------------------------------------------------------------------------------- +FACT B: Cursor read view on a clustered index sees only committed versions +------- +of the records in the clustered index or those versions of rows created +by transaction which created a cursor before cursor was created even +if transaction which created the cursor has changed that clustered index page. + +PROOF: Consider e.g.following table and SQL-clauses: + +create table t1(a int not null, b int, primary key(a)); +insert into t1 values (1),(2); +commit; + +Now consider that we have a cursor for a query + +select a from t1 where a >= 1; + +This query will use clustered key on the table t1. Now after the first fetch +on this cursor if we do a update: + +update t1 set a = 5 where a = 2; + +Now second fetch of the cursor should not see record (5) instead it should +see record (2). + +We also should show that if we have execute delete t1 where a = 5; after +the cursor is opened we still can see record (2). + +When accessing clustered record we always check if this read view sees +trx_id stored to clustered record. By default we don't see any changes +if record trx_id >= low_limit_id i.e. change was made transaction +which started after transaction which created the cursor. If row +was changed by the future transaction a previous version of the +clustered record is created. Thus we see only committed version in +this case. We see all changes made by committed transactions i.e. +record trx_id < up_limit_id. In this case we don't need to do anything, +we already see correct version of the record. We don't see any changes +made by active transaction except creating transaction. We have stored +trx_id of creating transaction to list of trx_ids when this view was +created. Thus we can easily see if this record was changed by the +creating transaction. Because we already have clustered record we can +access roll_ptr. Using this roll_ptr we can fetch undo record. +We can now check that undo_no of the undo record is less than undo_no of the +trancaction which created a view when cursor was created. We see this +clustered record only in case when record undo_no is less than undo_no +in the view. If this is not true we build based on undo_rec previous +version of the record. This record is found because purge can't remove +records accessed by active transaction. Thus we see correct version. Q. E. D. +------------------------------------------------------------------------------- +FACT C: Purge does not remove any delete marked row that is visible +------- +to cursor view. + +TODO: proof this + +*/ + +/************************************************************************* +Creates a read view object. */ +UNIV_INLINE +read_view_t* +read_view_create_low( +/*=================*/ + /* out, own: read view struct */ + ulint n, /* in: number of cells in the trx_ids array */ + mem_heap_t* heap) /* in: memory heap from which allocated */ +{ + read_view_t* view; + + view = mem_heap_alloc(heap, sizeof(read_view_t)); + + view->n_trx_ids = n; + view->trx_ids = mem_heap_alloc(heap, n * sizeof(dulint)); + + return(view); +} + +/************************************************************************* +Makes a copy of the oldest existing read view, with the exception that also +the creating trx of the oldest view is set as not visible in the 'copied' +view. Opens a new view if no views currently exist. The view must be closed +with ..._close. This is used in purge. */ +UNIV_INTERN +read_view_t* +read_view_oldest_copy_or_open_new( +/*==============================*/ + /* out, own: read view struct */ + dulint cr_trx_id, /* in: trx_id of creating + transaction, or (0, 0) used in purge*/ + mem_heap_t* heap) /* in: memory heap from which + allocated */ +{ + read_view_t* old_view; + read_view_t* view_copy; + ibool needs_insert = TRUE; + ulint insert_done = 0; + ulint n; + ulint i; + + ut_ad(mutex_own(&kernel_mutex)); + + old_view = UT_LIST_GET_LAST(trx_sys->view_list); + + if (old_view == NULL) { + + return(read_view_open_now(cr_trx_id, heap)); + } + + n = old_view->n_trx_ids; + + if (!ut_dulint_is_zero(old_view->creator_trx_id)) { + n++; + } else { + needs_insert = FALSE; + } + + view_copy = read_view_create_low(n, heap); + + /* Insert the id of the creator in the right place of the descending + array of ids, if needs_insert is TRUE: */ + + i = 0; + while (i < n) { + if (needs_insert + && (i >= old_view->n_trx_ids + || ut_dulint_cmp(old_view->creator_trx_id, + read_view_get_nth_trx_id(old_view, i)) + > 0)) { + + read_view_set_nth_trx_id(view_copy, i, + old_view->creator_trx_id); + needs_insert = FALSE; + insert_done = 1; + } else { + read_view_set_nth_trx_id(view_copy, i, + read_view_get_nth_trx_id( + old_view, + i - insert_done)); + } + + i++; + } + + view_copy->creator_trx_id = cr_trx_id; + + view_copy->low_limit_no = old_view->low_limit_no; + view_copy->low_limit_id = old_view->low_limit_id; + + + if (n > 0) { + /* The last active transaction has the smallest id: */ + view_copy->up_limit_id = read_view_get_nth_trx_id( + view_copy, n - 1); + } else { + view_copy->up_limit_id = old_view->up_limit_id; + } + + UT_LIST_ADD_LAST(view_list, trx_sys->view_list, view_copy); + + return(view_copy); +} + +/************************************************************************* +Opens a read view where exactly the transactions serialized before this +point in time are seen in the view. */ +UNIV_INTERN +read_view_t* +read_view_open_now( +/*===============*/ + /* out, own: read view struct */ + dulint cr_trx_id, /* in: trx_id of creating + transaction, or (0, 0) used in + purge */ + mem_heap_t* heap) /* in: memory heap from which + allocated */ +{ + read_view_t* view; + trx_t* trx; + ulint n; + + ut_ad(mutex_own(&kernel_mutex)); + + view = read_view_create_low(UT_LIST_GET_LEN(trx_sys->trx_list), heap); + + view->creator_trx_id = cr_trx_id; + view->type = VIEW_NORMAL; + view->undo_no = ut_dulint_zero; + + /* No future transactions should be visible in the view */ + + view->low_limit_no = trx_sys->max_trx_id; + view->low_limit_id = view->low_limit_no; + + n = 0; + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + /* No active transaction should be visible, except cr_trx */ + + while (trx) { + if (ut_dulint_cmp(trx->id, cr_trx_id) != 0 + && (trx->conc_state == TRX_ACTIVE + || trx->conc_state == TRX_PREPARED)) { + + read_view_set_nth_trx_id(view, n, trx->id); + + n++; + + /* NOTE that a transaction whose trx number is < + trx_sys->max_trx_id can still be active, if it is + in the middle of its commit! Note that when a + transaction starts, we initialize trx->no to + ut_dulint_max. */ + + if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) { + + view->low_limit_no = trx->no; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + view->n_trx_ids = n; + + if (n > 0) { + /* The last active transaction has the smallest id: */ + view->up_limit_id = read_view_get_nth_trx_id(view, n - 1); + } else { + view->up_limit_id = view->low_limit_id; + } + + + UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view); + + return(view); +} + +/************************************************************************* +Closes a read view. */ +UNIV_INTERN +void +read_view_close( +/*============*/ + read_view_t* view) /* in: read view */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + UT_LIST_REMOVE(view_list, trx_sys->view_list, view); +} + +/************************************************************************* +Closes a consistent read view for MySQL. This function is called at an SQL +statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */ +UNIV_INTERN +void +read_view_close_for_mysql( +/*======================*/ + trx_t* trx) /* in: trx which has a read view */ +{ + ut_a(trx->global_read_view); + + mutex_enter(&kernel_mutex); + + read_view_close(trx->global_read_view); + + mem_heap_empty(trx->global_read_view_heap); + + trx->read_view = NULL; + trx->global_read_view = NULL; + + mutex_exit(&kernel_mutex); +} + +/************************************************************************* +Prints a read view to stderr. */ +UNIV_INTERN +void +read_view_print( +/*============*/ + read_view_t* view) /* in: read view */ +{ + ulint n_ids; + ulint i; + + if (view->type == VIEW_HIGH_GRANULARITY) { + fprintf(stderr, + "High-granularity read view undo_n:o %lu %lu\n", + (ulong) ut_dulint_get_high(view->undo_no), + (ulong) ut_dulint_get_low(view->undo_no)); + } else { + fprintf(stderr, "Normal read view\n"); + } + + fprintf(stderr, "Read view low limit trx n:o %lu %lu\n", + (ulong) ut_dulint_get_high(view->low_limit_no), + (ulong) ut_dulint_get_low(view->low_limit_no)); + + fprintf(stderr, "Read view up limit trx id " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF(view->up_limit_id)); + + fprintf(stderr, "Read view low limit trx id " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF(view->low_limit_id)); + + fprintf(stderr, "Read view individually stored trx ids:\n"); + + n_ids = view->n_trx_ids; + + for (i = 0; i < n_ids; i++) { + fprintf(stderr, "Read view trx id " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF( + read_view_get_nth_trx_id(view, i))); + } +} + +/************************************************************************* +Create a high-granularity consistent cursor view for mysql to be used +in cursors. In this consistent read view modifications done by the +creating transaction after the cursor is created or future transactions +are not visible. */ +UNIV_INTERN +cursor_view_t* +read_cursor_view_create_for_mysql( +/*==============================*/ + trx_t* cr_trx) /* in: trx where cursor view is created */ +{ + cursor_view_t* curview; + read_view_t* view; + mem_heap_t* heap; + trx_t* trx; + ulint n; + + ut_a(cr_trx); + + /* Use larger heap than in trx_create when creating a read_view + because cursors are quite long. */ + + heap = mem_heap_create(512); + + curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(cursor_view_t)); + curview->heap = heap; + + /* Drop cursor tables from consideration when evaluating the need of + auto-commit */ + curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use; + cr_trx->n_mysql_tables_in_use = 0; + + mutex_enter(&kernel_mutex); + + curview->read_view = read_view_create_low( + UT_LIST_GET_LEN(trx_sys->trx_list), curview->heap); + + view = curview->read_view; + view->creator_trx_id = cr_trx->id; + view->type = VIEW_HIGH_GRANULARITY; + view->undo_no = cr_trx->undo_no; + + /* No future transactions should be visible in the view */ + + view->low_limit_no = trx_sys->max_trx_id; + view->low_limit_id = view->low_limit_no; + + n = 0; + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + /* No active transaction should be visible */ + + while (trx) { + + if (trx->conc_state == TRX_ACTIVE + || trx->conc_state == TRX_PREPARED) { + + read_view_set_nth_trx_id(view, n, trx->id); + + n++; + + /* NOTE that a transaction whose trx number is < + trx_sys->max_trx_id can still be active, if it is + in the middle of its commit! Note that when a + transaction starts, we initialize trx->no to + ut_dulint_max. */ + + if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) { + + view->low_limit_no = trx->no; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + view->n_trx_ids = n; + + if (n > 0) { + /* The last active transaction has the smallest id: */ + view->up_limit_id = read_view_get_nth_trx_id(view, n - 1); + } else { + view->up_limit_id = view->low_limit_id; + } + + UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view); + + mutex_exit(&kernel_mutex); + + return(curview); +} + +/************************************************************************* +Close a given consistent cursor view for mysql and restore global read view +back to a transaction read view. */ +UNIV_INTERN +void +read_cursor_view_close_for_mysql( +/*=============================*/ + trx_t* trx, /* in: trx */ + cursor_view_t* curview)/* in: cursor view to be closed */ +{ + ut_a(curview); + ut_a(curview->read_view); + ut_a(curview->heap); + + /* Add cursor's tables to the global count of active tables that + belong to this transaction */ + trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use; + + mutex_enter(&kernel_mutex); + + read_view_close(curview->read_view); + trx->read_view = trx->global_read_view; + + mutex_exit(&kernel_mutex); + + mem_heap_free(curview->heap); +} + +/************************************************************************* +This function sets a given consistent cursor view to a transaction +read view if given consistent cursor view is not NULL. Otherwise, function +restores a global read view to a transaction read view. */ +UNIV_INTERN +void +read_cursor_set_for_mysql( +/*======================*/ + trx_t* trx, /* in: transaction where cursor is set */ + cursor_view_t* curview)/* in: consistent cursor view to be set */ +{ + ut_a(trx); + + mutex_enter(&kernel_mutex); + + if (UNIV_LIKELY(curview != NULL)) { + trx->read_view = curview->read_view; + } else { + trx->read_view = trx->global_read_view; + } + + mutex_exit(&kernel_mutex); +} diff --git a/storage/xtradb/rem/rem0cmp.c b/storage/xtradb/rem/rem0cmp.c new file mode 100644 index 00000000000..39fcb6f19dd --- /dev/null +++ b/storage/xtradb/rem/rem0cmp.c @@ -0,0 +1,1234 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/*********************************************************************** +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#include "rem0cmp.h" + +#ifdef UNIV_NONINL +#include "rem0cmp.ic" +#endif + +#include "srv0srv.h" + +/* ALPHABETICAL ORDER + ================== + +The records are put into alphabetical order in the following +way: let F be the first field where two records disagree. +If there is a character in some position n where the the +records disagree, the order is determined by comparison of +the characters at position n, possibly after +collating transformation. If there is no such character, +but the corresponding fields have different lengths, then +if the data type of the fields is paddable, +shorter field is padded with a padding character. If the +data type is not paddable, longer field is considered greater. +Finally, the SQL null is bigger than any other value. + +At the present, the comparison functions return 0 in the case, +where two records disagree only in the way that one +has more fields than the other. */ + +#ifdef UNIV_DEBUG +/***************************************************************** +Used in debug checking of cmp_dtuple_... . +This function is used to compare a data tuple to a physical record. If +dtuple has n fields then rec must have either m >= n fields, or it must +differ from dtuple in some of the m fields rec has. */ +static +int +cmp_debug_dtuple_rec_with_match( +/*============================*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively, when only the + common first fields are compared */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint* matched_fields);/* in/out: number of already + completely matched fields; when function + returns, contains the value for current + comparison */ +#endif /* UNIV_DEBUG */ +#ifndef UNIV_HOTBACKUP +/***************************************************************** +This function is used to compare two data fields for which the data type +is such that we must use MySQL code to compare them. The prototype here +must be a copy of the the one in ha_innobase.cc! */ +extern +int +innobase_mysql_cmp( +/*===============*/ + /* out: 1, 0, -1, if a is greater, + equal, less than b, respectively */ + int mysql_type, /* in: MySQL type */ + uint charset_number, /* in: number of the charset */ + const unsigned char* a, /* in: data field */ + unsigned int a_length, /* in: data field length, + not UNIV_SQL_NULL */ + const unsigned char* b, /* in: data field */ + unsigned int b_length); /* in: data field length, + not UNIV_SQL_NULL */ +#endif /* !UNIV_HOTBACKUP */ +/************************************************************************* +Transforms the character code so that it is ordered appropriately for the +language. This is only used for the latin1 char set. MySQL does the +comparisons for other char sets. */ +UNIV_INLINE +ulint +cmp_collate( +/*========*/ + /* out: collation order position */ + ulint code) /* in: code of a character stored in database record */ +{ + return((ulint) srv_latin1_ordering[code]); +} + +/***************************************************************** +Returns TRUE if two columns are equal for comparison purposes. */ +UNIV_INTERN +ibool +cmp_cols_are_equal( +/*===============*/ + /* out: TRUE if the columns are + considered equal in comparisons */ + const dict_col_t* col1, /* in: column 1 */ + const dict_col_t* col2, /* in: column 2 */ + ibool check_charsets) + /* in: whether to check charsets */ +{ + if (dtype_is_non_binary_string_type(col1->mtype, col1->prtype) + && dtype_is_non_binary_string_type(col2->mtype, col2->prtype)) { + + /* Both are non-binary string types: they can be compared if + and only if the charset-collation is the same */ + + if (check_charsets) { + return(dtype_get_charset_coll(col1->prtype) + == dtype_get_charset_coll(col2->prtype)); + } else { + return(TRUE); + } + } + + if (dtype_is_binary_string_type(col1->mtype, col1->prtype) + && dtype_is_binary_string_type(col2->mtype, col2->prtype)) { + + /* Both are binary string types: they can be compared */ + + return(TRUE); + } + + if (col1->mtype != col2->mtype) { + + return(FALSE); + } + + if (col1->mtype == DATA_INT + && (col1->prtype & DATA_UNSIGNED) + != (col2->prtype & DATA_UNSIGNED)) { + + /* The storage format of an unsigned integer is different + from a signed integer: in a signed integer we OR + 0x8000... to the value of positive integers. */ + + return(FALSE); + } + + return(col1->mtype != DATA_INT || col1->len == col2->len); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************** +Innobase uses this function to compare two data fields for which the data type +is such that we must compare whole fields or call MySQL to do the comparison */ +static +int +cmp_whole_field( +/*============*/ + /* out: 1, 0, -1, if a is greater, + equal, less than b, respectively */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + const byte* a, /* in: data field */ + unsigned int a_length, /* in: data field length, + not UNIV_SQL_NULL */ + const byte* b, /* in: data field */ + unsigned int b_length) /* in: data field length, + not UNIV_SQL_NULL */ +{ + float f_1; + float f_2; + double d_1; + double d_2; + int swap_flag = 1; + + switch (mtype) { + + case DATA_DECIMAL: + /* Remove preceding spaces */ + for (; a_length && *a == ' '; a++, a_length--); + for (; b_length && *b == ' '; b++, b_length--); + + if (*a == '-') { + if (*b != '-') { + return(-1); + } + + a++; b++; + a_length--; + b_length--; + + swap_flag = -1; + + } else if (*b == '-') { + + return(1); + } + + while (a_length > 0 && (*a == '+' || *a == '0')) { + a++; a_length--; + } + + while (b_length > 0 && (*b == '+' || *b == '0')) { + b++; b_length--; + } + + if (a_length != b_length) { + if (a_length < b_length) { + return(-swap_flag); + } + + return(swap_flag); + } + + while (a_length > 0 && *a == *b) { + + a++; b++; a_length--; + } + + if (a_length == 0) { + + return(0); + } + + if (*a > *b) { + return(swap_flag); + } + + return(-swap_flag); + case DATA_DOUBLE: + d_1 = mach_double_read(a); + d_2 = mach_double_read(b); + + if (d_1 > d_2) { + return(1); + } else if (d_2 > d_1) { + return(-1); + } + + return(0); + + case DATA_FLOAT: + f_1 = mach_float_read(a); + f_2 = mach_float_read(b); + + if (f_1 > f_2) { + return(1); + } else if (f_2 > f_1) { + return(-1); + } + + return(0); + case DATA_BLOB: + if (prtype & DATA_BINARY_TYPE) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: comparing a binary BLOB" + " with a character set sensitive\n" + "InnoDB: comparison!\n"); + } + /* fall through */ + case DATA_VARMYSQL: + case DATA_MYSQL: + return(innobase_mysql_cmp( + (int)(prtype & DATA_MYSQL_TYPE_MASK), + (uint)dtype_get_charset_coll(prtype), + a, a_length, b, b_length)); + default: + fprintf(stderr, + "InnoDB: unknown type number %lu\n", + (ulong) mtype); + ut_error; + } + + return(0); +} +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************** +This function is used to compare two data fields for which we know the +data type. */ +UNIV_INTERN +int +cmp_data_data_slow( +/*===============*/ + /* out: 1, 0, -1, if data1 is greater, equal, + less than data2, respectively */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + const byte* data1, /* in: data field (== a pointer to a memory + buffer) */ + ulint len1, /* in: data field length or UNIV_SQL_NULL */ + const byte* data2, /* in: data field (== a pointer to a memory + buffer) */ + ulint len2) /* in: data field length or UNIV_SQL_NULL */ +{ +#ifndef UNIV_HOTBACKUP + ulint data1_byte; + ulint data2_byte; + ulint cur_bytes; + + if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) { + + if (len1 == len2) { + + return(0); + } + + if (len1 == UNIV_SQL_NULL) { + /* We define the SQL null to be the smallest possible + value of a field in the alphabetical order */ + + return(-1); + } + + return(1); + } + + if (mtype >= DATA_FLOAT + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + + return(cmp_whole_field(mtype, prtype, + data1, (unsigned) len1, + data2, (unsigned) len2)); + } + + /* Compare then the fields */ + + cur_bytes = 0; + + for (;;) { + if (len1 <= cur_bytes) { + if (len2 <= cur_bytes) { + + return(0); + } + + data1_byte = dtype_get_pad_char(mtype, prtype); + + if (data1_byte == ULINT_UNDEFINED) { + + return(-1); + } + } else { + data1_byte = *data1; + } + + if (len2 <= cur_bytes) { + data2_byte = dtype_get_pad_char(mtype, prtype); + + if (data2_byte == ULINT_UNDEFINED) { + + return(1); + } + } else { + data2_byte = *data2; + } + + if (data1_byte == data2_byte) { + /* If the bytes are equal, they will remain such even + after the collation transformation below */ + + goto next_byte; + } + + if (mtype <= DATA_CHAR + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE))) { + + data1_byte = cmp_collate(data1_byte); + data2_byte = cmp_collate(data2_byte); + } + + if (data1_byte > data2_byte) { + + return(1); + } else if (data1_byte < data2_byte) { + + return(-1); + } +next_byte: + /* Next byte */ + cur_bytes++; + data1++; + data2++; + } +#else /* !UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* !UNIV_HOTBACKUP */ + + return(0); /* Not reached */ +} + +/***************************************************************** +This function is used to compare a data tuple to a physical record. +Only dtuple->n_fields_cmp first fields are taken into account for +the the data tuple! If we denote by n = n_fields_cmp, then rec must +have either m >= n fields, or it must differ from dtuple in some of +the m fields rec has. If rec has an externally stored field we do not +compare it but return with value 0 if such a comparison should be +made. */ +UNIV_INTERN +int +cmp_dtuple_rec_with_match( +/*======================*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively, when only the + common first fields are compared, or + until the first externally stored field in + rec */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint* matched_fields, /* in/out: number of already completely + matched fields; when function returns, + contains the value for current comparison */ + ulint* matched_bytes) /* in/out: number of already matched + bytes within the first field not completely + matched; when function returns, contains the + value for current comparison */ +{ +#ifndef UNIV_HOTBACKUP + const dfield_t* dtuple_field; /* current field in logical record */ + ulint dtuple_f_len; /* the length of the current field + in the logical record */ + const byte* dtuple_b_ptr; /* pointer to the current byte in + logical field data */ + ulint dtuple_byte; /* value of current byte to be compared + in dtuple*/ + ulint rec_f_len; /* length of current field in rec */ + const byte* rec_b_ptr; /* pointer to the current byte in + rec field */ + ulint rec_byte; /* value of current byte to be + compared in rec */ + ulint cur_field; /* current field number */ + ulint cur_bytes; /* number of already matched bytes + in current field */ + int ret = 3333; /* return value */ + + ut_ad(dtuple && rec && matched_fields && matched_bytes); + ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + cur_field = *matched_fields; + cur_bytes = *matched_bytes; + + ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple)); + ut_ad(cur_field <= rec_offs_n_fields(offsets)); + + if (cur_bytes == 0 && cur_field == 0) { + ulint rec_info = rec_get_info_bits(rec, + rec_offs_comp(offsets)); + ulint tup_info = dtuple_get_info_bits(dtuple); + + if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) { + ret = !(tup_info & REC_INFO_MIN_REC_FLAG); + goto order_resolved; + } else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) { + ret = -1; + goto order_resolved; + } + } + + /* Match fields in a loop; stop if we run out of fields in dtuple + or find an externally stored field */ + + while (cur_field < dtuple_get_n_fields_cmp(dtuple)) { + + ulint mtype; + ulint prtype; + + dtuple_field = dtuple_get_nth_field(dtuple, cur_field); + { + const dtype_t* type + = dfield_get_type(dtuple_field); + + mtype = type->mtype; + prtype = type->prtype; + } + + dtuple_f_len = dfield_get_len(dtuple_field); + + rec_b_ptr = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); + + /* If we have matched yet 0 bytes, it may be that one or + both the fields are SQL null, or the record or dtuple may be + the predefined minimum record, or the field is externally + stored */ + + if (UNIV_LIKELY(cur_bytes == 0)) { + if (rec_offs_nth_extern(offsets, cur_field)) { + /* We do not compare to an externally + stored field */ + + ret = 0; + + goto order_resolved; + } + + if (dtuple_f_len == UNIV_SQL_NULL) { + if (rec_f_len == UNIV_SQL_NULL) { + + goto next_field; + } + + ret = -1; + goto order_resolved; + } else if (rec_f_len == UNIV_SQL_NULL) { + /* We define the SQL null to be the + smallest possible value of a field + in the alphabetical order */ + + ret = 1; + goto order_resolved; + } + } + + if (mtype >= DATA_FLOAT + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + + ret = cmp_whole_field(mtype, prtype, + dfield_get_data(dtuple_field), + (unsigned) dtuple_f_len, + rec_b_ptr, (unsigned) rec_f_len); + + if (ret != 0) { + cur_bytes = 0; + + goto order_resolved; + } else { + goto next_field; + } + } + + /* Set the pointers at the current byte */ + + rec_b_ptr = rec_b_ptr + cur_bytes; + dtuple_b_ptr = (byte*)dfield_get_data(dtuple_field) + + cur_bytes; + /* Compare then the fields */ + + for (;;) { + if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) { + if (dtuple_f_len <= cur_bytes) { + + goto next_field; + } + + rec_byte = dtype_get_pad_char(mtype, prtype); + + if (rec_byte == ULINT_UNDEFINED) { + ret = 1; + + goto order_resolved; + } + } else { + rec_byte = *rec_b_ptr; + } + + if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) { + dtuple_byte = dtype_get_pad_char(mtype, + prtype); + + if (dtuple_byte == ULINT_UNDEFINED) { + ret = -1; + + goto order_resolved; + } + } else { + dtuple_byte = *dtuple_b_ptr; + } + + if (dtuple_byte == rec_byte) { + /* If the bytes are equal, they will + remain such even after the collation + transformation below */ + + goto next_byte; + } + + if (mtype <= DATA_CHAR + || (mtype == DATA_BLOB + && !(prtype & DATA_BINARY_TYPE))) { + + rec_byte = cmp_collate(rec_byte); + dtuple_byte = cmp_collate(dtuple_byte); + } + + ret = (int) (dtuple_byte - rec_byte); + if (UNIV_LIKELY(ret)) { + if (ret < 0) { + ret = -1; + goto order_resolved; + } else { + ret = 1; + goto order_resolved; + } + } +next_byte: + /* Next byte */ + cur_bytes++; + rec_b_ptr++; + dtuple_b_ptr++; + } + +next_field: + cur_field++; + cur_bytes = 0; + } + + ut_ad(cur_bytes == 0); + + ret = 0; /* If we ran out of fields, dtuple was equal to rec + up to the common fields */ +order_resolved: + ut_ad((ret >= - 1) && (ret <= 1)); + ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets, + matched_fields)); + ut_ad(*matched_fields == cur_field); /* In the debug version, the + above cmp_debug_... sets + *matched_fields to a value */ + *matched_fields = cur_field; + *matched_bytes = cur_bytes; + + return(ret); +#else /* !UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; + return(0); +#endif /* !UNIV_HOTBACKUP */ +} + +/****************************************************************** +Compares a data tuple to a physical record. */ +UNIV_INTERN +int +cmp_dtuple_rec( +/*===========*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively; see the comments + for cmp_dtuple_rec_with_match */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint matched_fields = 0; + ulint matched_bytes = 0; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes)); +} + +/****************************************************************** +Checks if a dtuple is a prefix of a record. The last field in dtuple +is allowed to be a prefix of the corresponding field in the record. */ +UNIV_INTERN +ibool +cmp_dtuple_is_prefix_of_rec( +/*========================*/ + /* out: TRUE if prefix */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint n_fields; + ulint matched_fields = 0; + ulint matched_bytes = 0; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + n_fields = dtuple_get_n_fields(dtuple); + + if (n_fields > rec_offs_n_fields(offsets)) { + + return(FALSE); + } + + cmp_dtuple_rec_with_match(dtuple, rec, offsets, + &matched_fields, &matched_bytes); + if (matched_fields == n_fields) { + + return(TRUE); + } + + if (matched_fields == n_fields - 1 + && matched_bytes == dfield_get_len( + dtuple_get_nth_field(dtuple, n_fields - 1))) { + return(TRUE); + } + + return(FALSE); +} + +#ifndef UNIV_HOTBACKUP +/***************************************************************** +Compare two physical records that contain the same number of columns, +none of which are stored externally. */ +UNIV_INTERN +int +cmp_rec_rec_simple( +/*===============*/ + /* out: 1, 0 , -1 if rec1 is greater, + equal, less, respectively, than rec2 */ + const rec_t* rec1, /* in: physical record */ + const rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + const dict_index_t* index) /* in: data dictionary index */ +{ + ulint rec1_f_len; /* length of current field in rec1 */ + const byte* rec1_b_ptr; /* pointer to the current byte + in rec1 field */ + ulint rec1_byte; /* value of current byte to be + compared in rec1 */ + ulint rec2_f_len; /* length of current field in rec2 */ + const byte* rec2_b_ptr; /* pointer to the current byte + in rec2 field */ + ulint rec2_byte; /* value of current byte to be + compared in rec2 */ + ulint cur_field; /* current field number */ + ulint n_uniq; + + n_uniq = dict_index_get_n_unique(index); + ut_ad(rec_offs_n_fields(offsets1) >= n_uniq); + ut_ad(rec_offs_n_fields(offsets2) >= n_uniq); + + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); + + for (cur_field = 0; cur_field < n_uniq; cur_field++) { + + ulint cur_bytes; + ulint mtype; + ulint prtype; + + { + const dict_col_t* col + = dict_index_get_nth_col(index, cur_field); + + mtype = col->mtype; + prtype = col->prtype; + } + + ut_ad(!rec_offs_nth_extern(offsets1, cur_field)); + ut_ad(!rec_offs_nth_extern(offsets2, cur_field)); + + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, + cur_field, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, + cur_field, &rec2_f_len); + + if (rec1_f_len == UNIV_SQL_NULL + || rec2_f_len == UNIV_SQL_NULL) { + + if (rec1_f_len == rec2_f_len) { + + goto next_field; + + } else if (rec2_f_len == UNIV_SQL_NULL) { + + /* We define the SQL null to be the + smallest possible value of a field + in the alphabetical order */ + + return(1); + } else { + return(-1); + } + } + + if (mtype >= DATA_FLOAT + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + int ret = cmp_whole_field(mtype, prtype, + rec1_b_ptr, + (unsigned) rec1_f_len, + rec2_b_ptr, + (unsigned) rec2_f_len); + if (ret) { + return(ret); + } + + goto next_field; + } + + /* Compare the fields */ + for (cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) { + if (rec2_f_len <= cur_bytes) { + + if (rec1_f_len <= cur_bytes) { + + goto next_field; + } + + rec2_byte = dtype_get_pad_char(mtype, prtype); + + if (rec2_byte == ULINT_UNDEFINED) { + return(1); + } + } else { + rec2_byte = *rec2_b_ptr; + } + + if (rec1_f_len <= cur_bytes) { + rec1_byte = dtype_get_pad_char(mtype, prtype); + + if (rec1_byte == ULINT_UNDEFINED) { + return(-1); + } + } else { + rec1_byte = *rec1_b_ptr; + } + + if (rec1_byte == rec2_byte) { + /* If the bytes are equal, they will remain + such even after the collation transformation + below */ + + continue; + } + + if (mtype <= DATA_CHAR + || (mtype == DATA_BLOB + && !(prtype & DATA_BINARY_TYPE))) { + + rec1_byte = cmp_collate(rec1_byte); + rec2_byte = cmp_collate(rec2_byte); + } + + if (rec1_byte < rec2_byte) { + return(-1); + } else if (rec1_byte > rec2_byte) { + return(1); + } + } +next_field: + continue; + } + + /* If we ran out of fields, rec1 was equal to rec2. */ + return(0); +} +#endif /* !UNIV_HOTBACKUP */ + +/***************************************************************** +This function is used to compare two physical records. Only the common +first fields are compared, and if an externally stored field is +encountered, then 0 is returned. */ +UNIV_INTERN +int +cmp_rec_rec_with_match( +/*===================*/ + /* out: 1, 0 , -1 if rec1 is greater, equal, + less, respectively, than rec2; only the common + first fields are compared */ + const rec_t* rec1, /* in: physical record */ + const rec_t* rec2, /* in: physical record */ + const ulint* offsets1,/* in: rec_get_offsets(rec1, index) */ + const ulint* offsets2,/* in: rec_get_offsets(rec2, index) */ + dict_index_t* index, /* in: data dictionary index */ + ulint* matched_fields, /* in/out: number of already completely + matched fields; when the function returns, + contains the value the for current + comparison */ + ulint* matched_bytes) /* in/out: number of already matched + bytes within the first field not completely + matched; when the function returns, contains + the value for the current comparison */ +{ +#ifndef UNIV_HOTBACKUP + ulint rec1_n_fields; /* the number of fields in rec */ + ulint rec1_f_len; /* length of current field in rec */ + const byte* rec1_b_ptr; /* pointer to the current byte + in rec field */ + ulint rec1_byte; /* value of current byte to be + compared in rec */ + ulint rec2_n_fields; /* the number of fields in rec */ + ulint rec2_f_len; /* length of current field in rec */ + const byte* rec2_b_ptr; /* pointer to the current byte + in rec field */ + ulint rec2_byte; /* value of current byte to be + compared in rec */ + ulint cur_field; /* current field number */ + ulint cur_bytes; /* number of already matched + bytes in current field */ + int ret = 0; /* return value */ + ulint comp; + + ut_ad(rec1 && rec2 && index); + ut_ad(rec_offs_validate(rec1, index, offsets1)); + ut_ad(rec_offs_validate(rec2, index, offsets2)); + ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2)); + + comp = rec_offs_comp(offsets1); + rec1_n_fields = rec_offs_n_fields(offsets1); + rec2_n_fields = rec_offs_n_fields(offsets2); + + cur_field = *matched_fields; + cur_bytes = *matched_bytes; + + /* Match fields in a loop */ + + while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) { + + ulint mtype; + ulint prtype; + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + /* This is for the insert buffer B-tree. */ + mtype = DATA_BINARY; + prtype = 0; + } else { + const dict_col_t* col + = dict_index_get_nth_col(index, cur_field); + + mtype = col->mtype; + prtype = col->prtype; + } + + rec1_b_ptr = rec_get_nth_field(rec1, offsets1, + cur_field, &rec1_f_len); + rec2_b_ptr = rec_get_nth_field(rec2, offsets2, + cur_field, &rec2_f_len); + + if (cur_bytes == 0) { + if (cur_field == 0) { + /* Test if rec is the predefined minimum + record */ + if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp) + & REC_INFO_MIN_REC_FLAG)) { + + if (!(rec_get_info_bits(rec2, comp) + & REC_INFO_MIN_REC_FLAG)) { + ret = -1; + } + + goto order_resolved; + + } else if (UNIV_UNLIKELY + (rec_get_info_bits(rec2, comp) + & REC_INFO_MIN_REC_FLAG)) { + + ret = 1; + + goto order_resolved; + } + } + + if (rec_offs_nth_extern(offsets1, cur_field) + || rec_offs_nth_extern(offsets2, cur_field)) { + /* We do not compare to an externally + stored field */ + + goto order_resolved; + } + + if (rec1_f_len == UNIV_SQL_NULL + || rec2_f_len == UNIV_SQL_NULL) { + + if (rec1_f_len == rec2_f_len) { + + goto next_field; + + } else if (rec2_f_len == UNIV_SQL_NULL) { + + /* We define the SQL null to be the + smallest possible value of a field + in the alphabetical order */ + + ret = 1; + } else { + ret = -1; + } + + goto order_resolved; + } + } + + if (mtype >= DATA_FLOAT + || (mtype == DATA_BLOB + && 0 == (prtype & DATA_BINARY_TYPE) + && dtype_get_charset_coll(prtype) + != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) { + + ret = cmp_whole_field(mtype, prtype, + rec1_b_ptr, + (unsigned) rec1_f_len, + rec2_b_ptr, + (unsigned) rec2_f_len); + if (ret != 0) { + cur_bytes = 0; + + goto order_resolved; + } else { + goto next_field; + } + } + + /* Set the pointers at the current byte */ + rec1_b_ptr = rec1_b_ptr + cur_bytes; + rec2_b_ptr = rec2_b_ptr + cur_bytes; + + /* Compare then the fields */ + for (;;) { + if (rec2_f_len <= cur_bytes) { + + if (rec1_f_len <= cur_bytes) { + + goto next_field; + } + + rec2_byte = dtype_get_pad_char(mtype, prtype); + + if (rec2_byte == ULINT_UNDEFINED) { + ret = 1; + + goto order_resolved; + } + } else { + rec2_byte = *rec2_b_ptr; + } + + if (rec1_f_len <= cur_bytes) { + rec1_byte = dtype_get_pad_char(mtype, prtype); + + if (rec1_byte == ULINT_UNDEFINED) { + ret = -1; + + goto order_resolved; + } + } else { + rec1_byte = *rec1_b_ptr; + } + + if (rec1_byte == rec2_byte) { + /* If the bytes are equal, they will remain + such even after the collation transformation + below */ + + goto next_byte; + } + + if (mtype <= DATA_CHAR + || (mtype == DATA_BLOB + && !(prtype & DATA_BINARY_TYPE))) { + + rec1_byte = cmp_collate(rec1_byte); + rec2_byte = cmp_collate(rec2_byte); + } + + if (rec1_byte < rec2_byte) { + ret = -1; + goto order_resolved; + } else if (rec1_byte > rec2_byte) { + ret = 1; + goto order_resolved; + } +next_byte: + /* Next byte */ + + cur_bytes++; + rec1_b_ptr++; + rec2_b_ptr++; + } + +next_field: + cur_field++; + cur_bytes = 0; + } + + ut_ad(cur_bytes == 0); + + /* If we ran out of fields, rec1 was equal to rec2 up + to the common fields */ + ut_ad(ret == 0); +order_resolved: + + ut_ad((ret >= - 1) && (ret <= 1)); + + *matched_fields = cur_field; + *matched_bytes = cur_bytes; + + return(ret); +#else /* !UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; + return(0); +#endif /* !UNIV_HOTBACKUP */ +} + +#ifdef UNIV_DEBUG +/***************************************************************** +Used in debug checking of cmp_dtuple_... . +This function is used to compare a data tuple to a physical record. If +dtuple has n fields then rec must have either m >= n fields, or it must +differ from dtuple in some of the m fields rec has. If encounters an +externally stored field, returns 0. */ +static +int +cmp_debug_dtuple_rec_with_match( +/*============================*/ + /* out: 1, 0, -1, if dtuple is greater, equal, + less than rec, respectively, when only the + common first fields are compared */ + const dtuple_t* dtuple, /* in: data tuple */ + const rec_t* rec, /* in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint* matched_fields) /* in/out: number of already + completely matched fields; when function + returns, contains the value for current + comparison */ +{ + const dfield_t* dtuple_field; /* current field in logical record */ + ulint dtuple_f_len; /* the length of the current field + in the logical record */ + const byte* dtuple_f_data; /* pointer to the current logical + field data */ + ulint rec_f_len; /* length of current field in rec */ + const byte* rec_f_data; /* pointer to the current rec field */ + int ret = 3333; /* return value */ + ulint cur_field; /* current field number */ + + ut_ad(dtuple && rec && matched_fields); + ut_ad(dtuple_check_typed(dtuple)); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple)); + ut_ad(*matched_fields <= rec_offs_n_fields(offsets)); + + cur_field = *matched_fields; + + if (cur_field == 0) { + if (UNIV_UNLIKELY + (rec_get_info_bits(rec, rec_offs_comp(offsets)) + & REC_INFO_MIN_REC_FLAG)) { + + ret = !(dtuple_get_info_bits(dtuple) + & REC_INFO_MIN_REC_FLAG); + + goto order_resolved; + } + + if (UNIV_UNLIKELY + (dtuple_get_info_bits(dtuple) & REC_INFO_MIN_REC_FLAG)) { + ret = -1; + + goto order_resolved; + } + } + + /* Match fields in a loop; stop if we run out of fields in dtuple */ + + while (cur_field < dtuple_get_n_fields_cmp(dtuple)) { + + ulint mtype; + ulint prtype; + + dtuple_field = dtuple_get_nth_field(dtuple, cur_field); + { + const dtype_t* type + = dfield_get_type(dtuple_field); + + mtype = type->mtype; + prtype = type->prtype; + } + + dtuple_f_data = dfield_get_data(dtuple_field); + dtuple_f_len = dfield_get_len(dtuple_field); + + rec_f_data = rec_get_nth_field(rec, offsets, + cur_field, &rec_f_len); + + if (rec_offs_nth_extern(offsets, cur_field)) { + /* We do not compare to an externally stored field */ + + ret = 0; + + goto order_resolved; + } + + ret = cmp_data_data(mtype, prtype, dtuple_f_data, dtuple_f_len, + rec_f_data, rec_f_len); + if (ret != 0) { + goto order_resolved; + } + + cur_field++; + } + + ret = 0; /* If we ran out of fields, dtuple was equal to rec + up to the common fields */ +order_resolved: + ut_ad((ret >= - 1) && (ret <= 1)); + + *matched_fields = cur_field; + + return(ret); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/xtradb/rem/rem0rec.c b/storage/xtradb/rem/rem0rec.c new file mode 100644 index 00000000000..d6899c810e3 --- /dev/null +++ b/storage/xtradb/rem/rem0rec.c @@ -0,0 +1,1720 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "rem0rec.h" + +#ifdef UNIV_NONINL +#include "rem0rec.ic" +#endif + +#include "mtr0mtr.h" +#include "mtr0log.h" + +/* PHYSICAL RECORD (OLD STYLE) + =========================== + +The physical record, which is the data type of all the records +found in index pages of the database, has the following format +(lower addresses and more significant bits inside a byte are below +represented on a higher text line): + +| offset of the end of the last field of data, the most significant + bit is set to 1 if and only if the field is SQL-null, + if the offset is 2-byte, then the second most significant + bit is set to 1 if the field is stored on another page: + mostly this will occur in the case of big BLOB fields | +... +| offset of the end of the first field of data + the SQL-null bit | +| 4 bits used to delete mark a record, and mark a predefined + minimum record in alphabetical order | +| 4 bits giving the number of records owned by this record + (this term is explained in page0page.h) | +| 13 bits giving the order number of this record in the + heap of the index page | +| 10 bits giving the number of fields in this record | +| 1 bit which is set to 1 if the offsets above are given in + one byte format, 0 if in two byte format | +| two bytes giving an absolute pointer to the next record in the page | +ORIGIN of the record +| first field of data | +... +| last field of data | + +The origin of the record is the start address of the first field +of data. The offsets are given relative to the origin. +The offsets of the data fields are stored in an inverted +order because then the offset of the first fields are near the +origin, giving maybe a better processor cache hit rate in searches. + +The offsets of the data fields are given as one-byte +(if there are less than 127 bytes of data in the record) +or two-byte unsigned integers. The most significant bit +is not part of the offset, instead it indicates the SQL-null +if the bit is set to 1. */ + +/* PHYSICAL RECORD (NEW STYLE) + =========================== + +The physical record, which is the data type of all the records +found in index pages of the database, has the following format +(lower addresses and more significant bits inside a byte are below +represented on a higher text line): + +| length of the last non-null variable-length field of data: + if the maximum length is 255, one byte; otherwise, + 0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes, + length=128..16383, extern storage flag) | +... +| length of first variable-length field of data | +| SQL-null flags (1 bit per nullable field), padded to full bytes | +| 4 bits used to delete mark a record, and mark a predefined + minimum record in alphabetical order | +| 4 bits giving the number of records owned by this record + (this term is explained in page0page.h) | +| 13 bits giving the order number of this record in the + heap of the index page | +| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree), + 010=infimum, 011=supremum, 1xx=reserved | +| two bytes giving a relative pointer to the next record in the page | +ORIGIN of the record +| first field of data | +... +| last field of data | + +The origin of the record is the start address of the first field +of data. The offsets are given relative to the origin. +The offsets of the data fields are stored in an inverted +order because then the offset of the first fields are near the +origin, giving maybe a better processor cache hit rate in searches. + +The offsets of the data fields are given as one-byte +(if there are less than 127 bytes of data in the record) +or two-byte unsigned integers. The most significant bit +is not part of the offset, instead it indicates the SQL-null +if the bit is set to 1. */ + +/* CANONICAL COORDINATES. A record can be seen as a single +string of 'characters' in the following way: catenate the bytes +in each field, in the order of fields. An SQL-null field +is taken to be an empty sequence of bytes. Then after +the position of each field insert in the string +the 'character' , except that after an SQL-null field +insert . Now the ordinal position of each +byte in this canonical string is its canonical coordinate. +So, for the record ("AA", SQL-NULL, "BB", ""), the canonical +string is "AABB". +We identify prefixes (= initial segments) of a record +with prefixes of the canonical string. The canonical +length of the prefix is the length of the corresponding +prefix of the canonical string. The canonical length of +a record is the length of its canonical string. + +For example, the maximal common prefix of records +("AA", SQL-NULL, "BB", "C") and ("AA", SQL-NULL, "B", "C") +is "AAB", and its canonical +length is 5. + +A complete-field prefix of a record is a prefix which ends at the +end of some field (containing also ). +A record is a complete-field prefix of another record, if +the corresponding canonical strings have the same property. */ + +/* this is used to fool compiler in rec_validate */ +UNIV_INTERN ulint rec_dummy; + +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static +ibool +rec_validate_old( +/*=============*/ + /* out: TRUE if ok */ + const rec_t* rec); /* in: physical record */ + +/********************************************************** +Determine how many of the first n columns in a compact +physical record are stored externally. */ +UNIV_INTERN +ulint +rec_get_n_extern_new( +/*=================*/ + /* out: number of externally stored columns */ + const rec_t* rec, /* in: compact physical record */ + dict_index_t* index, /* in: record descriptor */ + ulint n) /* in: number of columns to scan */ +{ + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint n_extern; + ulint i; + + ut_ad(dict_table_is_comp(index->table)); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index)); + + if (n == ULINT_UNDEFINED) { + n = dict_index_get_n_fields(index); + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + null_mask = 1; + n_extern = 0; + i = 0; + + /* read the lengths of fields 0..n */ + do { + ulint len; + + field = dict_index_get_nth_field(index, i); + if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. */ + continue; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + const dict_col_t* col + = dict_field_get_col(field); + len = *lens--; + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + if (len & 0x40) { + n_extern++; + } + lens--; + } + } + } + } while (++i < n); + + return(n_extern); +} + +/********************************************************** +Determine the offset to each field in a leaf-page record +in ROW_FORMAT=COMPACT. This is a special case of +rec_init_offsets() and rec_get_offsets_func(). */ +UNIV_INTERN +void +rec_init_offsets_comp_ordinary( +/*===========================*/ + const rec_t* rec, /* in: physical record in + ROW_FORMAT=COMPACT */ + ulint extra, /* in: number of bytes to reserve + between the record header and + the data payload + (usually REC_N_NEW_EXTRA_BYTES) */ + const dict_index_t* index, /* in: record descriptor */ + ulint* offsets)/* in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + ulint i = 0; + ulint offs = 0; + ulint any_ext = 0; + const byte* nulls = rec - (extra + 1); + const byte* lens = nulls + - UT_BITS_IN_BYTES(index->n_nullable); + dict_field_t* field; + ulint null_mask = 1; + +#ifdef UNIV_DEBUG + /* We cannot invoke rec_offs_make_valid() here, because it can hold + that extra != REC_N_NEW_EXTRA_BYTES. Similarly, rec_offs_validate() + will fail in that case, because it invokes rec_get_status(). */ + offsets[2] = (ulint) rec; + offsets[3] = (ulint) index; +#endif /* UNIV_DEBUG */ + + /* read the lengths of fields 0..n */ + do { + ulint len; + + field = dict_index_get_nth_field(index, i); + if (!(dict_field_get_col(field)->prtype + & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + const dict_col_t* col + = dict_field_get_col(field); + len = *lens--; + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype + == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens--; + + offs += len & 0x3fff; + if (UNIV_UNLIKELY(len + & 0x4000)) { + ut_ad(dict_index_is_clust + (index)); + any_ext = REC_OFFS_EXTERNAL; + len = offs + | REC_OFFS_EXTERNAL; + } else { + len = offs; + } + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } +resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + *rec_offs_base(offsets) + = (rec - (lens + 1)) | REC_OFFS_COMPACT | any_ext; +} + +/********************************************************** +The following function determines the offsets to each field in the +record. The offsets are written to a previously allocated array of +ulint, where rec_offs_n_fields(offsets) has been initialized to the +number of fields in the record. The rest of the array will be +initialized by this function. rec_offs_base(offsets)[0] will be set +to the extra size (if REC_OFFS_COMPACT is set, the record is in the +new format; if REC_OFFS_EXTERNAL is set, the record contains externally +stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to +offsets past the end of fields 0..n_fields, or to the beginning of +fields 1..n_fields+1. When the high-order bit of the offset at [i+1] +is set (REC_OFFS_SQL_NULL), the field i is NULL. When the second +high-order bit of the offset at [i+1] is set (REC_OFFS_EXTERNAL), the +field i is being stored externally. */ +static +void +rec_init_offsets( +/*=============*/ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index, /* in: record descriptor */ + ulint* offsets)/* in/out: array of offsets; + in: n=rec_offs_n_fields(offsets) */ +{ + ulint i = 0; + ulint offs; + + rec_offs_make_valid(rec, index, offsets); + + if (dict_table_is_comp(index->table)) { + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint status = rec_get_status(rec); + ulint n_node_ptr_field = ULINT_UNDEFINED; + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* the field is 8 bytes long */ + rec_offs_base(offsets)[0] + = REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT; + rec_offs_base(offsets)[1] = 8; + return; + case REC_STATUS_NODE_PTR: + n_node_ptr_field + = dict_index_get_n_unique_in_tree(index); + break; + case REC_STATUS_ORDINARY: + rec_init_offsets_comp_ordinary(rec, + REC_N_NEW_EXTRA_BYTES, + index, offsets); + return; + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + offs = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + do { + ulint len; + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + len = offs += 4; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + if (!(dict_field_get_col(field)->prtype + & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + const dict_col_t* col + = dict_field_get_col(field); + len = *lens--; + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype + == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + + len <<= 8; + len |= *lens--; + + /* B-tree node pointers + must not contain externally + stored columns. Thus + the "e" flag must be 0. */ + ut_a(!(len & 0x4000)); + offs += len & 0x3fff; + len = offs; + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } +resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + *rec_offs_base(offsets) + = (rec - (lens + 1)) | REC_OFFS_COMPACT; + } else { + /* Old-style record: determine extra size and end offsets */ + offs = REC_N_OLD_EXTRA_BYTES; + if (rec_get_1byte_offs_flag(rec)) { + offs += rec_offs_n_fields(offsets); + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + do { + offs = rec_1_get_field_end_info(rec, i); + if (offs & REC_1BYTE_SQL_NULL_MASK) { + offs &= ~REC_1BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + rec_offs_base(offsets)[1 + i] = offs; + } while (++i < rec_offs_n_fields(offsets)); + } else { + offs += 2 * rec_offs_n_fields(offsets); + *rec_offs_base(offsets) = offs; + /* Determine offsets to fields */ + do { + offs = rec_2_get_field_end_info(rec, i); + if (offs & REC_2BYTE_SQL_NULL_MASK) { + offs &= ~REC_2BYTE_SQL_NULL_MASK; + offs |= REC_OFFS_SQL_NULL; + } + if (offs & REC_2BYTE_EXTERN_MASK) { + offs &= ~REC_2BYTE_EXTERN_MASK; + offs |= REC_OFFS_EXTERNAL; + *rec_offs_base(offsets) |= REC_OFFS_EXTERNAL; + } + rec_offs_base(offsets)[1 + i] = offs; + } while (++i < rec_offs_n_fields(offsets)); + } + } +} + +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously returned array. */ +UNIV_INTERN +ulint* +rec_get_offsets_func( +/*=================*/ + /* out: the new offsets */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index, /* in: record descriptor */ + ulint* offsets,/* in/out: array consisting of + offsets[0] allocated elements, + or an array from rec_get_offsets(), + or NULL */ + ulint n_fields,/* in: maximum number of + initialized fields + (ULINT_UNDEFINED if all fields) */ + mem_heap_t** heap, /* in/out: memory heap */ + const char* file, /* in: file name where called */ + ulint line) /* in: line number where called */ +{ + ulint n; + ulint size; + + ut_ad(rec); + ut_ad(index); + ut_ad(heap); + + if (dict_table_is_comp(index->table)) { + switch (UNIV_EXPECT(rec_get_status(rec), + REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + n = dict_index_get_n_fields(index); + break; + case REC_STATUS_NODE_PTR: + n = dict_index_get_n_unique_in_tree(index) + 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record */ + n = 1; + break; + default: + ut_error; + return(NULL); + } + } else { + n = rec_get_n_fields_old(rec); + } + + if (UNIV_UNLIKELY(n_fields < n)) { + n = n_fields; + } + + size = n + (1 + REC_OFFS_HEADER_SIZE); + + if (UNIV_UNLIKELY(!offsets) + || UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) { + if (UNIV_UNLIKELY(!*heap)) { + *heap = mem_heap_create_func(size * sizeof(ulint), + MEM_HEAP_DYNAMIC, + file, line); + } + offsets = mem_heap_alloc(*heap, size * sizeof(ulint)); + rec_offs_set_n_alloc(offsets, size); + } + + rec_offs_set_n_fields(offsets, n); + rec_init_offsets(rec, index, offsets); + return(offsets); +} + +/********************************************************** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +UNIV_INTERN +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /* in: the extra bytes of a + compact record in reverse order, + excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + const dict_index_t* index, /* in: record descriptor */ + ulint node_ptr,/* in: nonzero=node pointer, + 0=leaf node */ + ulint* offsets)/* in/out: array consisting of + offsets[0] allocated elements */ +{ + ulint n; + ulint i; + ulint offs; + ulint any_ext; + const byte* nulls; + const byte* lens; + dict_field_t* field; + ulint null_mask; + ulint n_node_ptr_field; + + ut_ad(extra); + ut_ad(index); + ut_ad(offsets); + ut_ad(dict_table_is_comp(index->table)); + + if (UNIV_UNLIKELY(node_ptr)) { + n_node_ptr_field = dict_index_get_n_unique_in_tree(index); + n = n_node_ptr_field + 1; + } else { + n_node_ptr_field = ULINT_UNDEFINED; + n = dict_index_get_n_fields(index); + } + + ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE)); + rec_offs_set_n_fields(offsets, n); + + nulls = extra; + lens = nulls + UT_BITS_IN_BYTES(index->n_nullable); + i = offs = 0; + null_mask = 1; + any_ext = 0; + + /* read the lengths of fields 0..n */ + do { + ulint len; + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + len = offs += 4; + goto resolved; + } + + field = dict_index_get_nth_field(index, i); + if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls++; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + /* No length is stored for NULL fields. + We do not advance offs, and we set + the length to zero and enable the + SQL NULL flag in offsets[]. */ + len = offs | REC_OFFS_SQL_NULL; + goto resolved; + } + null_mask <<= 1; + } + + if (UNIV_UNLIKELY(!field->fixed_len)) { + /* Variable-length field: read the length */ + const dict_col_t* col + = dict_field_get_col(field); + len = *lens++; + if (UNIV_UNLIKELY(col->len > 255) + || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) { + if (len & 0x80) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens++; + + offs += len & 0x3fff; + if (UNIV_UNLIKELY(len & 0x4000)) { + any_ext = REC_OFFS_EXTERNAL; + len = offs | REC_OFFS_EXTERNAL; + } else { + len = offs; + } + + goto resolved; + } + } + + len = offs += len; + } else { + len = offs += field->fixed_len; + } +resolved: + rec_offs_base(offsets)[i + 1] = len; + } while (++i < rec_offs_n_fields(offsets)); + + ut_ad(lens >= extra); + *rec_offs_base(offsets) = (lens - extra + REC_N_NEW_EXTRA_BYTES) + | REC_OFFS_COMPACT | any_ext; +} + +/**************************************************************** +The following function is used to get the offset to the nth +data field in an old-style record. */ +UNIV_INTERN +ulint +rec_get_nth_field_offs_old( +/*=======================*/ + /* out: offset to the field */ + const rec_t* rec, /* in: record */ + ulint n, /* in: index of the field */ + ulint* len) /* out: length of the field; + UNIV_SQL_NULL if SQL null */ +{ + ulint os; + ulint next_os; + + ut_ad(rec && len); + ut_ad(n < rec_get_n_fields_old(rec)); + + if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) { + fprintf(stderr, "Error: trying to access field %lu in rec\n", + (ulong) n); + ut_error; + } + + if (UNIV_UNLIKELY(rec == NULL)) { + fputs("Error: rec is NULL pointer\n", stderr); + ut_error; + } + + if (rec_get_1byte_offs_flag(rec)) { + os = rec_1_get_field_start_offs(rec, n); + + next_os = rec_1_get_field_end_info(rec, n); + + if (next_os & REC_1BYTE_SQL_NULL_MASK) { + *len = UNIV_SQL_NULL; + + return(os); + } + + next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK; + } else { + os = rec_2_get_field_start_offs(rec, n); + + next_os = rec_2_get_field_end_info(rec, n); + + if (next_os & REC_2BYTE_SQL_NULL_MASK) { + *len = UNIV_SQL_NULL; + + return(os); + } + + next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK + | REC_2BYTE_EXTERN_MASK); + } + + *len = next_os - os; + + ut_ad(*len < UNIV_PAGE_SIZE); + + return(os); +} + +/************************************************************** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. */ +UNIV_INTERN +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + /* out: total size */ + const dict_index_t* index, /* in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + const dfield_t* fields, /* in: array of data fields */ + ulint n_fields,/* in: number of data fields */ + ulint* extra) /* out: extra size */ +{ + ulint extra_size; + ulint data_size; + ulint i; + ut_ad(index); + ut_ad(fields); + ut_ad(n_fields > 0); + ut_ad(n_fields <= dict_index_get_n_fields(index)); + + extra_size = REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(index->n_nullable); + data_size = 0; + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + const dict_field_t* field; + ulint len; + const dict_col_t* col; + + field = dict_index_get_nth_field(index, i); + len = dfield_get_len(&fields[i]); + col = dict_field_get_col(field); + + ut_ad(dict_col_type_assert_equal(col, + dfield_get_type(&fields[i]))); + + if (dfield_is_null(&fields[i])) { + /* No length is stored for NULL fields. */ + ut_ad(!(col->prtype & DATA_NOT_NULL)); + continue; + } + + ut_ad(len <= col->len || col->mtype == DATA_BLOB); + + if (field->fixed_len) { + ut_ad(len == field->fixed_len); + /* dict_index_add_col() should guarantee this */ + ut_ad(!field->prefix_len + || field->fixed_len == field->prefix_len); + } else if (dfield_is_ext(&fields[i])) { + extra_size += 2; + } else if (len < 128 + || (col->len < 256 && col->mtype != DATA_BLOB)) { + extra_size++; + } else { + /* For variable-length columns, we look up the + maximum length from the column itself. If this + is a prefix index column shorter than 256 bytes, + this will waste one byte. */ + extra_size += 2; + } + data_size += len; + } + + if (UNIV_LIKELY_NULL(extra)) { + *extra = extra_size; + } + + return(extra_size + data_size); +} + +/************************************************************** +Determines the size of a data tuple in ROW_FORMAT=COMPACT. */ +UNIV_INTERN +ulint +rec_get_converted_size_comp( +/*========================*/ + /* out: total size */ + const dict_index_t* index, /* in: record descriptor; + dict_table_is_comp() is + assumed to hold, even if + it does not */ + ulint status, /* in: status bits of the record */ + const dfield_t* fields, /* in: array of data fields */ + ulint n_fields,/* in: number of data fields */ + ulint* extra) /* out: extra size */ +{ + ulint size; + ut_ad(index); + ut_ad(fields); + ut_ad(n_fields > 0); + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields == dict_index_get_n_fields(index)); + size = 0; + break; + case REC_STATUS_NODE_PTR: + n_fields--; + ut_ad(n_fields == dict_index_get_n_unique_in_tree(index)); + ut_ad(dfield_get_len(&fields[n_fields]) == REC_NODE_PTR_SIZE); + size = REC_NODE_PTR_SIZE; /* child page number */ + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record, 8 data bytes */ + if (UNIV_LIKELY_NULL(extra)) { + *extra = REC_N_NEW_EXTRA_BYTES; + } + return(REC_N_NEW_EXTRA_BYTES + 8); + default: + ut_error; + return(ULINT_UNDEFINED); + } + + return(size + rec_get_converted_size_comp_prefix(index, fields, + n_fields, extra)); +} + +/*************************************************************** +Sets the value of the ith field SQL null bit of an old-style record. */ +UNIV_INTERN +void +rec_set_nth_field_null_bit( +/*=======================*/ + rec_t* rec, /* in: record */ + ulint i, /* in: ith field */ + ibool val) /* in: value to set */ +{ + ulint info; + + if (rec_get_1byte_offs_flag(rec)) { + + info = rec_1_get_field_end_info(rec, i); + + if (val) { + info = info | REC_1BYTE_SQL_NULL_MASK; + } else { + info = info & ~REC_1BYTE_SQL_NULL_MASK; + } + + rec_1_set_field_end_info(rec, i, info); + + return; + } + + info = rec_2_get_field_end_info(rec, i); + + if (val) { + info = info | REC_2BYTE_SQL_NULL_MASK; + } else { + info = info & ~REC_2BYTE_SQL_NULL_MASK; + } + + rec_2_set_field_end_info(rec, i, info); +} + +/*************************************************************** +Sets an old-style record field to SQL null. +The physical size of the field is not changed. */ +UNIV_INTERN +void +rec_set_nth_field_sql_null( +/*=======================*/ + rec_t* rec, /* in: record */ + ulint n) /* in: index of the field */ +{ + ulint offset; + + offset = rec_get_field_start_offs(rec, n); + + data_write_sql_null(rec + offset, rec_get_nth_field_size(rec, n)); + + rec_set_nth_field_null_bit(rec, n, TRUE); +} + +/************************************************************* +Builds an old-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_old( +/*==========================*/ + /* out: pointer to the origin of + physical record */ + byte* buf, /* in: start address of the physical record */ + const dtuple_t* dtuple, /* in: data tuple */ + ulint n_ext) /* in: number of externally stored columns */ +{ + const dfield_t* field; + ulint n_fields; + ulint data_size; + rec_t* rec; + ulint end_offset; + ulint ored_offset; + ulint len; + ulint i; + + ut_ad(buf && dtuple); + ut_ad(dtuple_validate(dtuple)); + ut_ad(dtuple_check_typed(dtuple)); + + n_fields = dtuple_get_n_fields(dtuple); + data_size = dtuple_get_data_size(dtuple); + + ut_ad(n_fields > 0); + + /* Calculate the offset of the origin in the physical record */ + + rec = buf + rec_get_converted_extra_size(data_size, n_fields, n_ext); +#ifdef UNIV_DEBUG + /* Suppress Valgrind warnings of ut_ad() + in mach_write_to_1(), mach_write_to_2() et al. */ + memset(buf, 0xff, rec - buf + data_size); +#endif /* UNIV_DEBUG */ + /* Store the number of fields */ + rec_set_n_fields_old(rec, n_fields); + + /* Set the info bits of the record */ + rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple) + & REC_INFO_BITS_MASK); + + /* Store the data and the offsets */ + + end_offset = 0; + + if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) { + + rec_set_1byte_offs_flag(rec, TRUE); + + for (i = 0; i < n_fields; i++) { + + field = dtuple_get_nth_field(dtuple, i); + + if (dfield_is_null(field)) { + len = dtype_get_sql_null_size( + dfield_get_type(field)); + data_write_sql_null(rec + end_offset, len); + + end_offset += len; + ored_offset = end_offset + | REC_1BYTE_SQL_NULL_MASK; + } else { + /* If the data is not SQL null, store it */ + len = dfield_get_len(field); + + memcpy(rec + end_offset, + dfield_get_data(field), len); + + end_offset += len; + ored_offset = end_offset; + } + + rec_1_set_field_end_info(rec, i, ored_offset); + } + } else { + rec_set_1byte_offs_flag(rec, FALSE); + + for (i = 0; i < n_fields; i++) { + + field = dtuple_get_nth_field(dtuple, i); + + if (dfield_is_null(field)) { + len = dtype_get_sql_null_size( + dfield_get_type(field)); + data_write_sql_null(rec + end_offset, len); + + end_offset += len; + ored_offset = end_offset + | REC_2BYTE_SQL_NULL_MASK; + } else { + /* If the data is not SQL null, store it */ + len = dfield_get_len(field); + + memcpy(rec + end_offset, + dfield_get_data(field), len); + + end_offset += len; + ored_offset = end_offset; + + if (dfield_is_ext(field)) { + ored_offset |= REC_2BYTE_EXTERN_MASK; + } + } + + rec_2_set_field_end_info(rec, i, ored_offset); + } + } + + return(rec); +} + +/************************************************************* +Builds a ROW_FORMAT=COMPACT record out of a data tuple. */ +UNIV_INTERN +void +rec_convert_dtuple_to_rec_comp( +/*===========================*/ + rec_t* rec, /* in: origin of record */ + ulint extra, /* in: number of bytes to + reserve between the record + header and the data payload + (normally REC_N_NEW_EXTRA_BYTES) */ + const dict_index_t* index, /* in: record descriptor */ + ulint status, /* in: status bits of the record */ + const dfield_t* fields, /* in: array of data fields */ + ulint n_fields)/* in: number of data fields */ +{ + const dfield_t* field; + const dtype_t* type; + byte* end; + byte* nulls; + byte* lens; + ulint len; + ulint i; + ulint n_node_ptr_field; + ulint fixed_len; + ulint null_mask = 1; + ut_ad(extra == 0 || dict_table_is_comp(index->table)); + ut_ad(extra == 0 || extra == REC_N_NEW_EXTRA_BYTES); + ut_ad(n_fields > 0); + + switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + n_node_ptr_field = ULINT_UNDEFINED; + break; + case REC_STATUS_NODE_PTR: + ut_ad(n_fields == dict_index_get_n_unique_in_tree(index) + 1); + n_node_ptr_field = n_fields - 1; + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad(n_fields == 1); + n_node_ptr_field = ULINT_UNDEFINED; + break; + default: + ut_error; + return; + } + + end = rec; + nulls = rec - (extra + 1); + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + /* clear the SQL-null flags */ + memset(lens + 1, 0, nulls - lens); + + /* Store the data and the offsets */ + + for (i = 0, field = fields; i < n_fields; i++, field++) { + type = dfield_get_type(field); + len = dfield_get_len(field); + + if (UNIV_UNLIKELY(i == n_node_ptr_field)) { + ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); + ut_ad(len == 4); + memcpy(end, dfield_get_data(field), len); + end += 4; + break; + } + + if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) { + /* nullable field */ + ut_ad(index->n_nullable > 0); + + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + ut_ad(*nulls < null_mask); + + /* set the null flag if necessary */ + if (dfield_is_null(field)) { + *nulls |= null_mask; + null_mask <<= 1; + continue; + } + + null_mask <<= 1; + } + /* only nullable fields can be null */ + ut_ad(!dfield_is_null(field)); + + fixed_len = dict_index_get_nth_field(index, i)->fixed_len; + + if (fixed_len) { + ut_ad(len == fixed_len); + ut_ad(!dfield_is_ext(field)); + } else if (dfield_is_ext(field)) { + ut_ad(len <= REC_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE); + *lens-- = (byte) (len >> 8) | 0xc0; + *lens-- = (byte) len; + } else { + ut_ad(len <= dtype_get_len(type) + || dtype_get_mtype(type) == DATA_BLOB); + if (len < 128 + || (dtype_get_len(type) < 256 + && dtype_get_mtype(type) != DATA_BLOB)) { + + *lens-- = (byte) len; + } else { + ut_ad(len < 16384); + *lens-- = (byte) (len >> 8) | 0x80; + *lens-- = (byte) len; + } + } + + memcpy(end, dfield_get_data(field), len); + end += len; + } +} + +/************************************************************* +Builds a new-style physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +static +rec_t* +rec_convert_dtuple_to_rec_new( +/*==========================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of + the physical record */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* dtuple) /* in: data tuple */ +{ + ulint extra_size; + ulint status; + rec_t* rec; + + status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK; + rec_get_converted_size_comp(index, status, + dtuple->fields, dtuple->n_fields, + &extra_size); + rec = buf + extra_size; + + rec_convert_dtuple_to_rec_comp( + rec, REC_N_NEW_EXTRA_BYTES, index, status, + dtuple->fields, dtuple->n_fields); + + /* Set the info bits of the record */ + rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple)); + + return(rec); +} + +/************************************************************* +Builds a physical record out of a data tuple and +stores it beginning from the start of the given buffer. */ +UNIV_INTERN +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + /* out: pointer to the origin + of physical record */ + byte* buf, /* in: start address of the + physical record */ + const dict_index_t* index, /* in: record descriptor */ + const dtuple_t* dtuple, /* in: data tuple */ + ulint n_ext) /* in: number of + externally stored columns */ +{ + rec_t* rec; + + ut_ad(buf && index && dtuple); + ut_ad(dtuple_validate(dtuple)); + ut_ad(dtuple_check_typed(dtuple)); + + if (dict_table_is_comp(index->table)) { + rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple); + } else { + rec = rec_convert_dtuple_to_rec_old(buf, dtuple, n_ext); + } + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, + offsets_, ULINT_UNDEFINED, &heap); + ut_ad(rec_validate(rec, offsets)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ + return(rec); +} + +/****************************************************************** +Copies the first n fields of a physical record to a data tuple. The fields +are copied to the memory heap. */ +UNIV_INTERN +void +rec_copy_prefix_to_dtuple( +/*======================*/ + dtuple_t* tuple, /* out: data tuple */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields + to copy */ + mem_heap_t* heap) /* in: memory heap */ +{ + ulint i; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap); + + ut_ad(rec_validate(rec, offsets)); + ut_ad(dtuple_check_typed(tuple)); + + dtuple_set_info_bits(tuple, rec_get_info_bits( + rec, dict_table_is_comp(index->table))); + + for (i = 0; i < n_fields; i++) { + dfield_t* field; + const byte* data; + ulint len; + + field = dtuple_get_nth_field(tuple, i); + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + dfield_set_data(field, + mem_heap_dup(heap, data, len), len); + ut_ad(!rec_offs_nth_extern(offsets, i)); + } else { + dfield_set_null(field); + } + } +} + +/****************************************************************** +Copies the first n fields of an old-style physical record +to a new physical record in a buffer. */ +static +rec_t* +rec_copy_prefix_to_buf_old( +/*=======================*/ + /* out, own: copied record */ + const rec_t* rec, /* in: physical record */ + ulint n_fields, /* in: number of fields to copy */ + ulint area_end, /* in: end of the prefix data */ + byte** buf, /* in/out: memory buffer for + the copied prefix, or NULL */ + ulint* buf_size) /* in/out: buffer size */ +{ + rec_t* copy_rec; + ulint area_start; + ulint prefix_len; + + if (rec_get_1byte_offs_flag(rec)) { + area_start = REC_N_OLD_EXTRA_BYTES + n_fields; + } else { + area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields; + } + + prefix_len = area_start + area_end; + + if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf != NULL) { + mem_free(*buf); + } + + *buf = mem_alloc2(prefix_len, buf_size); + } + + ut_memcpy(*buf, rec - area_start, prefix_len); + + copy_rec = *buf + area_start; + + rec_set_n_fields_old(copy_rec, n_fields); + + return(copy_rec); +} + +/****************************************************************** +Copies the first n fields of a physical record to a new physical record in +a buffer. */ +UNIV_INTERN +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + /* out, own: copied record */ + const rec_t* rec, /* in: physical record */ + const dict_index_t* index, /* in: record descriptor */ + ulint n_fields, /* in: number of fields + to copy */ + byte** buf, /* in/out: memory buffer + for the copied prefix, + or NULL */ + ulint* buf_size) /* in/out: buffer size */ +{ + const byte* nulls; + const byte* lens; + ulint i; + ulint prefix_len; + ulint null_mask; + ulint status; + + UNIV_PREFETCH_RW(*buf); + + if (!dict_table_is_comp(index->table)) { + ut_ad(rec_validate_old(rec)); + return(rec_copy_prefix_to_buf_old( + rec, n_fields, + rec_get_field_start_offs(rec, n_fields), + buf, buf_size)); + } + + status = rec_get_status(rec); + + switch (status) { + case REC_STATUS_ORDINARY: + ut_ad(n_fields <= dict_index_get_n_fields(index)); + break; + case REC_STATUS_NODE_PTR: + /* it doesn't make sense to copy the child page number field */ + ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index)); + break; + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + /* infimum or supremum record: no sense to copy anything */ + default: + ut_error; + return(NULL); + } + + nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1); + lens = nulls - UT_BITS_IN_BYTES(index->n_nullable); + UNIV_PREFETCH_R(lens); + prefix_len = 0; + null_mask = 1; + + /* read the lengths of fields 0..n */ + for (i = 0; i < n_fields; i++) { + const dict_field_t* field; + const dict_col_t* col; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + + if (!(col->prtype & DATA_NOT_NULL)) { + /* nullable field => read the null flag */ + if (UNIV_UNLIKELY(!(byte) null_mask)) { + nulls--; + null_mask = 1; + } + + if (*nulls & null_mask) { + null_mask <<= 1; + continue; + } + + null_mask <<= 1; + } + + if (field->fixed_len) { + prefix_len += field->fixed_len; + } else { + ulint len = *lens--; + if (col->len > 255 || col->mtype == DATA_BLOB) { + if (len & 0x80) { + /* 1exxxxxx */ + len &= 0x3f; + len <<= 8; + len |= *lens--; + UNIV_PREFETCH_R(lens); + } + } + prefix_len += len; + } + } + + UNIV_PREFETCH_R(rec + prefix_len); + + prefix_len += rec - (lens + 1); + + if ((*buf == NULL) || (*buf_size < prefix_len)) { + if (*buf != NULL) { + mem_free(*buf); + } + + *buf = mem_alloc2(prefix_len, buf_size); + } + + memcpy(*buf, lens + 1, prefix_len); + + return(*buf + (rec - (lens + 1))); +} + +/******************************************************************* +Validates the consistency of an old-style physical record. */ +static +ibool +rec_validate_old( +/*=============*/ + /* out: TRUE if ok */ + const rec_t* rec) /* in: physical record */ +{ + const byte* data; + ulint len; + ulint n_fields; + ulint len_sum = 0; + ulint sum = 0; + ulint i; + + ut_a(rec); + n_fields = rec_get_n_fields_old(rec); + + if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + (ulong) n_fields); + return(FALSE); + } + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field_old(rec, i, &len); + + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", + (ulong) i, + (ulong) len); + return(FALSE); + } + + if (len != UNIV_SQL_NULL) { + len_sum += len; + sum += *(data + len -1); /* dereference the + end of the field to + cause a memory trap + if possible */ + } else { + len_sum += rec_get_nth_field_size(rec, i); + } + } + + if (len_sum != rec_get_data_size_old(rec)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + (ulong) len_sum, + rec_get_data_size_old(rec)); + return(FALSE); + } + + rec_dummy = sum; /* This is here only to fool the compiler */ + + return(TRUE); +} + +/******************************************************************* +Validates the consistency of a physical record. */ +UNIV_INTERN +ibool +rec_validate( +/*=========*/ + /* out: TRUE if ok */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + const byte* data; + ulint len; + ulint n_fields; + ulint len_sum = 0; + ulint sum = 0; + ulint i; + + ut_a(rec); + n_fields = rec_offs_n_fields(offsets); + + if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) { + fprintf(stderr, "InnoDB: Error: record has %lu fields\n", + (ulong) n_fields); + return(FALSE); + } + + ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec)); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) { + fprintf(stderr, + "InnoDB: Error: record field %lu len %lu\n", + (ulong) i, + (ulong) len); + return(FALSE); + } + + if (len != UNIV_SQL_NULL) { + len_sum += len; + sum += *(data + len -1); /* dereference the + end of the field to + cause a memory trap + if possible */ + } else if (!rec_offs_comp(offsets)) { + len_sum += rec_get_nth_field_size(rec, i); + } + } + + if (len_sum != rec_offs_data_size(offsets)) { + fprintf(stderr, + "InnoDB: Error: record len should be %lu, len %lu\n", + (ulong) len_sum, + (ulong) rec_offs_data_size(offsets)); + return(FALSE); + } + + rec_dummy = sum; /* This is here only to fool the compiler */ + + if (!rec_offs_comp(offsets)) { + ut_a(rec_validate_old(rec)); + } + + return(TRUE); +} + +/******************************************************************* +Prints an old-style physical record. */ +UNIV_INTERN +void +rec_print_old( +/*==========*/ + FILE* file, /* in: file where to print */ + const rec_t* rec) /* in: physical record */ +{ + const byte* data; + ulint len; + ulint n; + ulint i; + + ut_ad(rec); + + n = rec_get_n_fields_old(rec); + + fprintf(file, "PHYSICAL RECORD: n_fields %lu;" + " %u-byte offsets; info bits %lu\n", + (ulong) n, + rec_get_1byte_offs_flag(rec) ? 1 : 2, + (ulong) rec_get_info_bits(rec, FALSE)); + + for (i = 0; i < n; i++) { + + data = rec_get_nth_field_old(rec, i, &len); + + fprintf(file, " %lu:", (ulong) i); + + if (len != UNIV_SQL_NULL) { + if (len <= 30) { + + ut_print_buf(file, data, len); + } else { + ut_print_buf(file, data, 30); + + fprintf(file, " (total %lu bytes)", + (ulong) len); + } + } else { + fprintf(file, " SQL NULL, size %lu ", + rec_get_nth_field_size(rec, i)); + } + + putc(';', file); + putc('\n', file); + } + + rec_validate_old(rec); +} + +/******************************************************************* +Prints a physical record in ROW_FORMAT=COMPACT. Ignores the +record header. */ +UNIV_INTERN +void +rec_print_comp( +/*===========*/ + FILE* file, /* in: file where to print */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ulint i; + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + const byte* data; + ulint len; + + data = rec_get_nth_field(rec, offsets, i, &len); + + fprintf(file, " %lu:", (ulong) i); + + if (len != UNIV_SQL_NULL) { + if (len <= 30) { + + ut_print_buf(file, data, len); + } else { + ut_print_buf(file, data, 30); + + fprintf(file, " (total %lu bytes)", + (ulong) len); + } + } else { + fputs(" SQL NULL", file); + } + putc(';', file); + putc('\n', file); + } +} + +/******************************************************************* +Prints a physical record. */ +UNIV_INTERN +void +rec_print_new( +/*==========*/ + FILE* file, /* in: file where to print */ + const rec_t* rec, /* in: physical record */ + const ulint* offsets)/* in: array returned by rec_get_offsets() */ +{ + ut_ad(rec); + ut_ad(offsets); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_comp(offsets)) { + rec_print_old(file, rec); + return; + } + + fprintf(file, "PHYSICAL RECORD: n_fields %lu;" + " compact format; info bits %lu\n", + (ulong) rec_offs_n_fields(offsets), + (ulong) rec_get_info_bits(rec, TRUE)); + + rec_print_comp(file, rec, offsets); + rec_validate(rec, offsets); +} + +/******************************************************************* +Prints a physical record. */ +UNIV_INTERN +void +rec_print( +/*======*/ + FILE* file, /* in: file where to print */ + const rec_t* rec, /* in: physical record */ + dict_index_t* index) /* in: record descriptor */ +{ + ut_ad(index); + + if (!dict_table_is_comp(index->table)) { + rec_print_old(file, rec); + return; + } else { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + rec_print_new(file, rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +} diff --git a/storage/xtradb/row/row0ext.c b/storage/xtradb/row/row0ext.c new file mode 100644 index 00000000000..83dfa024ffc --- /dev/null +++ b/storage/xtradb/row/row0ext.c @@ -0,0 +1,114 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "row0ext.h" + +#ifdef UNIV_NONINL +#include "row0ext.ic" +#endif + +#include "btr0cur.h" + +/************************************************************************ +Fills the column prefix cache of an externally stored column. */ +static +void +row_ext_cache_fill( +/*===============*/ + row_ext_t* ext, /* in/out: column prefix cache */ + ulint i, /* in: index of ext->ext[] */ + ulint zip_size,/* compressed page size in bytes, or 0 */ + const dfield_t* dfield) /* in: data field */ +{ + const byte* field = dfield_get_data(dfield); + ulint f_len = dfield_get_len(dfield); + byte* buf = ext->buf + i * REC_MAX_INDEX_COL_LEN; + + ut_ad(i < ext->n_ext); + ut_ad(dfield_is_ext(dfield)); + ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY(!memcmp(field_ref_zero, + field + f_len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* The BLOB pointer is not set: we cannot fetch it */ + ext->len[i] = 0; + } else { + /* Fetch at most REC_MAX_INDEX_COL_LEN of the column. + The column should be non-empty. However, + trx_rollback_or_clean_all_recovered() may try to + access a half-deleted BLOB if the server previously + crashed during the execution of + btr_free_externally_stored_field(). */ + ext->len[i] = btr_copy_externally_stored_field_prefix( + buf, REC_MAX_INDEX_COL_LEN, zip_size, field, f_len); + } +} + +/************************************************************************ +Creates a cache of column prefixes of externally stored columns. */ +UNIV_INTERN +row_ext_t* +row_ext_create( +/*===========*/ + /* out,own: column prefix cache */ + ulint n_ext, /* in: number of externally stored columns */ + const ulint* ext, /* in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + const dtuple_t* tuple, /* in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + ulint zip_size,/* compressed page size in bytes, or 0 */ + mem_heap_t* heap) /* in: heap where created */ +{ + ulint i; + row_ext_t* ret = mem_heap_alloc(heap, (sizeof *ret) + + (n_ext - 1) * sizeof ret->len); + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(zip_size <= UNIV_PAGE_SIZE); + + ret->n_ext = n_ext; + ret->ext = ext; + ret->buf = mem_heap_alloc(heap, n_ext * REC_MAX_INDEX_COL_LEN); +#ifdef UNIV_DEBUG + memset(ret->buf, 0xaa, n_ext * REC_MAX_INDEX_COL_LEN); + UNIV_MEM_ALLOC(ret->buf, n_ext * REC_MAX_INDEX_COL_LEN); +#endif + + /* Fetch the BLOB prefixes */ + for (i = 0; i < n_ext; i++) { + const dfield_t* dfield; + + dfield = dtuple_get_nth_field(tuple, ext[i]); + row_ext_cache_fill(ret, i, zip_size, dfield); + } + + return(ret); +} diff --git a/storage/xtradb/row/row0ins.c b/storage/xtradb/row/row0ins.c new file mode 100644 index 00000000000..ceb0f7d75a2 --- /dev/null +++ b/storage/xtradb/row/row0ins.c @@ -0,0 +1,2558 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Insert into a table + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0ins.h" + +#ifdef UNIV_NONINL +#include "row0ins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "eval0eval.h" +#include "data0data.h" +#include "usr0sess.h" +#include "buf0lru.h" + +#define ROW_INS_PREV 1 +#define ROW_INS_NEXT 2 + + +/********************************************************************* +This prototype is copied from /mysql/sql/ha_innodb.cc. +Invalidates the MySQL query cache for the table. +NOTE that the exact prototype of this function has to be in +/xtradb/row/row0ins.c! */ +extern +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /* in: transaction which modifies the table */ + char* full_name, /* in: concatenation of database name, null + char '\0', table name, null char'\0'; + NOTE that in Windows this is always + in LOWER CASE! */ + ulint full_name_len); /* in: full name length where also the null + chars count */ + +/************************************************************************* +Creates an insert node struct. */ +UNIV_INTERN +ins_node_t* +ins_node_create( +/*============*/ + /* out, own: insert node struct */ + ulint ins_type, /* in: INS_VALUES, ... */ + dict_table_t* table, /* in: table where to insert */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + ins_node_t* node; + + node = mem_heap_alloc(heap, sizeof(ins_node_t)); + + node->common.type = QUE_NODE_INSERT; + + node->ins_type = ins_type; + + node->state = INS_NODE_SET_IX_LOCK; + node->table = table; + node->index = NULL; + node->entry = NULL; + + node->select = NULL; + + node->trx_id = ut_dulint_zero; + + node->entry_sys_heap = mem_heap_create(128); + + node->magic_n = INS_NODE_MAGIC_N; + + return(node); +} + +/*************************************************************** +Creates an entry template for each index of a table. */ +UNIV_INTERN +void +ins_node_create_entry_list( +/*=======================*/ + ins_node_t* node) /* in: row insert node */ +{ + dict_index_t* index; + dtuple_t* entry; + + ut_ad(node->entry_sys_heap); + + UT_LIST_INIT(node->entry_list); + + index = dict_table_get_first_index(node->table); + + while (index != NULL) { + entry = row_build_index_entry(node->row, NULL, index, + node->entry_sys_heap); + UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry); + + index = dict_table_get_next_index(index); + } +} + +/********************************************************************* +Adds system field buffers to a row. */ +static +void +row_ins_alloc_sys_fields( +/*=====================*/ + ins_node_t* node) /* in: insert node */ +{ + dtuple_t* row; + dict_table_t* table; + mem_heap_t* heap; + const dict_col_t* col; + dfield_t* dfield; + byte* ptr; + + row = node->row; + table = node->table; + heap = node->entry_sys_heap; + + ut_ad(row && table && heap); + ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table)); + + /* 1. Allocate buffer for row id */ + + col = dict_table_get_sys_col(table, DATA_ROW_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + + ptr = mem_heap_alloc(heap, DATA_ROW_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN); + + node->row_id_buf = ptr; + + /* 3. Allocate buffer for trx id */ + + col = dict_table_get_sys_col(table, DATA_TRX_ID); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + + dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN); + + node->trx_id_buf = ptr; + + /* 4. Allocate buffer for roll ptr */ + + col = dict_table_get_sys_col(table, DATA_ROLL_PTR); + + dfield = dtuple_get_nth_field(row, dict_col_get_no(col)); + ptr = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + + dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN); +} + +/************************************************************************* +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +UNIV_INTERN +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /* in: insert node */ + dtuple_t* row) /* in: new row (or first row) for the node */ +{ + node->state = INS_NODE_SET_IX_LOCK; + node->index = NULL; + node->entry = NULL; + + node->row = row; + + mem_heap_empty(node->entry_sys_heap); + + /* Create templates for index entries */ + + ins_node_create_entry_list(node); + + /* Allocate from entry_sys_heap buffers for sys fields */ + + row_ins_alloc_sys_fields(node); + + /* As we allocated a new trx id buf, the trx id should be written + there again: */ + + node->trx_id = ut_dulint_zero; +} + +/*********************************************************************** +Does an insert operation by updating a delete-marked existing record +in the index. This situation can occur if the delete-marked record is +kept in the index for consistent reads. */ +static +ulint +row_ins_sec_index_entry_by_modify( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /* in: B-tree cursor */ + const dtuple_t* entry, /* in: index entry to insert */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ +{ + big_rec_t* dummy_big_rec; + mem_heap_t* heap; + upd_t* update; + rec_t* rec; + ulint err; + + rec = btr_cur_get_rec(cursor); + + ut_ad(!dict_index_is_clust(cursor->index)); + ut_ad(rec_get_deleted_flag(rec, + dict_table_is_comp(cursor->index->table))); + + /* We know that in the alphabetical ordering, entry and rec are + identified. But in their binary form there may be differences if + there are char fields in them. Therefore we have to calculate the + difference. */ + + heap = mem_heap_create(1024); + + update = row_upd_build_sec_rec_difference_binary( + cursor->index, entry, rec, thr_get_trx(thr), heap); + if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor, + update, 0, thr, mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + + goto func_exit; + } + + err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor, + &heap, &dummy_big_rec, update, + 0, thr, mtr); + ut_ad(!dummy_big_rec); + } +func_exit: + mem_heap_free(heap); + + return(err); +} + +/*********************************************************************** +Does an insert operation by delete unmarking and updating a delete marked +existing record in the index. This situation can occur if the delete marked +record is kept in the index for consistent reads. */ +static +ulint +row_ins_clust_index_entry_by_modify( +/*================================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether mtr holds just a leaf + latch or also a tree latch */ + btr_cur_t* cursor, /* in: B-tree cursor */ + mem_heap_t** heap, /* in/out: pointer to memory heap, or NULL */ + big_rec_t** big_rec,/* out: possible big rec vector of fields + which have to be stored externally by the + caller */ + const dtuple_t* entry, /* in: index entry to insert */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ +{ + rec_t* rec; + upd_t* update; + ulint err; + + ut_ad(dict_index_is_clust(cursor->index)); + + *big_rec = NULL; + + rec = btr_cur_get_rec(cursor); + + ut_ad(rec_get_deleted_flag(rec, + dict_table_is_comp(cursor->index->table))); + + if (!*heap) { + *heap = mem_heap_create(1024); + } + + /* Build an update vector containing all the fields to be modified; + NOTE that this vector may NOT contain system columns trx_id or + roll_ptr */ + + update = row_upd_build_difference_binary(cursor->index, entry, rec, + thr_get_trx(thr), *heap); + if (mode == BTR_MODIFY_LEAF) { + /* Try optimistic updating of the record, keeping changes + within the page */ + + err = btr_cur_optimistic_update(0, cursor, update, 0, thr, + mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + + } + err = btr_cur_pessimistic_update(0, cursor, + heap, big_rec, update, + 0, thr, mtr); + } + + return(err); +} + +/************************************************************************* +Returns TRUE if in a cascaded update/delete an ancestor node of node +updates (not DELETE, but UPDATE) table. */ +static +ibool +row_ins_cascade_ancestor_updates_table( +/*===================================*/ + /* out: TRUE if an ancestor updates table */ + que_node_t* node, /* in: node in a query graph */ + dict_table_t* table) /* in: table */ +{ + que_node_t* parent; + upd_node_t* upd_node; + + parent = que_node_get_parent(node); + + while (que_node_get_type(parent) == QUE_NODE_UPDATE) { + + upd_node = parent; + + if (upd_node->table == table && upd_node->is_delete == FALSE) { + + return(TRUE); + } + + parent = que_node_get_parent(parent); + + ut_a(parent); + } + + return(FALSE); +} + +/************************************************************************* +Returns the number of ancestor UPDATE or DELETE nodes of a +cascaded update/delete node. */ +static +ulint +row_ins_cascade_n_ancestors( +/*========================*/ + /* out: number of ancestors */ + que_node_t* node) /* in: node in a query graph */ +{ + que_node_t* parent; + ulint n_ancestors = 0; + + parent = que_node_get_parent(node); + + while (que_node_get_type(parent) == QUE_NODE_UPDATE) { + n_ancestors++; + + parent = que_node_get_parent(parent); + + ut_a(parent); + } + + return(n_ancestors); +} + +/********************************************************************** +Calculates the update vector node->cascade->update for a child table in +a cascaded update. */ +static +ulint +row_ins_cascade_calc_update_vec( +/*============================*/ + /* out: number of fields in the + calculated update vector; the value + can also be 0 if no foreign key + fields changed; the returned value + is ULINT_UNDEFINED if the column + type in the child table is too short + to fit the new value in the parent + table: that means the update fails */ + upd_node_t* node, /* in: update node of the parent + table */ + dict_foreign_t* foreign, /* in: foreign key constraint whose + type is != 0 */ + mem_heap_t* heap) /* in: memory heap to use as + temporary storage */ +{ + upd_node_t* cascade = node->cascade_node; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index = foreign->foreign_index; + upd_t* update; + upd_field_t* ufield; + dict_table_t* parent_table; + dict_index_t* parent_index; + upd_t* parent_update; + upd_field_t* parent_ufield; + ulint n_fields_updated; + ulint parent_field_no; + ulint i; + ulint j; + + ut_a(node); + ut_a(foreign); + ut_a(cascade); + ut_a(table); + ut_a(index); + + /* Calculate the appropriate update vector which will set the fields + in the child index record to the same value (possibly padded with + spaces if the column is a fixed length CHAR or FIXBINARY column) as + the referenced index record will get in the update. */ + + parent_table = node->table; + ut_a(parent_table == foreign->referenced_table); + parent_index = foreign->referenced_index; + parent_update = node->update; + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + + n_fields_updated = 0; + + for (i = 0; i < foreign->n_fields; i++) { + + parent_field_no = dict_table_get_nth_col_pos( + parent_table, + dict_index_get_nth_col_no(parent_index, i)); + + for (j = 0; j < parent_update->n_fields; j++) { + parent_ufield = parent_update->fields + j; + + if (parent_ufield->field_no == parent_field_no) { + + ulint min_size; + const dict_col_t* col; + ulint ufield_len; + + col = dict_index_get_nth_col(index, i); + + /* A field in the parent index record is + updated. Let us make the update vector + field for the child table. */ + + ufield = update->fields + n_fields_updated; + + ufield->field_no + = dict_table_get_nth_col_pos( + table, dict_col_get_no(col)); + ufield->exp = NULL; + + ufield->new_val = parent_ufield->new_val; + ufield_len = dfield_get_len(&ufield->new_val); + + /* Clear the "external storage" flag */ + dfield_set_len(&ufield->new_val, ufield_len); + + /* Do not allow a NOT NULL column to be + updated as NULL */ + + if (dfield_is_null(&ufield->new_val) + && (col->prtype & DATA_NOT_NULL)) { + + return(ULINT_UNDEFINED); + } + + /* If the new value would not fit in the + column, do not allow the update */ + + if (!dfield_is_null(&ufield->new_val) + && dtype_get_at_most_n_mbchars( + col->prtype, + col->mbminlen, col->mbmaxlen, + col->len, + ufield_len, + dfield_get_data(&ufield->new_val)) + < ufield_len) { + + return(ULINT_UNDEFINED); + } + + /* If the parent column type has a different + length than the child column type, we may + need to pad with spaces the new value of the + child column */ + + min_size = dict_col_get_min_size(col); + + /* Because UNIV_SQL_NULL (the marker + of SQL NULL values) exceeds all possible + values of min_size, the test below will + not hold for SQL NULL columns. */ + + if (min_size > ufield_len) { + + char* pad_start; + const char* pad_end; + char* padded_data + = mem_heap_alloc( + heap, min_size); + pad_start = padded_data + ufield_len; + pad_end = padded_data + min_size; + + memcpy(padded_data, + dfield_get_data(&ufield + ->new_val), + dfield_get_len(&ufield + ->new_val)); + + switch (UNIV_EXPECT(col->mbminlen,1)) { + default: + ut_error; + return(ULINT_UNDEFINED); + case 1: + if (UNIV_UNLIKELY + (dtype_get_charset_coll( + col->prtype) + == DATA_MYSQL_BINARY_CHARSET_COLL)) { + /* Do not pad BINARY + columns. */ + return(ULINT_UNDEFINED); + } + + /* space=0x20 */ + memset(pad_start, 0x20, + pad_end - pad_start); + break; + case 2: + /* space=0x0020 */ + ut_a(!(ufield_len % 2)); + ut_a(!(min_size % 2)); + do { + *pad_start++ = 0x00; + *pad_start++ = 0x20; + } while (pad_start < pad_end); + break; + } + + dfield_set_data(&ufield->new_val, + padded_data, min_size); + } + + n_fields_updated++; + } + } + } + + update->n_fields = n_fields_updated; + + return(n_fields_updated); +} + +/************************************************************************* +Set detailed error message associated with foreign key errors for +the given transaction. */ +static +void +row_ins_set_detailed( +/*=================*/ + trx_t* trx, /* in: transaction */ + dict_foreign_t* foreign) /* in: foreign key constraint */ +{ + mutex_enter(&srv_misc_tmpfile_mutex); + rewind(srv_misc_tmpfile); + + if (os_file_set_eof(srv_misc_tmpfile)) { + ut_print_name(srv_misc_tmpfile, trx, TRUE, + foreign->foreign_table_name); + dict_print_info_on_foreign_key_in_create_format( + srv_misc_tmpfile, trx, foreign, FALSE); + trx_set_detailed_error_from_file(trx, srv_misc_tmpfile); + } else { + trx_set_detailed_error(trx, "temp file operation failed"); + } + + mutex_exit(&srv_misc_tmpfile_mutex); +} + +/************************************************************************* +Reports a foreign key error associated with an update or a delete of a +parent table index entry. */ +static +void +row_ins_foreign_report_err( +/*=======================*/ + const char* errstr, /* in: error string from the viewpoint + of the parent table */ + que_thr_t* thr, /* in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /* in: foreign key constraint */ + const rec_t* rec, /* in: a matching index record in the + child table */ + const dtuple_t* entry) /* in: index entry in the parent + table */ +{ + FILE* ef = dict_foreign_err_file; + trx_t* trx = thr_get_trx(thr); + + row_ins_set_detailed(trx, foreign); + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx, 600); + + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign, + TRUE); + putc('\n', ef); + fputs(errstr, ef); + fputs(" in parent table, in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->referenced_index->name); + if (entry) { + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + } + fputs("\nBut in child table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->foreign_index->name); + if (rec) { + fputs(", there is a record:\n", ef); + rec_print(ef, rec, foreign->foreign_index); + } else { + fputs(", the record is not available\n", ef); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/************************************************************************* +Reports a foreign key error to dict_foreign_err_file when we are trying +to add an index entry to a child table. Note that the adding may be the result +of an update, too. */ +static +void +row_ins_foreign_report_add_err( +/*===========================*/ + trx_t* trx, /* in: transaction */ + dict_foreign_t* foreign, /* in: foreign key constraint */ + const rec_t* rec, /* in: a record in the parent table: + it does not match entry because we + have an error! */ + const dtuple_t* entry) /* in: index entry to insert in the + child table */ +{ + FILE* ef = dict_foreign_err_file; + + row_ins_set_detailed(trx, foreign); + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx, 600); + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign, + TRUE); + fputs("\nTrying to add in child table, in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->foreign_index->name); + if (entry) { + fputs(" tuple:\n", ef); + /* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized. + It would be better to only display the user columns. */ + dtuple_print(ef, entry); + } + fputs("\nBut in parent table ", ef); + ut_print_name(ef, trx, TRUE, foreign->referenced_table_name); + fputs(", in index ", ef); + ut_print_name(ef, trx, FALSE, foreign->referenced_index->name); + fputs(",\nthe closest match we can find is record:\n", ef); + if (rec && page_rec_is_supremum(rec)) { + /* If the cursor ended on a supremum record, it is better + to report the previous record in the error message, so that + the user gets a more descriptive error message. */ + rec = page_rec_get_prev_const(rec); + } + + if (rec) { + rec_print(ef, rec, foreign->referenced_index); + } + putc('\n', ef); + + mutex_exit(&dict_foreign_err_mutex); +} + +/************************************************************************* +Invalidate the query cache for the given table. */ +static +void +row_ins_invalidate_query_cache( +/*===========================*/ + que_thr_t* thr, /* in: query thread whose run_node + is an update node */ + const char* name) /* in: table name prefixed with + database name and a '/' character */ +{ + char* buf; + char* ptr; + ulint len = strlen(name) + 1; + + buf = mem_strdupl(name, len); + + ptr = strchr(buf, '/'); + ut_a(ptr); + *ptr = '\0'; + + /* We call a function in ha_innodb.cc */ +#ifndef UNIV_HOTBACKUP + innobase_invalidate_query_cache(thr_get_trx(thr), buf, len); +#endif + mem_free(buf); +} + +/************************************************************************* +Perform referential actions or checks when a parent row is deleted or updated +and the constraint had an ON DELETE or ON UPDATE condition which was not +RESTRICT. */ +static +ulint +row_ins_foreign_check_on_constraint( +/*================================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + or error code */ + que_thr_t* thr, /* in: query thread whose run_node + is an update node */ + dict_foreign_t* foreign, /* in: foreign key constraint whose + type is != 0 */ + btr_pcur_t* pcur, /* in: cursor placed on a matching + index record in the child table */ + dtuple_t* entry, /* in: index entry in the parent + table */ + mtr_t* mtr) /* in: mtr holding the latch of pcur + page */ +{ + upd_node_t* node; + upd_node_t* cascade; + dict_table_t* table = foreign->foreign_table; + dict_index_t* index; + dict_index_t* clust_index; + dtuple_t* ref; + mem_heap_t* upd_vec_heap = NULL; + const rec_t* rec; + const rec_t* clust_rec; + const buf_block_t* clust_block; + upd_t* update; + ulint n_to_update; + ulint err; + ulint i; + trx_t* trx; + mem_heap_t* tmp_heap = NULL; + + ut_a(thr); + ut_a(foreign); + ut_a(pcur); + ut_a(mtr); + + trx = thr_get_trx(thr); + + /* Since we are going to delete or update a row, we have to invalidate + the MySQL query cache for table. A deadlock of threads is not possible + here because the caller of this function does not hold any latches with + the sync0sync.h rank above the kernel mutex. The query cache mutex has + a rank just above the kernel mutex. */ + + row_ins_invalidate_query_cache(thr, table->name); + + node = thr->run_node; + + if (node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_DELETE_CASCADE + | DICT_FOREIGN_ON_DELETE_SET_NULL))) { + + row_ins_foreign_report_err("Trying to delete", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (!node->is_delete && 0 == (foreign->type + & (DICT_FOREIGN_ON_UPDATE_CASCADE + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* This is an UPDATE */ + + row_ins_foreign_report_err("Trying to update", + thr, foreign, + btr_pcur_get_rec(pcur), entry); + + return(DB_ROW_IS_REFERENCED); + } + + if (node->cascade_node == NULL) { + /* Extend our query graph by creating a child to current + update node. The child is used in the cascade or set null + operation. */ + + node->cascade_heap = mem_heap_create(128); + node->cascade_node = row_create_update_node_for_mysql( + table, node->cascade_heap); + que_node_set_parent(node->cascade_node, node); + } + + /* Initialize cascade_node to do the operation we want. Note that we + use the SAME cascade node to do all foreign key operations of the + SQL DELETE: the table of the cascade node may change if there are + several child tables to the table where the delete is done! */ + + cascade = node->cascade_node; + + cascade->table = table; + + cascade->foreign = foreign; + + if (node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) { + cascade->is_delete = TRUE; + } else { + cascade->is_delete = FALSE; + + if (foreign->n_fields > cascade->update_n_fields) { + /* We have to make the update vector longer */ + + cascade->update = upd_create(foreign->n_fields, + node->cascade_heap); + cascade->update_n_fields = foreign->n_fields; + } + } + + /* We do not allow cyclic cascaded updating (DELETE is allowed, + but not UPDATE) of the same table, as this can lead to an infinite + cycle. Check that we are not updating the same table which is + already being modified in this cascade chain. We have to check + this also because the modification of the indexes of a 'parent' + table may still be incomplete, and we must avoid seeing the indexes + of the parent table in an inconsistent state! */ + + if (!cascade->is_delete + && row_ins_cascade_ancestor_updates_table(cascade, table)) { + + /* We do not know if this would break foreign key + constraints, but play safe and return an error */ + + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying an update, possibly causing a cyclic" + " cascaded update\n" + "in the child table,", thr, foreign, + btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (row_ins_cascade_n_ancestors(cascade) >= 15) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying a too deep cascaded delete or update\n", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + index = btr_pcur_get_btr_cur(pcur)->index; + + ut_a(index == foreign->foreign_index); + + rec = btr_pcur_get_rec(pcur); + + if (dict_index_is_clust(index)) { + /* pcur is already positioned in the clustered index of + the child table */ + + clust_index = index; + clust_rec = rec; + clust_block = btr_pcur_get_block(pcur); + } else { + /* We have to look for the record in the clustered index + in the child table */ + + clust_index = dict_table_get_first_index(table); + + tmp_heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, + tmp_heap); + btr_pcur_open_with_no_init(clust_index, ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + cascade->pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(cascade->pcur); + clust_block = btr_pcur_get_block(cascade->pcur); + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(cascade->pcur) + < dict_index_get_n_unique(clust_index)) { + + fputs("InnoDB: error in cascade of a foreign key op\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + fputs("\n" + "InnoDB: clustered record ", stderr); + rec_print(stderr, clust_rec, clust_index); + fputs("\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com\n", stderr); + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Set an X-lock on the row to delete or update in the child table */ + + err = lock_table(0, table, LOCK_IX, thr); + + if (err == DB_SUCCESS) { + /* Here it suffices to use a LOCK_REC_NOT_GAP type lock; + we already have a normal shared lock on the appropriate + gap if the search criterion was not unique */ + + err = lock_clust_rec_read_check_and_lock_alt( + 0, clust_block, clust_rec, clust_index, + LOCK_X, LOCK_REC_NOT_GAP, thr); + } + + if (err != DB_SUCCESS) { + + goto nonstandard_exit_func; + } + + if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) { + /* This can happen if there is a circular reference of + rows such that cascading delete comes to delete a row + already in the process of being delete marked */ + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + + if ((node->is_delete + && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)) + || (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + + /* Build the appropriate update vector which sets + foreign->n_fields first fields in rec to SQL NULL */ + + update = cascade->update; + + update->info_bits = 0; + update->n_fields = foreign->n_fields; + + for (i = 0; i < foreign->n_fields; i++) { + upd_field_t* ufield = &update->fields[i]; + + ufield->field_no = dict_table_get_nth_col_pos( + table, + dict_index_get_nth_col_no(index, i)); + ufield->orig_len = 0; + ufield->exp = NULL; + dfield_set_null(&ufield->new_val); + } + } + + if (!node->is_delete + && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) { + + /* Build the appropriate update vector which sets changing + foreign->n_fields first fields in rec to new values */ + + upd_vec_heap = mem_heap_create(256); + + n_to_update = row_ins_cascade_calc_update_vec(node, foreign, + upd_vec_heap); + if (n_to_update == ULINT_UNDEFINED) { + err = DB_ROW_IS_REFERENCED; + + row_ins_foreign_report_err( + "Trying a cascaded update where the" + " updated value in the child\n" + "table would not fit in the length" + " of the column, or the value would\n" + "be NULL and the column is" + " declared as not NULL in the child table,", + thr, foreign, btr_pcur_get_rec(pcur), entry); + + goto nonstandard_exit_func; + } + + if (cascade->update->n_fields == 0) { + + /* The update does not change any columns referred + to in this foreign key constraint: no need to do + anything */ + + err = DB_SUCCESS; + + goto nonstandard_exit_func; + } + } + + /* Store pcur position and initialize or store the cascade node + pcur stored position */ + + btr_pcur_store_position(pcur, mtr); + + if (index == clust_index) { + btr_pcur_copy_stored_position(cascade->pcur, pcur); + } else { + btr_pcur_store_position(cascade->pcur, mtr); + } + + mtr_commit(mtr); + + ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON); + + cascade->state = UPD_NODE_UPDATE_CLUSTERED; + + err = row_update_cascade_for_mysql(thr, cascade, + foreign->foreign_table); + + if (foreign->foreign_table->n_foreign_key_checks_running == 0) { + fprintf(stderr, + "InnoDB: error: table %s has the counter 0" + " though there is\n" + "InnoDB: a FOREIGN KEY check running on it.\n", + foreign->foreign_table->name); + } + + /* Release the data dictionary latch for a while, so that we do not + starve other threads from doing CREATE TABLE etc. if we have a huge + cascaded operation running. The counter n_foreign_key_checks_running + will prevent other users from dropping or ALTERing the table when we + release the latch. */ + + row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); + row_mysql_freeze_data_dictionary(thr_get_trx(thr)); + + mtr_start(mtr); + + /* Restore pcur position */ + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + return(err); + +nonstandard_exit_func: + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (upd_vec_heap) { + mem_heap_free(upd_vec_heap); + } + + btr_pcur_store_position(pcur, mtr); + + mtr_commit(mtr); + mtr_start(mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr); + + return(err); +} + +/************************************************************************* +Sets a shared lock on a record. Used in locking possible duplicate key +records and also in checking foreign key constraints. */ +static +ulint +row_ins_set_shared_rec_lock( +/*========================*/ + /* out: DB_SUCCESS or error code */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_S, type, thr); + } + + return(err); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************* +Sets a exclusive lock on a record. Used in locking possible duplicate key +records */ +static +ulint +row_ins_set_exclusive_rec_lock( +/*===========================*/ + /* out: DB_SUCCESS or error code */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP type lock */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, LOCK_X, type, thr); + } + + return(err); +} +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************************* +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_operation_lock. */ +UNIV_INTERN +ulint +row_ins_check_foreign_constraint( +/*=============================*/ + /* out: DB_SUCCESS, + DB_NO_REFERENCED_ROW, + or DB_ROW_IS_REFERENCED */ + ibool check_ref,/* in: TRUE if we want to check that + the referenced table is ok, FALSE if we + want to to check the foreign key table */ + dict_foreign_t* foreign,/* in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /* in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /* in: index entry for index */ + que_thr_t* thr) /* in: query thread */ +{ + upd_node_t* upd_node; + dict_table_t* check_table; + dict_index_t* check_index; + ulint n_fields_cmp; + btr_pcur_t pcur; + ibool moved; + int cmp; + ulint err; + ulint i; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + +run_again: +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + err = DB_SUCCESS; + + if (trx->check_foreigns == FALSE) { + /* The user has suppressed foreign key checks currently for + this session */ + goto exit_func; + } + + /* If any of the foreign key fields in entry is SQL NULL, we + suppress the foreign key check: this is compatible with Oracle, + for example */ + + for (i = 0; i < foreign->n_fields; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + goto exit_func; + } + } + + if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) { + upd_node = thr->run_node; + + if (!(upd_node->is_delete) && upd_node->foreign == foreign) { + /* If a cascaded update is done as defined by a + foreign key constraint, do not check that + constraint for the child row. In ON UPDATE CASCADE + the update of the parent row is only half done when + we come here: if we would check the constraint here + for the child row it would fail. + + A QUESTION remains: if in the child table there are + several constraints which refer to the same parent + table, we should merge all updates to the child as + one update? And the updates can be contradictory! + Currently we just perform the update associated + with each foreign key constraint, one after + another, and the user has problems predicting in + which order they are performed. */ + + goto exit_func; + } + } + + if (check_ref) { + check_table = foreign->referenced_table; + check_index = foreign->referenced_index; + } else { + check_table = foreign->foreign_table; + check_index = foreign->foreign_index; + } + + if (check_table == NULL || check_table->ibd_file_missing) { + if (check_ref) { + FILE* ef = dict_foreign_err_file; + + row_ins_set_detailed(trx, foreign); + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Transaction:\n", ef); + trx_print(ef, trx, 600); + fputs("Foreign key constraint fails for table ", ef); + ut_print_name(ef, trx, TRUE, + foreign->foreign_table_name); + fputs(":\n", ef); + dict_print_info_on_foreign_key_in_create_format( + ef, trx, foreign, TRUE); + fputs("\nTrying to add to index ", ef); + ut_print_name(ef, trx, FALSE, + foreign->foreign_index->name); + fputs(" tuple:\n", ef); + dtuple_print(ef, entry); + fputs("\nBut the parent table ", ef); + ut_print_name(ef, trx, TRUE, + foreign->referenced_table_name); + fputs("\nor its .ibd file does" + " not currently exist!\n", ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_NO_REFERENCED_ROW; + } + + goto exit_func; + } + + ut_a(check_table); + ut_a(check_index); + + if (check_table != table) { + /* We already have a LOCK_IX on table, but not necessarily + on check_table */ + + err = lock_table(0, check_table, LOCK_IS, thr); + + if (err != DB_SUCCESS) { + + goto do_possible_lock_wait; + } + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, foreign->n_fields); + + btr_pcur_open(check_index, entry, PAGE_CUR_GE, + BTR_SEARCH_LEAF, &pcur, &mtr); + + /* Scan index records and check if there is a matching record */ + + for (;;) { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + + if (page_rec_is_infimum(rec)) { + + goto next_rec; + } + + offsets = rec_get_offsets(rec, check_index, + offsets, ULINT_UNDEFINED, &heap); + + if (page_rec_is_supremum(rec)) { + + err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block, + rec, check_index, + offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + + goto next_rec; + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (rec_get_deleted_flag(rec, + rec_offs_comp(offsets))) { + err = row_ins_set_shared_rec_lock( + LOCK_ORDINARY, block, + rec, check_index, offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + } else { + /* Found a matching record. Lock only + a record because we can allow inserts + into gaps */ + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, block, + rec, check_index, offsets, thr); + + if (err != DB_SUCCESS) { + + break; + } + + if (check_ref) { + err = DB_SUCCESS; + + break; + } else if (foreign->type != 0) { + /* There is an ON UPDATE or ON DELETE + condition: check them in a separate + function */ + + err = row_ins_foreign_check_on_constraint( + thr, foreign, &pcur, entry, + &mtr); + if (err != DB_SUCCESS) { + /* Since reporting a plain + "duplicate key" error + message to the user in + cases where a long CASCADE + operation would lead to a + duplicate key in some + other table is very + confusing, map duplicate + key errors resulting from + FK constraints to a + separate error code. */ + + if (err == DB_DUPLICATE_KEY) { + err = DB_FOREIGN_DUPLICATE_KEY; + } + + break; + } + + /* row_ins_foreign_check_on_constraint + may have repositioned pcur on a + different block */ + block = btr_pcur_get_block(&pcur); + } else { + row_ins_foreign_report_err( + "Trying to delete or update", + thr, foreign, rec, entry); + + err = DB_ROW_IS_REFERENCED; + break; + } + } + } + + if (cmp < 0) { + err = row_ins_set_shared_rec_lock( + LOCK_GAP, block, + rec, check_index, offsets, thr); + if (err != DB_SUCCESS) { + + break; + } + + if (check_ref) { + err = DB_NO_REFERENCED_ROW; + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + } else { + err = DB_SUCCESS; + } + + break; + } + + ut_a(cmp == 0); +next_rec: + moved = btr_pcur_move_to_next(&pcur, &mtr); + + if (!moved) { + if (check_ref) { + rec = btr_pcur_get_rec(&pcur); + row_ins_foreign_report_add_err( + trx, foreign, rec, entry); + err = DB_NO_REFERENCED_ROW; + } else { + err = DB_SUCCESS; + } + + break; + } + } + + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + +do_possible_lock_wait: + if (err == DB_LOCK_WAIT) { + trx->error_state = err; + + que_thr_stop_for_mysql(thr); + + srv_suspend_mysql_thread(thr); + + if (trx->error_state == DB_SUCCESS) { + + goto run_again; + } + + err = trx->error_state; + } + +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/******************************************************************* +Checks if foreign key constraints fail for an index entry. If index +is not mentioned in any constraint, this function does nothing, +Otherwise does searches to the indexes of referenced tables and +sets shared locks which lock either the success or the failure of +a constraint. */ +static +ulint +row_ins_check_foreign_constraints( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + dict_table_t* table, /* in: table */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry for index */ + que_thr_t* thr) /* in: query thread */ +{ + dict_foreign_t* foreign; + ulint err; + trx_t* trx; + ibool got_s_lock = FALSE; + + trx = thr_get_trx(thr); + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + while (foreign) { + if (foreign->foreign_index == index) { + + if (foreign->referenced_table == NULL) { + dict_table_get(foreign->referenced_table_name, + FALSE); + } + + if (0 == trx->dict_operation_lock_mode) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + if (foreign->referenced_table) { + mutex_enter(&(dict_sys->mutex)); + + (foreign->referenced_table + ->n_foreign_key_checks_running)++; + + mutex_exit(&(dict_sys->mutex)); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects the referenced + table from being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint( + TRUE, foreign, table, entry, thr); + + if (foreign->referenced_table) { + mutex_enter(&(dict_sys->mutex)); + + ut_a(foreign->referenced_table + ->n_foreign_key_checks_running > 0); + (foreign->referenced_table + ->n_foreign_key_checks_running)--; + + mutex_exit(&(dict_sys->mutex)); + } + + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + if (err != DB_SUCCESS) { + return(err); + } + } + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + return(DB_SUCCESS); +} + +#ifndef UNIV_HOTBACKUP +/******************************************************************* +Checks if a unique key violation to rec would occur at the index entry +insert. */ +static +ibool +row_ins_dupl_error_with_rec( +/*========================*/ + /* out: TRUE if error */ + const rec_t* rec, /* in: user record; NOTE that we assume + that the caller already has a record lock on + the record! */ + const dtuple_t* entry, /* in: entry to insert */ + dict_index_t* index, /* in: index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint matched_fields; + ulint matched_bytes; + ulint n_unique; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + n_unique = dict_index_get_n_unique(index); + + matched_fields = 0; + matched_bytes = 0; + + cmp_dtuple_rec_with_match(entry, rec, offsets, + &matched_fields, &matched_bytes); + + if (matched_fields < n_unique) { + + return(FALSE); + } + + /* In a unique secondary index we allow equal key values if they + contain SQL NULLs */ + + if (!dict_index_is_clust(index)) { + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(FALSE); + } + } + } + + return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); +} +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************************* +Scans a unique non-clustered index at a given index entry to determine +whether a uniqueness violation has occurred for the key value of the entry. +Set shared locks on possible duplicate records. */ +static +ulint +row_ins_scan_sec_index_for_duplicate( +/*=================================*/ + /* out: DB_SUCCESS, DB_DUPLICATE_KEY, or + DB_LOCK_WAIT */ + dict_index_t* index, /* in: non-clustered unique index */ + dtuple_t* entry, /* in: index entry */ + que_thr_t* thr) /* in: query thread */ +{ +#ifndef UNIV_HOTBACKUP + ulint n_unique; + ulint i; + int cmp; + ulint n_fields_cmp; + btr_pcur_t pcur; + ulint err = DB_SUCCESS; + unsigned allow_duplicates; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + n_unique = dict_index_get_n_unique(index); + + /* If the secondary index is unique, but one of the fields in the + n_unique first fields is NULL, a unique key violation cannot occur, + since we define NULL != NULL in this case */ + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(DB_SUCCESS); + } + } + + mtr_start(&mtr); + + /* Store old value on n_fields_cmp */ + + n_fields_cmp = dtuple_get_n_fields_cmp(entry); + + dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index)); + + btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); + + allow_duplicates = thr_get_trx(thr)->duplicates & TRX_DUP_IGNORE; + + /* Scan index records and check if there is a duplicate */ + + do { + const rec_t* rec = btr_pcur_get_rec(&pcur); + const buf_block_t* block = btr_pcur_get_block(&pcur); + + if (page_rec_is_infimum(rec)) { + + continue; + } + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (allow_duplicates) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_ORDINARY, block, + rec, index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_ORDINARY, block, + rec, index, offsets, thr); + } + + if (err != DB_SUCCESS) { + + break; + } + + if (page_rec_is_supremum(rec)) { + + continue; + } + + cmp = cmp_dtuple_rec(entry, rec, offsets); + + if (cmp == 0) { + if (row_ins_dupl_error_with_rec(rec, entry, + index, offsets)) { + err = DB_DUPLICATE_KEY; + + thr_get_trx(thr)->error_info = index; + + break; + } + } + + if (cmp < 0) { + break; + } + + ut_a(cmp == 0); + } while (btr_pcur_move_to_next(&pcur, &mtr)); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + mtr_commit(&mtr); + + /* Restore old value */ + dtuple_set_n_fields_cmp(entry, n_fields_cmp); + + return(err); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; + return(DB_FAIL); +#endif /* UNIV_HOTBACKUP */ +} + +/******************************************************************* +Checks if a unique key violation error would occur at an index entry +insert. Sets shared locks on possible duplicate records. Works only +for a clustered index! */ +static +ulint +row_ins_duplicate_error_in_clust( +/*=============================*/ + /* out: DB_SUCCESS if no error, + DB_DUPLICATE_KEY if error, DB_LOCK_WAIT if we + have to wait for a lock on a possible + duplicate record */ + btr_cur_t* cursor, /* in: B-tree cursor */ + dtuple_t* entry, /* in: entry to insert */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ +#ifndef UNIV_HOTBACKUP + ulint err; + rec_t* rec; + ulint n_unique; + trx_t* trx = thr_get_trx(thr); + mem_heap_t*heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + UT_NOT_USED(mtr); + + ut_a(dict_index_is_clust(cursor->index)); + ut_ad(dict_index_is_unique(cursor->index)); + + /* NOTE: For unique non-clustered indexes there may be any number + of delete marked records with the same value for the non-clustered + index key (remember multiversioning), and which differ only in + the row refererence part of the index record, containing the + clustered index key fields. For such a secondary index record, + to avoid race condition, we must FIRST do the insertion and after + that check that the uniqueness condition is not breached! */ + + /* NOTE: A problem is that in the B-tree node pointers on an + upper level may match more to the entry than the actual existing + user records on the leaf level. So, even if low_match would suggest + that a duplicate key violation may occur, this may not be the case. */ + + n_unique = dict_index_get_n_unique(cursor->index); + + if (cursor->low_match >= n_unique) { + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_infimum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + /* We set a lock on the possible duplicate: this + is needed in logical logging of MySQL to make + sure that in roll-forward we get the same duplicate + errors as in original execution */ + + if (trx->duplicates & TRX_DUP_IGNORE) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), rec, + cursor->index, offsets, thr); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index, offsets)) { + trx->error_info = cursor->index; + err = DB_DUPLICATE_KEY; + goto func_exit; + } + } + } + + if (cursor->up_match >= n_unique) { + + rec = page_rec_get_next(btr_cur_get_rec(cursor)); + + if (!page_rec_is_supremum(rec)) { + offsets = rec_get_offsets(rec, cursor->index, offsets, + ULINT_UNDEFINED, &heap); + + if (trx->duplicates & TRX_DUP_IGNORE) { + + /* If the SQL-query will update or replace + duplicate key we will take X-lock for + duplicates ( REPLACE, LOAD DATAFILE REPLACE, + INSERT ON DUPLICATE KEY UPDATE). */ + + err = row_ins_set_exclusive_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } else { + + err = row_ins_set_shared_rec_lock( + LOCK_REC_NOT_GAP, + btr_cur_get_block(cursor), + rec, cursor->index, offsets, thr); + } + + if (err != DB_SUCCESS) { + goto func_exit; + } + + if (row_ins_dupl_error_with_rec( + rec, entry, cursor->index, offsets)) { + trx->error_info = cursor->index; + err = DB_DUPLICATE_KEY; + goto func_exit; + } + } + + ut_a(!dict_index_is_clust(cursor->index)); + /* This should never happen */ + } + + err = DB_SUCCESS; +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; + return(DB_FAIL); +#endif /* UNIV_HOTBACKUP */ +} + +/******************************************************************* +Checks if an index entry has long enough common prefix with an existing +record so that the intended insert of the entry must be changed to a modify of +the existing record. In the case of a clustered index, the prefix must be +n_unique fields long, and in the case of a secondary index, all fields must be +equal. */ +UNIV_INLINE +ulint +row_ins_must_modify( +/*================*/ + /* out: 0 if no update, ROW_INS_PREV if + previous should be updated; currently we + do the search so that only the low_match + record can match enough to the search tuple, + not the next record */ + btr_cur_t* cursor) /* in: B-tree cursor */ +{ + ulint enough_match; + rec_t* rec; + + /* NOTE: (compare to the note in row_ins_duplicate_error) Because node + pointers on upper levels of the B-tree may match more to entry than + to actual user records on the leaf level, we have to check if the + candidate record is actually a user record. In a clustered index + node pointers contain index->n_unique first fields, and in the case + of a secondary index, all fields of the index. */ + + enough_match = dict_index_get_n_unique_in_tree(cursor->index); + + if (cursor->low_match >= enough_match) { + + rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_infimum(rec)) { + + return(ROW_INS_PREV); + } + } + + return(0); +} + +/******************************************************************* +Tries to insert an index entry to an index. If the index is clustered +and a record with the same unique key is found, the other record is +necessarily marked deleted by a committed transaction, or a unique key +violation error occurs. The delete marked record is then updated to an +existing record, and we must write an undo log record on the delete +marked record. If the index is secondary, and a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. */ +static +ulint +row_ins_index_entry_low( +/*====================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL + if pessimistic retry needed, or error code */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + ulint n_ext, /* in: number of externally stored columns */ + que_thr_t* thr) /* in: query thread */ +{ + btr_cur_t cursor; + ulint ignore_sec_unique = 0; + ulint modify = 0; /* remove warning */ + rec_t* insert_rec; + rec_t* rec; + ulint err; + ulint n_unique; + big_rec_t* big_rec = NULL; + mtr_t mtr; + mem_heap_t* heap = NULL; + + log_free_check(); + + mtr_start(&mtr); + + cursor.thr = thr; + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + if (!(thr_get_trx(thr)->check_unique_secondary)) { + ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE; + } + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + mode | BTR_INSERT | ignore_sec_unique, + &cursor, 0, &mtr); + + if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { + /* The insertion was made to the insert buffer already during + the search: we are done */ + + err = DB_SUCCESS; + + goto function_exit; + } + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } +#endif + + n_unique = dict_index_get_n_unique(index); + + if (dict_index_is_unique(index) && (cursor.up_match >= n_unique + || cursor.low_match >= n_unique)) { + + if (dict_index_is_clust(index)) { + /* Note that the following may return also + DB_LOCK_WAIT */ + + err = row_ins_duplicate_error_in_clust( + &cursor, entry, thr, &mtr); + if (err != DB_SUCCESS) { + + goto function_exit; + } + } else { + mtr_commit(&mtr); + err = row_ins_scan_sec_index_for_duplicate( + index, entry, thr); + mtr_start(&mtr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + /* We did not find a duplicate and we have now + locked with s-locks the necessary records to + prevent any insertion of a duplicate by another + transaction. Let us now reposition the cursor and + continue the insertion. */ + + btr_cur_search_to_nth_level(index, 0, entry, + PAGE_CUR_LE, + mode | BTR_INSERT, + &cursor, 0, &mtr); + } + } + + modify = row_ins_must_modify(&cursor); + + if (modify != 0) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + + if (modify == ROW_INS_NEXT) { + rec = page_rec_get_next(btr_cur_get_rec(&cursor)); + + btr_cur_position(index, rec, + btr_cur_get_block(&cursor),&cursor); + } + + if (dict_index_is_clust(index)) { + err = row_ins_clust_index_entry_by_modify( + mode, &cursor, &heap, &big_rec, entry, + thr, &mtr); + } else { + ut_ad(!n_ext); + err = row_ins_sec_index_entry_by_modify( + mode, &cursor, entry, thr, &mtr); + } + } else { + if (mode == BTR_MODIFY_LEAF) { + err = btr_cur_optimistic_insert( + 0, &cursor, entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } else { + ut_a(mode == BTR_MODIFY_TREE); + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + + goto function_exit; + } + err = btr_cur_pessimistic_insert( + 0, &cursor, entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } + } + +function_exit: + mtr_commit(&mtr); + + if (UNIV_LIKELY_NULL(big_rec)) { + rec_t* rec; + ulint* offsets; + mtr_start(&mtr); + + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, NULL, + ULINT_UNDEFINED, &heap); + + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr); + + if (modify) { + dtuple_big_rec_free(big_rec); + } else { + dtuple_convert_back_big_rec(index, entry, big_rec); + } + + mtr_commit(&mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/******************************************************************* +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. */ +UNIV_INTERN +ulint +row_ins_index_entry( +/*================*/ + /* out: DB_SUCCESS, DB_LOCK_WAIT, + DB_DUPLICATE_KEY, or some other error code */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to insert */ + ulint n_ext, /* in: number of externally stored columns */ + ibool foreign,/* in: TRUE=check foreign key constraints */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) { + err = row_ins_check_foreign_constraints(index->table, index, + entry, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } + + /* Try first optimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, + n_ext, thr); + if (err != DB_FAIL) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ + + err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, + n_ext, thr); + return(err); +} + +/*************************************************************** +Sets the values of the dtuple fields in entry from the values of appropriate +columns in row. */ +static +void +row_ins_index_entry_set_vals( +/*=========================*/ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry to make */ + const dtuple_t* row) /* in: row */ +{ + ulint n_fields; + ulint i; + + ut_ad(entry && row); + + n_fields = dtuple_get_n_fields(entry); + + for (i = 0; i < n_fields; i++) { + dict_field_t* ind_field; + dfield_t* field; + const dfield_t* row_field; + ulint len; + + field = dtuple_get_nth_field(entry, i); + ind_field = dict_index_get_nth_field(index, i); + row_field = dtuple_get_nth_field(row, ind_field->col->ind); + len = dfield_get_len(row_field); + + /* Check column prefix indexes */ + if (ind_field->prefix_len > 0 + && dfield_get_len(row_field) != UNIV_SQL_NULL) { + + const dict_col_t* col + = dict_field_get_col(ind_field); + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ind_field->prefix_len, + len, dfield_get_data(row_field)); + + ut_ad(!dfield_is_ext(row_field)); + } + + dfield_set_data(field, dfield_get_data(row_field), len); + if (dfield_is_ext(row_field)) { + ut_ad(dict_index_is_clust(index)); + dfield_set_ext(field); + } + } +} + +/*************************************************************** +Inserts a single index entry to the table. */ +static +ulint +row_ins_index_entry_step( +/*=====================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + ins_node_t* node, /* in: row insert node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(dtuple_check_typed(node->row)); + + row_ins_index_entry_set_vals(node->index, node->entry, node->row); + + ut_ad(dtuple_check_typed(node->entry)); + + err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr); + + return(err); +} + +/*************************************************************** +Allocates a row id for row and inits the node->index field. */ +UNIV_INLINE +void +row_ins_alloc_row_id_step( +/*======================*/ + ins_node_t* node) /* in: row insert node */ +{ + dulint row_id; + + ut_ad(node->state == INS_NODE_ALLOC_ROW_ID); + + if (dict_index_is_unique(dict_table_get_first_index(node->table))) { + + /* No row id is stored if the clustered index is unique */ + + return; + } + + /* Fill in row id value to row */ + + row_id = dict_sys_get_new_row_id(); + + dict_sys_write_row_id(node->row_id_buf, row_id); +} + +/*************************************************************** +Gets a row to insert from the values list. */ +UNIV_INLINE +void +row_ins_get_row_from_values( +/*========================*/ + ins_node_t* node) /* in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->values_list; + + while (list_node) { + eval_exp(list_node); + + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/*************************************************************** +Gets a row to insert from the select list. */ +UNIV_INLINE +void +row_ins_get_row_from_select( +/*========================*/ + ins_node_t* node) /* in: row insert node */ +{ + que_node_t* list_node; + dfield_t* dfield; + dtuple_t* row; + ulint i; + + /* The field values are copied in the buffers of the select node and + it is safe to use them until we fetch from select again: therefore + we can just copy the pointers */ + + row = node->row; + + i = 0; + list_node = node->select->select_list; + + while (list_node) { + dfield = dtuple_get_nth_field(row, i); + dfield_copy_data(dfield, que_node_get_val(list_node)); + + i++; + list_node = que_node_get_next(list_node); + } +} + +/*************************************************************** +Inserts a row to a table. */ +static +ulint +row_ins( +/*====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + ins_node_t* node, /* in: row insert node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(node && thr); + + if (node->state == INS_NODE_ALLOC_ROW_ID) { + + row_ins_alloc_row_id_step(node); + + node->index = dict_table_get_first_index(node->table); + node->entry = UT_LIST_GET_FIRST(node->entry_list); + + if (node->ins_type == INS_SEARCHED) { + + row_ins_get_row_from_select(node); + + } else if (node->ins_type == INS_VALUES) { + + row_ins_get_row_from_values(node); + } + + node->state = INS_NODE_INSERT_ENTRIES; + } + + ut_ad(node->state == INS_NODE_INSERT_ENTRIES); + + while (node->index != NULL) { + err = row_ins_index_entry_step(node, thr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + } + + ut_ad(node->entry == NULL); + + node->state = INS_NODE_ALLOC_ROW_ID; + + return(DB_SUCCESS); +} + +/*************************************************************** +Inserts a row to a table. This is a high-level function used in SQL execution +graphs. */ +UNIV_INTERN +que_thr_t* +row_ins_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ins_node_t* node; + que_node_t* parent; + sel_node_t* sel_node; + trx_t* trx; + ulint err; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started(trx); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_INSERT); + + parent = que_node_get_parent(node); + sel_node = node->select; + + if (thr->prev_node == parent) { + node->state = INS_NODE_SET_IX_LOCK; + } + + /* If this is the first time this node is executed (or when + execution resumes after wait for the table IX lock), set an + IX lock on the table and reset the possible select node. MySQL's + partitioned table code may also call an insert within the same + SQL statement AFTER it has used this table handle to do a search. + This happens, for example, when a row update moves it to another + partition. In that case, we have already set the IX lock on the + table during the search operation, and there is no need to set + it again here. But we must write trx->id to node->trx_id_buf. */ + + trx_write_trx_id(node->trx_id_buf, trx->id); + + if (node->state == INS_NODE_SET_IX_LOCK) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + if (UT_DULINT_EQ(trx->id, node->trx_id)) { + /* No need to do IX-locking */ + + goto same_trx; + } + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + + node->trx_id = trx->id; +same_trx: + node->state = INS_NODE_ALLOC_ROW_ID; + + if (node->ins_type == INS_SEARCHED) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to insert */ + + thr->run_node = sel_node; + + return(thr); + } + } + + if ((node->ins_type == INS_SEARCHED) + && (sel_node->state != SEL_NODE_FETCH)) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to insert */ + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_ins(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* err == DB_LOCK_WAIT or SQL error detected */ + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->ins_type == INS_SEARCHED) { + /* Fetch a row to insert */ + + thr->run_node = sel_node; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} diff --git a/storage/xtradb/row/row0merge.c b/storage/xtradb/row/row0merge.c new file mode 100644 index 00000000000..efed3d26e5b --- /dev/null +++ b/storage/xtradb/row/row0merge.c @@ -0,0 +1,2358 @@ +/***************************************************************************** + +Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +New index creation routines using a merge sort + +Created 12/4/2005 Jan Lindstrom +Completed by Sunny Bains and Marko Makela +*******************************************************/ + +#include "row0merge.h" +#include "row0ext.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0ins.h" +#include "row0sel.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "dict0load.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "os0file.h" +#include "lock0lock.h" +#include "data0data.h" +#include "data0type.h" +#include "que0que.h" +#include "pars0pars.h" +#include "mem0mem.h" +#include "log0log.h" +#include "ut0sort.h" +#include "handler0alter.h" + +#ifdef UNIV_DEBUG +/* Set these in order ot enable debug printout. */ +static ibool row_merge_print_cmp; +static ibool row_merge_print_read; +static ibool row_merge_print_write; +#endif /* UNIV_DEBUG */ + +/* Block size for I/O operations in merge sort. The minimum is +UNIV_PAGE_SIZE, or page_get_free_space_of_empty() rounded to a power of 2. + +When not creating a PRIMARY KEY that contains column prefixes, this +can be set as small as UNIV_PAGE_SIZE / 2. See the comment above +ut_ad(data_size < sizeof(row_merge_block_t)). */ + +typedef byte row_merge_block_t[1048576]; + +/* Secondary buffer for I/O operations of merge records. This buffer +is used for writing or reading a record that spans two row_merge_block_t. +Thus, it must be able to hold one merge record, whose maximum size is +the same as the minimum size of row_merge_block_t. */ + +typedef byte mrec_buf_t[UNIV_PAGE_SIZE]; + +/* Merge record in row_merge_block_t. The format is the same as a +record in ROW_FORMAT=COMPACT with the exception that the +REC_N_NEW_EXTRA_BYTES are omitted. */ +typedef byte mrec_t; + +/* Buffer for sorting in main memory. */ +struct row_merge_buf_struct { + mem_heap_t* heap; /* memory heap where allocated */ + dict_index_t* index; /* the index the tuples belong to */ + ulint total_size; /* total amount of data bytes */ + ulint n_tuples; /* number of data tuples */ + ulint max_tuples; /* maximum number of data tuples */ + const dfield_t**tuples; /* array of pointers to + arrays of fields that form + the data tuples */ + const dfield_t**tmp_tuples; /* temporary copy of tuples, + for sorting */ +}; + +typedef struct row_merge_buf_struct row_merge_buf_t; + +/* Information about temporary files used in merge sort are stored +to this structure */ + +struct merge_file_struct { + int fd; /* File descriptor */ + ulint offset; /* File offset */ +}; + +typedef struct merge_file_struct merge_file_t; + +#ifdef UNIV_DEBUG +/********************************************************** +Display a merge tuple. */ +static +void +row_merge_tuple_print( +/*==================*/ + FILE* f, /* in: output stream */ + const dfield_t* entry, /* in: tuple to print */ + ulint n_fields)/* in: number of fields in the tuple */ +{ + ulint j; + + for (j = 0; j < n_fields; j++) { + const dfield_t* field = &entry[j]; + + if (dfield_is_null(field)) { + fputs("\n NULL;", f); + } else { + ulint field_len = dfield_get_len(field); + ulint len = ut_min(field_len, 20); + if (dfield_is_ext(field)) { + fputs("\nE", f); + } else { + fputs("\n ", f); + } + ut_print_buf(f, dfield_get_data(field), len); + if (len != field_len) { + fprintf(f, " (total %lu bytes)", field_len); + } + } + } + putc('\n', f); +} +#endif /* UNIV_DEBUG */ + +/********************************************************** +Allocate a sort buffer. */ +static +row_merge_buf_t* +row_merge_buf_create_low( +/*=====================*/ + /* out,own: sort buffer */ + mem_heap_t* heap, /* in: heap where allocated */ + dict_index_t* index, /* in: secondary index */ + ulint max_tuples, /* in: maximum number of data tuples */ + ulint buf_size) /* in: size of the buffer, in bytes */ +{ + row_merge_buf_t* buf; + + ut_ad(max_tuples > 0); + ut_ad(max_tuples <= sizeof(row_merge_block_t)); + ut_ad(max_tuples < buf_size); + + buf = mem_heap_zalloc(heap, buf_size); + buf->heap = heap; + buf->index = index; + buf->max_tuples = max_tuples; + buf->tuples = mem_heap_alloc(heap, + 2 * max_tuples * sizeof *buf->tuples); + buf->tmp_tuples = buf->tuples + max_tuples; + + return(buf); +} + +/********************************************************** +Allocate a sort buffer. */ +static +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + /* out,own: sort buffer */ + dict_index_t* index) /* in: secondary index */ +{ + row_merge_buf_t* buf; + ulint max_tuples; + ulint buf_size; + mem_heap_t* heap; + + max_tuples = sizeof(row_merge_block_t) + / ut_max(1, dict_index_get_min_size(index)); + + buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples; + + heap = mem_heap_create(buf_size + sizeof(row_merge_block_t)); + + buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size); + + return(buf); +} + +/********************************************************** +Empty a sort buffer. */ +static +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + /* out: sort buffer */ + row_merge_buf_t* buf) /* in,own: sort buffer */ +{ + ulint buf_size; + ulint max_tuples = buf->max_tuples; + mem_heap_t* heap = buf->heap; + dict_index_t* index = buf->index; + + buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples; + + mem_heap_empty(heap); + + return(row_merge_buf_create_low(heap, index, max_tuples, buf_size)); +} + +/********************************************************** +Deallocate a sort buffer. */ +static +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /* in,own: sort buffer, to be freed */ +{ + mem_heap_free(buf->heap); +} + +/********************************************************** +Insert a data tuple into a sort buffer. */ +static +ibool +row_merge_buf_add( +/*==============*/ + /* out: TRUE if added, + FALSE if out of space */ + row_merge_buf_t* buf, /* in/out: sort buffer */ + const dtuple_t* row, /* in: row in clustered index */ + const row_ext_t* ext) /* in: cache of externally stored + column prefixes, or NULL */ +{ + ulint i; + ulint n_fields; + ulint data_size; + ulint extra_size; + const dict_index_t* index; + dfield_t* entry; + dfield_t* field; + + if (buf->n_tuples >= buf->max_tuples) { + return(FALSE); + } + + UNIV_PREFETCH_R(row->fields); + + index = buf->index; + + n_fields = dict_index_get_n_fields(index); + + entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry); + buf->tuples[buf->n_tuples] = entry; + field = entry; + + data_size = 0; + extra_size = UT_BITS_IN_BYTES(index->n_nullable); + + for (i = 0; i < n_fields; i++, field++) { + const dict_field_t* ifield; + const dict_col_t* col; + ulint col_no; + const dfield_t* row_field; + ulint len; + + ifield = dict_index_get_nth_field(index, i); + col = ifield->col; + col_no = dict_col_get_no(col); + row_field = dtuple_get_nth_field(row, col_no); + dfield_copy(field, row_field); + len = dfield_get_len(field); + + if (dfield_is_null(field)) { + ut_ad(!(col->prtype & DATA_NOT_NULL)); + continue; + } else if (UNIV_LIKELY(!ext)) { + } else if (dict_index_is_clust(index)) { + /* Flag externally stored fields. */ + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + if (i < dict_index_get_n_unique(index)) { + dfield_set_data(field, buf, len); + } else { + dfield_set_ext(field); + len = dfield_get_len(field); + } + } + } else { + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + ut_a(buf != field_ref_zero); + dfield_set_data(field, buf, len); + } + } + + /* If a column prefix index, take only the prefix */ + + if (ifield->prefix_len) { + len = dtype_get_at_most_n_mbchars( + col->prtype, + col->mbminlen, col->mbmaxlen, + ifield->prefix_len, + len, dfield_get_data(field)); + dfield_set_len(field, len); + } + + ut_ad(len <= col->len || col->mtype == DATA_BLOB); + + if (ifield->fixed_len) { + ut_ad(len == ifield->fixed_len); + ut_ad(!dfield_is_ext(field)); + } else if (dfield_is_ext(field)) { + extra_size += 2; + } else if (len < 128 + || (col->len < 256 && col->mtype != DATA_BLOB)) { + extra_size++; + } else { + /* For variable-length columns, we look up the + maximum length from the column itself. If this + is a prefix index column shorter than 256 bytes, + this will waste one byte. */ + extra_size += 2; + } + data_size += len; + } + +#ifdef UNIV_DEBUG + { + ulint size; + ulint extra; + + size = rec_get_converted_size_comp(index, + REC_STATUS_ORDINARY, + entry, n_fields, &extra); + + ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size); + ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra); + } +#endif /* UNIV_DEBUG */ + + /* Add to the total size of the record in row_merge_block_t + the encoded length of extra_size and the extra bytes (extra_size). + See row_merge_buf_write() for the variable-length encoding + of extra_size. */ + data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80); + + /* The following assertion may fail if row_merge_block_t is + declared very small and a PRIMARY KEY is being created with + many prefix columns. In that case, the record may exceed the + page_zip_rec_needs_ext() limit. However, no further columns + will be moved to external storage until the record is inserted + to the clustered index B-tree. */ + ut_ad(data_size < sizeof(row_merge_block_t)); + + /* Reserve one byte for the end marker of row_merge_block_t. */ + if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) { + return(FALSE); + } + + buf->total_size += data_size; + buf->n_tuples++; + + field = entry; + + /* Copy the data fields. */ + + do { + dfield_dup(field++, buf->heap); + } while (--n_fields); + + return(TRUE); +} + +/* Structure for reporting duplicate records. */ +struct row_merge_dup_struct { + const dict_index_t* index; /* index being sorted */ + TABLE* table; /* MySQL table object */ + ulint n_dup; /* number of duplicates */ +}; + +typedef struct row_merge_dup_struct row_merge_dup_t; + +/***************************************************************** +Report a duplicate key. */ +static +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /* in/out: for reporting duplicates */ + const dfield_t* entry) /* in: duplicate index entry */ +{ + mrec_buf_t buf; + const dtuple_t* tuple; + dtuple_t tuple_store; + const rec_t* rec; + const dict_index_t* index = dup->index; + ulint n_fields= dict_index_get_n_fields(index); + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + ulint n_ext; + + if (dup->n_dup++) { + /* Only report the first duplicate record, + but count all duplicate records. */ + return; + } + + rec_offs_init(offsets_); + + /* Convert the tuple to a record and then to MySQL format. */ + + tuple = dtuple_from_fields(&tuple_store, entry, n_fields); + n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0; + + rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext); + offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, + &heap); + + innobase_rec_to_mysql(dup->table, rec, index, offsets); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/***************************************************************** +Compare two tuples. */ +static +int +row_merge_tuple_cmp( +/*================*/ + /* out: 1, 0, -1 if a is greater, + equal, less, respectively, than b */ + ulint n_field,/* in: number of fields */ + const dfield_t* a, /* in: first tuple to be compared */ + const dfield_t* b, /* in: second tuple to be compared */ + row_merge_dup_t* dup) /* in/out: for reporting duplicates */ +{ + int cmp; + const dfield_t* field = a; + + /* Compare the fields of the tuples until a difference is + found or we run out of fields to compare. If !cmp at the + end, the tuples are equal. */ + do { + cmp = cmp_dfield_dfield(a++, b++); + } while (!cmp && --n_field); + + if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) { + /* Report a duplicate value error if the tuples are + logically equal. NULL columns are logically inequal, + although they are equal in the sorting order. Find + out if any of the fields are NULL. */ + for (b = field; b != a; b++) { + if (dfield_is_null(b)) { + + goto func_exit; + } + } + + row_merge_dup_report(dup, field); + } + +func_exit: + return(cmp); +} + +/************************************************************************** +Merge sort the tuple buffer in main memory. */ +static +void +row_merge_tuple_sort( +/*=================*/ + ulint n_field,/* in: number of fields */ + row_merge_dup_t* dup, /* in/out: for reporting duplicates */ + const dfield_t** tuples, /* in/out: tuples */ + const dfield_t** aux, /* in/out: work area */ + ulint low, /* in: lower bound of the + sorting area, inclusive */ + ulint high) /* in: upper bound of the + sorting area, exclusive */ +{ +#define row_merge_tuple_sort_ctx(a,b,c,d) \ + row_merge_tuple_sort(n_field, dup, a, b, c, d) +#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup) + + UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx, + tuples, aux, low, high, row_merge_tuple_cmp_ctx); +} + +/********************************************************** +Sort a buffer. */ +static +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /* in/out: sort buffer */ + row_merge_dup_t* dup) /* in/out: for reporting duplicates */ +{ + row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup, + buf->tuples, buf->tmp_tuples, 0, buf->n_tuples); +} + +/********************************************************** +Write a buffer to a block. */ +static +void +row_merge_buf_write( +/*================*/ + const row_merge_buf_t* buf, /* in: sorted buffer */ +#ifdef UNIV_DEBUG + const merge_file_t* of, /* in: output file */ +#endif /* UNIV_DEBUG */ + row_merge_block_t* block) /* out: buffer for writing to file */ +#ifndef UNIV_DEBUG +# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block) +#endif /* !UNIV_DEBUG */ +{ + const dict_index_t* index = buf->index; + ulint n_fields= dict_index_get_n_fields(index); + byte* b = &(*block)[0]; + + ulint i; + + for (i = 0; i < buf->n_tuples; i++) { + ulint size; + ulint extra_size; + const dfield_t* entry = buf->tuples[i]; + + size = rec_get_converted_size_comp(index, + REC_STATUS_ORDINARY, + entry, n_fields, + &extra_size); + ut_ad(size > extra_size); + ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES); + extra_size -= REC_N_NEW_EXTRA_BYTES; + size -= REC_N_NEW_EXTRA_BYTES; + + /* Encode extra_size + 1 */ + if (extra_size + 1 < 0x80) { + *b++ = (byte) (extra_size + 1); + } else { + ut_ad((extra_size + 1) < 0x8000); + *b++ = (byte) (0x80 | ((extra_size + 1) >> 8)); + *b++ = (byte) (extra_size + 1); + } + + ut_ad(b + size < block[1]); + + rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index, + REC_STATUS_ORDINARY, + entry, n_fields); + + b += size; + +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu", + (void*) b, of->fd, (ulong) of->offset, + (ulong) i); + row_merge_tuple_print(stderr, entry, n_fields); + } +#endif /* UNIV_DEBUG */ + } + + /* Write an "end-of-chunk" marker. */ + ut_a(b < block[1]); + ut_a(b == block[0] + buf->total_size); + *b++ = 0; +#ifdef UNIV_DEBUG_VALGRIND + /* The rest of the block is uninitialized. Initialize it + to avoid bogus warnings. */ + memset(b, 0xff, block[1] - b); +#endif /* UNIV_DEBUG_VALGRIND */ +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n", + (void*) b, of->fd, (ulong) of->offset); + } +#endif /* UNIV_DEBUG */ +} + +/********************************************************** +Create a memory heap and allocate space for row_merge_rec_offsets(). */ +static +mem_heap_t* +row_merge_heap_create( +/*==================*/ + /* out: memory heap */ + const dict_index_t* index, /* in: record descriptor */ + ulint** offsets1, /* out: offsets */ + ulint** offsets2) /* out: offsets */ +{ + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1); + + *offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1); + *offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2); + + (*offsets1)[0] = (*offsets2)[0] = i; + (*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index); + + return(heap); +} + +/************************************************************************** +Search an index object by name and column names. If several indexes match, +return the index with the max id. */ +static +dict_index_t* +row_merge_dict_table_get_index( +/*===========================*/ + /* out: matching index, + NULL if not found */ + dict_table_t* table, /* in: table */ + const merge_index_def_t*index_def) /* in: index definition */ +{ + ulint i; + dict_index_t* index; + const char** column_names; + + column_names = mem_alloc(index_def->n_fields * sizeof *column_names); + + for (i = 0; i < index_def->n_fields; ++i) { + column_names[i] = index_def->fields[i].field_name; + } + + index = dict_table_get_index_by_max_id( + table, index_def->name, column_names, index_def->n_fields); + + mem_free((void*) column_names); + + return(index); +} + +/************************************************************************ +Read a merge block from the file system. */ +static +ibool +row_merge_read( +/*===========*/ + /* out: TRUE if request was + successful, FALSE if fail */ + int fd, /* in: file descriptor */ + ulint offset, /* in: offset where to read */ + row_merge_block_t* buf) /* out: data */ +{ + ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf; + ibool success; + + success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf, + (ulint) (ofs & 0xFFFFFFFF), + (ulint) (ofs >> 32), + sizeof *buf); + if (UNIV_UNLIKELY(!success)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: failed to read merge block at %llu\n", ofs); + } + + return(UNIV_LIKELY(success)); +} + +/************************************************************************ +Read a merge block from the file system. */ +static +ibool +row_merge_write( +/*============*/ + /* out: TRUE if request was + successful, FALSE if fail */ + int fd, /* in: file descriptor */ + ulint offset, /* in: offset where to write */ + const void* buf) /* in: data */ +{ + ib_uint64_t ofs = ((ib_uint64_t) offset) + * sizeof(row_merge_block_t); + + return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, + (ulint) (ofs & 0xFFFFFFFF), + (ulint) (ofs >> 32), + sizeof(row_merge_block_t)))); +} + +/************************************************************************ +Read a merge record. */ +static +const byte* +row_merge_read_rec( +/*===============*/ + /* out: pointer to next record, + or NULL on I/O error + or end of list */ + row_merge_block_t* block, /* in/out: file buffer */ + mrec_buf_t* buf, /* in/out: secondary buffer */ + const byte* b, /* in: pointer to record */ + const dict_index_t* index, /* in: index of the record */ + int fd, /* in: file descriptor */ + ulint* foffs, /* in/out: file offset */ + const mrec_t** mrec, /* out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + ulint* offsets)/* out: offsets of mrec */ +{ + ulint extra_size; + ulint data_size; + ulint avail_size; + + ut_ad(block); + ut_ad(buf); + ut_ad(b >= block[0]); + ut_ad(b < block[1]); + ut_ad(index); + ut_ad(foffs); + ut_ad(mrec); + ut_ad(offsets); + + ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index)); + + extra_size = *b++; + + if (UNIV_UNLIKELY(!extra_size)) { + /* End of list */ + *mrec = NULL; +#ifdef UNIV_DEBUG + if (row_merge_print_read) { + fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n", + (const void*) b, (const void*) block, + fd, (ulong) *foffs); + } +#endif /* UNIV_DEBUG */ + return(NULL); + } + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + if (UNIV_UNLIKELY(b >= block[1])) { + if (!row_merge_read(fd, ++(*foffs), block)) { +err_exit: + /* Signal I/O error. */ + *mrec = b; + return(NULL); + } + + /* Wrap around to the beginning of the buffer. */ + b = block[0]; + } + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *b++; + } + + /* Normalize extra_size. Above, value 0 signals "end of list". */ + extra_size--; + + /* Read the extra bytes. */ + + if (UNIV_UNLIKELY(b + extra_size >= block[1])) { + /* The record spans two blocks. Copy the entire record + to the auxiliary buffer and handle this as a special + case. */ + + avail_size = block[1] - b; + + memcpy(*buf, b, avail_size); + + if (!row_merge_read(fd, ++(*foffs), block)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = block[0]; + + /* Copy the record. */ + memcpy(*buf + avail_size, b, extra_size - avail_size); + b += extra_size - avail_size; + + *mrec = *buf + extra_size; + + rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets); + + data_size = rec_offs_data_size(offsets); + + /* These overflows should be impossible given that + records are much smaller than either buffer, and + the record starts near the beginning of each buffer. */ + ut_a(extra_size + data_size < sizeof *buf); + ut_a(b + data_size < block[1]); + + /* Copy the data bytes. */ + memcpy(*buf + extra_size, b, data_size); + b += data_size; + + goto func_exit; + } + + *mrec = b + extra_size; + + rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets); + + data_size = rec_offs_data_size(offsets); + ut_ad(extra_size + data_size < sizeof *buf); + + b += extra_size + data_size; + + if (UNIV_LIKELY(b < block[1])) { + /* The record fits entirely in the block. + This is the normal case. */ + goto func_exit; + } + + /* The record spans two blocks. Copy it to buf. */ + + b -= extra_size + data_size; + avail_size = block[1] - b; + memcpy(*buf, b, avail_size); + *mrec = *buf + extra_size; + rec_offs_make_valid(*mrec, index, offsets); + + if (!row_merge_read(fd, ++(*foffs), block)) { + + goto err_exit; + } + + /* Wrap around to the beginning of the buffer. */ + b = block[0]; + + /* Copy the rest of the record. */ + memcpy(*buf + avail_size, b, extra_size + data_size - avail_size); + b += extra_size + data_size - avail_size; + +func_exit: +#ifdef UNIV_DEBUG + if (row_merge_print_read) { + fprintf(stderr, "row_merge_read %p,%p,%d,%lu ", + (const void*) b, (const void*) block, + fd, (ulong) *foffs); + rec_print_comp(stderr, *mrec, offsets); + putc('\n', stderr); + } +#endif /* UNIV_DEBUG */ + + return(b); +} + +/************************************************************************ +Write a merge record. */ +static +void +row_merge_write_rec_low( +/*====================*/ + byte* b, /* out: buffer */ + ulint e, /* in: encoded extra_size */ +#ifdef UNIV_DEBUG + ulint size, /* in: total size to write */ + int fd, /* in: file descriptor */ + ulint foffs, /* in: file offset */ +#endif /* UNIV_DEBUG */ + const mrec_t* mrec, /* in: record to write */ + const ulint* offsets)/* in: offsets of mrec */ +#ifndef UNIV_DEBUG +# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \ + row_merge_write_rec_low(b, e, mrec, offsets) +#endif /* !UNIV_DEBUG */ +{ +#ifdef UNIV_DEBUG + const byte* const end = b + size; + ut_ad(e == rec_offs_extra_size(offsets) + 1); + + if (row_merge_print_write) { + fprintf(stderr, "row_merge_write %p,%d,%lu ", + (void*) b, fd, (ulong) foffs); + rec_print_comp(stderr, mrec, offsets); + putc('\n', stderr); + } +#endif /* UNIV_DEBUG */ + + if (e < 0x80) { + *b++ = (byte) e; + } else { + *b++ = (byte) (0x80 | (e >> 8)); + *b++ = (byte) e; + } + + memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets)); + ut_ad(b + rec_offs_size(offsets) == end); +} + +/************************************************************************ +Write a merge record. */ +static +byte* +row_merge_write_rec( +/*================*/ + /* out: pointer to end of block, + or NULL on error */ + row_merge_block_t* block, /* in/out: file buffer */ + mrec_buf_t* buf, /* in/out: secondary buffer */ + byte* b, /* in: pointer to end of block */ + int fd, /* in: file descriptor */ + ulint* foffs, /* in/out: file offset */ + const mrec_t* mrec, /* in: record to write */ + const ulint* offsets)/* in: offsets of mrec */ +{ + ulint extra_size; + ulint size; + ulint avail_size; + + ut_ad(block); + ut_ad(buf); + ut_ad(b >= block[0]); + ut_ad(b < block[1]); + ut_ad(mrec); + ut_ad(foffs); + ut_ad(mrec < block[0] || mrec > block[1]); + ut_ad(mrec < buf[0] || mrec > buf[1]); + + /* Normalize extra_size. Value 0 signals "end of list". */ + extra_size = rec_offs_extra_size(offsets) + 1; + + size = extra_size + (extra_size >= 0x80) + + rec_offs_data_size(offsets); + + if (UNIV_UNLIKELY(b + size >= block[1])) { + /* The record spans two blocks. + Copy it to the temporary buffer first. */ + avail_size = block[1] - b; + + row_merge_write_rec_low(buf[0], + extra_size, size, fd, *foffs, + mrec, offsets); + + /* Copy the head of the temporary buffer, write + the completed block, and copy the tail of the + record to the head of the new block. */ + memcpy(b, buf[0], avail_size); + + if (!row_merge_write(fd, (*foffs)++, block)) { + return(NULL); + } + + UNIV_MEM_INVALID(block[0], sizeof block[0]); + + /* Copy the rest. */ + b = block[0]; + memcpy(b, buf[0] + avail_size, size - avail_size); + b += size - avail_size; + } else { + row_merge_write_rec_low(b, extra_size, size, fd, *foffs, + mrec, offsets); + b += size; + } + + return(b); +} + +/************************************************************************ +Write an end-of-list marker. */ +static +byte* +row_merge_write_eof( +/*================*/ + /* out: pointer to end of block, + or NULL on error */ + row_merge_block_t* block, /* in/out: file buffer */ + byte* b, /* in: pointer to end of block */ + int fd, /* in: file descriptor */ + ulint* foffs) /* in/out: file offset */ +{ + ut_ad(block); + ut_ad(b >= block[0]); + ut_ad(b < block[1]); + ut_ad(foffs); +#ifdef UNIV_DEBUG + if (row_merge_print_write) { + fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n", + (void*) b, (void*) block, fd, (ulong) *foffs); + } +#endif /* UNIV_DEBUG */ + + *b++ = 0; + UNIV_MEM_ASSERT_RW(block[0], b - block[0]); + UNIV_MEM_ASSERT_W(block[0], sizeof block[0]); +#ifdef UNIV_DEBUG_VALGRIND + /* The rest of the block is uninitialized. Initialize it + to avoid bogus warnings. */ + memset(b, 0xff, block[1] - b); +#endif /* UNIV_DEBUG_VALGRIND */ + + if (!row_merge_write(fd, (*foffs)++, block)) { + return(NULL); + } + + UNIV_MEM_INVALID(block[0], sizeof block[0]); + return(block[0]); +} + +/***************************************************************** +Compare two merge records. */ +static +int +row_merge_cmp( +/*==========*/ + /* out: 1, 0, -1 if + mrec1 is greater, equal, less, + respectively, than mrec2 */ + const mrec_t* mrec1, /* in: first merge + record to be compared */ + const mrec_t* mrec2, /* in: second merge + record to be compared */ + const ulint* offsets1, /* in: first record offsets */ + const ulint* offsets2, /* in: second record offsets */ + const dict_index_t* index) /* in: index */ +{ + int cmp; + + cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index); + +#ifdef UNIV_DEBUG + if (row_merge_print_cmp) { + fputs("row_merge_cmp1 ", stderr); + rec_print_comp(stderr, mrec1, offsets1); + fputs("\nrow_merge_cmp2 ", stderr); + rec_print_comp(stderr, mrec2, offsets2); + fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp); + } +#endif /* UNIV_DEBUG */ + + return(cmp); +} + +/************************************************************************ +Reads clustered index of the table and create temporary files +containing the index entries for the indexes to be built. */ +static +ulint +row_merge_read_clustered_index( +/*===========================*/ + /* out: DB_SUCCESS or error */ + trx_t* trx, /* in: transaction */ + TABLE* table, /* in/out: MySQL table object, + for reporting erroneous records */ + const dict_table_t* old_table,/* in: table where rows are + read from */ + const dict_table_t* new_table,/* in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + dict_index_t** index, /* in: indexes to be created */ + merge_file_t* files, /* in: temporary files */ + ulint n_index,/* in: number of indexes to create */ + row_merge_block_t* block) /* in/out: file buffer */ +{ + dict_index_t* clust_index; /* Clustered index */ + mem_heap_t* row_heap; /* Heap memory to create + clustered index records */ + row_merge_buf_t** merge_buf; /* Temporary list for records*/ + btr_pcur_t pcur; /* Persistent cursor on the + clustered index */ + mtr_t mtr; /* Mini transaction */ + ulint err = DB_SUCCESS;/* Return code */ + ulint i; + ulint n_nonnull = 0; /* number of columns + changed to NOT NULL */ + ulint* nonnull = NULL; /* NOT NULL columns */ + + trx->op_info = "reading clustered index"; + + ut_ad(trx); + ut_ad(old_table); + ut_ad(new_table); + ut_ad(index); + ut_ad(files); + + /* Create and initialize memory for record buffers */ + + merge_buf = mem_alloc(n_index * sizeof *merge_buf); + + for (i = 0; i < n_index; i++) { + merge_buf[i] = row_merge_buf_create(index[i]); + } + + mtr_start(&mtr); + + /* Find the clustered index and create a persistent cursor + based on that. */ + + clust_index = dict_table_get_first_index(old_table); + + btr_pcur_open_at_index_side( + TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + + if (UNIV_UNLIKELY(old_table != new_table)) { + ulint n_cols = dict_table_get_n_cols(old_table); + + /* A primary key will be created. Identify the + columns that were flagged NOT NULL in the new table, + so that we can quickly check that the records in the + (old) clustered index do not violate the added NOT + NULL constraints. */ + + ut_a(n_cols == dict_table_get_n_cols(new_table)); + + nonnull = mem_alloc(n_cols * sizeof *nonnull); + + for (i = 0; i < n_cols; i++) { + if (dict_table_get_nth_col(old_table, i)->prtype + & DATA_NOT_NULL) { + + continue; + } + + if (dict_table_get_nth_col(new_table, i)->prtype + & DATA_NOT_NULL) { + + nonnull[n_nonnull++] = i; + } + } + + if (!n_nonnull) { + mem_free(nonnull); + nonnull = NULL; + } + } + + row_heap = mem_heap_create(sizeof(mrec_buf_t)); + + /* Scan the clustered index. */ + for (;;) { + const rec_t* rec; + ulint* offsets; + dtuple_t* row = NULL; + row_ext_t* ext; + ibool has_next = TRUE; + + btr_pcur_move_to_next_on_page(&pcur); + + /* When switching pages, commit the mini-transaction + in order to release the latch on the old page. */ + + if (btr_pcur_is_after_last_on_page(&pcur)) { + btr_pcur_store_position(&pcur, &mtr); + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_SEARCH_LEAF, + &pcur, &mtr); + has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + if (UNIV_LIKELY(has_next)) { + rec = btr_pcur_get_rec(&pcur); + offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &row_heap); + + /* Skip delete marked records. */ + if (rec_get_deleted_flag( + rec, dict_table_is_comp(old_table))) { + continue; + } + + srv_n_rows_inserted++; + + /* Build a row based on the clustered index. */ + + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, offsets, + new_table, &ext, row_heap); + + if (UNIV_LIKELY_NULL(nonnull)) { + for (i = 0; i < n_nonnull; i++) { + dfield_t* field + = &row->fields[nonnull[i]]; + dtype_t* field_type + = dfield_get_type(field); + + ut_a(!(field_type->prtype + & DATA_NOT_NULL)); + + if (dfield_is_null(field)) { + err = DB_PRIMARY_KEY_IS_NULL; + i = 0; + goto err_exit; + } + + field_type->prtype |= DATA_NOT_NULL; + } + } + } + + /* Build all entries for all the indexes to be created + in a single scan of the clustered index. */ + + for (i = 0; i < n_index; i++) { + row_merge_buf_t* buf = merge_buf[i]; + merge_file_t* file = &files[i]; + const dict_index_t* index = buf->index; + + if (UNIV_LIKELY + (row && row_merge_buf_add(buf, row, ext))) { + continue; + } + + /* The buffer must be sufficiently large + to hold at least one record. */ + ut_ad(buf->n_tuples || !has_next); + + /* We have enough data tuples to form a block. + Sort them and write to disk. */ + + if (buf->n_tuples) { + if (dict_index_is_unique(index)) { + row_merge_dup_t dup; + dup.index = buf->index; + dup.table = table; + dup.n_dup = 0; + + row_merge_buf_sort(buf, &dup); + + if (dup.n_dup) { + err = DB_DUPLICATE_KEY; +err_exit: + trx->error_key_num = i; + goto func_exit; + } + } else { + row_merge_buf_sort(buf, NULL); + } + } + + row_merge_buf_write(buf, file, block); + + if (!row_merge_write(file->fd, file->offset++, + block)) { + err = DB_OUT_OF_FILE_SPACE; + goto err_exit; + } + + UNIV_MEM_INVALID(block[0], sizeof block[0]); + merge_buf[i] = row_merge_buf_empty(buf); + + /* Try writing the record again, now that + the buffer has been written out and emptied. */ + + if (UNIV_UNLIKELY + (row && !row_merge_buf_add(buf, row, ext))) { + /* An empty buffer should have enough + room for at least one record. */ + ut_error; + } + } + + mem_heap_empty(row_heap); + + if (UNIV_UNLIKELY(!has_next)) { + goto func_exit; + } + } + +func_exit: + btr_pcur_close(&pcur); + mtr_commit(&mtr); + mem_heap_free(row_heap); + + if (UNIV_LIKELY_NULL(nonnull)) { + mem_free(nonnull); + } + + for (i = 0; i < n_index; i++) { + row_merge_buf_free(merge_buf[i]); + } + + mem_free(merge_buf); + + trx->op_info = ""; + + return(err); +} + +/***************************************************************** +Merge two blocks of linked lists on disk and write a bigger block. */ +static +ulint +row_merge_blocks( +/*=============*/ + /* out: DB_SUCCESS or error code */ + const dict_index_t* index, /* in: index being created */ + merge_file_t* file, /* in/out: file containing + index entries */ + row_merge_block_t* block, /* in/out: 3 buffers */ + ulint* foffs0, /* in/out: offset of first + source list in the file */ + ulint* foffs1, /* in/out: offset of second + source list in the file */ + merge_file_t* of, /* in/out: output file */ + TABLE* table) /* in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + mem_heap_t* heap; /* memory heap for offsets0, offsets1 */ + + mrec_buf_t buf[3]; /* buffer for handling split mrec in block[] */ + const byte* b0; /* pointer to block[0] */ + const byte* b1; /* pointer to block[1] */ + byte* b2; /* pointer to block[2] */ + const mrec_t* mrec0; /* merge rec, points to block[0] or buf[0] */ + const mrec_t* mrec1; /* merge rec, points to block[1] or buf[1] */ + ulint* offsets0;/* offsets of mrec0 */ + ulint* offsets1;/* offsets of mrec1 */ + + heap = row_merge_heap_create(index, &offsets0, &offsets1); + + /* Write a record and read the next record. Split the output + file in two halves, which can be merged on the following pass. */ +#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \ + do { \ + b2 = row_merge_write_rec(&block[2], &buf[2], b2, \ + of->fd, &of->offset, \ + mrec##N, offsets##N); \ + if (UNIV_UNLIKELY(!b2)) { \ + goto corrupt; \ + } \ + b##N = row_merge_read_rec(&block[N], &buf[N], \ + b##N, index, \ + file->fd, foffs##N, \ + &mrec##N, offsets##N); \ + if (UNIV_UNLIKELY(!b##N)) { \ + if (mrec##N) { \ + goto corrupt; \ + } \ + AT_END; \ + } \ + } while (0) + + if (!row_merge_read(file->fd, *foffs0, &block[0]) + || !row_merge_read(file->fd, *foffs1, &block[1])) { +corrupt: + mem_heap_free(heap); + return(DB_CORRUPTION); + } + + b0 = block[0]; + b1 = block[1]; + b2 = block[2]; + + b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd, + foffs0, &mrec0, offsets0); + b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd, + foffs1, &mrec1, offsets1); + if (UNIV_UNLIKELY(!b0 && mrec0) + || UNIV_UNLIKELY(!b1 && mrec1)) { + + goto corrupt; + } + + while (mrec0 && mrec1) { + switch (row_merge_cmp(mrec0, mrec1, + offsets0, offsets1, index)) { + case 0: + if (UNIV_UNLIKELY + (dict_index_is_unique(index))) { + innobase_rec_to_mysql(table, mrec0, + index, offsets0); + mem_heap_free(heap); + return(DB_DUPLICATE_KEY); + } + /* fall through */ + case -1: + ROW_MERGE_WRITE_GET_NEXT(0, goto merged); + break; + case 1: + ROW_MERGE_WRITE_GET_NEXT(1, goto merged); + break; + default: + ut_error; + } + + } + +merged: + if (mrec0) { + /* append all mrec0 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(0, goto done0); + } + } +done0: + if (mrec1) { + /* append all mrec1 to output */ + for (;;) { + ROW_MERGE_WRITE_GET_NEXT(1, goto done1); + } + } +done1: + + mem_heap_free(heap); + b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset); + return(b2 ? DB_SUCCESS : DB_CORRUPTION); +} + +/***************************************************************** +Merge disk files. */ +static +ulint +row_merge( +/*======*/ + /* out: DB_SUCCESS or error code */ + const dict_index_t* index, /* in: index being created */ + merge_file_t* file, /* in/out: file containing + index entries */ + ulint half, /* in: half the file */ + row_merge_block_t* block, /* in/out: 3 buffers */ + int* tmpfd, /* in/out: temporary file handle */ + TABLE* table) /* in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + ulint foffs0; /* first input offset */ + ulint foffs1; /* second input offset */ + ulint error; /* error code */ + merge_file_t of; /* output file */ + + UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]); + ut_ad(half > 0); + + of.fd = *tmpfd; + of.offset = 0; + + /* Merge blocks to the output file. */ + foffs0 = 0; + foffs1 = half; + + for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) { + error = row_merge_blocks(index, file, block, + &foffs0, &foffs1, &of, table); + + if (error != DB_SUCCESS) { + return(error); + } + } + + /* Copy the last block, if there is one. */ + while (foffs0 < half) { + if (!row_merge_read(file->fd, foffs0++, block) + || !row_merge_write(of.fd, of.offset++, block)) { + return(DB_CORRUPTION); + } + } + while (foffs1 < file->offset) { + if (!row_merge_read(file->fd, foffs1++, block) + || !row_merge_write(of.fd, of.offset++, block)) { + return(DB_CORRUPTION); + } + } + + /* Swap file descriptors for the next pass. */ + *tmpfd = file->fd; + *file = of; + + UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]); + + return(DB_SUCCESS); +} + +/***************************************************************** +Merge disk files. */ +static +ulint +row_merge_sort( +/*===========*/ + /* out: DB_SUCCESS or error code */ + const dict_index_t* index, /* in: index being created */ + merge_file_t* file, /* in/out: file containing + index entries */ + row_merge_block_t* block, /* in/out: 3 buffers */ + int* tmpfd, /* in/out: temporary file handle */ + TABLE* table) /* in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + ulint blksz; /* block size */ + + for (blksz = 1; blksz < file->offset; blksz *= 2) { + ulint half; + ulint error; + + ut_ad(ut_is_2pow(blksz)); + half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz); + error = row_merge(index, file, half, block, tmpfd, table); + + if (error != DB_SUCCESS) { + return(error); + } + } + + return(DB_SUCCESS); +} + +/***************************************************************** +Copy externally stored columns to the data tuple. */ +static +void +row_merge_copy_blobs( +/*=================*/ + const mrec_t* mrec, /* in: merge record */ + const ulint* offsets,/* in: offsets of mrec */ + ulint zip_size,/* in: compressed page size in bytes, or 0 */ + dtuple_t* tuple, /* in/out: data tuple */ + mem_heap_t* heap) /* in/out: memory heap */ +{ + ulint i; + ulint n_fields = dtuple_get_n_fields(tuple); + + for (i = 0; i < n_fields; i++) { + ulint len; + const void* data; + dfield_t* field = dtuple_get_nth_field(tuple, i); + + if (!dfield_is_ext(field)) { + continue; + } + + ut_ad(!dfield_is_null(field)); + + /* The table is locked during index creation. + Therefore, externally stored columns cannot possibly + be freed between the time the BLOB pointers are read + (row_merge_read_clustered_index()) and dereferenced + (below). */ + data = btr_rec_copy_externally_stored_field( + mrec, offsets, zip_size, i, &len, heap); + + dfield_set_data(field, data, len); + } +} + +/************************************************************************ +Read sorted file containing index data tuples and insert these data +tuples to the index */ +static +ulint +row_merge_insert_index_tuples( +/*==========================*/ + /* out: DB_SUCCESS or error number */ + trx_t* trx, /* in: transaction */ + dict_index_t* index, /* in: index */ + dict_table_t* table, /* in: new table */ + ulint zip_size,/* in: compressed page size of + the old table, or 0 if uncompressed */ + int fd, /* in: file descriptor */ + row_merge_block_t* block) /* in/out: file buffer */ +{ + mrec_buf_t buf; + const byte* b; + que_thr_t* thr; + ins_node_t* node; + mem_heap_t* tuple_heap; + mem_heap_t* graph_heap; + ulint error = DB_SUCCESS; + ulint foffs = 0; + ulint* offsets; + + ut_ad(trx); + ut_ad(index); + ut_ad(table); + + /* We use the insert query graph as the dummy graph + needed in the row module call */ + + trx->op_info = "inserting index entries"; + + graph_heap = mem_heap_create(500); + node = ins_node_create(INS_DIRECT, table, graph_heap); + + thr = pars_complete_graph_for_exec(node, trx, graph_heap); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + tuple_heap = mem_heap_create(1000); + + { + ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + } + + b = *block; + + if (!row_merge_read(fd, foffs, block)) { + error = DB_CORRUPTION; + } else { + for (;;) { + const mrec_t* mrec; + dtuple_t* dtuple; + ulint n_ext; + + b = row_merge_read_rec(block, &buf, b, index, + fd, &foffs, &mrec, offsets); + if (UNIV_UNLIKELY(!b)) { + /* End of list, or I/O error */ + if (mrec) { + error = DB_CORRUPTION; + } + break; + } + + dtuple = row_rec_to_index_entry_low( + mrec, index, offsets, &n_ext, tuple_heap); + + if (UNIV_UNLIKELY(n_ext)) { + row_merge_copy_blobs(mrec, offsets, zip_size, + dtuple, tuple_heap); + } + + node->row = dtuple; + node->table = table; + node->trx_id = trx->id; + + ut_ad(dtuple_validate(dtuple)); + + do { + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + error = row_ins_index_entry(index, dtuple, + 0, FALSE, thr); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + + goto next_rec; + } + + thr->lock_state = QUE_THR_LOCK_ROW; + trx->error_state = error; + que_thr_stop_for_mysql(thr); + thr->lock_state = QUE_THR_LOCK_NOLOCK; + } while (row_mysql_handle_errors(&error, trx, + thr, NULL)); + + goto err_exit; +next_rec: + mem_heap_empty(tuple_heap); + } + } + + que_thr_stop_for_mysql_no_error(thr, trx); +err_exit: + que_graph_free(thr->graph); + + trx->op_info = ""; + + mem_heap_free(tuple_heap); + + return(error); +} + +/************************************************************************* +Sets an exclusive lock on a table, for the duration of creating indexes. */ +UNIV_INTERN +ulint +row_merge_lock_table( +/*=================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in/out: transaction */ + dict_table_t* table, /* in: table to lock */ + enum lock_mode mode) /* in: LOCK_X or LOCK_S */ +{ + mem_heap_t* heap; + que_thr_t* thr; + ulint err; + sel_node_t* node; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(mode == LOCK_X || mode == LOCK_S); + + heap = mem_heap_create(512); + + trx->op_info = "setting table lock for creating or dropping index"; + + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(que_node_get_parent(thr)); + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + err = lock_table(0, table, mode, thr); + + trx->error_state = err; + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + que_thr_stop_for_mysql_no_error(thr, trx); + } else { + que_thr_stop_for_mysql(thr); + + if (err != DB_QUE_THR_SUSPENDED) { + ibool was_lock_wait; + + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; + + parent = que_node_get_parent(thr); + run_thr = que_fork_start_command(parent); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + + goto run_again; + } + } + + que_graph_free(thr->graph); + trx->op_info = ""; + + return(err); +} + +/************************************************************************* +Drop an index from the InnoDB system tables. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. */ +UNIV_INTERN +void +row_merge_drop_index( +/*=================*/ + dict_index_t* index, /* in: index to be removed */ + dict_table_t* table, /* in: table */ + trx_t* trx) /* in: transaction handle */ +{ + ulint err; + pars_info_t* info = pars_info_create(); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + + static const char str1[] = + "PROCEDURE DROP_INDEX_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n" + "DELETE FROM SYS_INDEXES WHERE ID = :indexid\n" + " AND TABLE_ID = :tableid;\n" + "END;\n"; + + ut_ad(index && table && trx); + + pars_info_add_dulint_literal(info, "indexid", index->id); + pars_info_add_dulint_literal(info, "tableid", table->id); + + trx_start_if_not_started(trx); + trx->op_info = "dropping index"; + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + err = que_eval_sql(info, str1, FALSE, trx); + + ut_a(err == DB_SUCCESS); + + /* Replace this index with another equivalent index for all + foreign key constraints on this table where this index is used */ + + dict_table_replace_index_in_foreign_list(table, index); + dict_index_remove_from_cache(table, index); + + trx->op_info = ""; +} + +/************************************************************************* +Drop those indexes which were created before an error occurred when +building an index. The data dictionary must have been locked +exclusively by the caller, because the transaction will not be +committed. */ +UNIV_INTERN +void +row_merge_drop_indexes( +/*===================*/ + trx_t* trx, /* in: transaction */ + dict_table_t* table, /* in: table containing the indexes */ + dict_index_t** index, /* in: indexes to drop */ + ulint num_created) /* in: number of elements in index[] */ +{ + ulint key_num; + + for (key_num = 0; key_num < num_created; key_num++) { + row_merge_drop_index(index[key_num], table, trx); + } +} + +/************************************************************************* +Drop all partially created indexes during crash recovery. */ +UNIV_INTERN +void +row_merge_drop_temp_indexes(void) +/*=============================*/ +{ + trx_t* trx; + ulint err; + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ +#if TEMP_INDEX_PREFIX != '\377' +# error "TEMP_INDEX_PREFIX != '\377'" +#endif + static const char drop_temp_indexes[] = + "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n" + "indexid CHAR;\n" + "DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n" + "WHERE SUBSTR(NAME,0,1)='\377';\n" + "BEGIN\n" + "\tOPEN c;\n" + "\tWHILE 1=1 LOOP\n" + "\t\tFETCH c INTO indexid;\n" + "\t\tIF (SQL % NOTFOUND) THEN\n" + "\t\t\tEXIT;\n" + "\t\tEND IF;\n" + "\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n" + "\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n" + "\tEND LOOP;\n" + "\tCLOSE c;\n" + "\tCOMMIT WORK;\n" + "END;\n"; + + trx = trx_allocate_for_background(); + trx->op_info = "dropping partially created indexes"; + row_mysql_lock_data_dictionary(trx); + + /* Incomplete transactions may be holding some locks on the + data dictionary tables. However, they should never have been + able to lock the records corresponding to the partially + created indexes that we are attempting to delete, because the + table was locked when the indexes were being created. We will + drop the partially created indexes before the rollback of + incomplete transactions is initiated. Thus, this should not + interfere with the incomplete transactions. */ + trx->isolation_level = TRX_ISO_READ_UNCOMMITTED; + err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx); + ut_a(err == DB_SUCCESS); + + row_mysql_unlock_data_dictionary(trx); + trx_free_for_background(trx); +} + +/************************************************************************* +Create a merge file. */ +static +void +row_merge_file_create( +/*==================*/ + merge_file_t* merge_file) /* out: merge file structure */ +{ + merge_file->fd = innobase_mysql_tmpfile(); + merge_file->offset = 0; +} + +/************************************************************************* +Destroy a merge file. */ +static +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /* out: merge file structure */ +{ + if (merge_file->fd != -1) { + close(merge_file->fd); + merge_file->fd = -1; + } +} + +/************************************************************************* +Determine the precise type of a column that is added to a tem +if a column must be constrained NOT NULL. */ +UNIV_INLINE +ulint +row_merge_col_prtype( +/*=================*/ + /* out: col->prtype, possibly + ORed with DATA_NOT_NULL */ + const dict_col_t* col, /* in: column */ + const char* col_name, /* in: name of the column */ + const merge_index_def_t*index_def) /* in: the index definition + of the primary key */ +{ + ulint prtype = col->prtype; + ulint i; + + ut_ad(index_def->ind_type & DICT_CLUSTERED); + + if (prtype & DATA_NOT_NULL) { + + return(prtype); + } + + /* All columns that are included + in the PRIMARY KEY must be NOT NULL. */ + + for (i = 0; i < index_def->n_fields; i++) { + if (!strcmp(col_name, index_def->fields[i].field_name)) { + return(prtype | DATA_NOT_NULL); + } + } + + return(prtype); +} + +/************************************************************************* +Create a temporary table for creating a primary key, using the definition +of an existing table. */ +UNIV_INTERN +dict_table_t* +row_merge_create_temporary_table( +/*=============================*/ + /* out: table, + or NULL on error */ + const char* table_name, /* in: new table name */ + const merge_index_def_t*index_def, /* in: the index definition + of the primary key */ + const dict_table_t* table, /* in: old table definition */ + trx_t* trx) /* in/out: transaction + (sets error_state) */ +{ + ulint i; + dict_table_t* new_table = NULL; + ulint n_cols = dict_table_get_n_user_cols(table); + ulint error; + mem_heap_t* heap = mem_heap_create(1000); + + ut_ad(table_name); + ut_ad(index_def); + ut_ad(table); + ut_ad(mutex_own(&dict_sys->mutex)); + + new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags); + + for (i = 0; i < n_cols; i++) { + const dict_col_t* col; + const char* col_name; + + col = dict_table_get_nth_col(table, i); + col_name = dict_table_get_col_name(table, i); + + dict_mem_table_add_col(new_table, heap, col_name, col->mtype, + row_merge_col_prtype(col, col_name, + index_def), + col->len); + } + + error = row_create_table_for_mysql(new_table, trx); + mem_heap_free(heap); + + if (error != DB_SUCCESS) { + trx->error_state = error; + new_table = NULL; + } + + return(new_table); +} + +/************************************************************************* +Rename the temporary indexes in the dictionary to permanent ones. The +data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +ulint +row_merge_rename_indexes( +/*=====================*/ + /* out: DB_SUCCESS if all OK */ + trx_t* trx, /* in/out: transaction */ + dict_table_t* table) /* in/out: table with new indexes */ +{ + ulint err = DB_SUCCESS; + pars_info_t* info = pars_info_create(); + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in renaming indexes. */ + +#if TEMP_INDEX_PREFIX != '\377' +# error "TEMP_INDEX_PREFIX != '\377'" +#endif + + static const char rename_indexes[] = + "PROCEDURE RENAME_INDEXES_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" + "WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='\377';\n" + "END;\n"; + + ut_ad(table); + ut_ad(trx); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + trx->op_info = "renaming indexes"; + + pars_info_add_dulint_literal(info, "tableid", table->id); + + err = que_eval_sql(info, rename_indexes, FALSE, trx); + + if (err == DB_SUCCESS) { + dict_index_t* index = dict_table_get_first_index(table); + do { + if (*index->name == TEMP_INDEX_PREFIX) { + index->name++; + } + index = dict_table_get_next_index(index); + } while (index); + } + + trx->op_info = ""; + + return(err); +} + +/************************************************************************* +Rename the tables in the data dictionary. The data dictionary must +have been locked exclusively by the caller, because the transaction +will not be committed. */ +UNIV_INTERN +ulint +row_merge_rename_tables( +/*====================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* old_table, /* in/out: old table, renamed to + tmp_name */ + dict_table_t* new_table, /* in/out: new table, renamed to + old_table->name */ + const char* tmp_name, /* in: new name for old_table */ + trx_t* trx) /* in: transaction handle */ +{ + ulint err = DB_ERROR; + pars_info_t* info; + const char* old_name= old_table->name; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(old_table != new_table); + ut_ad(mutex_own(&dict_sys->mutex)); + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + trx->op_info = "renaming tables"; + + /* We use the private SQL parser of Innobase to generate the query + graphs needed in updating the dictionary data in system tables. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_name", new_table->name); + pars_info_add_str_literal(info, "old_name", old_name); + pars_info_add_str_literal(info, "tmp_name", tmp_name); + + err = que_eval_sql(info, + "PROCEDURE RENAME_TABLES () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET NAME = :tmp_name\n" + " WHERE NAME = :old_name;\n" + "UPDATE SYS_TABLES SET NAME = :old_name\n" + " WHERE NAME = :new_name;\n" + "END;\n", FALSE, trx); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + + /* The following calls will also rename the .ibd data files if + the tables are stored in a single-table tablespace */ + + if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE) + || !dict_table_rename_in_cache(new_table, old_name, FALSE)) { + + err = DB_ERROR; + goto err_exit; + } + + err = dict_load_foreigns(old_name, TRUE); + + if (err != DB_SUCCESS) { +err_exit: + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + return(err); +} + +/************************************************************************* +Create and execute a query graph for creating an index. */ +static +ulint +row_merge_create_index_graph( +/*=========================*/ + /* out: DB_SUCCESS or error code */ + trx_t* trx, /* in: trx */ + dict_table_t* table, /* in: table */ + dict_index_t* index) /* in: index */ +{ + ind_node_t* node; /* Index creation node */ + mem_heap_t* heap; /* Memory heap */ + que_thr_t* thr; /* Query thread */ + ulint err; + + ut_ad(trx); + ut_ad(table); + ut_ad(index); + + heap = mem_heap_create(512); + + index->table = table; + node = ind_create_graph_create(index, heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + + que_run_threads(thr); + + err = trx->error_state; + + que_graph_free((que_t*) que_node_get_parent(thr)); + + return(err); +} + +/************************************************************************* +Create the index and load in to the dictionary. */ +UNIV_INTERN +dict_index_t* +row_merge_create_index( +/*===================*/ + /* out: index, or NULL on error */ + trx_t* trx, /* in/out: trx (sets error_state) */ + dict_table_t* table, /* in: the index is on this table */ + const merge_index_def_t* /* in: the index definition */ + index_def) +{ + dict_index_t* index; + ulint err; + ulint n_fields = index_def->n_fields; + ulint i; + + /* Create the index prototype, using the passed in def, this is not + a persistent operation. We pass 0 as the space id, and determine at + a lower level the space id where to store the table. */ + + index = dict_mem_index_create(table->name, index_def->name, + 0, index_def->ind_type, n_fields); + + ut_a(index); + + for (i = 0; i < n_fields; i++) { + merge_index_field_t* ifield = &index_def->fields[i]; + + dict_mem_index_add_field(index, ifield->field_name, + ifield->prefix_len); + } + + /* Add the index to SYS_INDEXES, using the index prototype. */ + err = row_merge_create_index_graph(trx, table, index); + + if (err == DB_SUCCESS) { + + index = row_merge_dict_table_get_index( + table, index_def); + + ut_a(index); + +#ifdef ROW_MERGE_IS_INDEX_USABLE + /* Note the id of the transaction that created this + index, we use it to restrict readers from accessing + this index, to ensure read consistency. */ + index->trx_id = trx->id; +#endif /* ROW_MERGE_IS_INDEX_USABLE */ + } else { + index = NULL; + } + + return(index); +} + +#ifdef ROW_MERGE_IS_INDEX_USABLE +/************************************************************************* +Check if a transaction can use an index. */ +UNIV_INTERN +ibool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /* in: transaction */ + const dict_index_t* index) /* in: index to check */ +{ + if (!trx->read_view) { + return(TRUE); + } + + return(ut_dulint_cmp(index->trx_id, trx->read_view->low_limit_id) < 0); +} +#endif /* ROW_MERGE_IS_INDEX_USABLE */ + +/************************************************************************* +Drop the old table. */ +UNIV_INTERN +ulint +row_merge_drop_table( +/*=================*/ + /* out: DB_SUCCESS or error code */ + trx_t* trx, /* in: transaction */ + dict_table_t* table) /* in: table to drop */ +{ + /* There must be no open transactions on the table. */ + ut_a(table->n_mysql_handles_opened == 0); + + return(row_drop_table_for_mysql(table->name, trx, FALSE)); +} + +/************************************************************************* +Build indexes on a table by reading a clustered index, +creating a temporary file containing index entries, merge sorting +these index entries and inserting sorted index entries to indexes. */ +UNIV_INTERN +ulint +row_merge_build_indexes( +/*====================*/ + /* out: DB_SUCCESS or error code */ + trx_t* trx, /* in: transaction */ + dict_table_t* old_table, /* in: table where rows are + read from */ + dict_table_t* new_table, /* in: table where indexes are + created; identical to old_table + unless creating a PRIMARY KEY */ + dict_index_t** indexes, /* in: indexes to be created */ + ulint n_indexes, /* in: size of indexes[] */ + TABLE* table) /* in/out: MySQL table, for + reporting erroneous key value + if applicable */ +{ + merge_file_t* merge_files; + row_merge_block_t* block; + ulint block_size; + ulint i; + ulint error; + int tmpfd; + + ut_ad(trx); + ut_ad(old_table); + ut_ad(new_table); + ut_ad(indexes); + ut_ad(n_indexes); + + trx_start_if_not_started(trx); + + /* Allocate memory for merge file data structure and initialize + fields */ + + merge_files = mem_alloc(n_indexes * sizeof *merge_files); + block_size = 3 * sizeof *block; + block = os_mem_alloc_large(&block_size); + + for (i = 0; i < n_indexes; i++) { + + row_merge_file_create(&merge_files[i]); + } + + tmpfd = innobase_mysql_tmpfile(); + + /* Reset the MySQL row buffer that is used when reporting + duplicate keys. */ + innobase_rec_reset(table); + + /* Read clustered index of the table and create files for + secondary index entries for merge sort */ + + error = row_merge_read_clustered_index( + trx, table, old_table, new_table, indexes, + merge_files, n_indexes, block); + + if (error != DB_SUCCESS) { + + goto func_exit; + } + + /* Now we have files containing index entries ready for + sorting and inserting. */ + + for (i = 0; i < n_indexes; i++) { + error = row_merge_sort(indexes[i], &merge_files[i], + block, &tmpfd, table); + + if (error == DB_SUCCESS) { + error = row_merge_insert_index_tuples( + trx, indexes[i], new_table, + dict_table_zip_size(old_table), + merge_files[i].fd, block); + } + + /* Close the temporary file to free up space. */ + row_merge_file_destroy(&merge_files[i]); + + if (error != DB_SUCCESS) { + trx->error_key_num = i; + goto func_exit; + } + } + +func_exit: + close(tmpfd); + + for (i = 0; i < n_indexes; i++) { + row_merge_file_destroy(&merge_files[i]); + } + + mem_free(merge_files); + os_mem_free_large(block, block_size); + + return(error); +} diff --git a/storage/xtradb/row/row0mysql.c b/storage/xtradb/row/row0mysql.c new file mode 100644 index 00000000000..8204285cc38 --- /dev/null +++ b/storage/xtradb/row/row0mysql.c @@ -0,0 +1,4209 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0mysql.ic" +#endif + +#include "row0ins.h" +#include "row0sel.h" +#include "row0upd.h" +#include "row0row.h" +#include "que0que.h" +#include "pars0pars.h" +#include "dict0dict.h" +#include "dict0crea.h" +#include "dict0load.h" +#include "dict0boot.h" +#include "trx0roll.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "trx0undo.h" +#include "lock0lock.h" +#include "rem0cmp.h" +#include "log0log.h" +#include "btr0sea.h" +#include "fil0fil.h" +#include "ibuf0ibuf.h" + +/* Provide optional 4.x backwards compatibility for 5.0 and above */ +UNIV_INTERN ibool row_rollback_on_timeout = FALSE; + +/* List of tables we should drop in background. ALTER TABLE in MySQL requires +that the table handler can drop the table in background when there are no +queries to it any more. Protected by the kernel mutex. */ +typedef struct row_mysql_drop_struct row_mysql_drop_t; +struct row_mysql_drop_struct{ + char* table_name; + UT_LIST_NODE_T(row_mysql_drop_t) row_mysql_drop_list; +}; + +static UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list; +static ibool row_mysql_drop_list_inited = FALSE; + +/* Magic table names for invoking various monitor threads */ +static const char S_innodb_monitor[] = "innodb_monitor"; +static const char S_innodb_lock_monitor[] = "innodb_lock_monitor"; +static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor"; +static const char S_innodb_table_monitor[] = "innodb_table_monitor"; +static const char S_innodb_mem_validate[] = "innodb_mem_validate"; + +/* Evaluates to true if str1 equals str2_onstack, used for comparing +the above strings. */ +#define STR_EQ(str1, str1_len, str2_onstack) \ + ((str1_len) == sizeof(str2_onstack) \ + && memcmp(str1, str2_onstack, sizeof(str2_onstack)) == 0) + +#ifndef UNIV_HOTBACKUP +/*********************************************************************** +Determine if the given name is a name reserved for MySQL system tables. */ +static +ibool +row_mysql_is_system_table( +/*======================*/ + /* out: TRUE if name is a MySQL + system table name */ + const char* name) +{ + if (strncmp(name, "mysql/", 6) != 0) { + + return(FALSE); + } + + return(0 == strcmp(name + 6, "host") + || 0 == strcmp(name + 6, "user") + || 0 == strcmp(name + 6, "db")); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************************* +If a table is not yet in the drop list, adds the table to the list of tables +which the master thread drops in background. We need this on Unix because in +ALTER TABLE MySQL may call drop table even if the table has running queries on +it. Also, if there are running foreign key checks on the table, we drop the +table lazily. */ +static +ibool +row_add_table_to_background_drop_list( +/*==================================*/ + /* out: TRUE if the table was not yet in the + drop list, and was added there */ + const char* name); /* in: table name */ + +/*********************************************************************** +Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */ +static +void +row_mysql_delay_if_needed(void) +/*===========================*/ +{ + if (srv_dml_needed_delay) { + os_thread_sleep(srv_dml_needed_delay); + } +} + +/*********************************************************************** +Frees the blob heap in prebuilt when no longer needed. */ +UNIV_INTERN +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct of a + ha_innobase:: table handle */ +{ + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; +} + +/*********************************************************************** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. */ +UNIV_INTERN +byte* +row_mysql_store_true_var_len( +/*=========================*/ + /* out: pointer to the data, we skip the 1 or 2 bytes + at the start that are used to store the len */ + byte* dest, /* in: where to store */ + ulint len, /* in: length, must fit in two bytes */ + ulint lenlen) /* in: storage length of len: either 1 or 2 bytes */ +{ + if (lenlen == 2) { + ut_a(len < 256 * 256); + + mach_write_to_2_little_endian(dest, len); + + return(dest + 2); + } + + ut_a(lenlen == 1); + ut_a(len < 256); + + mach_write_to_1(dest, len); + + return(dest + 1); +} + +/*********************************************************************** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. */ +UNIV_INTERN +const byte* +row_mysql_read_true_varchar( +/*========================*/ + /* out: pointer to the data, we skip + the 1 or 2 bytes at the start that are + used to store the len */ + ulint* len, /* out: variable-length field length */ + const byte* field, /* in: field in the MySQL format */ + ulint lenlen) /* in: storage length of len: either 1 + or 2 bytes */ +{ + if (lenlen == 2) { + *len = mach_read_from_2_little_endian(field); + + return(field + 2); + } + + ut_a(lenlen == 1); + + *len = mach_read_from_1(field); + + return(field + 1); +} + +/*********************************************************************** +Stores a reference to a BLOB in the MySQL format. */ +UNIV_INTERN +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /* in: where to store */ + ulint col_len,/* in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /* in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len) /* in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +{ + /* MySQL might assume the field is set to zero except the length and + the pointer fields */ + + memset(dest, '\0', col_len); + + /* In dest there are 1 - 4 bytes reserved for the BLOB length, + and after that 8 bytes reserved for the pointer to the data. + In 32-bit architectures we only use the first 4 bytes of the pointer + slot. */ + + ut_a(col_len - 8 > 1 || len < 256); + ut_a(col_len - 8 > 2 || len < 256 * 256); + ut_a(col_len - 8 > 3 || len < 256 * 256 * 256); + + mach_write_to_n_little_endian(dest, col_len - 8, len); + + memcpy(dest + col_len - 8, &data, sizeof data); +} + +/*********************************************************************** +Reads a reference to a BLOB in the MySQL format. */ +UNIV_INTERN +const byte* +row_mysql_read_blob_ref( +/*====================*/ + /* out: pointer to BLOB data */ + ulint* len, /* out: BLOB length */ + const byte* ref, /* in: BLOB reference in the + MySQL format */ + ulint col_len) /* in: BLOB reference length + (not BLOB length) */ +{ + byte* data; + + *len = mach_read_from_n_little_endian(ref, col_len - 8); + + memcpy(&data, ref + col_len - 8, sizeof data); + + return(data); +} + +/****************************************************************** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.c. */ +UNIV_INTERN +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + /* out: up to which byte we used + buf in the conversion */ + dfield_t* dfield, /* in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /* in/out: buffer for a converted + integer value; this must be at least + col_len long then! */ + ibool row_format_col, /* TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /* in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /* in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp) /* in: nonzero=compact format */ +{ + const byte* ptr = mysql_data; + const dtype_t* dtype; + ulint type; + ulint lenlen; + + dtype = dfield_get_type(dfield); + + type = dtype->mtype; + + if (type == DATA_INT) { + /* Store integer data in Innobase in a big-endian format, + sign bit negated if the data is a signed integer. In MySQL, + integers are stored in a little-endian format. */ + + byte* p = buf + col_len; + + for (;;) { + p--; + *p = *mysql_data; + if (p == buf) { + break; + } + mysql_data++; + } + + if (!(dtype->prtype & DATA_UNSIGNED)) { + + *buf ^= 128; + } + + ptr = buf; + buf += col_len; + } else if ((type == DATA_VARCHAR + || type == DATA_VARMYSQL + || type == DATA_BINARY)) { + + if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) { + /* The length of the actual data is stored to 1 or 2 + bytes at the start of the field */ + + if (row_format_col) { + if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) { + lenlen = 2; + } else { + lenlen = 1; + } + } else { + /* In a MySQL key value, lenlen is always 2 */ + lenlen = 2; + } + + ptr = row_mysql_read_true_varchar(&col_len, mysql_data, + lenlen); + } else { + /* Remove trailing spaces from old style VARCHAR + columns. */ + + /* Handle UCS2 strings differently. */ + ulint mbminlen = dtype_get_mbminlen(dtype); + + ptr = mysql_data; + + if (mbminlen == 2) { + /* space=0x0020 */ + /* Trim "half-chars", just in case. */ + col_len &= ~1; + + while (col_len >= 2 && ptr[col_len - 2] == 0x00 + && ptr[col_len - 1] == 0x20) { + col_len -= 2; + } + } else { + ut_a(mbminlen == 1); + /* space=0x20 */ + while (col_len > 0 + && ptr[col_len - 1] == 0x20) { + col_len--; + } + } + } + } else if (comp && type == DATA_MYSQL + && dtype_get_mbminlen(dtype) == 1 + && dtype_get_mbmaxlen(dtype) > 1) { + /* In some cases we strip trailing spaces from UTF-8 and other + multibyte charsets, from FIXED-length CHAR columns, to save + space. UTF-8 would otherwise normally use 3 * the string length + bytes to store an ASCII string! */ + + /* We assume that this CHAR field is encoded in a + variable-length character set where spaces have + 1:1 correspondence to 0x20 bytes, such as UTF-8. + + Consider a CHAR(n) field, a field of n characters. + It will contain between n * mbminlen and n * mbmaxlen bytes. + We will try to truncate it to n bytes by stripping + space padding. If the field contains single-byte + characters only, it will be truncated to n characters. + Consider a CHAR(5) field containing the string ".a " + where "." denotes a 3-byte character represented by + the bytes "$%&". After our stripping, the string will + be stored as "$%&a " (5 bytes). The string ".abc " + will be stored as "$%&abc" (6 bytes). + + The space padding will be restored in row0sel.c, function + row_sel_field_store_in_mysql_format(). */ + + ulint n_chars; + + ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype))); + + n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype); + + /* Strip space padding. */ + while (col_len > n_chars && ptr[col_len - 1] == 0x20) { + col_len--; + } + } else if (type == DATA_BLOB && row_format_col) { + + ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len); + } + + dfield_set_data(dfield, ptr, col_len); + + return(buf); +} + +/****************************************************************** +Convert a row in the MySQL format to a row in the Innobase format. Note that +the function to convert a MySQL format key value to an InnoDB dtuple is +row_sel_convert_mysql_key_to_innobase() in row0sel.c. */ +static +void +row_mysql_convert_row_to_innobase( +/*==============================*/ + dtuple_t* row, /* in/out: Innobase row where the + field type information is already + copied there! */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct where template + must be of type ROW_MYSQL_WHOLE_ROW */ + byte* mysql_rec) /* in: row in the MySQL format; + NOTE: do not discard as long as + row is used, as row may contain + pointers to this record! */ +{ + mysql_row_templ_t* templ; + dfield_t* dfield; + ulint i; + + ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); + ut_ad(prebuilt->mysql_template); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + dfield = dtuple_get_nth_field(row, i); + + if (templ->mysql_null_bit_mask != 0) { + /* Column may be SQL NULL */ + + if (mysql_rec[templ->mysql_null_byte_offset] + & (byte) (templ->mysql_null_bit_mask)) { + + /* It is SQL NULL */ + + dfield_set_null(dfield); + + goto next_column; + } + } + + row_mysql_store_col_in_innobase_format( + dfield, + prebuilt->ins_upd_rec_buff + templ->mysql_col_offset, + TRUE, /* MySQL row format data */ + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, + dict_table_is_comp(prebuilt->table)); +next_column: + ; + } +} + +/******************************************************************** +Handles user errors and lock waits detected by the database engine. */ +UNIV_INTERN +ibool +row_mysql_handle_errors( +/*====================*/ + /* out: TRUE if it was a lock wait and + we should continue running the query thread */ + ulint* new_err,/* out: possible new error encountered in + lock wait, or if no new error, the value + of trx->error_state at the entry of this + function */ + trx_t* trx, /* in: transaction */ + que_thr_t* thr, /* in: query thread */ + trx_savept_t* savept) /* in: savepoint or NULL */ +{ +#ifndef UNIV_HOTBACKUP + ulint err; + +handle_new_error: + err = trx->error_state; + + ut_a(err != DB_SUCCESS); + + trx->error_state = DB_SUCCESS; + + switch (err) { + case DB_LOCK_WAIT_TIMEOUT: + if (row_rollback_on_timeout) { + trx_general_rollback_for_mysql(trx, FALSE, NULL); + break; + } + /* fall through */ + case DB_DUPLICATE_KEY: + case DB_FOREIGN_DUPLICATE_KEY: + case DB_TOO_BIG_RECORD: + case DB_ROW_IS_REFERENCED: + case DB_NO_REFERENCED_ROW: + case DB_CANNOT_ADD_CONSTRAINT: + case DB_TOO_MANY_CONCURRENT_TRXS: + case DB_OUT_OF_FILE_SPACE: + if (savept) { + /* Roll back the latest, possibly incomplete + insertion or update */ + + trx_general_rollback_for_mysql(trx, TRUE, savept); + } + /* MySQL will roll back the latest SQL statement */ + break; + case DB_LOCK_WAIT: + srv_suspend_mysql_thread(thr); + + if (trx->error_state != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + goto handle_new_error; + } + + *new_err = err; + + return(TRUE); + + case DB_DEADLOCK: + case DB_LOCK_TABLE_FULL: + /* Roll back the whole transaction; this resolution was added + to version 3.23.43 */ + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + break; + + case DB_MUST_GET_MORE_FILE_SPACE: + fputs("InnoDB: The database cannot continue" + " operation because of\n" + "InnoDB: lack of space. You must add" + " a new data file to\n" + "InnoDB: my.cnf and restart the database.\n", stderr); + + exit(1); + + case DB_CORRUPTION: + fputs("InnoDB: We detected index corruption" + " in an InnoDB type table.\n" + "InnoDB: You have to dump + drop + reimport" + " the table or, in\n" + "InnoDB: a case of widespread corruption," + " dump all InnoDB\n" + "InnoDB: tables and recreate the" + " whole InnoDB tablespace.\n" + "InnoDB: If the mysqld server crashes" + " after the startup or when\n" + "InnoDB: you dump the tables, look at\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "forcing-recovery.html" + " for help.\n", stderr); + break; + default: + fprintf(stderr, "InnoDB: unknown error code %lu\n", + (ulong) err); + ut_error; + } + + if (trx->error_state != DB_SUCCESS) { + *new_err = trx->error_state; + } else { + *new_err = err; + } + + trx->error_state = DB_SUCCESS; + + return(FALSE); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; + return(FALSE); +#endif /* UNIV_HOTBACKUP */ +} + +/************************************************************************ +Create a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + /* out, own: a prebuilt struct */ + dict_table_t* table) /* in: Innobase table handle */ +{ + row_prebuilt_t* prebuilt; + mem_heap_t* heap; + dict_index_t* clust_index; + dtuple_t* ref; + ulint ref_len; + + heap = mem_heap_create(sizeof *prebuilt + 128); + + prebuilt = mem_heap_zalloc(heap, sizeof *prebuilt); + + prebuilt->magic_n = ROW_PREBUILT_ALLOCATED; + prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED; + + prebuilt->table = table; + + prebuilt->sql_stat_start = TRUE; + prebuilt->heap = heap; + + prebuilt->pcur = btr_pcur_create_for_mysql(); + prebuilt->clust_pcur = btr_pcur_create_for_mysql(); + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = 99999999; + + prebuilt->search_tuple = dtuple_create( + heap, 2 * dict_table_get_n_cols(table)); + + clust_index = dict_table_get_first_index(table); + + /* Make sure that search_tuple is long enough for clustered index */ + ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + prebuilt->clust_ref = ref; + + prebuilt->autoinc_error = 0; + prebuilt->autoinc_offset = 0; + + /* Default to 1, we will set the actual value later in + ha_innobase::get_auto_increment(). */ + prebuilt->autoinc_increment = 1; + + prebuilt->autoinc_last_value = 0; + + return(prebuilt); +} + +/************************************************************************ +Free a prebuilt struct for a MySQL table handle. */ +UNIV_INTERN +void +row_prebuilt_free( +/*==============*/ + row_prebuilt_t* prebuilt, /* in, own: prebuilt struct */ + ibool dict_locked) /* in: TRUE=data dictionary locked */ +{ + ulint i; + + if (UNIV_UNLIKELY + (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED + || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED)) { + + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu," + " magic n2 %lu, table name ", + (ulong) prebuilt->magic_n, + (ulong) prebuilt->magic_n2); + ut_print_name(stderr, NULL, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + prebuilt->magic_n = ROW_PREBUILT_FREED; + prebuilt->magic_n2 = ROW_PREBUILT_FREED; + + btr_pcur_free_for_mysql(prebuilt->pcur); + btr_pcur_free_for_mysql(prebuilt->clust_pcur); + + if (prebuilt->mysql_template) { + mem_free(prebuilt->mysql_template); + } + + if (prebuilt->ins_graph) { + que_graph_free_recursive(prebuilt->ins_graph); + } + + if (prebuilt->sel_graph) { + que_graph_free_recursive(prebuilt->sel_graph); + } + + if (prebuilt->upd_graph) { + que_graph_free_recursive(prebuilt->upd_graph); + } + + if (prebuilt->blob_heap) { + mem_heap_free(prebuilt->blob_heap); + } + + if (prebuilt->old_vers_heap) { + mem_heap_free(prebuilt->old_vers_heap); + } + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + if (prebuilt->fetch_cache[i] != NULL) { + + if ((ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4( + (prebuilt->fetch_cache[i]) - 4)) + || (ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4( + (prebuilt->fetch_cache[i]) + + prebuilt->mysql_row_len))) { + fputs("InnoDB: Error: trying to free" + " a corrupt fetch buffer.\n", stderr); + + mem_analyze_corruption( + prebuilt->fetch_cache[i]); + + ut_error; + } + + mem_free((prebuilt->fetch_cache[i]) - 4); + } + } + + dict_table_decrement_handle_count(prebuilt->table, dict_locked); + + mem_heap_free(prebuilt->heap); +} + +/************************************************************************* +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +UNIV_INTERN +void +row_update_prebuilt_trx( +/*====================*/ + /* out: prebuilt dtuple */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + trx_t* trx) /* in: transaction handle */ +{ + if (trx->magic_n != TRX_MAGIC_N) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: trx handle. Magic n %lu\n", + (ulong) trx->magic_n); + + mem_analyze_corruption(trx); + + ut_error; + } + + if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + fprintf(stderr, + "InnoDB: Error: trying to use a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + prebuilt->trx = trx; + + if (prebuilt->ins_graph) { + prebuilt->ins_graph->trx = trx; + } + + if (prebuilt->upd_graph) { + prebuilt->upd_graph->trx = trx; + } + + if (prebuilt->sel_graph) { + prebuilt->sel_graph->trx = trx; + } +} + +/************************************************************************* +Gets pointer to a prebuilt dtuple used in insertions. If the insert graph +has not yet been built in the prebuilt struct, then this function first +builds it. */ +static +dtuple_t* +row_get_prebuilt_insert_row( +/*========================*/ + /* out: prebuilt dtuple; the column + type information is also set in it */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + ins_node_t* node; + dtuple_t* row; + dict_table_t* table = prebuilt->table; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->ins_node == NULL) { + + /* Not called before for this handle: create an insert node + and query graph to the prebuilt struct */ + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == NULL) { + prebuilt->ins_upd_rec_buff = mem_heap_alloc( + prebuilt->heap, prebuilt->mysql_row_len); + } + + row = dtuple_create(prebuilt->heap, + dict_table_get_n_cols(table)); + + dict_table_copy_types(row, table); + + ins_node_set_new_row(node, row); + + prebuilt->ins_graph = que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->ins_node->row); +} + +/************************************************************************* +Updates the table modification counter and calculates new estimates +for table and index statistics if necessary. */ +UNIV_INLINE +void +row_update_statistics_if_needed( +/*============================*/ + dict_table_t* table) /* in: table */ +{ + ulint counter; + + counter = table->stat_modified_counter; + + table->stat_modified_counter = counter + 1; + + /* Calculate new statistics if 1 / 16 of table has been modified + since the last time a statistics batch was run, or if + stat_modified_counter > 2 000 000 000 (to avoid wrap-around). + We calculate statistics at most every 16th round, since we may have + a counter table which is very small and updated very often. */ + + if (counter > 2000000000 + || ((ib_int64_t)counter > 16 + table->stat_n_rows / 16)) { + + dict_update_statistics(table); + } +} + +/************************************************************************* +Unlocks AUTO_INC type locks that were possibly reserved by a trx. */ +UNIV_INTERN +void +row_unlock_table_autoinc_for_mysql( +/*===============================*/ + trx_t* trx) /* in/out: transaction */ +{ + mutex_enter(&kernel_mutex); + + lock_release_autoinc_locks(trx); + + mutex_exit(&kernel_mutex); +} + +/************************************************************************* +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. */ +UNIV_INTERN +int +row_lock_table_autoinc_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in the MySQL + table handle */ +{ + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + const dict_table_t* table = prebuilt->table; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + /* If we already hold an AUTOINC lock on the table then do nothing. + Note: We peek at the value of the current owner without acquiring + the kernel mutex. **/ + if (trx == table->autoinc_trx) { + + return(DB_SUCCESS); + } + + trx->op_info = "setting auto-inc lock"; + + if (node == NULL) { + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + } + + /* We use the insert query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(trx); + + err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Sets a table lock on the table mentioned in prebuilt. */ +UNIV_INTERN +int +row_lock_table_for_mysql( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in the MySQL + table handle */ + dict_table_t* table, /* in: table to lock, or NULL + if prebuilt->table should be + locked as + prebuilt->select_lock_type */ + ulint mode) /* in: lock mode of table + (ignored if table==NULL) */ +{ + trx_t* trx = prebuilt->trx; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "setting table lock"; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + + /* We use the select query graph as the dummy graph needed + in the lock module call */ + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(trx); + + if (table) { + err = lock_table(0, table, mode, thr); + } else { + err = lock_table(0, prebuilt->table, + prebuilt->select_lock_type, thr); + } + + trx->error_state = err; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Does an insert for MySQL. */ +UNIV_INTERN +int +row_insert_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: row in the MySQL format */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + que_thr_t* thr; + ulint err; + ibool was_lock_wait; + trx_t* trx = prebuilt->trx; + ins_node_t* node = prebuilt->ins_node; + + ut_ad(trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you" + " used DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that" + " newraw is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "inserting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started(trx); + + if (node == NULL) { + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; + } + + row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec); + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->ins_graph); + + if (prebuilt->sql_stat_start) { + node->state = INS_NODE_SET_IX_LOCK; + prebuilt->sql_stat_start = FALSE; + } else { + node->state = INS_NODE_ALLOC_ROW_ID; + } + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_ins_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + /* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW; + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + prebuilt->table->stat_n_rows++; + + srv_n_rows_inserted++; + + if (prebuilt->table->stat_n_rows == 0) { + /* Avoid wrap-over */ + prebuilt->table->stat_n_rows--; + } + + row_update_statistics_if_needed(prebuilt->table); + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Builds a dummy query graph used in selects. */ +UNIV_INTERN +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + sel_node_t* node; + + ut_ad(prebuilt && prebuilt->trx); + + if (prebuilt->sel_graph == NULL) { + + node = sel_node_create(prebuilt->heap); + + prebuilt->sel_graph = que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + + prebuilt->sel_graph->state = QUE_FORK_ACTIVE; + } +} + +/************************************************************************* +Creates an query graph node of 'update' type to be used in the MySQL +interface. */ +UNIV_INTERN +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + /* out, own: update node */ + dict_table_t* table, /* in: table to update */ + mem_heap_t* heap) /* in: mem heap from which allocated */ +{ + upd_node_t* node; + + node = upd_node_create(heap); + + node->in_mysql_interface = TRUE; + node->is_delete = FALSE; + node->searched_update = FALSE; + node->select = NULL; + node->pcur = btr_pcur_create_for_mysql(); + node->table = table; + + node->update = upd_create(dict_table_get_n_cols(table), heap); + + node->update_n_fields = dict_table_get_n_cols(table); + + UT_LIST_INIT(node->columns); + node->has_clust_rec_x_lock = TRUE; + node->cmpl_info = 0; + + node->table_sym = NULL; + node->col_assign_list = NULL; + + return(node); +} + +/************************************************************************* +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. */ +UNIV_INTERN +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + /* out: prebuilt update vector */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + upd_node_t* node; + + ut_ad(prebuilt && table && prebuilt->trx); + + if (prebuilt->upd_node == NULL) { + + /* Not called before for this handle: create an update node + and query graph to the prebuilt struct */ + + node = row_create_update_node_for_mysql(table, prebuilt->heap); + + prebuilt->upd_node = node; + + prebuilt->upd_graph = que_node_get_parent( + pars_complete_graph_for_exec(node, + prebuilt->trx, + prebuilt->heap)); + prebuilt->upd_graph->state = QUE_FORK_ACTIVE; + } + + return(prebuilt->upd_node->update); +} + +/************************************************************************* +Does an update or delete of a row for MySQL. */ +UNIV_INTERN +int +row_update_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + byte* mysql_rec, /* in: the row to be updated, in + the MySQL format */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + trx_savept_t savept; + ulint err; + que_thr_t* thr; + ibool was_lock_wait; + dict_index_t* clust_index; + /* ulint ref_len; */ + upd_node_t* node; + dict_table_t* table = prebuilt->table; + trx_t* trx = prebuilt->trx; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + UT_NOT_USED(mysql_rec); + + if (prebuilt->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you" + " used DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + return(DB_ERROR); + } + + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + + if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw" + " is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + return(DB_ERROR); + } + + trx->op_info = "updating or deleting"; + + row_mysql_delay_if_needed(); + + trx_start_if_not_started(trx); + + node = prebuilt->upd_node; + + clust_index = dict_table_get_first_index(table); + + if (prebuilt->pcur->btr_cur.index == clust_index) { + btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur); + } else { + btr_pcur_copy_stored_position(node->pcur, + prebuilt->clust_pcur); + } + + ut_a(node->pcur->rel_pos == BTR_PCUR_ON); + + /* MySQL seems to call rnd_pos before updating each row it + has cached: we can get the correct cursor position from + prebuilt->pcur; NOTE that we cannot build the row reference + from mysql_rec if the clustered index was automatically + generated for the table: MySQL does not know anything about + the row id used as the clustered index key */ + + savept = trx_savept_take(trx); + + thr = que_fork_get_first_thr(prebuilt->upd_graph); + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + ut_ad(!prebuilt->sql_stat_start); + + que_thr_move_to_run_state_for_mysql(thr, trx); + +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + que_thr_stop_for_mysql(thr); + + if (err == DB_RECORD_NOT_FOUND) { + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + + return((int) err); + } + + thr->lock_state= QUE_THR_LOCK_ROW; + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, + &savept); + thr->lock_state= QUE_THR_LOCK_NOLOCK; + + if (was_lock_wait) { + goto run_again; + } + + trx->op_info = ""; + + return((int) err); + } + + que_thr_stop_for_mysql_no_error(thr, trx); + + if (node->is_delete) { + if (prebuilt->table->stat_n_rows > 0) { + prebuilt->table->stat_n_rows--; + } + + srv_n_rows_deleted++; + } else { + srv_n_rows_updated++; + } + + row_update_statistics_if_needed(prebuilt->table); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +This can only be used when srv_locks_unsafe_for_binlog is TRUE or +this session is using a READ COMMITTED isolation level. Before +calling this function we must use trx_reset_new_rec_lock_info() and +trx_register_new_rec_lock() to store the information which new record locks +really were set. This function removes a newly set lock under prebuilt->pcur, +and also under prebuilt->clust_pcur. Currently, this is only used and tested +in the case of an UPDATE or a DELETE statement, where the row lock is of the +LOCK_X type. +Thus, this implements a 'mini-rollback' that releases the latest record +locks we set. */ +UNIV_INTERN +int +row_unlock_for_mysql( +/*=================*/ + /* out: error code or DB_SUCCESS */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL + handle */ + ibool has_latches_on_recs)/* TRUE if called so that we have + the latches on the records under pcur + and clust_pcur, and we do not need to + reposition the cursors. */ +{ + dict_index_t* index; + btr_pcur_t* pcur = prebuilt->pcur; + btr_pcur_t* clust_pcur = prebuilt->clust_pcur; + trx_t* trx = prebuilt->trx; + rec_t* rec; + mtr_t mtr; + + ut_ad(prebuilt && trx); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (UNIV_UNLIKELY + (!srv_locks_unsafe_for_binlog + && trx->isolation_level != TRX_ISO_READ_COMMITTED)) { + + fprintf(stderr, + "InnoDB: Error: calling row_unlock_for_mysql though\n" + "InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n" + "InnoDB: this session is not using" + " READ COMMITTED isolation level.\n"); + + return(DB_SUCCESS); + } + + trx->op_info = "unlock_row"; + + index = btr_pcur_get_btr_cur(pcur)->index; + + if (index != NULL && trx_new_rec_locks_contain(trx, index)) { + + mtr_start(&mtr); + + /* Restore the cursor position and find the record */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr); + } + + rec = btr_pcur_get_rec(pcur); + + lock_rec_unlock(trx, btr_pcur_get_block(pcur), + rec, prebuilt->select_lock_type); + + mtr_commit(&mtr); + + /* If the search was done through the clustered index, then + we have not used clust_pcur at all, and we must NOT try to + reset locks on clust_pcur. The values in clust_pcur may be + garbage! */ + + if (dict_index_is_clust(index)) { + + goto func_exit; + } + } + + index = btr_pcur_get_btr_cur(clust_pcur)->index; + + if (index != NULL && trx_new_rec_locks_contain(trx, index)) { + + mtr_start(&mtr); + + /* Restore the cursor position and find the record */ + + if (!has_latches_on_recs) { + btr_pcur_restore_position(BTR_SEARCH_LEAF, clust_pcur, + &mtr); + } + + rec = btr_pcur_get_rec(clust_pcur); + + lock_rec_unlock(trx, btr_pcur_get_block(clust_pcur), + rec, prebuilt->select_lock_type); + + mtr_commit(&mtr); + } + +func_exit: + trx->op_info = ""; + + return(DB_SUCCESS); +} + +/************************************************************************** +Does a cascaded delete or set null in a foreign key operation. */ +UNIV_INTERN +ulint +row_update_cascade_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + que_thr_t* thr, /* in: query thread */ + upd_node_t* node, /* in: update node used in the cascade + or set null operation */ + dict_table_t* table) /* in: table where we do the operation */ +{ + ulint err; + trx_t* trx; + + trx = thr_get_trx(thr); +run_again: + thr->run_node = node; + thr->prev_node = node; + + row_upd_step(thr); + + err = trx->error_state; + + /* Note that the cascade node is a subnode of another InnoDB + query graph node. We do a normal lock wait in this node, but + all errors are handled by the parent node. */ + + if (err == DB_LOCK_WAIT) { + /* Handle lock wait here */ + + que_thr_stop_for_mysql(thr); + + srv_suspend_mysql_thread(thr); + + /* Note that a lock wait may also end in a lock wait timeout, + or this transaction is picked as a victim in selective + deadlock resolution */ + + if (trx->error_state != DB_SUCCESS) { + + return(trx->error_state); + } + + /* Retry operation after a normal lock wait */ + + goto run_again; + } + + if (err != DB_SUCCESS) { + + return(err); + } + + if (node->is_delete) { + if (table->stat_n_rows > 0) { + table->stat_n_rows--; + } + + srv_n_rows_deleted++; + } else { + srv_n_rows_updated++; + } + + row_update_statistics_if_needed(table); + + return(err); +} + +/************************************************************************* +Checks if a table is such that we automatically created a clustered +index on it (on row id). */ +UNIV_INTERN +ibool +row_table_got_default_clust_index( +/*==============================*/ + const dict_table_t* table) +{ + const dict_index_t* clust_index; + + clust_index = dict_table_get_first_index(table); + + return(dict_index_get_nth_col(clust_index, 0)->mtype == DATA_SYS); +} + +/************************************************************************* +Calculates the key number used inside MySQL for an Innobase index. We have +to take into account if we generated a default clustered index for the table */ +UNIV_INTERN +ulint +row_get_mysql_key_number_for_index( +/*===============================*/ + const dict_index_t* index) +{ + const dict_index_t* ind; + ulint i; + + ut_a(index); + + i = 0; + ind = dict_table_get_first_index(index->table); + + while (index != ind) { + ind = dict_table_get_next_index(ind); + i++; + } + + if (row_table_got_default_clust_index(index->table)) { + ut_a(i > 0); + i--; + } + + return(i); +} + +/************************************************************************* +Locks the data dictionary in shared mode from modifications, for performing +foreign key check, rollback, or other operation invisible to MySQL. */ +UNIV_INTERN +void +row_mysql_freeze_data_dictionary_func( +/*==================================*/ + trx_t* trx, /* in/out: transaction */ + const char* file, /* in: file name */ + ulint line) /* in: line number */ +{ + ut_a(trx->dict_operation_lock_mode == 0); + + rw_lock_s_lock_func(&dict_operation_lock, 0, file, line); + + trx->dict_operation_lock_mode = RW_S_LATCH; +} + +/************************************************************************* +Unlocks the data dictionary shared lock. */ +UNIV_INTERN +void +row_mysql_unfreeze_data_dictionary( +/*===============================*/ + trx_t* trx) /* in/out: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == RW_S_LATCH); + + rw_lock_s_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +/************************************************************************* +Locks the data dictionary exclusively for performing a table create or other +data dictionary modification operation. */ +UNIV_INTERN +void +row_mysql_lock_data_dictionary_func( +/*================================*/ + trx_t* trx, /* in/out: transaction */ + const char* file, /* in: file name */ + ulint line) /* in: line number */ +{ + ut_a(trx->dict_operation_lock_mode == 0 + || trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks or lock waits can occur then in these operations */ + + rw_lock_x_lock_func(&dict_operation_lock, 0, file, line); + trx->dict_operation_lock_mode = RW_X_LATCH; + + mutex_enter(&(dict_sys->mutex)); +} + +/************************************************************************* +Unlocks the data dictionary exclusive lock. */ +UNIV_INTERN +void +row_mysql_unlock_data_dictionary( +/*=============================*/ + trx_t* trx) /* in/out: transaction */ +{ + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + mutex_exit(&(dict_sys->mutex)); + rw_lock_x_unlock(&dict_operation_lock); + + trx->dict_operation_lock_mode = 0; +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************* +Creates a table for MySQL. If the name of the table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also start the printing of monitor +output by the master thread. If the table name ends in "innodb_mem_validate", +InnoDB will try to invoke mem_validate(). */ +UNIV_INTERN +int +row_create_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in, own: table definition + (will be freed) */ + trx_t* trx) /* in: transaction handle */ +{ + tab_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + const char* table_name; + ulint table_name_len; + ulint err; + ulint i; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); +err_exit: + dict_mem_table_free(table); + trx_commit_for_mysql(trx); + + return(DB_ERROR); + } + + trx->op_info = "creating table"; + + if (row_mysql_is_system_table(table->name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL system" + " table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be" + " of the MyISAM type!\n", + table->name); + goto err_exit; + } + + /* Check that no reserved column names are used. */ + for (i = 0; i < dict_table_get_n_user_cols(table); i++) { + if (dict_col_name_is_reserved( + dict_table_get_col_name(table, i))) { + + goto err_exit; + } + } + + trx_start_if_not_started(trx); + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + table_name = strchr(table->name, '/'); + ut_a(table_name); + table_name++; + table_name_len = strlen(table_name) + 1; + + if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) { + + /* Table equals "innodb_monitor": + start monitor prints */ + + srv_print_innodb_monitor = TRUE; + + /* The lock timeout monitor thread also takes care + of InnoDB monitor prints */ + + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_lock_monitor)) { + + srv_print_innodb_monitor = TRUE; + srv_print_innodb_lock_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + } else if (STR_EQ(table_name, table_name_len, + S_innodb_mem_validate)) { + /* We define here a debugging feature intended for + developers */ + + fputs("Validating InnoDB memory:\n" + "to use this feature you must compile InnoDB with\n" + "UNIV_MEM_DEBUG defined in univ.i and" + " the server must be\n" + "quiet because allocation from a mem heap" + " is not protected\n" + "by any semaphore.\n", stderr); +#ifdef UNIV_MEM_DEBUG + ut_a(mem_validate()); + fputs("Memory validated\n", stderr); +#else /* UNIV_MEM_DEBUG */ + fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n", + stderr); +#endif /* UNIV_MEM_DEBUG */ + } + + heap = mem_heap_create(512); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + node = tab_create_graph_create(table, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + err = trx->error_state; + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + } + + switch (err) { + case DB_OUT_OF_FILE_SPACE: + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: cannot create table ", + stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" because tablespace full\n", stderr); + + if (dict_table_get_low(table->name)) { + + row_drop_table_for_mysql(table->name, trx, FALSE); + trx_commit_for_mysql(trx); + } + break; + + case DB_DUPLICATE_KEY: + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" already exists in InnoDB internal\n" + "InnoDB: data dictionary. Have you deleted" + " the .frm file\n" + "InnoDB: and not used DROP TABLE?" + " Have you used DROP DATABASE\n" + "InnoDB: for InnoDB tables in" + " MySQL version <= 3.23.43?\n" + "InnoDB: See the Restrictions section" + " of the InnoDB manual.\n" + "InnoDB: You can drop the orphaned table" + " inside InnoDB by\n" + "InnoDB: creating an InnoDB table with" + " the same name in another\n" + "InnoDB: database and copying the .frm file" + " to the current database.\n" + "InnoDB: Then MySQL thinks the table exists," + " and DROP TABLE will\n" + "InnoDB: succeed.\n" + "InnoDB: You can look for further help from\n" + "InnoDB: " + "http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n", stderr); + + /* We may also get err == DB_ERROR if the .ibd file for the + table already exists */ + + break; + } + + que_graph_free((que_t*) que_node_get_parent(thr)); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Does an index creation operation for MySQL. TODO: currently failure +to create an index results in dropping the whole table! This is no problem +currently as all indexes must be created at the same time as the table. */ +UNIV_INTERN +int +row_create_index_for_mysql( +/*=======================*/ + /* out: error number or DB_SUCCESS */ + dict_index_t* index, /* in, own: index definition + (will be freed) */ + trx_t* trx, /* in: transaction handle */ + const ulint* field_lengths) /* in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ +{ + ind_node_t* node; + mem_heap_t* heap; + que_thr_t* thr; + ulint err; + ulint i; + ulint len; + char* table_name; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "creating index"; + + /* Copy the table name because we may want to drop the + table later, after the index object is freed (inside + que_run_threads()) and thus index->table_name is not available. */ + table_name = mem_strdup(index->table_name); + + trx_start_if_not_started(trx); + + /* Check that the same column does not appear twice in the index. + Starting from 4.0.14, InnoDB should be able to cope with that, but + safer not to allow them. */ + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + ulint j; + + for (j = 0; j < i; j++) { + if (0 == ut_strcmp( + dict_index_get_nth_field(index, j)->name, + dict_index_get_nth_field(index, i)->name)) { + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: column ", stderr); + ut_print_name(stderr, trx, FALSE, + dict_index_get_nth_field( + index, i)->name); + fputs(" appears twice in ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: This is not allowed" + " in InnoDB.\n", stderr); + + err = DB_COL_APPEARS_TWICE_IN_INDEX; + + goto error_handling; + } + } + + /* Check also that prefix_len and actual length + < DICT_MAX_INDEX_COL_LEN */ + + len = dict_index_get_nth_field(index, i)->prefix_len; + + if (field_lengths) { + len = ut_max(len, field_lengths[i]); + } + + if (len >= DICT_MAX_INDEX_COL_LEN) { + err = DB_TOO_BIG_RECORD; + + goto error_handling; + } + } + + heap = mem_heap_create(512); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + /* Note that the space id where we store the index is inherited from + the table in dict_build_index_def_step() in dict0crea.c. */ + + node = ind_create_graph_create(index, heap); + + thr = pars_complete_graph_for_exec(node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + err = trx->error_state; + + que_graph_free((que_t*) que_node_get_parent(thr)); + +error_handling: + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(table_name, trx, FALSE); + + trx_commit_for_mysql(trx); + + trx->error_state = DB_SUCCESS; + } + + trx->op_info = ""; + + mem_free(table_name); + + return((int) err); +} + +/************************************************************************* +Scans a table create SQL string and adds to the data dictionary +the foreign key constraints declared in the string. This function +should be called after the indexes for a table have been created. +Each foreign key constraint must be accompanied with indexes in +bot participating tables. The indexes are allowed to contain more +fields than mentioned in the constraint. Check also that foreign key +constraints which reference this table are ok. */ +UNIV_INTERN +int +row_table_add_foreign_constraints( +/*==============================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction */ + const char* sql_string, /* in: table create statement where + foreign keys are declared like: + FOREIGN KEY (a, b) REFERENCES table2(c, d), + table2 can be written also with the + database name before it: test.table2 */ + const char* name, /* in: table full name in the + normalized form + database_name/table_name */ + ibool reject_fks) /* in: if TRUE, fail with error + code DB_CANNOT_ADD_CONSTRAINT if + any foreign keys are found. */ +{ + ulint err; + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_a(sql_string); + + trx->op_info = "adding foreign keys"; + + trx_start_if_not_started(trx); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + err = dict_create_foreign_constraints(trx, sql_string, name, + reject_fks); +#ifndef UNIV_HOTBACKUP + if (err == DB_SUCCESS) { + /* Check that also referencing constraints are ok */ + err = dict_load_foreigns(name, TRUE); + } +#endif /* !UNIV_HOTBACKUP */ + if (err != DB_SUCCESS) { + /* We have special error handling here */ + + trx->error_state = DB_SUCCESS; + + trx_general_rollback_for_mysql(trx, FALSE, NULL); + + row_drop_table_for_mysql(name, trx, FALSE); + + trx_commit_for_mysql(trx); + + trx->error_state = DB_SUCCESS; + } + + return((int) err); +} + +/************************************************************************* +Drops a table for MySQL as a background operation. MySQL relies on Unix +in ALTER TABLE to the fact that the table handler does not remove the +table before all handles to it has been removed. Furhermore, the MySQL's +call to drop table must be non-blocking. Therefore we do the drop table +as a background operation, which is taken care of by the master thread +in srv0srv.c. */ +static +int +row_drop_table_for_mysql_in_background( +/*===================================*/ + /* out: error code or DB_SUCCESS */ + const char* name) /* in: table name */ +{ + ulint error; + trx_t* trx; + + trx = trx_allocate_for_background(); + + /* If the original transaction was dropping a table referenced by + foreign keys, we must set the following to be able to drop the + table: */ + + trx->check_foreigns = FALSE; + + /* fputs("InnoDB: Error: Dropping table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" in background drop list\n", stderr); */ + + /* Try to drop the table in InnoDB */ + + error = row_drop_table_for_mysql(name, trx, FALSE); + + /* Flush the log to reduce probability that the .frm files and + the InnoDB data dictionary get out-of-sync if the user runs + with innodb_flush_log_at_trx_commit = 0 */ + + log_buffer_flush_to_disk(); + + trx_commit_for_mysql(trx); + + trx_free_for_background(trx); + + return((int) error); +} + +/************************************************************************* +The master thread in srv0srv.c calls this regularly to drop tables which +we must drop in background after queries to them have ended. Such lazy +dropping of tables is needed in ALTER TABLE on Unix. */ +UNIV_INTERN +ulint +row_drop_tables_for_mysql_in_background(void) +/*=========================================*/ + /* out: how many tables dropped + + remaining tables in list */ +{ + row_mysql_drop_t* drop; + dict_table_t* table; + ulint n_tables; + ulint n_tables_dropped = 0; +loop: + mutex_enter(&kernel_mutex); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + + n_tables = UT_LIST_GET_LEN(row_mysql_drop_list); + + mutex_exit(&kernel_mutex); + + if (drop == NULL) { + /* All tables dropped */ + + return(n_tables + n_tables_dropped); + } + + mutex_enter(&(dict_sys->mutex)); + table = dict_table_get_low(drop->table_name); + mutex_exit(&(dict_sys->mutex)); + + if (table == NULL) { + /* If for some reason the table has already been dropped + through some other mechanism, do not try to drop it */ + + goto already_dropped; + } + + if (DB_SUCCESS != row_drop_table_for_mysql_in_background( + drop->table_name)) { + /* If the DROP fails for some table, we return, and let the + main thread retry later */ + + return(n_tables + n_tables_dropped); + } + + n_tables_dropped++; + +already_dropped: + mutex_enter(&kernel_mutex); + + UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Dropped table ", stderr); + ut_print_name(stderr, NULL, TRUE, drop->table_name); + fputs(" in background drop queue.\n", stderr); + + mem_free(drop->table_name); + + mem_free(drop); + + mutex_exit(&kernel_mutex); + + goto loop; +} + +/************************************************************************* +Get the background drop list length. NOTE: the caller must own the kernel +mutex! */ +UNIV_INTERN +ulint +row_get_background_drop_list_len_low(void) +/*======================================*/ + /* out: how many tables in list */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + return(UT_LIST_GET_LEN(row_mysql_drop_list)); +} + +/************************************************************************* +If a table is not yet in the drop list, adds the table to the list of tables +which the master thread drops in background. We need this on Unix because in +ALTER TABLE MySQL may call drop table even if the table has running queries on +it. Also, if there are running foreign key checks on the table, we drop the +table lazily. */ +static +ibool +row_add_table_to_background_drop_list( +/*==================================*/ + /* out: TRUE if the table was not yet in the + drop list, and was added there */ + const char* name) /* in: table name */ +{ + row_mysql_drop_t* drop; + + mutex_enter(&kernel_mutex); + + if (!row_mysql_drop_list_inited) { + + UT_LIST_INIT(row_mysql_drop_list); + row_mysql_drop_list_inited = TRUE; + } + + /* Look if the table already is in the drop list */ + drop = UT_LIST_GET_FIRST(row_mysql_drop_list); + + while (drop != NULL) { + if (strcmp(drop->table_name, name) == 0) { + /* Already in the list */ + + mutex_exit(&kernel_mutex); + + return(FALSE); + } + + drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop); + } + + drop = mem_alloc(sizeof(row_mysql_drop_t)); + + drop->table_name = mem_strdup(name); + + UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop); + + /* fputs("InnoDB: Adding table ", stderr); + ut_print_name(stderr, trx, TRUE, drop->table_name); + fputs(" to background drop list\n", stderr); */ + + mutex_exit(&kernel_mutex); + + return(TRUE); +} + +/************************************************************************* +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. */ +UNIV_INTERN +int +row_discard_tablespace_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_foreign_t* foreign; + dulint new_id; + dict_table_t* table; + ibool success; + ulint err; + pars_info_t* info = NULL; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. + + 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive + MySQL table lock on the table before we can do DISCARD + TABLESPACE. Then there are no running queries on the table. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: we remove all entries for the tablespace in + the insert buffer tree; as long as the tablespace mem object + does not exist, ongoing insert buffer page merges are + discarded in buf0rea.c. If we recreate the tablespace mem + object with IMPORT TABLESPACE later, then the tablespace will + have the same id, but the tablespace_version field in the mem + object is different, and ongoing old insert buffer page merges + get discarded. + + 4) Linear readahead and random readahead: we use the same + method as in 3) to discard ongoing operations. + + 5) FOREIGN KEY operations: if + table->n_foreign_key_checks_running > 0, we do not allow the + discard. We also reserve the data dictionary latch. */ + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = "discarding tablespace"; + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: is in the system tablespace 0" + " which cannot be discarded\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + if (table->n_foreign_key_checks_running > 0) { + + ut_print_timestamp(stderr); + fputs(" InnoDB: You are trying to DISCARD table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: though there is a foreign key check" + " running on it.\n" + "InnoDB: Cannot discard the table.\n", + stderr); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + + FILE* ef = dict_foreign_err_file; + + /* We only allow discarding a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + err = DB_CANNOT_DROP_CONSTRAINT; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot DISCARD table ", ef); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + goto funct_exit; + } + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + /* Remove all locks except the table-level S and X locks. */ + lock_remove_all_on_table(table, FALSE); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "table_name", name); + pars_info_add_dulint_literal(info, "new_id", new_id); + + err = que_eval_sql(info, + "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "BEGIN\n" + "SELECT ID INTO old_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " COMMIT WORK;\n" + " RETURN;\n" + "END IF;\n" + "UPDATE SYS_TABLES SET ID = :new_id\n" + " WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + dict_table_change_id_in_cache(table, new_id); + + success = fil_discard_tablespace(table->space); + + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + + err = DB_ERROR; + } else { + /* Set the flag which tells that now it is legal to + IMPORT a tablespace for this table */ + table->tablespace_discarded = TRUE; + table->ibd_file_missing = TRUE; + } + } + +funct_exit: + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return((int) err); +} + +/********************************************************************* +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. */ +UNIV_INTERN +int +row_import_tablespace_for_mysql( +/*============================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + ibool success; + ib_uint64_t current_lsn; + ulint err = DB_SUCCESS; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx_start_if_not_started(trx); + + trx->op_info = "importing tablespace"; + + current_lsn = log_get_lsn(); + + /* It is possible, though very improbable, that the lsn's in the + tablespace to be imported have risen above the current system lsn, if + a lengthy purge, ibuf merge, or rollback was performed on a backup + taken with ibbackup. If that is the case, reset page lsn's in the + file. We assume that mysqld was shut down after it performed these + cleanup operations on the .ibd file, so that it stamped the latest lsn + to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file. + + TODO: reset also the trx id's in clustered index records and write + a new space id to each data page. That would allow us to import clean + .ibd files from another MySQL installation. */ + + success = fil_reset_too_high_lsns(name, current_lsn); + + if (!success) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: cannot reset lsn's in table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + + err = DB_ERROR; + + row_mysql_lock_data_dictionary(trx); + + goto funct_exit; + } + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + ut_print_timestamp(stderr); + fputs(" InnoDB: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: does not exist in the InnoDB data dictionary\n" + "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (table->space == 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: is in the system tablespace 0" + " which cannot be imported\n", stderr); + err = DB_ERROR; + + goto funct_exit; + } + + if (!table->tablespace_discarded) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: you are trying to" + " IMPORT a tablespace\n" + "InnoDB: ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(", though you have not called DISCARD on it yet\n" + "InnoDB: during the lifetime of the mysqld process!\n", + stderr); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + success = fil_open_single_table_tablespace( + TRUE, table->space, + table->flags == DICT_TF_COMPACT ? 0 : table->flags, + table->name); + if (success) { + table->ibd_file_missing = FALSE; + table->tablespace_discarded = FALSE; + } else { + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fputs(" InnoDB: cannot find or open in the" + " database directory the .ibd file of\n" + "InnoDB: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", + stderr); + } + + err = DB_ERROR; + } + +funct_exit: + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return((int) err); +} + +/************************************************************************* +Truncates a table for MySQL. */ +UNIV_INTERN +int +row_truncate_table_for_mysql( +/*=========================*/ + /* out: error code or DB_SUCCESS */ + dict_table_t* table, /* in: table handle */ + trx_t* trx) /* in: transaction handle */ +{ + dict_foreign_t* foreign; + ulint err; + mem_heap_t* heap; + byte* buf; + dtuple_t* tuple; + dfield_t* dfield; + dict_index_t* sys_index; + btr_pcur_t pcur; + mtr_t mtr; + dulint new_id; + ulint recreate_space = 0; + pars_info_t* info = NULL; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. + + 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive + MySQL table lock on the table before we can do TRUNCATE + TABLE. Then there are no running queries on the table. This is + guaranteed, because in ha_innobase::store_lock(), we do not + weaken the TL_WRITE lock requested by MySQL when executing + SQLCOM_TRUNCATE. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE, + so we do not have to remove insert buffer records, as the + insert buffer works at a low level. If a freed page is later + reallocated, the allocator will remove the ibuf entries for + it. + + When we truncate *.ibd files by recreating them (analogous to + DISCARD TABLESPACE), we remove all entries for the table in the + insert buffer tree. This is not strictly necessary, because + in 6) we will assign a new tablespace identifier, but we can + free up some space in the system tablespace. + + 4) Linear readahead and random readahead: we use the same + method as in 3) to discard ongoing operations. (This is only + relevant for TRUNCATE TABLE by DISCARD TABLESPACE.) + + 5) FOREIGN KEY operations: if + table->n_foreign_key_checks_running > 0, we do not allow the + TRUNCATE. We also reserve the data dictionary latch. + + 6) Crash recovery: To prevent the application of pre-truncation + redo log records on the truncated tablespace, we will assign + a new tablespace identifier to the truncated tablespace. */ + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_ad(table); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); + + return(DB_ERROR); + } + + trx->op_info = "truncating table"; + + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + ut_a(trx->dict_operation_lock_mode == 0); + /* Prevent foreign key checks etc. while we are truncating the + table */ + + row_mysql_lock_data_dictionary(trx); + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns) { + FILE* ef = dict_foreign_err_file; + + /* We only allow truncating a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot truncate table ", ef); + ut_print_name(ef, trx, TRUE, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because it is referenced by ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + err = DB_ERROR; + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been truncated here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot truncate table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" by DROP+CREATE\n" + "InnoDB: because there is a foreign key check" + " running on it.\n", + stderr); + err = DB_ERROR; + + goto funct_exit; + } + + /* Remove all locks except the table-level S and X locks. */ + lock_remove_all_on_table(table, FALSE); + + trx->table_id = table->id; + + if (table->space && !table->dir_path_of_temp_table) { + /* Discard and create the single-table tablespace. */ + ulint space = table->space; + ulint flags = fil_space_get_flags(space); + + if (flags != ULINT_UNDEFINED + && fil_discard_tablespace(space)) { + + dict_index_t* index; + + space = 0; + + if (fil_create_new_single_table_tablespace( + &space, table->name, FALSE, flags, + FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: TRUNCATE TABLE %s failed to" + " create a new tablespace\n", + table->name); + table->ibd_file_missing = 1; + err = DB_ERROR; + goto funct_exit; + } + + recreate_space = space; + + /* Replace the space_id in the data dictionary cache. + The persisent data dictionary (SYS_TABLES.SPACE + and SYS_INDEXES.SPACE) are updated later in this + function. */ + table->space = space; + index = dict_table_get_first_index(table); + do { + index->space = space; + index = dict_table_get_next_index(index); + } while (index); + + mtr_start(&mtr); + fsp_header_init(space, + FIL_IBD_FILE_INITIAL_SIZE, &mtr); + mtr_commit(&mtr); + } + } + + /* scan SYS_INDEXES for all indexes of the table */ + heap = mem_heap_create(800); + + tuple = dtuple_create(heap, 1); + dfield = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 8); + mach_write_to_8(buf, table->id); + + dfield_set_data(dfield, buf, 8); + sys_index = dict_table_get_first_index(dict_sys->sys_indexes); + dict_index_copy_types(tuple, sys_index, 1); + + mtr_start(&mtr); + btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + for (;;) { + rec_t* rec; + const byte* field; + ulint len; + ulint root_page_no; + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* The end of SYS_INDEXES has been reached. */ + break; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, 0, &len); + ut_ad(len == 8); + + if (memcmp(buf, field, len) != 0) { + /* End of indexes for the table (TABLE_ID mismatch). */ + break; + } + + if (rec_get_deleted_flag(rec, FALSE)) { + /* The index has been dropped. */ + goto next_rec; + } + + /* This call may commit and restart mtr + and reposition pcur. */ + root_page_no = dict_truncate_index_tree(table, recreate_space, + &pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (root_page_no != FIL_NULL) { + page_rec_write_index_page_no( + rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, + root_page_no, &mtr); + /* We will need to commit and restart the + mini-transaction in order to avoid deadlocks. + The dict_truncate_index_tree() call has allocated + a page in this mini-transaction, and the rest of + this loop could latch another index page. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_LEAF, + &pcur, &mtr); + } + +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + mem_heap_free(heap); + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + info = pars_info_create(); + + pars_info_add_int4_literal(info, "space", (lint) table->space); + pars_info_add_dulint_literal(info, "old_id", table->id); + pars_info_add_dulint_literal(info, "new_id", new_id); + + err = que_eval_sql(info, + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES" + " SET ID = :new_id, SPACE = :space\n" + " WHERE ID = :old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_INDEXES" + " SET TABLE_ID = :new_id, SPACE = :space\n" + " WHERE TABLE_ID = :old_id;\n" + "COMMIT WORK;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); + fputs(" InnoDB: Unable to assign a new identifier to table ", + stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: after truncating it. Background processes" + " may corrupt the table!\n", stderr); + err = DB_ERROR; + } else { + dict_table_change_id_in_cache(table, new_id); + } + + /* MySQL calls ha_innobase::reset_auto_increment() which does + the same thing. */ + dict_table_autoinc_lock(table); + dict_table_autoinc_initialize(table, 1); + dict_table_autoinc_unlock(table); + dict_update_statistics(table); + + trx_commit_for_mysql(trx); + +funct_exit: + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + srv_wake_master_thread(); + + return((int) err); +} + +/************************************************************************* +Drops a table for MySQL. If the name of the dropped table ends in +one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor", +"innodb_table_monitor", then this will also stop the printing of monitor +output by the master thread. If the data dictionary was not already locked +by the transaction, the transaction will be committed. Otherwise, the +data dictionary will remain locked. */ +UNIV_INTERN +int +row_drop_table_for_mysql( +/*=====================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: table name */ + trx_t* trx, /* in: transaction handle */ + ibool drop_db)/* in: TRUE=dropping whole database */ +{ + dict_foreign_t* foreign; + dict_table_t* table; + ulint space_id; + ulint err; + const char* table_name; + ulint namelen; + ibool locked_dictionary = FALSE; + pars_info_t* info = NULL; + + ut_a(name != NULL); + + if (srv_created_new_raw) { + fputs("InnoDB: A new raw disk partition was initialized:\n" + "InnoDB: we do not allow database modifications" + " by the user.\n" + "InnoDB: Shut down mysqld and edit my.cnf so that newraw" + " is replaced with raw.\n", stderr); + + return(DB_ERROR); + } + + trx->op_info = "dropping table"; + + trx_start_if_not_started(trx); + + /* The table name is prefixed with the database name and a '/'. + Certain table names starting with 'innodb_' have their special + meaning regardless of the database name. Thus, we need to + ignore the database name prefix in the comparisons. */ + table_name = strchr(name, '/'); + ut_a(table_name); + table_name++; + namelen = strlen(table_name) + 1; + + if (namelen == sizeof S_innodb_monitor + && !memcmp(table_name, S_innodb_monitor, + sizeof S_innodb_monitor)) { + + /* Table name equals "innodb_monitor": + stop monitor prints */ + + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_lock_monitor + && !memcmp(table_name, S_innodb_lock_monitor, + sizeof S_innodb_lock_monitor)) { + srv_print_innodb_monitor = FALSE; + srv_print_innodb_lock_monitor = FALSE; + } else if (namelen == sizeof S_innodb_tablespace_monitor + && !memcmp(table_name, S_innodb_tablespace_monitor, + sizeof S_innodb_tablespace_monitor)) { + + srv_print_innodb_tablespace_monitor = FALSE; + } else if (namelen == sizeof S_innodb_table_monitor + && !memcmp(table_name, S_innodb_table_monitor, + sizeof S_innodb_table_monitor)) { + + srv_print_innodb_table_monitor = FALSE; + } + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + /* Prevent foreign key checks etc. while we are dropping the + table */ + + row_mysql_lock_data_dictionary(trx); + + locked_dictionary = TRUE; + } + + ut_ad(mutex_own(&(dict_sys->mutex))); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is" + " trying to drop it.\n" + "InnoDB: Have you copied the .frm file" + " of the table to the\n" + "InnoDB: MySQL database directory" + " from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + } + + /* Check if the table is referenced by foreign key constraints from + some other table (not the table itself) */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign && foreign->foreign_table == table) { +check_next_foreign: + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + if (foreign && trx->check_foreigns + && !(drop_db && dict_tables_have_same_db( + name, foreign->foreign_table_name))) { + FILE* ef = dict_foreign_err_file; + + /* We only allow dropping a referenced table if + FOREIGN_KEY_CHECKS is set to 0 */ + + err = DB_CANNOT_DROP_CONSTRAINT; + + mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + + fputs(" Cannot drop table ", ef); + ut_print_name(ef, trx, TRUE, name); + fputs("\n" + "because it is referenced by ", ef); + ut_print_name(ef, trx, TRUE, foreign->foreign_table_name); + putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); + + goto funct_exit; + } + + if (foreign && trx->check_foreigns) { + goto check_next_foreign; + } + + if (table->n_mysql_handles_opened > 0) { + ibool added; + + added = row_add_table_to_background_drop_list(table->name); + + if (added) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: MySQL is" + " trying to drop table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs("\n" + "InnoDB: though there are still" + " open handles to it.\n" + "InnoDB: Adding the table to the" + " background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* TODO: could we replace the counter n_foreign_key_checks_running + with lock checks on the table? Acquire here an exclusive lock on the + table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that + they can cope with the table having been dropped here? Foreign key + checks take an IS or IX lock on the table. */ + + if (table->n_foreign_key_checks_running > 0) { + + const char* table_name = table->name; + ibool added; + + added = row_add_table_to_background_drop_list(table_name); + + if (added) { + ut_print_timestamp(stderr); + fputs(" InnoDB: You are trying to drop table ", + stderr); + ut_print_name(stderr, trx, TRUE, table_name); + fputs("\n" + "InnoDB: though there is a" + " foreign key check running on it.\n" + "InnoDB: Adding the table to" + " the background drop queue.\n", + stderr); + + /* We return DB_SUCCESS to MySQL though the drop will + happen lazily later */ + + err = DB_SUCCESS; + } else { + /* The table is already in the background drop list */ + err = DB_ERROR; + } + + goto funct_exit; + } + + /* Remove all locks there are on the table or its records */ + lock_remove_all_on_table(table, TRUE); + + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = table->id; + + /* We use the private SQL parser of Innobase to generate the + query graphs needed in deleting the dictionary data from system + tables in Innobase. Deleting a row from SYS_INDEXES table also + frees the file segments of the B-tree associated with the index. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "table_name", name); + + err = que_eval_sql(info, + "PROCEDURE DROP_TABLE_PROC () IS\n" + "sys_foreign_id CHAR;\n" + "table_id CHAR;\n" + "index_id CHAR;\n" + "foreign_id CHAR;\n" + "found INT;\n" + "BEGIN\n" + "SELECT ID INTO table_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " RETURN;\n" + "END IF;\n" + "found := 1;\n" + "SELECT ID INTO sys_foreign_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = 'SYS_FOREIGN'\n" + "LOCK IN SHARE MODE;\n" + "IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (:table_name = 'SYS_FOREIGN') THEN\n" + " found := 0;\n" + "END IF;\n" + "IF (:table_name = 'SYS_FOREIGN_COLS') THEN\n" + " found := 0;\n" + "END IF;\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = :table_name\n" + " AND TO_BINARY(FOR_NAME)\n" + " = TO_BINARY(:table_name)\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FOREIGN_COLS\n" + " WHERE ID = foreign_id;\n" + " DELETE FROM SYS_FOREIGN\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + "END LOOP;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO index_id\n" + " FROM SYS_INDEXES\n" + " WHERE TABLE_ID = table_id\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS\n" + " WHERE INDEX_ID = index_id;\n" + " DELETE FROM SYS_INDEXES\n" + " WHERE ID = index_id\n" + " AND TABLE_ID = table_id;\n" + " END IF;\n" + "END LOOP;\n" + "DELETE FROM SYS_COLUMNS\n" + "WHERE TABLE_ID = table_id;\n" + "DELETE FROM SYS_TABLES\n" + "WHERE ID = table_id;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + ut_a(err == DB_OUT_OF_FILE_SPACE); + + err = DB_MUST_GET_MORE_FILE_SPACE; + + row_mysql_handle_errors(&err, trx, NULL, NULL); + + ut_error; + } else { + ibool is_path; + const char* name_or_path; + mem_heap_t* heap; + + heap = mem_heap_create(200); + + /* Clone the name, in case it has been allocated + from table->heap, which will be freed by + dict_table_remove_from_cache(table) below. */ + name = mem_heap_strdup(heap, name); + space_id = table->space; + + if (table->dir_path_of_temp_table != NULL) { + is_path = TRUE; + name_or_path = mem_heap_strdup( + heap, table->dir_path_of_temp_table); + } else { + is_path = FALSE; + name_or_path = name; + } + + dict_table_remove_from_cache(table); + + if (dict_load_table(name) != NULL) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: not able to remove table ", + stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs(" from the dictionary cache!\n", stderr); + err = DB_ERROR; + } + + /* Do not drop possible .ibd tablespace if something went + wrong: we do not want to delete valuable data of the user */ + + if (err == DB_SUCCESS && space_id > 0) { + if (!fil_space_for_table_exists_in_mem(space_id, + name_or_path, + is_path, + FALSE, TRUE)) { + err = DB_SUCCESS; + + fprintf(stderr, + "InnoDB: We removed now the InnoDB" + " internal data dictionary entry\n" + "InnoDB: of table "); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, ".\n"); + } else if (!fil_delete_tablespace(space_id)) { + fprintf(stderr, + "InnoDB: We removed now the InnoDB" + " internal data dictionary entry\n" + "InnoDB: of table "); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, ".\n"); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: not able to" + " delete tablespace %lu of table ", + (ulong) space_id); + ut_print_name(stderr, trx, TRUE, name); + fputs("!\n", stderr); + err = DB_ERROR; + } + } + + mem_heap_free(heap); + } +funct_exit: + + if (locked_dictionary) { + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + } + + trx->op_info = ""; + +#ifndef UNIV_HOTBACKUP + srv_wake_master_thread(); +#endif /* !UNIV_HOTBACKUP */ + + return((int) err); +} + +/*********************************************************************** +Drop all foreign keys in a database, see Bug#18942. +Called at the end of row_drop_database_for_mysql(). */ +static +ulint +drop_all_foreign_keys_in_db( +/*========================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: database name which ends to '/' */ + trx_t* trx) /* in: transaction handle */ +{ + pars_info_t* pinfo; + ulint err; + + ut_a(name[strlen(name) - 1] == '/'); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "dbname", name); + +/* true if for_name is not prefixed with dbname */ +#define TABLE_NOT_IN_THIS_DB \ +"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname" + + err = que_eval_sql(pinfo, + "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n" + "foreign_id CHAR;\n" + "for_name CHAR;\n" + "found INT;\n" + "DECLARE CURSOR cur IS\n" + "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n" + "WHERE FOR_NAME >= :dbname\n" + "LOCK IN SHARE MODE\n" + "ORDER BY FOR_NAME;\n" + "BEGIN\n" + "found := 1;\n" + "OPEN cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH cur INTO foreign_id, for_name;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n" + " found := 0;\n" + " ELSIF (1=1) THEN\n" + " DELETE FROM SYS_FOREIGN_COLS\n" + " WHERE ID = foreign_id;\n" + " DELETE FROM SYS_FOREIGN\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE cur;\n" + "COMMIT WORK;\n" + "END;\n", + FALSE, /* do not reserve dict mutex, + we are already holding it */ + trx); + + return(err); +} + +/************************************************************************* +Drops a database for MySQL. */ +UNIV_INTERN +int +row_drop_database_for_mysql( +/*========================*/ + /* out: error code or DB_SUCCESS */ + const char* name, /* in: database name which ends to '/' */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + char* table_name; + int err = DB_SUCCESS; + ulint namelen = strlen(name); + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(name != NULL); + ut_a(name[namelen - 1] == '/'); + + trx->op_info = "dropping database"; + + trx_start_if_not_started(trx); +loop: + row_mysql_lock_data_dictionary(trx); + + while ((table_name = dict_get_first_table_name_in_db(name))) { + ut_a(memcmp(table_name, name, namelen) == 0); + + table = dict_table_get_low(table_name); + + ut_a(table); + + /* Wait until MySQL does not have any queries running on + the table */ + + if (table->n_mysql_handles_opened > 0) { + row_mysql_unlock_data_dictionary(trx); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: MySQL is trying to" + " drop database ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fputs("\n" + "InnoDB: though there are still" + " open handles to table ", stderr); + ut_print_name(stderr, trx, TRUE, table_name); + fputs(".\n", stderr); + + os_thread_sleep(1000000); + + mem_free(table_name); + + goto loop; + } + + err = row_drop_table_for_mysql(table_name, trx, TRUE); + trx_commit_for_mysql(trx); + + if (err != DB_SUCCESS) { + fputs("InnoDB: DROP DATABASE ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, " failed with error %lu for table ", + (ulint) err); + ut_print_name(stderr, trx, TRUE, table_name); + putc('\n', stderr); + mem_free(table_name); + break; + } + + mem_free(table_name); + } + + if (err == DB_SUCCESS) { + /* after dropping all tables try to drop all leftover + foreign keys in case orphaned ones exist */ + err = (int) drop_all_foreign_keys_in_db(name, trx); + + if (err != DB_SUCCESS) { + fputs("InnoDB: DROP DATABASE ", stderr); + ut_print_name(stderr, trx, TRUE, name); + fprintf(stderr, " failed with error %d while " + "dropping all foreign keys", err); + } + } + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return(err); +} + +/************************************************************************* +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. */ +static +ibool +row_is_mysql_tmp_table_name( +/*========================*/ + /* out: TRUE if temporary table */ + const char* name) /* in: table name in the form + 'database/tablename' */ +{ + return(strstr(name, "/#sql") != NULL); + /* return(strstr(name, "/@0023sql") != NULL); */ +} + +/******************************************************************** +Delete a single constraint. */ +static +int +row_delete_constraint_low( +/*======================*/ + /* out: error code or DB_SUCCESS */ + const char* id, /* in: constraint id */ + trx_t* trx) /* in: transaction handle */ +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", id); + + return((int) que_eval_sql(info, + "PROCEDURE DELETE_CONSTRAINT () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n" + "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n" + "END;\n" + , FALSE, trx)); +} + +/******************************************************************** +Delete a single constraint. */ +static +int +row_delete_constraint( +/*==================*/ + /* out: error code or DB_SUCCESS */ + const char* id, /* in: constraint id */ + const char* database_name, /* in: database name, with the + trailing '/' */ + mem_heap_t* heap, /* in: memory heap */ + trx_t* trx) /* in: transaction handle */ +{ + ulint err; + + /* New format constraints have ids /. */ + err = row_delete_constraint_low( + mem_heap_strcat(heap, database_name, id), trx); + + if ((err == DB_SUCCESS) && !strchr(id, '/')) { + /* Old format < 4.0.18 constraints have constraint ids + _. We only try deleting them if the + constraint name does not contain a '/' character, otherwise + deleting a new format constraint named 'foo/bar' from + database 'baz' would remove constraint 'bar' from database + 'foo', if it existed. */ + + err = row_delete_constraint_low(id, trx); + } + + return((int) err); +} + +/************************************************************************* +Renames a table for MySQL. */ +UNIV_INTERN +ulint +row_rename_table_for_mysql( +/*=======================*/ + /* out: error code or DB_SUCCESS */ + const char* old_name, /* in: old table name */ + const char* new_name, /* in: new table name */ + trx_t* trx, /* in: transaction handle */ + ibool commit) /* in: if TRUE then commit trx */ +{ + dict_table_t* table; + ulint err = DB_ERROR; + mem_heap_t* heap = NULL; + const char** constraints_to_drop = NULL; + ulint n_constraints_to_drop = 0; + ibool old_is_tmp, new_is_tmp; + pars_info_t* info = NULL; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + ut_a(old_name != NULL); + ut_a(new_name != NULL); + + if (srv_created_new_raw || srv_force_recovery) { + fputs("InnoDB: A new raw disk partition was initialized or\n" + "InnoDB: innodb_force_recovery is on: we do not allow\n" + "InnoDB: database modifications by the user. Shut down\n" + "InnoDB: mysqld and edit my.cnf so that newraw" + " is replaced\n" + "InnoDB: with raw, and innodb_force_... is removed.\n", + stderr); + + goto funct_exit; + } else if (row_mysql_is_system_table(new_name)) { + + fprintf(stderr, + "InnoDB: Error: trying to create a MySQL" + " system table %s of type InnoDB.\n" + "InnoDB: MySQL system tables must be" + " of the MyISAM type!\n", + new_name); + + goto funct_exit; + } + + trx->op_info = "renaming table"; + trx_start_if_not_started(trx); + + old_is_tmp = row_is_mysql_tmp_table_name(old_name); + new_is_tmp = row_is_mysql_tmp_table_name(new_name); + + table = dict_table_get_low(old_name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is" + " trying to rename the table.\n" + "InnoDB: Have you copied the .frm file" + " of the table to the\n" + "InnoDB: MySQL database directory" + " from another database?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + } else if (table->ibd_file_missing) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fputs(" InnoDB: Error: table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" does not have an .ibd file" + " in the database directory.\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n", + stderr); + goto funct_exit; + } else if (new_is_tmp) { + /* MySQL is doing an ALTER TABLE command and it renames the + original table to a temporary table name. We want to preserve + the original foreign key constraint definitions despite the + name change. An exception is those constraints for which + the ALTER TABLE contained DROP FOREIGN KEY .*/ + + heap = mem_heap_create(100); + + err = dict_foreign_parse_drop_constraints( + heap, trx, table, &n_constraints_to_drop, + &constraints_to_drop); + + if (err != DB_SUCCESS) { + + goto funct_exit; + } + } + + /* We use the private SQL parser of Innobase to generate the query + graphs needed in updating the dictionary data from system tables. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + + err = que_eval_sql(info, + "PROCEDURE RENAME_TABLE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET NAME = :new_table_name\n" + " WHERE NAME = :old_table_name;\n" + "END;\n" + , FALSE, trx); + + if (err != DB_SUCCESS) { + + goto end; + } else if (!new_is_tmp) { + /* Rename all constraints. */ + + info = pars_info_create(); + + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "old_table_name", old_name); + + err = que_eval_sql( + info, + "PROCEDURE RENAME_CONSTRAINT_IDS () IS\n" + "gen_constr_prefix CHAR;\n" + "new_db_name CHAR;\n" + "foreign_id CHAR;\n" + "new_foreign_id CHAR;\n" + "old_db_name_len INT;\n" + "old_t_name_len INT;\n" + "new_db_name_len INT;\n" + "id_len INT;\n" + "found INT;\n" + "BEGIN\n" + "found := 1;\n" + "old_db_name_len := INSTR(:old_table_name, '/')-1;\n" + "new_db_name_len := INSTR(:new_table_name, '/')-1;\n" + "new_db_name := SUBSTR(:new_table_name, 0,\n" + " new_db_name_len);\n" + "old_t_name_len := LENGTH(:old_table_name);\n" + "gen_constr_prefix := CONCAT(:old_table_name,\n" + " '_ibfk_');\n" + "WHILE found = 1 LOOP\n" + " SELECT ID INTO foreign_id\n" + " FROM SYS_FOREIGN\n" + " WHERE FOR_NAME = :old_table_name\n" + " AND TO_BINARY(FOR_NAME)\n" + " = TO_BINARY(:old_table_name)\n" + " LOCK IN SHARE MODE;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " UPDATE SYS_FOREIGN\n" + " SET FOR_NAME = :new_table_name\n" + " WHERE ID = foreign_id;\n" + " id_len := LENGTH(foreign_id);\n" + " IF (INSTR(foreign_id, '/') > 0) THEN\n" + " IF (INSTR(foreign_id,\n" + " gen_constr_prefix) > 0)\n" + " THEN\n" + " new_foreign_id :=\n" + " CONCAT(:new_table_name,\n" + " SUBSTR(foreign_id, old_t_name_len,\n" + " id_len - old_t_name_len));\n" + " ELSE\n" + " new_foreign_id :=\n" + " CONCAT(new_db_name,\n" + " SUBSTR(foreign_id,\n" + " old_db_name_len,\n" + " id_len - old_db_name_len));\n" + " END IF;\n" + " UPDATE SYS_FOREIGN\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " UPDATE SYS_FOREIGN_COLS\n" + " SET ID = new_foreign_id\n" + " WHERE ID = foreign_id;\n" + " END IF;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n" + "WHERE REF_NAME = :old_table_name\n" + " AND TO_BINARY(REF_NAME)\n" + " = TO_BINARY(:old_table_name);\n" + "END;\n" + , FALSE, trx); + + } else if (n_constraints_to_drop > 0) { + /* Drop some constraints of tmp tables. */ + + ulint db_name_len = dict_get_db_name_len(old_name) + 1; + char* db_name = mem_heap_strdupl(heap, old_name, + db_name_len); + ulint i; + + for (i = 0; i < n_constraints_to_drop; i++) { + err = row_delete_constraint(constraints_to_drop[i], + db_name, heap, trx); + + if (err != DB_SUCCESS) { + break; + } + } + } + +end: + if (err != DB_SUCCESS) { + if (err == DB_DUPLICATE_KEY) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error; possible reasons:\n" + "InnoDB: 1) Table rename would cause" + " two FOREIGN KEY constraints\n" + "InnoDB: to have the same internal name" + " in case-insensitive comparison.\n" + "InnoDB: 2) table ", stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs(" exists in the InnoDB internal data\n" + "InnoDB: dictionary though MySQL is" + " trying to rename table ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fputs(" to it.\n" + "InnoDB: Have you deleted the .frm file" + " and not used DROP TABLE?\n" + "InnoDB: You can look for further help from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: If table ", stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs(" is a temporary table #sql..., then" + " it can be that\n" + "InnoDB: there are still queries running" + " on the table, and it will be\n" + "InnoDB: dropped automatically when" + " the queries end.\n" + "InnoDB: You can drop the orphaned table" + " inside InnoDB by\n" + "InnoDB: creating an InnoDB table with" + " the same name in another\n" + "InnoDB: database and copying the .frm file" + " to the current database.\n" + "InnoDB: Then MySQL thinks the table exists," + " and DROP TABLE will\n" + "InnoDB: succeed.\n", stderr); + } + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + /* The following call will also rename the .ibd data file if + the table is stored in a single-table tablespace */ + + if (!dict_table_rename_in_cache(table, new_name, + !new_is_tmp)) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + goto funct_exit; + } + + /* We only want to switch off some of the type checking in + an ALTER, not in a RENAME. */ + + err = dict_load_foreigns( + new_name, !old_is_tmp || trx->check_foreigns); + + if (err != DB_SUCCESS) { + ut_print_timestamp(stderr); + + if (old_is_tmp) { + fputs(" InnoDB: Error: in ALTER TABLE ", + stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs("\n" + "InnoDB: has or is referenced" + " in foreign key constraints\n" + "InnoDB: which are not compatible" + " with the new table definition.\n", + stderr); + } else { + fputs(" InnoDB: Error: in RENAME TABLE" + " table ", + stderr); + ut_print_name(stderr, trx, TRUE, new_name); + fputs("\n" + "InnoDB: is referenced in" + " foreign key constraints\n" + "InnoDB: which are not compatible" + " with the new table definition.\n", + stderr); + } + + ut_a(dict_table_rename_in_cache(table, + old_name, FALSE)); + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } + } + +funct_exit: + + if (commit) { + trx_commit_for_mysql(trx); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + trx->op_info = ""; + + return(err); +} + +/************************************************************************* +Checks that the index contains entries in an ascending order, unique +constraint is not broken, and calculates the number of index entries +in the read view of the current transaction. */ +static +ibool +row_scan_and_check_index( +/*=====================*/ + /* out: TRUE if ok */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct in MySQL */ + dict_index_t* index, /* in: index */ + ulint* n_rows) /* out: number of entries seen in the + current consistent read */ +{ + dtuple_t* prev_entry = NULL; + ulint matched_fields; + ulint matched_bytes; + byte* buf; + ulint ret; + rec_t* rec; + ibool is_ok = TRUE; + int cmp; + ibool contains_null; + ulint i; + ulint cnt; + mem_heap_t* heap = NULL; + ulint n_ext; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + rec_offs_init(offsets_); + + *n_rows = 0; + + buf = mem_alloc(UNIV_PAGE_SIZE); + heap = mem_heap_create(100); + + /* Make a dummy template in prebuilt, which we will use + in scanning the index entries */ + + prebuilt->index = index; + prebuilt->sql_stat_start = TRUE; + prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE; + prebuilt->n_template = 0; + prebuilt->need_to_access_clustered = FALSE; + + dtuple_set_n_fields(prebuilt->search_tuple, 0); + + prebuilt->select_lock_type = LOCK_NONE; + cnt = 1000; + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0); +loop: + /* Check thd->killed every 1,000 scanned rows */ + if (--cnt == 0) { + if (trx_is_interrupted(prebuilt->trx)) { + goto func_exit; + } + cnt = 1000; + } + if (ret != DB_SUCCESS) { +func_exit: + mem_free(buf); + mem_heap_free(heap); + + return(is_ok); + } + + *n_rows = *n_rows + 1; + + /* row_search... returns the index record in buf, record origin offset + within buf stored in the first 4 bytes, because we have built a dummy + template */ + + rec = buf + mach_read_from_4(buf); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + if (prev_entry != NULL) { + matched_fields = 0; + matched_bytes = 0; + + cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets, + &matched_fields, + &matched_bytes); + contains_null = FALSE; + + /* In a unique secondary index we allow equal key values if + they contain SQL NULLs */ + + for (i = 0; + i < dict_index_get_n_ordering_defined_by_user(index); + i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(prev_entry, i))) { + + contains_null = TRUE; + } + } + + if (cmp > 0) { + fputs("InnoDB: index records in a wrong order in ", + stderr); +not_ok: + dict_index_name_print(stderr, + prebuilt->trx, index); + fputs("\n" + "InnoDB: prev record ", stderr); + dtuple_print(stderr, prev_entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + is_ok = FALSE; + } else if (dict_index_is_unique(index) + && !contains_null + && matched_fields + >= dict_index_get_n_ordering_defined_by_user( + index)) { + + fputs("InnoDB: duplicate key in ", stderr); + goto not_ok; + } + } + + { + mem_heap_t* tmp_heap = NULL; + + /* Empty the heap on each round. But preserve offsets[] + for the row_rec_to_index_entry() call, by copying them + into a separate memory heap when needed. */ + if (UNIV_UNLIKELY(offsets != offsets_)) { + ulint size = rec_offs_get_n_alloc(offsets) + * sizeof *offsets; + + tmp_heap = mem_heap_create(size); + offsets = mem_heap_dup(tmp_heap, offsets, size); + } + + mem_heap_empty(heap); + + prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, + index, offsets, + &n_ext, heap); + + if (UNIV_LIKELY_NULL(tmp_heap)) { + mem_heap_free(tmp_heap); + } + } + + ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT); + + goto loop; +} + +/************************************************************************* +Checks a table for corruption. */ +UNIV_INTERN +ulint +row_check_table_for_mysql( +/*======================*/ + /* out: DB_ERROR or DB_SUCCESS */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL + handle */ +{ + dict_table_t* table = prebuilt->table; + dict_index_t* index; + ulint n_rows; + ulint n_rows_in_table = ULINT_UNDEFINED; + ulint ret = DB_SUCCESS; + ulint old_isolation_level; + + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you" + " used DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + table->name); + return(DB_ERROR); + } + + prebuilt->trx->op_info = "checking table"; + + old_isolation_level = prebuilt->trx->isolation_level; + + /* We must run the index record counts at an isolation level + >= READ COMMITTED, because a dirty read can see a wrong number + of records in some index; to play safe, we use always + REPEATABLE READ here */ + + prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + /* Enlarge the fatal lock wait timeout during CHECK TABLE. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + index = dict_table_get_first_index(table); + + while (index != NULL) { + /* fputs("Validating index ", stderr); + ut_print_name(stderr, trx, FALSE, index->name); + putc('\n', stderr); */ + + if (!btr_validate_index(index, prebuilt->trx)) { + ret = DB_ERROR; + } else { + if (!row_scan_and_check_index(prebuilt,index, &n_rows)){ + ret = DB_ERROR; + } + + if (trx_is_interrupted(prebuilt->trx)) { + break; + } + + /* fprintf(stderr, "%lu entries in index %s\n", n_rows, + index->name); */ + + if (index == dict_table_get_first_index(table)) { + n_rows_in_table = n_rows; + } else if (n_rows != n_rows_in_table) { + + ret = DB_ERROR; + + fputs("Error: ", stderr); + dict_index_name_print(stderr, + prebuilt->trx, index); + fprintf(stderr, + " contains %lu entries," + " should be %lu\n", + (ulong) n_rows, + (ulong) n_rows_in_table); + } + } + + index = dict_table_get_next_index(index); + } + + /* Restore the original isolation level */ + prebuilt->trx->isolation_level = old_isolation_level; + + /* We validate also the whole adaptive hash index for all tables + at every CHECK TABLE */ + + if (!btr_search_validate()) { + + ret = DB_ERROR; + } + + /* Restore the fatal lock wait timeout after CHECK TABLE. */ + mutex_enter(&kernel_mutex); + srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */ + mutex_exit(&kernel_mutex); + + prebuilt->trx->op_info = ""; + + return(ret); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************************* +Determines if a table is a magic monitor table. */ +UNIV_INTERN +ibool +row_is_magic_monitor_table( +/*=======================*/ + /* out: TRUE if monitor table */ + const char* table_name) /* in: name of the table, in the + form database/table_name */ +{ + const char* name; /* table_name without database/ */ + ulint len; + + name = strchr(table_name, '/'); + ut_a(name != NULL); + name++; + len = strlen(name) + 1; + + if (STR_EQ(name, len, S_innodb_monitor) + || STR_EQ(name, len, S_innodb_lock_monitor) + || STR_EQ(name, len, S_innodb_tablespace_monitor) + || STR_EQ(name, len, S_innodb_table_monitor) + || STR_EQ(name, len, S_innodb_mem_validate)) { + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/xtradb/row/row0purge.c b/storage/xtradb/row/row0purge.c new file mode 100644 index 00000000000..8c3f9b993ba --- /dev/null +++ b/storage/xtradb/row/row0purge.c @@ -0,0 +1,690 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#include "row0purge.h" + +#ifdef UNIV_NONINL +#include "row0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "row0vers.h" +#include "row0mysql.h" +#include "log0log.h" + +/************************************************************************ +Creates a purge node to a query graph. */ +UNIV_INTERN +purge_node_t* +row_purge_node_create( +/*==================*/ + /* out, own: purge node */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + purge_node_t* node; + + ut_ad(parent && heap); + + node = mem_heap_alloc(heap, sizeof(purge_node_t)); + + node->common.type = QUE_NODE_PURGE; + node->common.parent = parent; + + node->heap = mem_heap_create(256); + + return(node); +} + +/*************************************************************** +Repositions the pcur in the purge node on the clustered index record, +if found. */ +static +ibool +row_purge_reposition_pcur( +/*======================*/ + /* out: TRUE if the record was found */ + ulint mode, /* in: latching mode */ + purge_node_t* node, /* in: row purge node */ + mtr_t* mtr) /* in: mtr */ +{ + ibool found; + + if (node->found_clust) { + found = btr_pcur_restore_position(mode, &(node->pcur), mtr); + + return(found); + } + + found = row_search_on_row_ref(&(node->pcur), mode, node->table, + node->ref, mtr); + node->found_clust = found; + + if (found) { + btr_pcur_store_position(&(node->pcur), mtr); + } + + return(found); +} + +/*************************************************************** +Removes a delete marked clustered index record if possible. */ +static +ibool +row_purge_remove_clust_if_poss_low( +/*===============================*/ + /* out: TRUE if success, or if not found, or + if modified after the delete marking */ + purge_node_t* node, /* in: row purge node */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ibool success; + ulint err; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + index = dict_table_get_first_index(node->table); + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + mtr_start(&mtr); + + success = row_purge_reposition_pcur(mode, node, &mtr); + + if (!success) { + /* The record is already removed */ + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + rec = btr_pcur_get_rec(pcur); + + if (0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr( + rec, index, rec_get_offsets( + rec, index, offsets_, + ULINT_UNDEFINED, &heap)))) { + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + /* Someone else has modified the record later: do not remove */ + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(TRUE); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NONE, &mtr); + + if (err == DB_SUCCESS) { + success = TRUE; + } else if (err == DB_OUT_OF_FILE_SPACE) { + success = FALSE; + } else { + ut_error; + } + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + return(success); +} + +/*************************************************************** +Removes a clustered index record if it has not been modified after the delete +marking. */ +static +void +row_purge_remove_clust_if_poss( +/*===========================*/ + purge_node_t* node) /* in: row purge node */ +{ + ibool success; + ulint n_tries = 0; + + /* fputs("Purge: Removing clustered record\n", stderr); */ + + success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/*************************************************************** +Removes a secondary index entry if possible. */ +static +ibool +row_purge_remove_sec_if_poss_low( +/*=============================*/ + /* out: TRUE if success or if not found */ + purge_node_t* node, /* in: row purge node */ + dict_index_t* index, /* in: index */ + const dtuple_t* entry, /* in: index entry */ + ulint mode) /* in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has = 0; /* remove warning */ + ibool found; + ulint err; + mtr_t mtr; + mtr_t mtr_vers; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + if (!found) { + /* Not found. This is a legitimate condition. In a + rollback, InnoDB will remove secondary recs that would + be purged anyway. Then the actual purge will not find + the secondary index record. Also, the purge itself is + eager: if it comes to consider a secondary index + record, and notices it does not need to exist in the + index, it will remove it. Then if/when the purge + comes to consider the secondary index record a second + time, it will not exist any more in the index. */ + + /* fputs("PURGE:........sec entry not found\n", stderr); */ + /* dtuple_print(stderr, entry); */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(TRUE); + } + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + /* We should remove the index record if no later version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should do nothing. */ + + mtr_start(&mtr_vers); + + success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers); + + if (success) { + old_has = row_vers_old_has_index_entry( + TRUE, btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + + if (!success || !old_has) { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NONE, &mtr); + success = err == DB_SUCCESS; + ut_a(success || err == DB_OUT_OF_FILE_SPACE); + } + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(success); +} + +/*************************************************************** +Removes a secondary index entry if possible. */ +UNIV_INLINE +void +row_purge_remove_sec_if_poss( +/*=========================*/ + purge_node_t* node, /* in: row purge node */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + ibool success; + ulint n_tries = 0; + + /* fputs("Purge: Removing secondary record\n", stderr); */ + + success = row_purge_remove_sec_if_poss_low(node, index, entry, + BTR_MODIFY_LEAF); + if (success) { + + return; + } +retry: + success = row_purge_remove_sec_if_poss_low(node, index, entry, + BTR_MODIFY_TREE); + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + ut_a(success); +} + +/*************************************************************** +Purges a delete marking of a record. */ +static +void +row_purge_del_mark( +/*===============*/ + purge_node_t* node) /* in: row purge node */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + + ut_ad(node); + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + /* Build the index entry */ + entry = row_build_index_entry(node->row, NULL, index, heap); + ut_a(entry); + row_purge_remove_sec_if_poss(node, index, entry); + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + row_purge_remove_clust_if_poss(node); +} + +/*************************************************************** +Purges an update of an existing record. Also purges an update of a delete +marked record if that record contained an externally stored field. */ +static +void +row_purge_upd_exist_or_extern( +/*==========================*/ + purge_node_t* node) /* in: row purge node */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ibool is_insert; + ulint rseg_id; + ulint page_no; + ulint offset; + ulint i; + mtr_t mtr; + + ut_ad(node); + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + goto skip_secondaries; + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field_binary(NULL, node->index, + node->update)) { + /* Build the older version of the index entry */ + entry = row_build_index_entry(node->row, NULL, + index, heap); + ut_a(entry); + row_purge_remove_sec_if_poss(node, index, entry); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + +skip_secondaries: + /* Free possible externally stored fields */ + for (i = 0; i < upd_get_n_fields(node->update); i++) { + + const upd_field_t* ufield + = upd_get_nth_field(node->update, i); + + if (dfield_is_ext(&ufield->new_val)) { + buf_block_t* block; + ulint internal_offset; + byte* data_field; + + /* We use the fact that new_val points to + node->undo_rec and get thus the offset of + dfield data inside the undo record. Then we + can calculate from node->roll_ptr the file + address of the new_val data */ + + internal_offset + = ((const byte*) + dfield_get_data(&ufield->new_val)) + - node->undo_rec; + + ut_a(internal_offset < UNIV_PAGE_SIZE); + + trx_undo_decode_roll_ptr(node->roll_ptr, + &is_insert, &rseg_id, + &page_no, &offset); + mtr_start(&mtr); + + /* We have to acquire an X-latch to the clustered + index tree */ + + index = dict_table_get_first_index(node->table); + + mtr_x_lock(dict_index_get_lock(index), &mtr); + + /* NOTE: we must also acquire an X-latch to the + root page of the tree. We will need it when we + free pages from the tree. If the tree is of height 1, + the tree X-latch does NOT protect the root page, + because it is also a leaf page. Since we will have a + latch on an undo log page, we would break the + latching order if we would only later latch the + root page of such a tree! */ + + btr_root_get(index, &mtr); + + /* We assume in purge of externally stored fields + that the space id of the undo log record is 0! */ + + block = buf_page_get(0, 0, page_no, RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + data_field = buf_block_get_frame(block) + + offset + internal_offset; + + ut_a(dfield_get_len(&ufield->new_val) + >= BTR_EXTERN_FIELD_REF_SIZE); + btr_free_externally_stored_field( + index, + data_field + dfield_get_len(&ufield->new_val) + - BTR_EXTERN_FIELD_REF_SIZE, + NULL, NULL, NULL, 0, RB_NONE, &mtr); + mtr_commit(&mtr); + } + } +} + +/*************************************************************** +Parses the row reference and other info in a modify undo log record. */ +static +ibool +row_purge_parse_undo_rec( +/*=====================*/ + /* out: TRUE if purge operation required: + NOTE that then the CALLER must unfreeze + data dictionary! */ + purge_node_t* node, /* in: row undo node */ + ibool* updated_extern, + /* out: TRUE if an externally stored field + was updated */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + trx_t* trx; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + + ut_ad(node && thr); + + trx = thr_get_trx(thr); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + updated_extern, &undo_no, &table_id); + node->rec_type = type; + + if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) { + + return(FALSE); + } + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + node->table = NULL; + + if (type == TRX_UNDO_UPD_EXIST_REC + && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) { + + /* Purge requires no changes to indexes: we may return */ + + return(FALSE); + } + + /* Prevent DROP TABLE etc. from running when we are doing the purge + for this row */ + + row_mysql_freeze_data_dictionary(trx); + + mutex_enter(&(dict_sys->mutex)); + + node->table = dict_table_get_on_id_low(table_id); + + mutex_exit(&(dict_sys->mutex)); + + if (node->table == NULL) { + /* The table has been dropped: no need to do purge */ +err_exit: + row_mysql_unfreeze_data_dictionary(trx); + return(FALSE); + } + + if (node->table->ibd_file_missing) { + /* We skip purge of missing .ibd files */ + + node->table = NULL; + + goto err_exit; + } + + clust_index = dict_table_get_first_index(node->table); + + if (clust_index == NULL) { + /* The table was corrupt in the data dictionary */ + + goto err_exit; + } + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, trx, + node->heap, &(node->update)); + + /* Read to the partial row the fields that occur in indexes */ + + if (!(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + ptr = trx_undo_rec_get_partial_row( + ptr, clust_index, &node->row, + type == TRX_UNDO_UPD_DEL_REC, + node->heap); + } + + return(TRUE); +} + +/*************************************************************** +Fetches an undo log record and does the purge for the recorded operation. +If none left, or the current purge completed, returns the control to the +parent node, which is always a query thread node. */ +static +ulint +row_purge( +/*======*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + purge_node_t* node, /* in: row purge node */ + que_thr_t* thr) /* in: query thread */ +{ + dulint roll_ptr; + ibool purge_needed; + ibool updated_extern; + trx_t* trx; + + ut_ad(node && thr); + + trx = thr_get_trx(thr); + + node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr, + &(node->reservation), + node->heap); + if (!node->undo_rec) { + /* Purge completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + + if (node->undo_rec == &trx_purge_dummy_rec) { + purge_needed = FALSE; + } else { + purge_needed = row_purge_parse_undo_rec(node, &updated_extern, + thr); + /* If purge_needed == TRUE, we must also remember to unfreeze + data dictionary! */ + } + + if (purge_needed) { + node->found_clust = FALSE; + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { + row_purge_del_mark(node); + + } else if (updated_extern + || node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + row_purge_upd_exist_or_extern(node); + } + + if (node->found_clust) { + btr_pcur_close(&(node->pcur)); + } + + row_mysql_unfreeze_data_dictionary(trx); + } + + /* Do some cleanup */ + trx_purge_rec_release(node->reservation); + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(DB_SUCCESS); +} + +/*************************************************************** +Does the purge operation for a single undo log record. This is a high-level +function used in an SQL execution graph. */ +UNIV_INTERN +que_thr_t* +row_purge_step( +/*===========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + purge_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + + err = row_purge(node, thr); + + ut_ad(err == DB_SUCCESS); + + return(thr); +} diff --git a/storage/xtradb/row/row0row.c b/storage/xtradb/row/row0row.c new file mode 100644 index 00000000000..4343ee2b009 --- /dev/null +++ b/storage/xtradb/row/row0row.c @@ -0,0 +1,1241 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "row0row.h" + +#ifdef UNIV_NONINL +#include "row0row.ic" +#endif + +#include "data0type.h" +#include "dict0dict.h" +#include "btr0btr.h" +#include "ha_prototypes.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "ut0mem.h" + +/************************************************************************* +Gets the offset of trx id field, in bytes relative to the origin of +a clustered index record. */ +UNIV_INTERN +ulint +row_get_trx_id_offset( +/*==================*/ + /* out: offset of DATA_TRX_ID */ + const rec_t* rec __attribute__((unused)), + /* in: record */ + dict_index_t* index, /* in: clustered index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + ulint pos; + ulint offset; + ulint len; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offset = rec_get_nth_field_offs(offsets, pos, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + +/********************************************************************* +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. */ +UNIV_INTERN +dtuple_t* +row_build_index_entry( +/*==================*/ + /* out: index entry which should be + inserted or purged, or NULL if the + externally stored columns in the + clustered index record are unavailable + and ext != NULL */ + const dtuple_t* row, /* in: row which should be + inserted or purged */ + row_ext_t* ext, /* in: externally stored column prefixes, + or NULL */ + dict_index_t* index, /* in: index on the table */ + mem_heap_t* heap) /* in: memory heap from which the memory for + the index entry is allocated */ +{ + dtuple_t* entry; + ulint entry_len; + ulint i; + + ut_ad(row && index && heap); + ut_ad(dtuple_check_typed(row)); + + entry_len = dict_index_get_n_fields(index); + entry = dtuple_create(heap, entry_len); + + if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) { + dtuple_set_n_fields_cmp(entry, entry_len); + /* There may only be externally stored columns + in a clustered index B-tree of a user table. */ + ut_a(!ext); + } else { + dtuple_set_n_fields_cmp( + entry, dict_index_get_n_unique_in_tree(index)); + } + + for (i = 0; i < entry_len; i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = ind_field->col; + ulint col_no + = dict_col_get_no(col); + dfield_t* dfield + = dtuple_get_nth_field(entry, i); + const dfield_t* dfield2 + = dtuple_get_nth_field(row, col_no); + ulint len + = dfield_get_len(dfield2); + + dfield_copy(dfield, dfield2); + + if (dfield_is_null(dfield) || ind_field->prefix_len == 0) { + continue; + } + + /* If a column prefix index, take only the prefix. + Prefix-indexed columns may be externally stored. */ + ut_ad(col->ord_part); + + if (UNIV_LIKELY_NULL(ext)) { + /* See if the column is stored externally. */ + const byte* buf = row_ext_lookup(ext, col_no, + &len); + if (UNIV_LIKELY_NULL(buf)) { + if (UNIV_UNLIKELY(buf == field_ref_zero)) { + return(NULL); + } + dfield_set_data(dfield, buf, len); + } + } else if (dfield_is_ext(dfield)) { + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + len -= BTR_EXTERN_FIELD_REF_SIZE; + ut_a(ind_field->prefix_len <= len + || dict_index_is_clust(index)); + } + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ind_field->prefix_len, len, dfield_get_data(dfield)); + dfield_set_len(dfield, len); + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*********************************************************************** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. */ +UNIV_INTERN +dtuple_t* +row_build( +/*======*/ + /* out, own: row built; + see the NOTE below! */ + ulint type, /* in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /* in: clustered index */ + const rec_t* rec, /* in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const ulint* offsets,/* in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /* in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead */ + row_ext_t** ext, /* out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap) /* in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* row; + const dict_table_t* table; + ulint n_fields; + ulint n_ext_cols; + ulint* ext_cols = NULL; /* remove warning */ + ulint len; + ulint row_len; + byte* buf; + ulint i; + ulint j; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_ad(index && rec && heap); + ut_ad(dict_index_is_clust(index)); + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &tmp_heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + if (type != ROW_COPY_POINTERS) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, (ulint*) offsets); + } + + table = index->table; + row_len = dict_table_get_n_cols(table); + + row = dtuple_create(heap, row_len); + + dict_table_copy_types(row, table); + + dtuple_set_info_bits(row, rec_get_info_bits( + rec, dict_table_is_comp(table))); + + n_fields = rec_offs_n_fields(offsets); + n_ext_cols = rec_offs_n_extern(offsets); + if (n_ext_cols) { + ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols); + } + + for (i = j = 0; i < n_fields; i++) { + dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(ind_field); + ulint col_no + = dict_col_get_no(col); + dfield_t* dfield + = dtuple_get_nth_field(row, col_no); + + if (ind_field->prefix_len == 0) { + + const byte* field = rec_get_nth_field( + rec, offsets, i, &len); + + dfield_set_data(dfield, field, len); + } + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + + if (UNIV_LIKELY_NULL(col_table)) { + ut_a(col_no + < dict_table_get_n_cols(col_table)); + col = dict_table_get_nth_col( + col_table, col_no); + } + + if (col->ord_part) { + /* We will have to fetch prefixes of + externally stored columns that are + referenced by column prefixes. */ + ext_cols[j++] = col_no; + } + } + } + + ut_ad(dtuple_check_typed(row)); + + if (j) { + *ext = row_ext_create(j, ext_cols, row, + dict_table_zip_size(index->table), + heap); + } else { + *ext = NULL; + } + + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(row); +} + +/*********************************************************************** +Converts an index record to a typed data tuple. */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + /* out: index entry built; does not + set info_bits, and the data fields in + the entry will point directly to rec */ + const rec_t* rec, /* in: record in the index */ + const dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint* n_ext, /* out: number of externally + stored columns */ + mem_heap_t* heap) /* in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + ulint i; + const byte* field; + ulint len; + ulint rec_len; + + ut_ad(rec && heap && index); + /* Because this function may be invoked by row0merge.c + on a record whose header is in different format, the check + rec_offs_validate(rec, index, offsets) must be avoided here. */ + ut_ad(n_ext); + *n_ext = 0; + + rec_len = rec_offs_n_fields(offsets); + + entry = dtuple_create(heap, rec_len); + + dtuple_set_n_fields_cmp(entry, + dict_index_get_n_unique_in_tree(index)); + ut_ad(rec_len == dict_index_get_n_fields(index)); + + dict_index_copy_types(entry, index, rec_len); + + for (i = 0; i < rec_len; i++) { + + dfield = dtuple_get_nth_field(entry, i); + field = rec_get_nth_field(rec, offsets, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_offs_nth_extern(offsets, i)) { + dfield_set_ext(dfield); + (*n_ext)++; + } + } + + ut_ad(dtuple_check_typed(entry)); + + return(entry); +} + +/*********************************************************************** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. */ +UNIV_INTERN +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + /* out, own: index entry + built; see the NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or + ROW_COPY_POINTERS: the former + copies also the data fields to + heap as the latter only places + pointers to data fields on the + index page */ + const rec_t* rec, /* in: record in the index; + NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the dtuple is used! */ + const dict_index_t* index, /* in: index */ + ulint* offsets,/* in/out: rec_get_offsets(rec) */ + ulint* n_ext, /* out: number of externally + stored columns */ + mem_heap_t* heap) /* in: memory heap from which + the memory needed is allocated */ +{ + dtuple_t* entry; + byte* buf; + + ut_ad(rec && heap && index); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); + } + + entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap); + + dtuple_set_info_bits(entry, + rec_get_info_bits(rec, rec_offs_comp(offsets))); + + return(entry); +} + +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +dtuple_t* +row_build_row_ref( +/*==============*/ + /* out, own: row reference built; see the + NOTE below! */ + ulint type, /* in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /* in: secondary index */ + const rec_t* rec, /* in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + dict_table_t* table; + dict_index_t* clust_index; + dfield_t* dfield; + dtuple_t* ref; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + byte* buf; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* tmp_heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(index && rec && heap); + ut_ad(!dict_index_is_clust(index)); + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &tmp_heap); + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + + if (type == ROW_COPY_DATA) { + /* Take a copy of rec to heap */ + + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + + rec = rec_copy(buf, rec, offsets); + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(rec, index, offsets); + } + + table = index->table; + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminlen, + dtype->mbmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + return(ref); +} + +/*********************************************************************** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INTERN +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /* in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /* in: secondary index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) + or NULL */ + trx_t* trx) /* in: transaction */ +{ + const dict_index_t* clust_index; + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint pos; + ulint clust_col_prefix_len; + ulint i; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + ut_a(ref); + ut_a(index); + ut_a(rec); + ut_ad(!dict_index_is_clust(index)); + + if (UNIV_UNLIKELY(!index->table)) { + fputs("InnoDB: table ", stderr); +notfound: + ut_print_name(stderr, trx, TRUE, index->table_name); + fputs(" for index ", stderr); + ut_print_name(stderr, trx, FALSE, index->name); + fputs(" not found\n", stderr); + ut_error; + } + + clust_index = dict_table_get_first_index(index->table); + + if (UNIV_UNLIKELY(!clust_index)) { + fputs("InnoDB: clust index for table ", stderr); + goto notfound; + } + + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } + + /* Secondary indexes must not contain externally stored columns. */ + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + dict_index_copy_types(ref, clust_index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + pos = dict_index_get_nth_field_pos(index, clust_index, i); + + ut_a(pos != ULINT_UNDEFINED); + + field = rec_get_nth_field(rec, offsets, pos, &len); + + dfield_set_data(dfield, field, len); + + /* If the primary key contains a column prefix, then the + secondary index may contain a longer prefix of the same + column, or the full column, and we must adjust the length + accordingly. */ + + clust_col_prefix_len = dict_index_get_nth_field( + clust_index, i)->prefix_len; + + if (clust_col_prefix_len > 0) { + if (len != UNIV_SQL_NULL) { + + const dtype_t* dtype + = dfield_get_type(dfield); + + dfield_set_len(dfield, + dtype_get_at_most_n_mbchars( + dtype->prtype, + dtype->mbminlen, + dtype->mbmaxlen, + clust_col_prefix_len, + len, (char*) field)); + } + } + } + + ut_ad(dtuple_check_typed(ref)); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*********************************************************************** +From a row build a row reference with which we can search the clustered +index record. */ +UNIV_INTERN +void +row_build_row_ref_from_row( +/*=======================*/ + dtuple_t* ref, /* in/out: row reference built; + see the NOTE below! + ref must have the right number + of fields! */ + const dict_table_t* table, /* in: table */ + const dtuple_t* row) /* in: row + NOTE: the data fields in ref will point + directly into data of this row */ +{ + const dict_index_t* clust_index; + ulint ref_len; + ulint i; + + ut_ad(ref && table && row); + + clust_index = dict_table_get_first_index(table); + + ref_len = dict_index_get_n_unique(clust_index); + + ut_ad(ref_len == dtuple_get_n_fields(ref)); + + for (i = 0; i < ref_len; i++) { + const dict_col_t* col; + const dict_field_t* field; + dfield_t* dfield; + const dfield_t* dfield2; + + dfield = dtuple_get_nth_field(ref, i); + + field = dict_index_get_nth_field(clust_index, i); + + col = dict_field_get_col(field); + + dfield2 = dtuple_get_nth_field(row, dict_col_get_no(col)); + + dfield_copy(dfield, dfield2); + ut_ad(!dfield_is_ext(dfield)); + + if (field->prefix_len > 0 && !dfield_is_null(dfield)) { + + ulint len = dfield_get_len(dfield); + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + field->prefix_len, + len, dfield_get_data(dfield)); + + dfield_set_len(dfield, len); + } + } + + ut_ad(dtuple_check_typed(ref)); +} + +/******************************************************************* +Searches the clustered index record for a row, if we have the row reference. */ +UNIV_INTERN +ibool +row_search_on_row_ref( +/*==================*/ + /* out: TRUE if found */ + btr_pcur_t* pcur, /* out: persistent cursor, which must + be closed by the caller */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /* in: table */ + const dtuple_t* ref, /* in: row reference */ + mtr_t* mtr) /* in/out: mtr */ +{ + ulint low_match; + rec_t* rec; + dict_index_t* index; + + ut_ad(dtuple_check_typed(ref)); + + index = dict_table_get_first_index(table); + + ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index)); + + btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr); + + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_infimum(rec)) { + + return(FALSE); + } + + if (low_match != dtuple_get_n_fields(ref)) { + + return(FALSE); + } + + return(TRUE); +} + +/************************************************************************* +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. */ +UNIV_INTERN +rec_t* +row_get_clust_rec( +/*==============*/ + /* out: record or NULL, if no record found */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index, /* in: secondary index */ + dict_index_t** clust_index,/* out: clustered index */ + mtr_t* mtr) /* in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* ref; + dict_table_t* table; + btr_pcur_t pcur; + ibool found; + rec_t* clust_rec; + + ut_ad(!dict_index_is_clust(index)); + + table = index->table; + + heap = mem_heap_create(256); + + ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap); + + found = row_search_on_row_ref(&pcur, mode, table, ref, mtr); + + clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL; + + mem_heap_free(heap); + + btr_pcur_close(&pcur); + + *clust_index = dict_table_get_first_index(table); + + return(clust_rec); +} + +/******************************************************************* +Searches an index record. */ +UNIV_INTERN +ibool +row_search_index_entry( +/*===================*/ + /* out: TRUE if found */ + dict_index_t* index, /* in: index */ + const dtuple_t* entry, /* in: index entry */ + ulint mode, /* in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /* in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /* in: mtr */ +{ + ulint n_fields; + ulint low_match; + rec_t* rec; + + ut_ad(dtuple_check_typed(entry)); + + btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr); + low_match = btr_pcur_get_low_match(pcur); + + rec = btr_pcur_get_rec(pcur); + + n_fields = dtuple_get_n_fields(entry); + + return(!page_rec_is_infimum(rec) && low_match == n_fields); +} + +#ifndef UNIV_HOTBACKUP + +#include + +/*********************************************************************** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_INT using "prtype" and writes the result to "buf". +If the data is in unknown format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). */ +static +ulint +row_raw_format_int( +/*===============*/ + /* out: number of bytes + that were written */ + const char* data, /* in: raw data */ + ulint data_len, /* in: raw data length + in bytes */ + ulint prtype, /* in: precise type */ + char* buf, /* out: output buffer */ + ulint buf_size, /* in: output buffer size + in bytes */ + ibool* format_in_hex) /* out: should the data be + formated in hex */ +{ + ulint ret; + + if (data_len <= sizeof(ullint)) { + + ullint value; + ibool unsigned_type = prtype & DATA_UNSIGNED; + + value = mach_read_int_type((const byte*) data, + data_len, unsigned_type); + + if (unsigned_type) { + + ret = ut_snprintf(buf, buf_size, "%llu", + value) + 1; + } else { + + ret = ut_snprintf(buf, buf_size, "%lld", + (long long) value) + 1; + } + + } else { + + *format_in_hex = TRUE; + ret = 0; + } + + return(ut_min(ret, buf_size)); +} + +/*********************************************************************** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the +result to "buf". +If the data is in binary format, then nothing is written to "buf", +0 is returned and "format_in_hex" is set to TRUE, otherwise +"format_in_hex" is left untouched. +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). */ +static +ulint +row_raw_format_str( +/*===============*/ + /* out: number of bytes + that were written */ + const char* data, /* in: raw data */ + ulint data_len, /* in: raw data length + in bytes */ + ulint prtype, /* in: precise type */ + char* buf, /* out: output buffer */ + ulint buf_size, /* in: output buffer size + in bytes */ + ibool* format_in_hex) /* out: should the data be + formated in hex */ +{ + ulint charset_coll; + + if (buf_size == 0) { + + return(0); + } + + /* we assume system_charset_info is UTF-8 */ + + charset_coll = dtype_get_charset_coll(prtype); + + if (UNIV_LIKELY(dtype_is_utf8(prtype))) { + + return(ut_str_sql_format(data, data_len, buf, buf_size)); + } + /* else */ + + if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) { + + *format_in_hex = TRUE; + return(0); + } + /* else */ + + return(innobase_raw_format(data, data_len, charset_coll, + buf, buf_size)); +} + +/*********************************************************************** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always '\0'-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating '\0'). */ +UNIV_INTERN +ulint +row_raw_format( +/*===========*/ + /* out: number of bytes + that were written */ + const char* data, /* in: raw data */ + ulint data_len, /* in: raw data length + in bytes */ + const dict_field_t* dict_field, /* in: index field */ + char* buf, /* out: output buffer */ + ulint buf_size) /* in: output buffer size + in bytes */ +{ + ulint mtype; + ulint prtype; + ulint ret; + ibool format_in_hex; + + if (buf_size == 0) { + + return(0); + } + + if (data_len == UNIV_SQL_NULL) { + + ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1; + + return(ut_min(ret, buf_size)); + } + + mtype = dict_field->col->mtype; + prtype = dict_field->col->prtype; + + format_in_hex = FALSE; + + switch (mtype) { + case DATA_INT: + + ret = row_raw_format_int(data, data_len, prtype, + buf, buf_size, &format_in_hex); + break; + case DATA_CHAR: + case DATA_VARCHAR: + case DATA_MYSQL: + case DATA_VARMYSQL: + + ret = row_raw_format_str(data, data_len, prtype, + buf, buf_size, &format_in_hex); + break; + /* XXX support more data types */ + default: + + format_in_hex = TRUE; + } + + if (format_in_hex) { + + if (UNIV_LIKELY(buf_size > 2)) { + + memcpy(buf, "0x", 2); + buf += 2; + buf_size -= 2; + ret = 2 + ut_raw_to_hex(data, data_len, + buf, buf_size); + } else { + + buf[0] = '\0'; + ret = 1; + } + } + + return(ret); +} + +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include "ut0dbg.h" + +void +test_row_raw_format_int() +{ + ulint ret; + char buf[128]; + ibool format_in_hex; + +#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\ + ret_expected, buf_expected, format_in_hex_expected)\ + do {\ + ibool ok = TRUE;\ + ulint i;\ + memset(buf, 'x', 10);\ + buf[10] = '\0';\ + format_in_hex = FALSE;\ + fprintf(stderr, "TESTING \"\\x");\ + for (i = 0; i < data_len; i++) {\ + fprintf(stderr, "%02hhX", data[i]);\ + }\ + fprintf(stderr, "\", %lu, %lu, %lu\n",\ + (ulint) data_len, (ulint) prtype,\ + (ulint) buf_size);\ + ret = row_raw_format_int(data, data_len, prtype,\ + buf, buf_size, &format_in_hex);\ + if (ret != ret_expected) {\ + fprintf(stderr, "expected ret %lu, got %lu\n",\ + (ulint) ret_expected, ret);\ + ok = FALSE;\ + }\ + if (strcmp((char*) buf, buf_expected) != 0) {\ + fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\ + buf_expected, buf);\ + ok = FALSE;\ + }\ + if (format_in_hex != format_in_hex_expected) {\ + fprintf(stderr, "expected format_in_hex %d, got %d\n",\ + (int) format_in_hex_expected,\ + (int) format_in_hex);\ + ok = FALSE;\ + }\ + if (ok) {\ + fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\ + (ulint) ret, buf, (int) format_in_hex);\ + } else {\ + return;\ + }\ + } while (0) + +#if 1 + /* min values for signed 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, 0, + buf, sizeof(buf), 5, "-128", 0); + + CALL_AND_TEST("\x00\x00", 2, 0, + buf, sizeof(buf), 7, "-32768", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, 0, + buf, sizeof(buf), 9, "-8388608", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, 0, + buf, sizeof(buf), 12, "-2147483648", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0, + buf, sizeof(buf), 14, "-549755813888", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0, + buf, sizeof(buf), 17, "-140737488355328", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0, + buf, sizeof(buf), 19, "-36028797018963968", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0, + buf, sizeof(buf), 21, "-9223372036854775808", 0); + + /* min values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\x00", 1, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED, + buf, sizeof(buf), 2, "0", 0); + + /* max values for signed 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, 0, + buf, sizeof(buf), 4, "127", 0); + + CALL_AND_TEST("\xFF\xFF", 2, 0, + buf, sizeof(buf), 6, "32767", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, 0, + buf, sizeof(buf), 8, "8388607", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0, + buf, sizeof(buf), 11, "2147483647", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0, + buf, sizeof(buf), 13, "549755813887", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0, + buf, sizeof(buf), 16, "140737488355327", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0, + buf, sizeof(buf), 18, "36028797018963967", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0, + buf, sizeof(buf), 20, "9223372036854775807", 0); + + /* max values for unsigned 1-8 byte integers */ + + CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED, + buf, sizeof(buf), 4, "255", 0); + + CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "65535", 0); + + CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED, + buf, sizeof(buf), 9, "16777215", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED, + buf, sizeof(buf), 11, "4294967295", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED, + buf, sizeof(buf), 14, "1099511627775", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED, + buf, sizeof(buf), 16, "281474976710655", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED, + buf, sizeof(buf), 18, "72057594037927935", 0); + + CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED, + buf, sizeof(buf), 21, "18446744073709551615", 0); + + /* some random values */ + + CALL_AND_TEST("\x52", 1, 0, + buf, sizeof(buf), 4, "-46", 0); + + CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED, + buf, sizeof(buf), 3, "14", 0); + + CALL_AND_TEST("\x62\xCE", 2, 0, + buf, sizeof(buf), 6, "-7474", 0); + + CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED, + buf, sizeof(buf), 6, "10710", 0); + + CALL_AND_TEST("\x7F\xFF\x90", 3, 0, + buf, sizeof(buf), 5, "-112", 0); + + CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED, + buf, sizeof(buf), 6, "41238", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0, + buf, sizeof(buf), 3, "-9", 0); + + CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED, + buf, sizeof(buf), 3, "92", 0); + + CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0, + buf, sizeof(buf), 6, "-9117", 0); + + CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED, + buf, sizeof(buf), 6, "91234", 0); +#endif + + /* speed test */ + + speedo_t speedo; + ulint i; + + speedo_reset(&speedo); + + for (i = 0; i < 1000000; i++) { + row_raw_format_int("\x23", 1, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x23", 1, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + 0, buf, sizeof(buf), + &format_in_hex); + row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8, + DATA_UNSIGNED, buf, sizeof(buf), + &format_in_hex); + } + + speedo_show(&speedo); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/row/row0sel.c b/storage/xtradb/row/row0sel.c new file mode 100644 index 00000000000..ebd7bf4a2ce --- /dev/null +++ b/storage/xtradb/row/row0sel.c @@ -0,0 +1,4736 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************* +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "row0sel.h" + +#ifdef UNIV_NONINL +#include "row0sel.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0trx.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0vers.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0sym.h" +#include "pars0pars.h" +#include "row0mysql.h" +#include "read0read.h" +#include "buf0lru.h" +#include "ha_prototypes.h" + +/* Maximum number of rows to prefetch; MySQL interface has another parameter */ +#define SEL_MAX_N_PREFETCH 16 + +/* Number of rows fetched, after which to start prefetching; MySQL interface +has another parameter */ +#define SEL_PREFETCH_LIMIT 1 + +/* When a select has accessed about this many pages, it returns control back +to que_run_threads: this is to allow canceling runaway queries */ + +#define SEL_COST_LIMIT 100 + +/* Flags for search shortcut */ +#define SEL_FOUND 0 +#define SEL_EXHAUSTED 1 +#define SEL_RETRY 2 + +/************************************************************************ +Returns TRUE if the user-defined column in a secondary index record +is alphabetically the same as the corresponding BLOB column in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! */ +static +ibool +row_sel_sec_rec_is_for_blob( +/*========================*/ + /* out: TRUE if the columns + are equal */ + ulint mtype, /* in: main type */ + ulint prtype, /* in: precise type */ + ulint mbminlen, /* in: minimum length of a + multi-byte character */ + ulint mbmaxlen, /* in: maximum length of a + multi-byte character */ + const byte* clust_field, /* in: the locally stored part of + the clustered index column, including + the BLOB pointer; the clustered + index record must be covered by + a lock or a page latch to protect it + against deletion (rollback or purge) */ + ulint clust_len, /* in: length of clust_field */ + const byte* sec_field, /* in: column in secondary index */ + ulint sec_len, /* in: length of sec_field */ + ulint zip_size) /* in: compressed page size, or 0 */ +{ + ulint len; + byte buf[DICT_MAX_INDEX_COL_LEN]; + + len = btr_copy_externally_stored_field_prefix(buf, sizeof buf, + zip_size, + clust_field, clust_len); + + if (UNIV_UNLIKELY(len == 0)) { + /* The BLOB was being deleted as the server crashed. + There should not be any secondary index records + referring to this clustered index record, because + btr_free_externally_stored_field() is called after all + secondary index entries of the row have been purged. */ + return(FALSE); + } + + len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen, + sec_len, len, (const char*) buf); + + return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len)); +} + +/************************************************************************ +Returns TRUE if the user-defined column values in a secondary index record +are alphabetically the same as the corresponding columns in the clustered +index record. +NOTE: the comparison is NOT done as a binary comparison, but character +fields are compared with collation! */ +static +ibool +row_sel_sec_rec_is_for_clust_rec( +/*=============================*/ + /* out: TRUE if the secondary + record is equal to the corresponding + fields in the clustered record, + when compared with collation */ + const rec_t* sec_rec, /* in: secondary index record */ + dict_index_t* sec_index, /* in: secondary index */ + const rec_t* clust_rec, /* in: clustered index record; + must be protected by a lock or + a page latch against deletion + in rollback or purge */ + dict_index_t* clust_index) /* in: clustered index */ +{ + const byte* sec_field; + ulint sec_len; + const byte* clust_field; + ulint n; + ulint i; + mem_heap_t* heap = NULL; + ulint clust_offsets_[REC_OFFS_NORMAL_SIZE]; + ulint sec_offsets_[REC_OFFS_SMALL_SIZE]; + ulint* clust_offs = clust_offsets_; + ulint* sec_offs = sec_offsets_; + ibool is_equal = TRUE; + + rec_offs_init(clust_offsets_); + rec_offs_init(sec_offsets_); + + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(clust_index->table))) { + + /* The clustered index record is delete-marked; + it is not visible in the read view. Besides, + if there are any externally stored columns, + some of them may have already been purged. */ + return(FALSE); + } + + clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, + ULINT_UNDEFINED, &heap); + sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs, + ULINT_UNDEFINED, &heap); + + n = dict_index_get_n_ordering_defined_by_user(sec_index); + + for (i = 0; i < n; i++) { + const dict_field_t* ifield; + const dict_col_t* col; + ulint clust_pos; + ulint clust_len; + ulint len; + + ifield = dict_index_get_nth_field(sec_index, i); + col = dict_field_get_col(ifield); + clust_pos = dict_col_get_clust_pos(col, clust_index); + + clust_field = rec_get_nth_field( + clust_rec, clust_offs, clust_pos, &clust_len); + sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len); + + len = clust_len; + + if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) { + + if (rec_offs_nth_extern(clust_offs, clust_pos)) { + len -= BTR_EXTERN_FIELD_REF_SIZE; + } + + len = dtype_get_at_most_n_mbchars( + col->prtype, col->mbminlen, col->mbmaxlen, + ifield->prefix_len, len, (char*) clust_field); + + if (rec_offs_nth_extern(clust_offs, clust_pos) + && len < sec_len) { + if (!row_sel_sec_rec_is_for_blob( + col->mtype, col->prtype, + col->mbminlen, col->mbmaxlen, + clust_field, clust_len, + sec_field, sec_len, + dict_table_zip_size( + clust_index->table))) { + goto inequal; + } + + continue; + } + } + + if (0 != cmp_data_data(col->mtype, col->prtype, + clust_field, len, + sec_field, sec_len)) { +inequal: + is_equal = FALSE; + goto func_exit; + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(is_equal); +} + +/************************************************************************* +Creates a select node struct. */ +UNIV_INTERN +sel_node_t* +sel_node_create( +/*============*/ + /* out, own: select node struct */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + sel_node_t* node; + + node = mem_heap_alloc(heap, sizeof(sel_node_t)); + node->common.type = QUE_NODE_SELECT; + node->state = SEL_NODE_OPEN; + + node->plans = NULL; + + return(node); +} + +/************************************************************************* +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +UNIV_INTERN +void +sel_node_free_private( +/*==================*/ + sel_node_t* node) /* in: select node struct */ +{ + ulint i; + plan_t* plan; + + if (node->plans != NULL) { + for (i = 0; i < node->n_tables; i++) { + plan = sel_node_get_nth_plan(node, i); + + btr_pcur_close(&(plan->pcur)); + btr_pcur_close(&(plan->clust_pcur)); + + if (plan->old_vers_heap) { + mem_heap_free(plan->old_vers_heap); + } + } + } +} + +/************************************************************************* +Evaluates the values in a select list. If there are aggregate functions, +their argument value is added to the aggregate total. */ +UNIV_INLINE +void +sel_eval_select_list( +/*=================*/ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + exp = node->select_list; + + while (exp) { + eval_exp(exp); + + exp = que_node_get_next(exp); + } +} + +/************************************************************************* +Assigns the values in the select list to the possible into-variables in +SELECT ... INTO ... */ +UNIV_INLINE +void +sel_assign_into_var_values( +/*=======================*/ + sym_node_t* var, /* in: first variable in a list of variables */ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + if (var == NULL) { + + return; + } + + exp = node->select_list; + + while (var) { + ut_ad(exp); + + eval_node_copy_val(var->alias, exp); + + exp = que_node_get_next(exp); + var = que_node_get_next(var); + } +} + +/************************************************************************* +Resets the aggregate value totals in the select list of an aggregate type +query. */ +UNIV_INLINE +void +sel_reset_aggregate_vals( +/*=====================*/ + sel_node_t* node) /* in: select node */ +{ + func_node_t* func_node; + + ut_ad(node->is_aggregate); + + func_node = node->select_list; + + while (func_node) { + eval_node_set_int_val(func_node, 0); + + func_node = que_node_get_next(func_node); + } + + node->aggregate_already_fetched = FALSE; +} + +/************************************************************************* +Copies the input variable values when an explicit cursor is opened. */ +UNIV_INLINE +void +row_sel_copy_input_variable_vals( +/*=============================*/ + sel_node_t* node) /* in: select node */ +{ + sym_node_t* var; + + var = UT_LIST_GET_FIRST(node->copy_variables); + + while (var) { + eval_node_copy_val(var, var->alias); + + var->indirection = NULL; + + var = UT_LIST_GET_NEXT(col_var_list, var); + } +} + +/************************************************************************* +Fetches the column values from a record. */ +static +void +row_sel_fetch_columns( +/*==================*/ + dict_index_t* index, /* in: record index */ + const rec_t* rec, /* in: record in a clustered or non-clustered + index; must be protected by a page latch */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + sym_node_t* column) /* in: first column in a column list, or + NULL */ +{ + dfield_t* val; + ulint index_type; + ulint field_no; + const byte* data; + ulint len; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (dict_index_is_clust(index)) { + index_type = SYM_CLUST_FIELD_NO; + } else { + index_type = SYM_SEC_FIELD_NO; + } + + while (column) { + mem_heap_t* heap = NULL; + ibool needs_copy; + + field_no = column->field_nos[index_type]; + + if (field_no != ULINT_UNDEFINED) { + + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, + field_no))) { + + /* Copy an externally stored field to the + temporary heap */ + + heap = mem_heap_create(1); + + data = btr_rec_copy_externally_stored_field( + rec, offsets, + dict_table_zip_size(index->table), + field_no, &len, heap); + + ut_a(len != UNIV_SQL_NULL); + + needs_copy = TRUE; + } else { + data = rec_get_nth_field(rec, offsets, + field_no, &len); + + if (len == UNIV_SQL_NULL) { + len = UNIV_SQL_NULL; + } + + needs_copy = column->copy_val; + } + + if (needs_copy) { + eval_node_copy_and_alloc_val(column, data, + len); + } else { + val = que_node_get_val(column); + dfield_set_data(val, data, len); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Allocates a prefetch buffer for a column when prefetch is first time done. */ +static +void +sel_col_prefetch_buf_alloc( +/*=======================*/ + sym_node_t* column) /* in: symbol table node for a column */ +{ + sel_buf_t* sel_buf; + ulint i; + + ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL); + + column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH + * sizeof(sel_buf_t)); + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = column->prefetch_buf + i; + + sel_buf->data = NULL; + + sel_buf->val_buf_size = 0; + } +} + +/************************************************************************* +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +UNIV_INTERN +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */ +{ + sel_buf_t* sel_buf; + ulint i; + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = prefetch_buf + i; + + if (sel_buf->val_buf_size > 0) { + + mem_free(sel_buf->data); + } + } +} + +/************************************************************************* +Pops the column values for a prefetched, cached row from the column prefetch +buffers and places them to the val fields in the column nodes. */ +static +void +sel_pop_prefetched_row( +/*===================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint val_buf_size; + + ut_ad(plan->n_rows_prefetched > 0); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + val = que_node_get_val(column); + + if (!column->copy_val) { + /* We did not really push any value for the + column */ + + ut_ad(!column->prefetch_buf); + ut_ad(que_node_get_val_buf_size(column) == 0); + ut_d(dfield_set_null(val)); + + goto next_col; + } + + ut_ad(column->prefetch_buf); + ut_ad(!dfield_is_ext(val)); + + sel_buf = column->prefetch_buf + plan->first_prefetched; + + data = sel_buf->data; + len = sel_buf->len; + val_buf_size = sel_buf->val_buf_size; + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + sel_buf->data = dfield_get_data(val); + sel_buf->len = dfield_get_len(val); + sel_buf->val_buf_size = que_node_get_val_buf_size(column); + + dfield_set_data(val, data, len); + que_node_set_val_buf_size(column, val_buf_size); +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } + + plan->n_rows_prefetched--; + + plan->first_prefetched++; +} + +/************************************************************************* +Pushes the column values for a prefetched, cached row to the column prefetch +buffers from the val fields in the column nodes. */ +UNIV_INLINE +void +sel_push_prefetched_row( +/*====================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint pos; + ulint val_buf_size; + + if (plan->n_rows_prefetched == 0) { + pos = 0; + plan->first_prefetched = 0; + } else { + pos = plan->n_rows_prefetched; + + /* We have the convention that pushing new rows starts only + after the prefetch stack has been emptied: */ + + ut_ad(plan->first_prefetched == 0); + } + + plan->n_rows_prefetched++; + + ut_ad(pos < SEL_MAX_N_PREFETCH); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + if (!column->copy_val) { + /* There is no sense to push pointers to database + page fields when we do not keep latch on the page! */ + + goto next_col; + } + + if (!column->prefetch_buf) { + /* Allocate a new prefetch buffer */ + + sel_col_prefetch_buf_alloc(column); + } + + sel_buf = column->prefetch_buf + pos; + + val = que_node_get_val(column); + + data = dfield_get_data(val); + len = dfield_get_len(val); + val_buf_size = que_node_get_val_buf_size(column); + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + dfield_set_data(val, sel_buf->data, sel_buf->len); + que_node_set_val_buf_size(column, sel_buf->val_buf_size); + + sel_buf->data = data; + sel_buf->len = len; + sel_buf->val_buf_size = val_buf_size; +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers( +/*====================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + dict_index_t* index, /* in: plan node for table */ + rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, plan->index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t** old_vers_heap, /* out: old version heap to use */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (*old_vers_heap) { + mem_heap_empty(*old_vers_heap); + } else { + *old_vers_heap = mem_heap_create(512); + } + + err = row_vers_build_for_consistent_read( + rec, mtr, index, offsets, read_view, offset_heap, + *old_vers_heap, old_vers); + return(err); +} + +/************************************************************************* +Builds the last committed version of a clustered index record for a +semi-consistent read. */ +static +ulint +row_sel_build_committed_vers_for_mysql( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + dict_index_t* clust_index, /* in: clustered index */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + const rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ + const rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_semi_consistent_read( + rec, mtr, clust_index, offsets, offset_heap, + prebuilt->old_vers_heap, old_vers); + return(err); +} + +/************************************************************************* +Tests the conditions which determine when the index segment we are searching +through has been exhausted. */ +UNIV_INLINE +ibool +row_sel_test_end_conds( +/*===================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved and the right sides of + comparisons evaluated */ +{ + func_node_t* cond; + + /* All conditions in end_conds are comparisons of a column to an + expression */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + /* Evaluate the left side of the comparison, i.e., get the + column value if there is an indirection */ + + eval_sym(cond->args); + + /* Do the comparison */ + + if (!eval_cmp(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Tests the other conditions. */ +UNIV_INLINE +ibool +row_sel_test_other_conds( +/*=====================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved */ +{ + func_node_t* cond; + + cond = UT_LIST_GET_FIRST(plan->other_conds); + + while (cond) { + eval_exp(cond); + + if (!eval_node_get_ibool_val(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. */ +static +ulint +row_sel_get_clust_rec( +/*==================*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select_node */ + plan_t* plan, /* in: plan node for table */ + rec_t* rec, /* in: record in a non-clustered index */ + que_thr_t* thr, /* in: query thread */ + rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + *out_rec = NULL; + + offsets = rec_get_offsets(rec, + btr_pcur_get_btr_cur(&plan->pcur)->index, + offsets, ULINT_UNDEFINED, &heap); + + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets); + + index = dict_table_get_first_index(plan->table); + + btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE, + BTR_SEARCH_LEAF, &plan->clust_pcur, + 0, mtr); + + clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(&(plan->clust_pcur)) + < dict_index_get_n_unique(index)) { + + ut_a(rec_get_deleted_flag(rec, + dict_table_is_comp(plan->table))); + ut_a(node->read_view); + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + goto func_exit; + } + + offsets = rec_get_offsets(clust_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (!node->read_view) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation level + we lock only the record, i.e., next-key locking is + not used. */ + ulint lock_type; + trx_t* trx; + + trx = thr_get_trx(thr); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) { + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(&plan->clust_pcur), + clust_rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers( + node->read_view, index, clust_rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, mtr); + + if (err != DB_SUCCESS) { + + goto err_exit; + } + + clust_rec = old_vers; + + if (clust_rec == NULL) { + goto func_exit; + } + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if ((old_vers + || rec_get_deleted_flag(rec, dict_table_is_comp( + plan->table))) + && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index, + clust_rec, index)) { + goto func_exit; + } + } + + /* Fetch the columns needed in test conditions. The clustered + index record is protected by a page latch that was acquired + when plan->clust_pcur was positioned. The latch will not be + released until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, clust_rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + *out_rec = clust_rec; +func_exit: + err = DB_SUCCESS; +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/************************************************************************* +Sets a lock on a record. */ +UNIV_INLINE +ulint +sel_set_rec_lock( +/*=============*/ + /* out: DB_SUCCESS or error code */ + const buf_block_t* block, /* in: buffer block of rec */ + const rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + ulint mode, /* in: lock mode */ + ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or + LOC_REC_NOT_GAP */ + que_thr_t* thr) /* in: query thread */ +{ + trx_t* trx; + ulint err; + + trx = thr_get_trx(thr); + + if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) { + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + } + + if (dict_index_is_clust(index)) { + err = lock_clust_rec_read_check_and_lock( + 0, block, rec, index, offsets, mode, type, thr); + } else { + err = lock_sec_rec_read_check_and_lock( + 0, block, rec, index, offsets, mode, type, thr); + } + + return(err); +} + +/************************************************************************* +Opens a pcur to a table index. */ +static +void +row_sel_open_pcur( +/*==============*/ + plan_t* plan, /* in: table plan */ + ibool search_latch_locked, + /* in: TRUE if the thread currently + has the search latch locked in + s-mode */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + func_node_t* cond; + que_node_t* exp; + ulint n_fields; + ulint has_search_latch = 0; /* RW_S_LATCH or 0 */ + ulint i; + + if (search_latch_locked) { + has_search_latch = RW_S_LATCH; + } + + index = plan->index; + + /* Calculate the value of the search tuple: the exact match columns + get their expressions evaluated when we evaluate the right sides of + end_conds */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + eval_exp(que_node_get_next(cond->args)); + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + + if (plan->n_exact_match < n_fields) { + /* There is a non-exact match field which must be + evaluated separately */ + + eval_exp(plan->tuple_exps[n_fields - 1]); + } + + for (i = 0; i < n_fields; i++) { + exp = plan->tuple_exps[i]; + + dfield_copy_data(dtuple_get_nth_field(plan->tuple, i), + que_node_get_val(exp)); + } + + /* Open pcur to the index */ + + btr_pcur_open_with_no_init(index, plan->tuple, plan->mode, + BTR_SEARCH_LEAF, &plan->pcur, + has_search_latch, mtr); + } else { + /* Open the cursor to the start or the end of the index + (FALSE: no init) */ + + btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF, + &(plan->pcur), FALSE, mtr); + } + + ut_ad(plan->n_rows_prefetched == 0); + ut_ad(plan->n_rows_fetched == 0); + ut_ad(plan->cursor_at_end == FALSE); + + plan->pcur_is_open = TRUE; +} + +/************************************************************************* +Restores a stored pcur position to a table index. */ +static +ibool +row_sel_restore_pcur_pos( +/*=====================*/ + /* out: TRUE if the cursor should be moved to + the next record after we return from this + function (moved to the previous, in the case + of a descending cursor) without processing + again the current cursor record */ + plan_t* plan, /* in: table plan */ + mtr_t* mtr) /* in: mtr */ +{ + ibool equal_position; + ulint relative_position; + + ut_ad(!plan->cursor_at_end); + + relative_position = btr_pcur_get_rel_pos(&(plan->pcur)); + + equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF, + &(plan->pcur), mtr); + + /* If the cursor is traveling upwards, and relative_position is + + (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock + yet on the successor of the page infimum; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + not yet processed the cursor record: no need to move the cursor to the + next record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we must move to the next record; + (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the next + record, else there is no need to move the cursor. */ + + if (plan->asc) { + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(FALSE); + } + + /* If the cursor is traveling downwards, and relative_position is + + (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on + the last record LESS than the successor of a page infimum; we have not + processed the cursor record: no need to move the cursor; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + processed the cursor record: we should move the cursor to the previous + record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we need not move to the previous + record; (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the previous + record, else there is no need to move the cursor. */ + + if (relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + + return(FALSE); + } + + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(FALSE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE); + + return(TRUE); +} + +/************************************************************************* +Resets a plan cursor to a closed state. */ +UNIV_INLINE +void +plan_reset_cursor( +/*==============*/ + plan_t* plan) /* in: plan */ +{ + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + plan->n_rows_fetched = 0; + plan->n_rows_prefetched = 0; +} + +/************************************************************************* +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). */ +static +ulint +row_sel_try_search_shortcut( +/*========================*/ + /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ + sel_node_t* node, /* in: select node for a consistent read */ + plan_t* plan, /* in: plan for a unique search in clustered + index */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + ulint ret; + rec_offs_init(offsets_); + + index = plan->index; + + ut_ad(node->read_view); + ut_ad(plan->unique_search); + ut_ad(!plan->must_get_clust); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + row_sel_open_pcur(plan, TRUE, mtr); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (dict_index_is_clust(index)) { + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + ret = SEL_RETRY; + goto func_exit; + } + } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) { + + ret = SEL_RETRY; + goto func_exit; + } + + /* Test the deleted flag. */ + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + /* Fetch the columns needed in test conditions. The index + record is protected by a page latch that was acquired when + plan->pcur was positioned. The latch will not be released + until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + ret = SEL_EXHAUSTED; + goto func_exit; + } + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + plan->n_rows_fetched++; + ret = SEL_FOUND; +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(ret); +} + +/************************************************************************* +Performs a select step. */ +static +ulint +row_sel( +/*====*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* index; + plan_t* plan; + mtr_t mtr; + ibool moved; + rec_t* rec; + rec_t* old_vers; + rec_t* clust_rec; + ibool search_latch_locked; + ibool consistent_read; + + /* The following flag becomes TRUE when we are doing a + consistent read from a non-clustered index and we must look + at the clustered index to find out the previous delete mark + state of the non-clustered record: */ + + ibool cons_read_requires_clust_rec = FALSE; + ulint cost_counter = 0; + ibool cursor_just_opened; + ibool must_go_to_next; + ibool mtr_has_extra_clust_latch = FALSE; + /* TRUE if the search was made using + a non-clustered index, and we had to + access the clustered record: now &mtr + contains a clustered index latch, and + &mtr must be committed before we move + to the next non-clustered record */ + ulint found_flag; + ulint err; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(thr->run_node == node); + + search_latch_locked = FALSE; + + if (node->read_view) { + /* In consistent reads, we try to do with the hash index and + not to use the buffer page get. This is to reduce memory bus + load resulting from semaphore operations. The search latch + will be s-locked when we access an index with a unique search + condition, but not locked when we access an index with a + less selective search condition. */ + + consistent_read = TRUE; + } else { + consistent_read = FALSE; + } + +table_loop: + /* TABLE LOOP + ---------- + This is the outer major loop in calculating a join. We come here when + node->fetch_table changes, and after adding a row to aggregate totals + and, of course, when this function is called. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + plan = sel_node_get_nth_plan(node, node->fetch_table); + index = plan->index; + + if (plan->n_rows_prefetched > 0) { + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + + if (plan->cursor_at_end) { + /* The cursor has already reached the result set end: no more + rows to process for this table cursor, as also the prefetch + stack was empty */ + + ut_ad(plan->pcur_is_open); + + goto table_exhausted_no_mtr; + } + + /* Open a cursor to index, or restore an open cursor position */ + + mtr_start(&mtr); + + if (consistent_read && plan->unique_search && !plan->pcur_is_open + && !plan->must_get_clust + && !plan->table->big_rows) { + if (!search_latch_locked) { + rw_lock_s_lock(&btr_search_latch); + + search_latch_locked = TRUE; + } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often + kept for some 10 searches before being released, + a waiting x-latch request would block other threads + from acquiring an s-latch for a long time, lowering + performance significantly in multiprocessors. */ + + rw_lock_s_unlock(&btr_search_latch); + rw_lock_s_lock(&btr_search_latch); + } + + found_flag = row_sel_try_search_shortcut(node, plan, &mtr); + + if (found_flag == SEL_FOUND) { + + goto next_table; + + } else if (found_flag == SEL_EXHAUSTED) { + + goto table_exhausted; + } + + ut_ad(found_flag == SEL_RETRY); + + plan_reset_cursor(plan); + + mtr_commit(&mtr); + mtr_start(&mtr); + } + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + + search_latch_locked = FALSE; + } + + if (!plan->pcur_is_open) { + /* Evaluate the expressions to build the search tuple and + open the cursor */ + + row_sel_open_pcur(plan, search_latch_locked, &mtr); + + cursor_just_opened = TRUE; + + /* A new search was made: increment the cost counter */ + cost_counter++; + } else { + /* Restore pcur position to the index */ + + must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr); + + cursor_just_opened = FALSE; + + if (must_go_to_next) { + /* We have already processed the cursor record: move + to the next */ + + goto next_rec; + } + } + +rec_loop: + /* RECORD LOOP + ----------- + In this loop we use pcur and try to fetch a qualifying row, and + also fill the prefetch buffer for this table if n_rows_fetched has + exceeded a threshold. While we are inside this loop, the following + holds: + (1) &mtr is started, + (2) pcur is positioned and open. + + NOTE that if cursor_just_opened is TRUE here, it means that we came + to this point right after row_sel_open_pcur. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + /* PHASE 1: Set a lock if specified */ + + if (!node->asc && cursor_just_opened + && !page_rec_is_supremum(rec)) { + + /* When we open a cursor for a descending search, we must set + a next-key lock on the successor record: otherwise it would + be possible to insert new records next to the cursor position, + and it might be that these new records should appear in the + search result set, resulting in the phantom problem. */ + + if (!consistent_read) { + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation + level, we lock only the record, i.e., next-key + locking is not used. */ + + rec_t* next_rec = page_rec_get_next(rec); + ulint lock_type; + trx_t* trx; + + trx = thr_get_trx(thr); + + offsets = rec_get_offsets(next_rec, index, offsets, + ULINT_UNDEFINED, &heap); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) { + + if (page_rec_is_supremum(next_rec)) { + + goto skip_lock; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur), + next_rec, index, offsets, + node->row_lock_mode, + lock_type, thr); + + if (err != DB_SUCCESS) { + /* Note that in this case we will store in pcur + the PREDECESSOR of the record we are waiting + the lock for */ + + goto lock_wait_or_error; + } + } + } + +skip_lock: + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. We also increment the cost counter as we may have + processed yet another page of index. */ + + cost_counter++; + + goto next_rec; + } + + if (!consistent_read) { + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using READ COMMITTED isolation level, + we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + trx_t* trx; + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + + trx = thr_get_trx(thr); + + if (srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) { + + if (page_rec_is_supremum(rec)) { + + goto next_rec; + } + + lock_type = LOCK_REC_NOT_GAP; + } else { + lock_type = LOCK_ORDINARY; + } + + err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur), + rec, index, offsets, + node->row_lock_mode, lock_type, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + if (page_rec_is_supremum(rec)) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (cost_counter > SEL_COST_LIMIT) { + + /* Now that we have placed the necessary locks, we can stop + for a while and store the cursor position; NOTE that if we + would store the cursor position BEFORE placing a record lock, + it might happen that the cursor would jump over some records + that another transaction could meanwhile insert adjacent to + the cursor: this would result in the phantom problem. */ + + goto stop_for_a_while; + } + + /* PHASE 2: Check a mixed index mix id if needed */ + + if (plan->unique_search && cursor_just_opened) { + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search + with the mode PAGE_CUR_GE, the up_match field in the cursor + tells how many fields in the user record matched to the search + tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) + < plan->n_exact_match) { + goto table_exhausted; + } + + /* Ok, no need to test end_conds or mix id */ + + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* PHASE 3: Get previous version in a consistent read */ + + cons_read_requires_clust_rec = FALSE; + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (consistent_read) { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (dict_index_is_clust(index)) { + + if (!lock_clust_rec_cons_read_sees(rec, index, offsets, + node->read_view)) { + + err = row_sel_build_prev_vers( + node->read_view, index, rec, + &offsets, &heap, &plan->old_vers_heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &heap); + + /* Fetch the columns needed in + test conditions. The clustered + index record is protected by a + page latch that was acquired + by row_sel_open_pcur() or + row_sel_restore_pcur_pos(). + The latch will not be released + until mtr_commit(mtr). */ + + row_sel_fetch_columns( + index, rec, offsets, + UT_LIST_GET_FIRST( + plan->columns)); + + if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, + node->read_view)) { + cons_read_requires_clust_rec = TRUE; + } + } + + /* PHASE 4: Test search end conditions and deleted flag */ + + /* Fetch the columns needed in test conditions. The record is + protected by a page latch that was acquired by + row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch + will not be released until mtr_commit(mtr). */ + + row_sel_fetch_columns(index, rec, offsets, + UT_LIST_GET_FIRST(plan->columns)); + + /* Test the selection end conditions: these can only contain columns + which already are found in the index, even though the index might be + non-clustered */ + + if (plan->unique_search && cursor_just_opened) { + + /* No test necessary: the test was already made above */ + + } else if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table)) + && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 5: Get the clustered index record, if needed and if we did + not do the search using the clustered index */ + + if (plan->must_get_clust || cons_read_requires_clust_rec) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec, + &mtr); + mtr_has_extra_clust_latch = TRUE; + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + /* Retrieving the clustered record required a search: + increment the cost counter */ + + cost_counter++; + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(consistent_read); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec, + dict_table_is_comp(plan->table))) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (node->can_get_updated) { + + btr_pcur_store_position(&(plan->clust_pcur), &mtr); + } + } + + /* PHASE 6: Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 7: We found a new qualifying row for the current table; push + the row if prefetch is on, or move to the next table in the join */ + + plan->n_rows_fetched++; + + ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF); + + if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT) + || plan->unique_search || plan->no_prefetch + || plan->table->big_rows) { + + /* No prefetch in operation: go to the next table */ + + goto next_table; + } + + sel_push_prefetched_row(plan); + + if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) { + + /* The prefetch buffer is now full */ + + sel_pop_prefetched_row(plan); + + goto next_table; + } + +next_rec: + ut_ad(!search_latch_locked); + + if (mtr_has_extra_clust_latch) { + + /* We must commit &mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + goto commit_mtr_for_a_while; + } + + if (node->asc) { + moved = btr_pcur_move_to_next(&(plan->pcur), &mtr); + } else { + moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr); + } + + if (!moved) { + + goto table_exhausted; + } + + cursor_just_opened = FALSE; + + /* END OF RECORD LOOP + ------------------ */ + goto rec_loop; + +next_table: + /* We found a record which satisfies the conditions: we can move to + the next table or return a row in the result set */ + + ut_ad(btr_pcur_is_on_user_rec(&plan->pcur)); + + if (plan->unique_search && !node->can_get_updated) { + + plan->cursor_at_end = TRUE; + } else { + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + } + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + +next_table_no_mtr: + /* If we use 'goto' to this label, it means that the row was popped + from the prefetched rows stack, and &mtr is already committed */ + + if (node->fetch_table + 1 == node->n_tables) { + + sel_eval_select_list(node); + + if (node->is_aggregate) { + + goto table_loop; + } + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + err = DB_SUCCESS; + goto func_exit; + } + + node->fetch_table++; + + /* When we move to the next table, we first reset the plan cursor: + we do not care about resetting it when we backtrack from a table */ + + plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table)); + + goto table_loop; + +table_exhausted: + /* The table cursor pcur reached the result set end: backtrack to the + previous table in the join if we do not have cached prefetched rows */ + + plan->cursor_at_end = TRUE; + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + + if (plan->n_rows_prefetched > 0) { + /* The table became exhausted during a prefetch */ + + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + +table_exhausted_no_mtr: + if (node->fetch_table == 0) { + err = DB_SUCCESS; + + if (node->is_aggregate && !node->aggregate_already_fetched) { + + node->aggregate_already_fetched = TRUE; + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + } else { + node->state = SEL_NODE_NO_MORE_ROWS; + + thr->run_node = que_node_get_parent(node); + } + + goto func_exit; + } + + node->fetch_table--; + + goto table_loop; + +stop_for_a_while: + /* Return control for a while to que_run_threads, so that runaway + queries can be canceled. NOTE that when we come here, we must, in a + locking read, have placed the necessary (possibly waiting request) + record lock on the cursor record or its successor: when we reposition + the cursor, this record lock guarantees that nobody can meanwhile have + inserted new records which should have appeared in the result set, + which would result in the phantom problem. */ + + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_gen(TRUE)); +#endif /* UNIV_SYNC_DEBUG */ + err = DB_SUCCESS; + goto func_exit; + +commit_mtr_for_a_while: + /* Stores the cursor position and commits &mtr; this is used if + &mtr may contain latches which would break the latching order if + &mtr would not be committed and the latches released. */ + + plan->stored_cursor_rec_processed = TRUE; + + ut_ad(!search_latch_locked); + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + mtr_has_extra_clust_latch = FALSE; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_gen(TRUE)); +#endif /* UNIV_SYNC_DEBUG */ + + goto table_loop; + +lock_wait_or_error: + /* See the note at stop_for_a_while: the same holds for this case */ + + ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc); + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + +#ifdef UNIV_SYNC_DEBUG + ut_ad(sync_thread_levels_empty_gen(TRUE)); +#endif /* UNIV_SYNC_DEBUG */ + +func_exit: + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); +} + +/************************************************************************** +Performs a select step. This is a high-level function used in SQL execution +graphs. */ +UNIV_INTERN +que_thr_t* +row_sel_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ulint i_lock_mode; + sym_node_t* table_node; + sel_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_SELECT); + + /* If this is a new time this node is executed (or when execution + resumes after wait for a table intention lock), set intention locks + on the tables, or assign a read view */ + + if (node->into_list && (thr->prev_node == que_node_get_parent(node))) { + + node->state = SEL_NODE_OPEN; + } + + if (node->state == SEL_NODE_OPEN) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(thr_get_trx(thr)); + + plan_reset_cursor(sel_node_get_nth_plan(node, 0)); + + if (node->consistent_read) { + /* Assign a read view for the query */ + node->read_view = trx_assign_read_view( + thr_get_trx(thr)); + } else { + if (node->set_x_locks) { + i_lock_mode = LOCK_IX; + } else { + i_lock_mode = LOCK_IS; + } + + table_node = node->table_list; + + while (table_node) { + err = lock_table(0, table_node->table, + i_lock_mode, thr); + if (err != DB_SUCCESS) { + thr_get_trx(thr)->error_state = err; + + return(NULL); + } + + table_node = que_node_get_next(table_node); + } + } + + /* If this is an explicit cursor, copy stored procedure + variable values, so that the values cannot change between + fetches (currently, we copy them also for non-explicit + cursors) */ + + if (node->explicit_cursor + && UT_LIST_GET_FIRST(node->copy_variables)) { + + row_sel_copy_input_variable_vals(node); + } + + node->state = SEL_NODE_FETCH; + node->fetch_table = 0; + + if (node->is_aggregate) { + /* Reset the aggregate total values */ + sel_reset_aggregate_vals(node); + } + } + + err = row_sel(node, thr); + + /* NOTE! if queries are parallelized, the following assignment may + have problems; the assignment should be made only if thr is the + only top-level thr in the graph: */ + + thr->graph->last_sel_node = node; + + if (err != DB_SUCCESS) { + thr_get_trx(thr)->error_state = err; + + return(NULL); + } + + return(thr); +} + +/************************************************************************** +Performs a fetch for a cursor. */ +UNIV_INTERN +que_thr_t* +fetch_step( +/*=======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + sel_node_t* sel_node; + fetch_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + sel_node = node->cursor_def; + + ut_ad(que_node_get_type(node) == QUE_NODE_FETCH); + + if (thr->prev_node != que_node_get_parent(node)) { + + if (sel_node->state != SEL_NODE_NO_MORE_ROWS) { + + if (node->into_list) { + sel_assign_into_var_values(node->into_list, + sel_node); + } else { + void* ret = (*node->func->func)( + sel_node, node->func->arg); + + if (!ret) { + sel_node->state + = SEL_NODE_NO_MORE_ROWS; + } + } + } + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + /* Make the fetch node the parent of the cursor definition for + the time of the fetch, so that execution knows to return to this + fetch node after a row has been selected or we know that there is + no row left */ + + sel_node->common.parent = node; + + if (sel_node->state == SEL_NODE_CLOSED) { + fprintf(stderr, + "InnoDB: Error: fetch called on a closed cursor\n"); + + thr_get_trx(thr)->error_state = DB_ERROR; + + return(NULL); + } + + thr->run_node = sel_node; + + return(thr); +} + +/******************************************************************** +Sample callback function for fetch that prints each row.*/ +UNIV_INTERN +void* +row_fetch_print( +/*============*/ + /* out: always returns non-NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg) /* in: not used */ +{ + sel_node_t* node = row; + que_node_t* exp; + ulint i = 0; + + UT_NOT_USED(user_arg); + + fprintf(stderr, "row_fetch_print: row %p\n", row); + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + const dtype_t* type = dfield_get_type(dfield); + + fprintf(stderr, " column %lu:\n", (ulong)i); + + dtype_print(type); + putc('\n', stderr); + + if (dfield_get_len(dfield) != UNIV_SQL_NULL) { + ut_print_buf(stderr, dfield_get_data(dfield), + dfield_get_len(dfield)); + putc('\n', stderr); + } else { + fputs(" ;\n", stderr); + } + + exp = que_node_get_next(exp); + i++; + } + + return((void*)42); +} + +/******************************************************************** +Callback function for fetch that stores an unsigned 4 byte integer to the +location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length += 4. */ +UNIV_INTERN +void* +row_fetch_store_uint4( +/*==================*/ + /* out: always returns NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg) /* in: data pointer */ +{ + sel_node_t* node = row; + ib_uint32_t* val = user_arg; + ulint tmp; + + dfield_t* dfield = que_node_get_val(node->select_list); + const dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(dtype_get_prtype(type) & DATA_UNSIGNED); + ut_a(len == 4); + + tmp = mach_read_from_4(dfield_get_data(dfield)); + *val = (ib_uint32_t) tmp; + + return(NULL); +} + +/*************************************************************** +Prints a row in a select result. */ +UNIV_INTERN +que_thr_t* +row_printf_step( +/*============*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + row_printf_node_t* node; + sel_node_t* sel_node; + que_node_t* arg; + + ut_ad(thr); + + node = thr->run_node; + + sel_node = node->sel_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); + } + + if (sel_node->state != SEL_NODE_FETCH) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to print */ + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + arg = sel_node->select_list; + + while (arg) { + dfield_print_also_hex(que_node_get_val(arg)); + + fputs(" ::: ", stderr); + + arg = que_node_get_next(arg); + } + + putc('\n', stderr); + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); +} + +/******************************************************************** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. A counterpart of this function is +ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ +UNIV_INTERN +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /* in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /* in: buffer to use in field + conversions */ + ulint buf_len, /* in: buffer length */ + dict_index_t* index, /* in: index of the key value */ + const byte* key_ptr, /* in: MySQL key value */ + ulint key_len, /* in: MySQL key value length */ + trx_t* trx) /* in: transaction */ +{ + byte* original_buf = buf; + const byte* original_key_ptr = key_ptr; + dict_field_t* field; + dfield_t* dfield; + ulint data_offset; + ulint data_len; + ulint data_field_len; + ibool is_null; + const byte* key_end; + ulint n_fields = 0; + + /* For documentation of the key value storage format in MySQL, see + ha_innobase::store_key_val_for_row() in ha_innodb.cc. */ + + key_end = key_ptr + key_len; + + /* Permit us to access any field in the tuple (ULINT_MAX): */ + + dtuple_set_n_fields(tuple, ULINT_MAX); + + dfield = dtuple_get_nth_field(tuple, 0); + field = dict_index_get_nth_field(index, 0); + + if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) { + /* A special case: we are looking for a position in the + generated clustered index which InnoDB automatically added + to a table with no primary key: the first and the only + ordering column is ROW_ID which InnoDB stored to the key_ptr + buffer. */ + + ut_a(key_len == DATA_ROW_ID_LEN); + + dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN); + + dtuple_set_n_fields(tuple, 1); + + return; + } + + while (key_ptr < key_end) { + + ulint type = dfield_get_type(dfield)->mtype; + ut_a(field->col->mtype == type); + + data_offset = 0; + is_null = FALSE; + + if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { + /* The first byte in the field tells if this is + an SQL NULL value */ + + data_offset = 1; + + if (*key_ptr != 0) { + dfield_set_null(dfield); + + is_null = TRUE; + } + } + + /* Calculate data length and data field total length */ + + if (type == DATA_BLOB) { + /* The key field is a column prefix of a BLOB or + TEXT */ + + ut_a(field->prefix_len > 0); + + /* MySQL stores the actual data length to the first 2 + bytes after the optional SQL NULL marker byte. The + storage format is little-endian, that is, the most + significant byte at a higher address. In UTF-8, MySQL + seems to reserve field->prefix_len bytes for + storing this field in the key value buffer, even + though the actual value only takes data_len bytes + from the start. */ + + data_len = key_ptr[data_offset] + + 256 * key_ptr[data_offset + 1]; + data_field_len = data_offset + 2 + field->prefix_len; + + data_offset += 2; + + /* Now that we know the length, we store the column + value like it would be a fixed char field */ + + } else if (field->prefix_len > 0) { + /* Looks like MySQL pads unused end bytes in the + prefix with space. Therefore, also in UTF-8, it is ok + to compare with a prefix containing full prefix_len + bytes, and no need to take at most prefix_len / 3 + UTF-8 characters from the start. + If the prefix is used as the upper end of a LIKE + 'abc%' query, then MySQL pads the end with chars + 0xff. TODO: in that case does it any harm to compare + with the full prefix_len bytes. How do characters + 0xff in UTF-8 behave? */ + + data_len = field->prefix_len; + data_field_len = data_offset + data_len; + } else { + data_len = dfield_get_type(dfield)->len; + data_field_len = data_offset + data_len; + } + + if (UNIV_UNLIKELY + (dtype_get_mysql_type(dfield_get_type(dfield)) + == DATA_MYSQL_TRUE_VARCHAR) + && UNIV_LIKELY(type != DATA_INT)) { + /* In a MySQL key value format, a true VARCHAR is + always preceded by 2 bytes of a length field. + dfield_get_type(dfield)->len returns the maximum + 'payload' len in bytes. That does not include the + 2 bytes that tell the actual data length. + + We added the check != DATA_INT to make sure we do + not treat MySQL ENUM or SET as a true VARCHAR! */ + + data_len += 2; + data_field_len += 2; + } + + /* Storing may use at most data_len bytes of buf */ + + if (UNIV_LIKELY(!is_null)) { + row_mysql_store_col_in_innobase_format( + dfield, buf, + FALSE, /* MySQL key value format col */ + key_ptr + data_offset, data_len, + dict_table_is_comp(index->table)); + buf += data_len; + } + + key_ptr += data_field_len; + + if (UNIV_UNLIKELY(key_ptr > key_end)) { + /* The last field in key was not a complete key field + but a prefix of it. + + Print a warning about this! HA_READ_PREFIX_LAST does + not currently work in InnoDB with partial-field key + value prefixes. Since MySQL currently uses a padding + trick to calculate LIKE 'abc%' type queries there + should never be partial-field prefixes in searches. */ + + ut_print_timestamp(stderr); + + fputs(" InnoDB: Warning: using a partial-field" + " key prefix in search.\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, ". Last data field length %lu bytes,\n" + "InnoDB: key ptr now exceeds" + " key end by %lu bytes.\n" + "InnoDB: Key value in the MySQL format:\n", + (ulong) data_field_len, + (ulong) (key_ptr - key_end)); + fflush(stderr); + ut_print_buf(stderr, original_key_ptr, key_len); + putc('\n', stderr); + + if (!is_null) { + ulint len = dfield_get_len(dfield); + dfield_set_len(dfield, len + - (ulint) (key_ptr - key_end)); + } + } + + n_fields++; + field++; + dfield++; + } + + ut_a(buf <= original_buf + buf_len); + + /* We set the length of tuple to n_fields: we assume that the memory + area allocated for it is big enough (usually bigger than n_fields). */ + + dtuple_set_n_fields(tuple, n_fields); +} + +/****************************************************************** +Stores the row id to the prebuilt struct. */ +static +void +row_sel_store_row_id_to_prebuilt( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in/out: prebuilt */ + const rec_t* index_rec, /* in: record */ + const dict_index_t* index, /* in: index of the record */ + const ulint* offsets) /* in: rec_get_offsets + (index_rec, index) */ +{ + const byte* data; + ulint len; + + ut_ad(rec_offs_validate(index_rec, index, offsets)); + + data = rec_get_nth_field( + index_rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); + + if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) { + fprintf(stderr, + "InnoDB: Error: Row id field is" + " wrong length %lu in ", (ulong) len); + dict_index_name_print(stderr, prebuilt->trx, index); + fprintf(stderr, "\n" + "InnoDB: Field number %lu, record:\n", + (ulong) dict_index_get_sys_col_pos(index, + DATA_ROW_ID)); + rec_print_new(stderr, index_rec, offsets); + putc('\n', stderr); + ut_error; + } + + ut_memcpy(prebuilt->row_id, data, len); +} + +/****************************************************************** +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */ +static +void +row_sel_field_store_in_mysql_format( +/*================================*/ + byte* dest, /* in/out: buffer where to store; NOTE + that BLOBs are not in themselves + stored here: the caller must allocate + and copy the BLOB into buffer before, + and pass the pointer to the BLOB in + 'data' */ + const mysql_row_templ_t* templ, + /* in: MySQL column template. + Its following fields are referenced: + type, is_unsigned, mysql_col_len, + mbminlen, mbmaxlen */ + const byte* data, /* in: data to store */ + ulint len) /* in: length of the data */ +{ + byte* ptr; + byte* field_end; + byte* pad_ptr; + + ut_ad(len != UNIV_SQL_NULL); + + switch (templ->type) { + case DATA_INT: + /* Convert integer data from Innobase to a little-endian + format, sign bit restored to normal */ + + ptr = dest + len; + + for (;;) { + ptr--; + *ptr = *data; + if (ptr == dest) { + break; + } + data++; + } + + if (!templ->is_unsigned) { + dest[len - 1] = (byte) (dest[len - 1] ^ 128); + } + + ut_ad(templ->mysql_col_len == len); + break; + + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_BINARY: + field_end = dest + templ->mysql_col_len; + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len( + dest, len, templ->mysql_length_bytes); + } + + /* Copy the actual data */ + ut_memcpy(dest, data, len); + + /* Pad with trailing spaces. We pad with spaces also the + unused end of a >= 5.0.3 true VARCHAR column, just in case + MySQL expects its contents to be deterministic. */ + + pad_ptr = dest + len; + + ut_ad(templ->mbminlen <= templ->mbmaxlen); + + /* We handle UCS2 charset strings differently. */ + if (templ->mbminlen == 2) { + /* A space char is two bytes, 0x0020 in UCS2 */ + + if (len & 1) { + /* A 0x20 has been stripped from the column. + Pad it back. */ + + if (pad_ptr < field_end) { + *pad_ptr = 0x20; + pad_ptr++; + } + } + + /* Pad the rest of the string with 0x0020 */ + + while (pad_ptr < field_end) { + *pad_ptr = 0x00; + pad_ptr++; + *pad_ptr = 0x20; + pad_ptr++; + } + } else { + ut_ad(templ->mbminlen == 1); + /* space=0x20 */ + + memset(pad_ptr, 0x20, field_end - pad_ptr); + } + break; + + case DATA_BLOB: + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, templ->mysql_col_len, data, + len); + break; + + case DATA_MYSQL: + memcpy(dest, data, len); + + ut_ad(templ->mysql_col_len >= len); + ut_ad(templ->mbmaxlen >= templ->mbminlen); + + ut_ad(templ->mbmaxlen > templ->mbminlen + || templ->mysql_col_len == len); + /* The following assertion would fail for old tables + containing UTF-8 ENUM columns due to Bug #9526. */ + ut_ad(!templ->mbmaxlen + || !(templ->mysql_col_len % templ->mbmaxlen)); + ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len); + + if (templ->mbminlen != templ->mbmaxlen) { + /* Pad with spaces. This undoes the stripping + done in row0mysql.ic, function + row_mysql_store_col_in_innobase_format(). */ + + memset(dest + len, 0x20, templ->mysql_col_len - len); + } + break; + + default: +#ifdef UNIV_DEBUG + case DATA_SYS_CHILD: + case DATA_SYS: + /* These column types should never be shipped to MySQL. */ + ut_ad(0); + + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Above are the valid column types for MySQL data. */ +#endif /* UNIV_DEBUG */ + ut_ad(templ->mysql_col_len == len); + memcpy(dest, data, len); + } +} + +/****************************************************************** +Convert a row in the Innobase format to a row in the MySQL format. +Note that the template in prebuilt may advise us to copy only a few +columns to mysql_rec, other columns are left blank. All columns may not +be needed in the query. */ +static +ibool +row_sel_store_mysql_rec( +/*====================*/ + /* out: TRUE if success, FALSE if + could not allocate memory for a BLOB + (though we may also assert in that + case) */ + byte* mysql_rec, /* out: row in the MySQL format */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + const rec_t* rec, /* in: Innobase record in the index + which was described in prebuilt's + template; must be protected by + a page latch */ + const ulint* offsets) /* in: array returned by + rec_get_offsets() */ +{ + mysql_row_templ_t* templ; + mem_heap_t* extern_field_heap = NULL; + mem_heap_t* heap; + const byte* data; + ulint len; + ulint i; + + ut_ad(prebuilt->mysql_template); + ut_ad(prebuilt->default_rec); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) { + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; + } + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + + if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, + templ->rec_field_no))) { + + /* Copy an externally stored field to the temporary + heap */ + + ut_a(!prebuilt->trx->has_search_latch); + + if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) { + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + UNIV_PAGE_SIZE); + } + + heap = prebuilt->blob_heap; + } else { + extern_field_heap + = mem_heap_create(UNIV_PAGE_SIZE); + + heap = extern_field_heap; + } + + /* NOTE: if we are retrieving a big BLOB, we may + already run out of memory in the next call, which + causes an assert */ + + data = btr_rec_copy_externally_stored_field( + rec, offsets, + dict_table_zip_size(prebuilt->table), + templ->rec_field_no, &len, heap); + + ut_a(len != UNIV_SQL_NULL); + } else { + /* Field is stored in the row. */ + + data = rec_get_nth_field(rec, offsets, + templ->rec_field_no, &len); + + if (UNIV_UNLIKELY(templ->type == DATA_BLOB) + && len != UNIV_SQL_NULL) { + + /* It is a BLOB field locally stored in the + InnoDB record: we MUST copy its contents to + prebuilt->blob_heap here because later code + assumes all BLOB values have been copied to a + safe place. */ + + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = mem_heap_create( + UNIV_PAGE_SIZE); + } + + data = memcpy(mem_heap_alloc( + prebuilt->blob_heap, len), + data, len); + } + } + + if (len != UNIV_SQL_NULL) { + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, data, len); + + /* Cleanup */ + if (extern_field_heap) { + mem_heap_free(extern_field_heap); + extern_field_heap = NULL; + } + + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a non-NULL + value */ + mysql_rec[templ->mysql_null_byte_offset] + &= ~(byte) templ->mysql_null_bit_mask; + } + } else { + /* MySQL assumes that the field for an SQL + NULL value is set to the default value. */ + + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; + memcpy(mysql_rec + templ->mysql_col_offset, + prebuilt->default_rec + templ->mysql_col_offset, + templ->mysql_col_len); + } + } + + return(TRUE); +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers_for_mysql( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + dict_index_t* clust_index, /* in: clustered index */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + const rec_t* rec, /* in: record in a clustered index */ + ulint** offsets, /* in/out: offsets returned by + rec_get_offsets(rec, clust_index) */ + mem_heap_t** offset_heap, /* in/out: memory heap from which + the offsets are allocated */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_consistent_read( + rec, mtr, clust_index, offsets, read_view, offset_heap, + prebuilt->old_vers_heap, old_vers); + return(err); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. Used in the MySQL +interface. */ +static +ulint +row_sel_get_clust_rec_for_mysql( +/*============================*/ + /* out: DB_SUCCESS or error code */ + row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */ + dict_index_t* sec_index,/* in: secondary index where rec resides */ + const rec_t* rec, /* in: record in a non-clustered index; if + this is a locking read, then rec is not + allowed to be delete-marked, and that would + not make sense either */ + que_thr_t* thr, /* in: query thread */ + const rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + ulint** offsets,/* in: offsets returned by + rec_get_offsets(rec, sec_index); + out: offsets returned by + rec_get_offsets(out_rec, clust_index) */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* clust_index; + const rec_t* clust_rec; + rec_t* old_vers; + ulint err; + trx_t* trx; + + *out_rec = NULL; + trx = thr_get_trx(thr); + + row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, + sec_index, *offsets, trx); + + clust_index = dict_table_get_first_index(sec_index->table); + + btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); + + prebuilt->clust_pcur->trx_if_known = trx; + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) + || btr_pcur_get_low_match(prebuilt->clust_pcur) + < dict_index_get_n_unique(clust_index)) { + + /* In a rare case it is possible that no clust rec is found + for a delete-marked secondary index record: if in row0umod.c + in row_undo_mod_remove_clust_low() we have already removed + the clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case we know that the + clustered index record did not exist in the read view of + trx. */ + + if (!rec_get_deleted_flag(rec, + dict_table_is_comp(sec_index->table)) + || prebuilt->select_lock_type != LOCK_NONE) { + ut_print_timestamp(stderr); + fputs(" InnoDB: error clustered record" + " for sec rec not found\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, sec_index); + fputs("\n" + "InnoDB: sec index record ", stderr); + rec_print(stderr, rec, sec_index); + fputs("\n" + "InnoDB: clust index record ", stderr); + rec_print(stderr, clust_rec, clust_index); + putc('\n', stderr); + trx_print(stderr, trx, 600); + + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + } + + clust_rec = NULL; + + goto func_exit; + } + + *offsets = rec_get_offsets(clust_rec, clust_index, *offsets, + ULINT_UNDEFINED, offset_heap); + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; we are searching + the clust rec with a unique condition, hence + we set a LOCK_REC_NOT_GAP type lock */ + + err = lock_clust_rec_read_check_and_lock( + 0, btr_pcur_get_block(prebuilt->clust_pcur), + clust_rec, clust_index, *offsets, + prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr); + if (err != DB_SUCCESS) { + + goto err_exit; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + old_vers = NULL; + + /* If the isolation level allows reading of uncommitted data, + then we never look for an earlier version */ + + if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && !lock_clust_rec_cons_read_sees( + clust_rec, clust_index, *offsets, + trx->read_view)) { + + /* The following call returns 'offsets' associated with + 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, prebuilt, + clust_rec, offsets, offset_heap, &old_vers, + mtr); + + if (err != DB_SUCCESS || old_vers == NULL) { + + goto err_exit; + } + + clust_rec = old_vers; + } + + /* If we had to go to an earlier version of row or the + secondary index record is delete marked, then it may be that + the secondary index record corresponding to clust_rec + (or old_vers) is not rec; in that case we must ignore + such row because in our snapshot rec would not have existed. + Remember that from rec we cannot see directly which transaction + id corresponds to it: we have to go to the clustered index + record. A query where we want to fetch all rows where + the secondary index value is in some interval would return + a wrong result if we would not drop rows which we come to + visit through secondary index records that would not really + exist in our snapshot. */ + + if (clust_rec + && (old_vers + || rec_get_deleted_flag(rec, dict_table_is_comp( + sec_index->table))) + && !row_sel_sec_rec_is_for_clust_rec( + rec, sec_index, clust_rec, clust_index)) { + clust_rec = NULL; +#ifdef UNIV_SEARCH_DEBUG + } else { + ut_a(clust_rec == NULL + || row_sel_sec_rec_is_for_clust_rec( + rec, sec_index, clust_rec, clust_index)); +#endif + } + } + +func_exit: + *out_rec = clust_rec; + + if (prebuilt->select_lock_type == LOCK_X) { + /* We may use the cursor in update: store its position */ + + btr_pcur_store_position(prebuilt->clust_pcur, mtr); + } + + err = DB_SUCCESS; +err_exit: + return(err); +} + +/************************************************************************ +Restores cursor position after it has been stored. We have to take into +account that the record cursor was positioned on may have been deleted. +Then we may have to move the cursor one step up or down. */ +static +ibool +sel_restore_position_for_mysql( +/*===========================*/ + /* out: TRUE if we may need to + process the record the cursor is + now positioned on (i.e. we should + not go to the next record yet) */ + ibool* same_user_rec, /* out: TRUE if we were able to restore + the cursor on a user record with the + same ordering prefix in in the + B-tree index */ + ulint latch_mode, /* in: latch mode wished in + restoration */ + btr_pcur_t* pcur, /* in: cursor whose position + has been stored */ + ibool moves_up, /* in: TRUE if the cursor moves up + in the index */ + mtr_t* mtr) /* in: mtr; CAUTION: may commit + mtr temporarily! */ +{ + ibool success; + ulint relative_position; + + relative_position = pcur->rel_pos; + + success = btr_pcur_restore_position(latch_mode, pcur, mtr); + + *same_user_rec = success; + + if (relative_position == BTR_PCUR_ON) { + if (success) { + return(FALSE); + } + + if (moves_up) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); + } + + if (relative_position == BTR_PCUR_AFTER + || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) { + + if (moves_up) { + return(TRUE); + } + + if (btr_pcur_is_on_user_rec(pcur)) { + btr_pcur_move_to_prev(pcur, mtr); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_BEFORE + || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE); + + if (moves_up && btr_pcur_is_on_user_rec(pcur)) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); +} + +/************************************************************************ +Pops a cached row for MySQL from the fetch cache. */ +UNIV_INLINE +void +row_sel_pop_cached_row_for_mysql( +/*=============================*/ + byte* buf, /* in/out: buffer where to copy the + row */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct */ +{ + ulint i; + mysql_row_templ_t* templ; + byte* cached_rec; + ut_ad(prebuilt->n_fetch_cached > 0); + ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len); + + if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) { + /* Copy cache record field by field, don't touch fields that + are not covered by current key */ + cached_rec = prebuilt->fetch_cache[ + prebuilt->fetch_cache_first]; + + for (i = 0; i < prebuilt->n_template; i++) { + templ = prebuilt->mysql_template + i; + ut_memcpy(buf + templ->mysql_col_offset, + cached_rec + templ->mysql_col_offset, + templ->mysql_col_len); + /* Copy NULL bit of the current field from cached_rec + to buf */ + if (templ->mysql_null_bit_mask) { + buf[templ->mysql_null_byte_offset] + ^= (buf[templ->mysql_null_byte_offset] + ^ cached_rec[templ->mysql_null_byte_offset]) + & (byte)templ->mysql_null_bit_mask; + } + } + } + else { + ut_memcpy(buf, + prebuilt->fetch_cache[prebuilt->fetch_cache_first], + prebuilt->mysql_prefix_len); + } + prebuilt->n_fetch_cached--; + prebuilt->fetch_cache_first++; + + if (prebuilt->n_fetch_cached == 0) { + prebuilt->fetch_cache_first = 0; + } +} + +/************************************************************************ +Pushes a row for MySQL to the fetch cache. */ +UNIV_INLINE +void +row_sel_push_cache_row_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + const rec_t* rec, /* in: record to push; must + be protected by a page latch */ + const ulint* offsets) /* in: rec_get_offsets() */ +{ + byte* buf; + ulint i; + + ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_a(!prebuilt->templ_contains_blob); + + if (prebuilt->fetch_cache[0] == NULL) { + /* Allocate memory for the fetch cache */ + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + + /* A user has reported memory corruption in these + buffers in Linux. Put magic numbers there to help + to track a possible bug. */ + + buf = mem_alloc(prebuilt->mysql_row_len + 8); + + prebuilt->fetch_cache[i] = buf + 4; + + mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N); + mach_write_to_4(buf + 4 + prebuilt->mysql_row_len, + ROW_PREBUILT_FETCH_MAGIC_N); + } + } + + ut_ad(prebuilt->fetch_cache_first == 0); + + if (UNIV_UNLIKELY(!row_sel_store_mysql_rec( + prebuilt->fetch_cache[ + prebuilt->n_fetch_cached], + prebuilt, rec, offsets))) { + ut_error; + } + + prebuilt->n_fetch_cached++; +} + +/************************************************************************* +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). We assume that the search +mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx, +btr search latch has been locked in S-mode. */ +static +ulint +row_sel_try_search_shortcut_for_mysql( +/*==================================*/ + /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ + const rec_t** out_rec,/* out: record if found */ + row_prebuilt_t* prebuilt,/* in: prebuilt struct */ + ulint** offsets,/* in/out: for rec_get_offsets(*out_rec) */ + mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */ + mtr_t* mtr) /* in: started mtr */ +{ + dict_index_t* index = prebuilt->index; + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + const rec_t* rec; + + ut_ad(dict_index_is_clust(index)); + ut_ad(!prebuilt->templ_contains_blob); + + btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, + BTR_SEARCH_LEAF, pcur, +#ifndef UNIV_SEARCH_DEBUG + RW_S_LATCH, +#else + 0, +#endif + mtr); + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + *offsets = rec_get_offsets(rec, index, *offsets, + ULINT_UNDEFINED, heap); + + if (!lock_clust_rec_cons_read_sees(rec, index, + *offsets, trx->read_view)) { + + return(SEL_RETRY); + } + + if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) { + + return(SEL_EXHAUSTED); + } + + *out_rec = rec; + + return(SEL_FOUND); +} + +/************************************************************************ +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! */ +UNIV_INTERN +ulint +row_search_for_mysql( +/*=================*/ + /* out: DB_SUCCESS, + DB_RECORD_NOT_FOUND, + DB_END_OF_INDEX, DB_DEADLOCK, + DB_LOCK_TABLE_FULL, DB_CORRUPTION, + or DB_TOO_BIG_RECORD */ + byte* buf, /* in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /* in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /* in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /* in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ +{ + dict_index_t* index = prebuilt->index; + ibool comp = dict_table_is_comp(index->table); + const dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + dict_index_t* clust_index; + que_thr_t* thr; + const rec_t* rec; + const rec_t* result_rec; + const rec_t* clust_rec; + ulint err = DB_SUCCESS; + ibool unique_search = FALSE; + ibool unique_search_from_clust_index = FALSE; + ibool mtr_has_extra_clust_latch = FALSE; + ibool moves_up = FALSE; + ibool set_also_gap_locks = TRUE; + /* if the query is a plain locking SELECT, and the isolation level + is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */ + ibool did_semi_consistent_read = FALSE; + /* if the returned record was locked and we did a semi-consistent + read (fetch the newest committed version), then this is set to + TRUE */ +#ifdef UNIV_SEARCH_DEBUG + ulint cnt = 0; +#endif /* UNIV_SEARCH_DEBUG */ + ulint next_offs; + ibool same_user_rec; + mtr_t mtr; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + ut_ad(index && pcur && search_tuple); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error:\n" + "InnoDB: MySQL is trying to use a table handle" + " but the .ibd file for\n" + "InnoDB: table %s does not exist.\n" + "InnoDB: Have you deleted the .ibd file" + " from the database directory under\n" + "InnoDB: the MySQL datadir, or have you used" + " DISCARD TABLESPACE?\n" + "InnoDB: Look from\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "innodb-troubleshooting.html\n" + "InnoDB: how you can resolve the problem.\n", + prebuilt->table->name); + + return(DB_ERROR); + } + + if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + fprintf(stderr, + "InnoDB: Error: trying to free a corrupt\n" + "InnoDB: table handle. Magic n %lu, table name ", + (ulong) prebuilt->magic_n); + ut_print_name(stderr, trx, TRUE, prebuilt->table->name); + putc('\n', stderr); + + mem_analyze_corruption(prebuilt); + + ut_error; + } + +#if 0 + /* August 19, 2005 by Heikki: temporarily disable this error + print until the cursor lock count is done correctly. + See bugs #12263 and #12456!*/ + + if (trx->n_mysql_tables_in_use == 0 + && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) { + /* Note that if MySQL uses an InnoDB temp table that it + created inside LOCK TABLES, then n_mysql_tables_in_use can + be zero; in that case select_lock_type is set to LOCK_X in + ::start_stmt. */ + + fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n" + "InnoDB: but it has not locked" + " any tables in ::external_lock()!\n", + stderr); + trx_print(stderr, trx, 600); + fputc('\n', stderr); + } +#endif + +#if 0 + fprintf(stderr, "Match mode %lu\n search tuple ", + (ulong) match_mode); + dtuple_print(search_tuple); + fprintf(stderr, "N tables locked %lu\n", + (ulong) trx->mysql_n_tables_locked); +#endif + /*-------------------------------------------------------------*/ + /* PHASE 0: Release a possible s-latch we are holding on the + adaptive hash index latch if there is someone waiting behind */ + + if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED) + && trx->has_search_latch) { + + /* There is an x-latch request on the adaptive hash index: + release the s-latch to reduce starvation and wait for + BTR_SEA_TIMEOUT rounds before trying to keep it again over + calls from MySQL */ + + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + } + + /* Reset the new record lock info if srv_locks_unsafe_for_binlog + is set or session is using a READ COMMITED isolation level. Then + we are able to remove the record locks set here on an individual + row. */ + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + trx_reset_new_rec_lock_info(trx); + } + + /*-------------------------------------------------------------*/ + /* PHASE 1: Try to pop the row from the prefetch cache */ + + if (UNIV_UNLIKELY(direction == 0)) { + trx->op_info = "starting index read"; + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + } else { + trx->op_info = "fetching rows"; + + if (prebuilt->n_rows_fetched == 0) { + prebuilt->fetch_direction = direction; + } + + if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) { + if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) { + ut_error; + /* TODO: scrollable cursor: restore cursor to + the place of the latest returned row, + or better: prevent caching for a scroll + cursor! */ + } + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + prebuilt->n_rows_fetched++; + + srv_n_rows_read++; + err = DB_SUCCESS; + goto func_exit; + } + + if (prebuilt->fetch_cache_first > 0 + && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) { + + /* The previous returned row was popped from the fetch + cache, but the cache was not full at the time of the + popping: no more rows can exist in the result set */ + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + + prebuilt->n_rows_fetched++; + + if (prebuilt->n_rows_fetched > 1000000000) { + /* Prevent wrap-over */ + prebuilt->n_rows_fetched = 500000000; + } + + mode = pcur->search_mode; + } + + /* In a search where at most one record in the index may match, we + can use a LOCK_REC_NOT_GAP type record lock when locking a + non-delete-marked matching record. + + Note that in a unique secondary index there may be different + delete-marked versions of a record where only the primary key + values differ: thus in a secondary index we must use next-key + locks when locking delete-marked records. */ + + if (match_mode == ROW_SEL_EXACT + && dict_index_is_unique(index) + && dtuple_get_n_fields(search_tuple) + == dict_index_get_n_unique(index) + && (dict_index_is_clust(index) + || !dtuple_contains_null(search_tuple))) { + + /* Note above that a UNIQUE secondary index can contain many + rows with the same key value if one of the columns is the SQL + null. A clustered index under MySQL can never contain null + columns because we demand that all the columns in primary key + are non-null. */ + + unique_search = TRUE; + + /* Even if the condition is unique, MySQL seems to try to + retrieve also a second row if a primary key contains more than + 1 column. Return immediately if this is not a HANDLER + command. */ + + if (UNIV_UNLIKELY(direction != 0 + && !prebuilt->used_in_HANDLER)) { + + err = DB_RECORD_NOT_FOUND; + goto func_exit; + } + } + + mtr_start(&mtr); + + /*-------------------------------------------------------------*/ + /* PHASE 2: Try fast adaptive hash index search if possible */ + + /* Next test if this is the special case where we can use the fast + adaptive hash index to try the search. Since we must release the + search system latch when we retrieve an externally stored field, we + cannot use the adaptive hash index in a search in the case the row + may be long and there may be externally stored fields */ + + if (UNIV_UNLIKELY(direction == 0) + && unique_search + && dict_index_is_clust(index) + && !prebuilt->templ_contains_blob + && !prebuilt->used_in_HANDLER + && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { + + mode = PAGE_CUR_GE; + + unique_search_from_clust_index = TRUE; + + if (trx->mysql_n_tables_locked == 0 + && prebuilt->select_lock_type == LOCK_NONE + && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED + && trx->read_view) { + + /* This is a SELECT query done as a consistent read, + and the read view has already been allocated: + let us try a search shortcut through the hash + index. + NOTE that we must also test that + mysql_n_tables_locked == 0, because this might + also be INSERT INTO ... SELECT ... or + CREATE TABLE ... SELECT ... . Our algorithm is + NOT prepared to inserts interleaved with the SELECT, + and if we try that, we can deadlock on the adaptive + hash index semaphore! */ + +#ifndef UNIV_SEARCH_DEBUG + if (!trx->has_search_latch) { + rw_lock_s_lock(&btr_search_latch); + trx->has_search_latch = TRUE; + } +#endif + switch (row_sel_try_search_shortcut_for_mysql( + &rec, prebuilt, &offsets, &heap, + &mtr)) { + case SEL_FOUND: +#ifdef UNIV_SEARCH_DEBUG + ut_a(0 == cmp_dtuple_rec(search_tuple, + rec, offsets)); +#endif + /* At this point, rec is protected by + a page latch that was acquired by + row_sel_try_search_shortcut_for_mysql(). + The latch will not be released until + mtr_commit(&mtr). */ + + if (!row_sel_store_mysql_rec(buf, prebuilt, + rec, offsets)) { + err = DB_TOO_BIG_RECORD; + + /* We let the main loop to do the + error handling */ + goto shortcut_fails_too_big_rec; + } + + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" shortcut\n", stderr); */ + + srv_n_rows_read++; + + err = DB_SUCCESS; + goto release_search_latch_if_needed; + + case SEL_EXHAUSTED: + mtr_commit(&mtr); + + /* ut_print_name(stderr, index->name); + fputs(" record not found 2\n", stderr); */ + + err = DB_RECORD_NOT_FOUND; +release_search_latch_if_needed: + if (trx->search_latch_timeout > 0 + && trx->has_search_latch) { + + trx->search_latch_timeout--; + + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + } + + /* NOTE that we do NOT store the cursor + position */ + goto func_exit; + + case SEL_RETRY: + break; + + default: + ut_ad(0); + } +shortcut_fails_too_big_rec: + mtr_commit(&mtr); + mtr_start(&mtr); + } + } + + /*-------------------------------------------------------------*/ + /* PHASE 3: Open or restore index cursor position */ + + if (trx->has_search_latch) { + rw_lock_s_unlock(&btr_search_latch); + trx->has_search_latch = FALSE; + } + + trx_start_if_not_started(trx); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && prebuilt->select_lock_type != LOCK_NONE + && trx->mysql_thd != NULL + && thd_is_select(trx->mysql_thd)) { + /* It is a plain locking SELECT and the isolation + level is low: do not lock gaps */ + + set_also_gap_locks = FALSE; + } + + /* Note that if the search mode was GE or G, then the cursor + naturally moves upward (in fetch next) in alphabetical order, + otherwise downward */ + + if (UNIV_UNLIKELY(direction == 0)) { + if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) { + moves_up = TRUE; + } + } else if (direction == ROW_SEL_NEXT) { + moves_up = TRUE; + } + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + clust_index = dict_table_get_first_index(index->table); + + if (UNIV_LIKELY(direction != 0)) { + ibool need_to_process = sel_restore_position_for_mysql( + &same_user_rec, BTR_SEARCH_LEAF, + pcur, moves_up, &mtr); + + if (UNIV_UNLIKELY(need_to_process)) { + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + /* We did a semi-consistent read, + but the record was removed in + the meantime. */ + prebuilt->row_read_type + = ROW_READ_TRY_SEMI_CONSISTENT; + } + } else if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_DID_SEMI_CONSISTENT)) { + + /* The cursor was positioned on the record + that we returned previously. If we need + to repeat a semi-consistent read as a + pessimistic locking read, the record + cannot be skipped. */ + + goto next_rec; + } + + } else if (dtuple_get_n_fields(search_tuple) > 0) { + + btr_pcur_open_with_no_init(index, search_tuple, mode, + BTR_SEARCH_LEAF, + pcur, 0, &mtr); + + pcur->trx_if_known = trx; + + rec = btr_pcur_get_rec(pcur); + + if (!moves_up + && !page_rec_is_supremum(rec) + && set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the next index record + to prevent phantoms in ORDER BY ... DESC queries */ + const rec_t* next = page_rec_get_next_const(rec); + + offsets = rec_get_offsets(next, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + next, index, offsets, + prebuilt->select_lock_type, + LOCK_GAP, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + } else { + if (mode == PAGE_CUR_G) { + btr_pcur_open_at_index_side( + TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE, + &mtr); + } else if (mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side( + FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE, + &mtr); + } + } + + if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + + if (trx->read_view == NULL + && prebuilt->select_lock_type == LOCK_NONE) { + + fputs("InnoDB: Error: MySQL is trying to" + " perform a consistent read\n" + "InnoDB: but the read view is not assigned!\n", + stderr); + trx_print(stderr, trx, 600); + fputc('\n', stderr); + ut_a(0); + } + } else if (prebuilt->select_lock_type == LOCK_NONE) { + /* This is a consistent read */ + /* Assign a read view for the query */ + + trx_assign_read_view(trx); + prebuilt->sql_stat_start = FALSE; + } else { + ulint lock_mode; + if (prebuilt->select_lock_type == LOCK_S) { + lock_mode = LOCK_IS; + } else { + lock_mode = LOCK_IX; + } + err = lock_table(0, index->table, lock_mode, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + prebuilt->sql_stat_start = FALSE; + } + +rec_loop: + /*-------------------------------------------------------------*/ + /* PHASE 4: Look for matching records in a loop */ + + rec = btr_pcur_get_rec(pcur); + ut_ad(!!page_rec_is_comp(rec) == comp); +#ifdef UNIV_SEARCH_DEBUG + /* + fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt, + page_get_page_no(page_align(rec))); + rec_print(rec); + */ +#endif /* UNIV_SEARCH_DEBUG */ + + if (page_rec_is_infimum(rec)) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. */ + + goto next_rec; + } + + if (page_rec_is_supremum(rec)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a lock on the index record */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using a READ COMMITTED isolation + level we do not lock gaps. Supremum record is really + a gap and therefore we do not set locks there. */ + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, + LOCK_ORDINARY, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + /* A page supremum record cannot be in the result set: skip + it now that we have placed a possible lock on it */ + + goto next_rec; + } + + /*-------------------------------------------------------------*/ + /* Do sanity checks in case our cursor has bumped into page + corruption */ + + if (comp) { + next_offs = rec_get_next_offs(rec, TRUE); + if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) { + + goto wrong_offs; + } + } else { + next_offs = rec_get_next_offs(rec, FALSE); + if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) { + + goto wrong_offs; + } + } + + if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) { + +wrong_offs: + if (srv_force_recovery == 0 || moves_up == FALSE) { + ut_print_timestamp(stderr); + buf_page_print(page_align(rec), 0); + fprintf(stderr, + "\nInnoDB: rec address %p," + " buf block fix count %lu\n", + (void*) rec, (ulong) + btr_cur_get_block(btr_pcur_get_btr_cur(pcur)) + ->page.buf_fix_count); + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". Run CHECK TABLE. You may need to\n" + "InnoDB: restore from a backup, or" + " dump + drop + reimport the table.\n", + stderr); + + err = DB_CORRUPTION; + + goto lock_wait_or_error; + } else { + /* The user may be dumping a corrupt table. Jump + over the corruption to recover as much as possible. */ + + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the rest of the page.\n", + stderr); + + btr_pcur_move_to_last_on_page(pcur, &mtr); + + goto next_rec; + } + } + /*-------------------------------------------------------------*/ + + /* Calculate the 'offsets' associated with 'rec' */ + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + if (UNIV_UNLIKELY(srv_force_recovery > 0)) { + if (!rec_validate(rec, offsets) + || !btr_index_rec_validate(rec, index, FALSE)) { + fprintf(stderr, + "InnoDB: Index corruption: rec offs %lu" + " next offs %lu, page no %lu,\n" + "InnoDB: ", + (ulong) page_offset(rec), + (ulong) next_offs, + (ulong) page_get_page_no(page_align(rec))); + dict_index_name_print(stderr, trx, index); + fputs(". We try to skip the record.\n", + stderr); + + goto next_rec; + } + } + + /* Note that we cannot trust the up_match value in the cursor at this + place because we can arrive here after moving the cursor! Thus + we have to recompare rec and search_tuple to determine if they + match enough. */ + + if (match_mode == ROW_SEL_EXACT) { + /* Test if the index record matches completely to search_tuple + in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + + /* fputs("Comparing rec and search tuple\n", stderr); */ + + if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set or this session is not + using a READ COMMITTED isolation level. */ + + err = sel_set_rec_lock( + btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + err = DB_RECORD_NOT_FOUND; + /* ut_print_name(stderr, index->name); + fputs(" record not found 3\n", stderr); */ + + goto normal_return; + } + + } else if (match_mode == ROW_SEL_EXACT_PREFIX) { + + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) { + + if (set_also_gap_locks + && !(srv_locks_unsafe_for_binlog + || trx->isolation_level + == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* Try to place a gap lock on the index + record only if innodb_locks_unsafe_for_binlog + option is not set or this session is not + using a READ COMMITTED isolation level. */ + + err = sel_set_rec_lock( + btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, LOCK_GAP, + thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + btr_pcur_store_position(pcur, &mtr); + + err = DB_RECORD_NOT_FOUND; + /* ut_print_name(stderr, index->name); + fputs(" record not found 4\n", stderr); */ + + goto normal_return; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record; note that delete + marked records are a special case in a unique search. If there + is a non-delete marked record, then it is enough to lock its + existence with LOCK_REC_NOT_GAP. */ + + /* If innodb_locks_unsafe_for_binlog option is used + or this session is using a READ COMMITED isolation + level we lock only the record, i.e., next-key locking is + not used. */ + + ulint lock_type; + + if (!set_also_gap_locks + || srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED + || (unique_search + && !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) { + + goto no_gap_lock; + } else { + lock_type = LOCK_ORDINARY; + } + + /* If we are doing a 'greater or equal than a primary key + value' search from a clustered index, and we find a record + that has that exact primary key value, then there is no need + to lock the gap before the record, because no insert in the + gap can be in our search range. That is, no phantom row can + appear that way. + + An example: if col1 is the primary key, the search is WHERE + col1 >= 100, and we find a record where col1 = 100, then no + need to lock the gap before that record. */ + + if (index == clust_index + && mode == PAGE_CUR_GE + && direction == 0 + && dtuple_get_n_fields_cmp(search_tuple) + == dict_index_get_n_unique(index) + && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) { +no_gap_lock: + lock_type = LOCK_REC_NOT_GAP; + } + + err = sel_set_rec_lock(btr_pcur_get_block(pcur), + rec, index, offsets, + prebuilt->select_lock_type, + lock_type, thr); + + switch (err) { + const rec_t* old_vers; + case DB_SUCCESS: + break; + case DB_LOCK_WAIT: + if (UNIV_LIKELY(prebuilt->row_read_type + != ROW_READ_TRY_SEMI_CONSISTENT) + || index != clust_index) { + + goto lock_wait_or_error; + } + + /* The following call returns 'offsets' + associated with 'old_vers' */ + err = row_sel_build_committed_vers_for_mysql( + clust_index, prebuilt, rec, + &offsets, &heap, &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + mutex_enter(&kernel_mutex); + if (trx->was_chosen_as_deadlock_victim) { + mutex_exit(&kernel_mutex); + err = DB_DEADLOCK; + + goto lock_wait_or_error; + } + if (UNIV_LIKELY(trx->wait_lock != NULL)) { + lock_cancel_waiting_and_release( + trx->wait_lock); + trx_reset_new_rec_lock_info(trx); + } else { + mutex_exit(&kernel_mutex); + + /* The lock was granted while we were + searching for the last committed version. + Do a normal locking read. */ + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, + &heap); + err = DB_SUCCESS; + break; + } + mutex_exit(&kernel_mutex); + + if (old_vers == NULL) { + /* The row was not yet committed */ + + goto next_rec; + } + + did_semi_consistent_read = TRUE; + rec = old_vers; + break; + default: + + goto lock_wait_or_error; + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) { + + /* Do nothing: we let a non-locking SELECT read the + latest version of the record */ + + } else if (index == clust_index) { + + /* Fetch a previous version of the row if the current + one is not visible in the snapshot; if we have a very + high force recovery level set, we try to avoid crashes + by skipping this lookup */ + + if (UNIV_LIKELY(srv_force_recovery < 5) + && !lock_clust_rec_cons_read_sees( + rec, index, offsets, trx->read_view)) { + + rec_t* old_vers; + /* The following call returns 'offsets' + associated with 'old_vers' */ + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, rec, &offsets, &heap, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The row did not exist yet in + the read view */ + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, trx->read_view)) { + /* We are looking into a non-clustered index, + and to get the right version of the record we + have to look also into the clustered index: this + is necessary, because we can only get the undo + information via the clustered index record. */ + + ut_ad(index != clust_index); + + goto requires_clust_rec; + } + } + + /* NOTE that at this point rec can be an old version of a clustered + index record built for a consistent read. We cannot assume after this + point that rec is on a buffer pool page. Functions like + page_rec_is_comp() cannot be used! */ + + if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) { + + /* The record is delete-marked: we can skip it */ + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE + && !did_semi_consistent_read) { + + /* No need to keep a lock on a delete-marked record + if we do not want to use next-key locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + } + + /* This is an optimization to skip setting the next key lock + on the record that follows this delete-marked record. This + optimization works because of the unique search criteria + which precludes the presence of a range lock between this + delete marked record and the record following it. + + For now this is applicable only to clustered indexes while + doing a unique search. There is scope for further optimization + applicable to unique secondary indexes. Current behaviour is + to widen the scope of a lock on an already delete marked record + if the same record is deleted twice by the same transaction */ + if (index == clust_index && unique_search) { + err = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + + goto next_rec; + } + + /* Get the clustered index record if needed, if we did not do the + search using the clustered index. */ + + if (index != clust_index && prebuilt->need_to_access_clustered) { + +requires_clust_rec: + /* We use a 'goto' to the preceding label if a consistent + read of a secondary index record requires us to look up old + versions of the associated clustered index record. */ + + ut_ad(rec_offs_validate(rec, index, offsets)); + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + mtr_has_extra_clust_latch = TRUE; + + /* The following call returns 'offsets' associated with + 'clust_rec'. Note that 'clust_rec' can be an old version + built for a consistent read. */ + + err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, + thr, &clust_rec, + &offsets, &heap, &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(prebuilt->select_lock_type == LOCK_NONE); + + goto next_rec; + } + + if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) { + + /* The record is delete marked: we can skip it */ + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + /* No need to keep a lock on a delete-marked + record if we do not want to use next-key + locking. */ + + row_unlock_for_mysql(prebuilt, TRUE); + } + + goto next_rec; + } + + if (prebuilt->need_to_access_clustered) { + + result_rec = clust_rec; + + ut_ad(rec_offs_validate(result_rec, clust_index, + offsets)); + } else { + /* We used 'offsets' for the clust rec, recalculate + them for 'rec' */ + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + result_rec = rec; + } + } else { + result_rec = rec; + } + + /* We found a qualifying record 'result_rec'. At this point, + 'offsets' are associated with 'result_rec'. */ + + ut_ad(rec_offs_validate(result_rec, + result_rec != rec ? clust_index : index, + offsets)); + + /* At this point, the clustered index record is protected + by a page latch that was acquired when pcur was positioned. + The latch will not be released until mtr_commit(&mtr). */ + + if ((match_mode == ROW_SEL_EXACT + || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD) + && prebuilt->select_lock_type == LOCK_NONE + && !prebuilt->templ_contains_blob + && !prebuilt->clust_index_was_generated + && !prebuilt->used_in_HANDLER + && prebuilt->template_type + != ROW_MYSQL_DUMMY_TEMPLATE) { + + /* Inside an update, for example, we do not cache rows, + since we may use the cursor position to do the actual + update, that is why we require ...lock_type == LOCK_NONE. + Since we keep space in prebuilt only for the BLOBs of + a single row, we cannot cache rows in the case there + are BLOBs in the fields to be fetched. In HANDLER we do + not cache rows because there the cursor is a scrollable + cursor. */ + + row_sel_push_cache_row_for_mysql(prebuilt, result_rec, + offsets); + if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { + + goto got_row; + } + + goto next_rec; + } else { + if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) { + memcpy(buf + 4, result_rec + - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + mach_write_to_4(buf, + rec_offs_extra_size(offsets) + 4); + } else { + if (!row_sel_store_mysql_rec(buf, prebuilt, + result_rec, offsets)) { + err = DB_TOO_BIG_RECORD; + + goto lock_wait_or_error; + } + } + + if (prebuilt->clust_index_was_generated) { + if (result_rec != rec) { + offsets = rec_get_offsets( + rec, index, offsets, ULINT_UNDEFINED, + &heap); + } + row_sel_store_row_id_to_prebuilt(prebuilt, rec, + index, offsets); + } + } + + /* From this point on, 'offsets' are invalid. */ + +got_row: + /* We have an optimization to save CPU time: if this is a consistent + read on a unique condition on the clustered index, then we do not + store the pcur position, because any fetch next or prev will anyway + return 'end of file'. Exceptions are locking reads and the MySQL + HANDLER command where the user can move the cursor with PREV or NEXT + even after a unique search. */ + + if (!unique_search_from_clust_index + || prebuilt->select_lock_type != LOCK_NONE + || prebuilt->used_in_HANDLER) { + + /* Inside an update always store the cursor position */ + + btr_pcur_store_position(pcur, &mtr); + } + + err = DB_SUCCESS; + + goto normal_return; + +next_rec: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + did_semi_consistent_read = FALSE; + + if (UNIV_UNLIKELY(srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && prebuilt->select_lock_type != LOCK_NONE) { + + trx_reset_new_rec_lock_info(trx); + } + + /*-------------------------------------------------------------*/ + /* PHASE 5: Move the cursor to the next index record */ + + if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) { + /* We must commit mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + mtr_start(&mtr); + if (sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, + pcur, moves_up, &mtr)) { +#ifdef UNIV_SEARCH_DEBUG + cnt++; +#endif /* UNIV_SEARCH_DEBUG */ + + goto rec_loop; + } + } + + if (moves_up) { + if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) { +not_moved: + btr_pcur_store_position(pcur, &mtr); + + if (match_mode != 0) { + err = DB_RECORD_NOT_FOUND; + } else { + err = DB_END_OF_INDEX; + } + + goto normal_return; + } + } else { + if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) { + goto not_moved; + } + } + +#ifdef UNIV_SEARCH_DEBUG + cnt++; +#endif /* UNIV_SEARCH_DEBUG */ + + goto rec_loop; + +lock_wait_or_error: + /* Reset the old and new "did semi-consistent read" flags. */ + if (UNIV_UNLIKELY(prebuilt->row_read_type + == ROW_READ_DID_SEMI_CONSISTENT)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + did_semi_consistent_read = FALSE; + + /*-------------------------------------------------------------*/ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + trx->error_state = err; + + /* The following is a patch for MySQL */ + + que_thr_stop_for_mysql(thr); + + thr->lock_state = QUE_THR_LOCK_ROW; + + if (row_mysql_handle_errors(&err, trx, thr, NULL)) { + /* It was a lock wait, and it ended */ + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + mtr_start(&mtr); + + sel_restore_position_for_mysql(&same_user_rec, + BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + + if ((srv_locks_unsafe_for_binlog + || trx->isolation_level == TRX_ISO_READ_COMMITTED) + && !same_user_rec) { + + /* Since we were not able to restore the cursor + on the same user record, we cannot use + row_unlock_for_mysql() to unlock any records, and + we must thus reset the new rec lock info. Since + in lock0lock.c we have blocked the inheriting of gap + X-locks, we actually do not have any new record locks + set in this case. + + Note that if we were able to restore on the 'same' + user record, it is still possible that we were actually + waiting on a delete-marked record, and meanwhile + it was removed by purge and inserted again by some + other user. But that is no problem, because in + rec_loop we will again try to set a lock, and + new_rec_lock_info in trx will be right at the end. */ + + trx_reset_new_rec_lock_info(trx); + } + + mode = pcur->search_mode; + + goto rec_loop; + } + + thr->lock_state = QUE_THR_LOCK_NOLOCK; + +#ifdef UNIV_SEARCH_DEBUG + /* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ +#endif /* UNIV_SEARCH_DEBUG */ + goto func_exit; + +normal_return: + /*-------------------------------------------------------------*/ + que_thr_stop_for_mysql_no_error(thr, trx); + + mtr_commit(&mtr); + + if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + err = DB_SUCCESS; + } + +#ifdef UNIV_SEARCH_DEBUG + /* fputs("Using ", stderr); + dict_index_name_print(stderr, index); + fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ +#endif /* UNIV_SEARCH_DEBUG */ + if (err == DB_SUCCESS) { + srv_n_rows_read++; + } + +func_exit: + trx->op_info = ""; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + /* Set or reset the "did semi-consistent read" flag on return. + The flag did_semi_consistent_read is set if and only if + the record being returned was fetched with a semi-consistent read. */ + ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS + || !did_semi_consistent_read); + + if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) { + if (UNIV_UNLIKELY(did_semi_consistent_read)) { + prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } + } + return(err); +} + +/*********************************************************************** +Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. */ +UNIV_INTERN +ibool +row_search_check_if_query_cache_permitted( +/*======================================*/ + /* out: TRUE if storing or retrieving + from the query cache is permitted */ + trx_t* trx, /* in: transaction object */ + const char* norm_name) /* in: concatenation of database name, + '/' char, table name */ +{ + dict_table_t* table; + ibool ret = FALSE; + + table = dict_table_get(norm_name, FALSE); + + if (table == NULL) { + + return(FALSE); + } + + mutex_enter(&kernel_mutex); + + /* Start the transaction if it is not started yet */ + + trx_start_if_not_started_low(trx); + + /* If there are locks on the table or some trx has invalidated the + cache up to our trx id, then ret = FALSE. + We do not check what type locks there are on the table, though only + IX type locks actually would require ret = FALSE. */ + + if (UT_LIST_GET_LEN(table->locks) == 0 + && ut_dulint_cmp(trx->id, + table->query_cache_inv_trx_id) >= 0) { + + ret = TRUE; + + /* If the isolation level is high, assign a read view for the + transaction if it does not yet have one */ + + if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ + && !trx->read_view) { + + trx->read_view = read_view_open_now( + trx->id, trx->global_read_view_heap); + trx->global_read_view = trx->read_view; + } + } + + mutex_exit(&kernel_mutex); + + return(ret); +} + +/*********************************************************************** +Read the AUTOINC column from the current row. If the value is less than +0 and the type is not unsigned then we reset the value to 0. */ +static +ib_uint64_t +row_search_autoinc_read_column( +/*===========================*/ + /* out: value read from the column */ + dict_index_t* index, /* in: index to read from */ + const rec_t* rec, /* in: current rec */ + ulint col_no, /* in: column number */ + ibool unsigned_type) /* in: signed or unsigned flag */ +{ + ulint len; + const byte* data; + ib_uint64_t value; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); + + data = rec_get_nth_field(rec, offsets, col_no, &len); + + ut_a(len != UNIV_SQL_NULL); + ut_a(len <= sizeof value); + + /* we assume AUTOINC value cannot be negative */ + value = mach_read_int_type(data, len, unsigned_type); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (!unsigned_type && (ib_int64_t) value < 0) { + value = 0; + } + + return(value); +} + +/*********************************************************************** +Get the last row. */ +static +const rec_t* +row_search_autoinc_get_rec( +/*=======================*/ + /* out: current rec or NULL */ + btr_pcur_t* pcur, /* in: the current cursor */ + mtr_t* mtr) /* in: mini transaction */ +{ + do { + const rec_t* rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_user_rec(rec)) { + return(rec); + } + } while (btr_pcur_move_to_prev(pcur, mtr)); + + return(NULL); +} + +/*********************************************************************** +Read the max AUTOINC value from an index. */ +UNIV_INTERN +ulint +row_search_max_autoinc( +/*===================*/ + /* out: DB_SUCCESS if all OK else + error code, DB_RECORD_NOT_FOUND if + column name can't be found in index */ + dict_index_t* index, /* in: index to search */ + const char* col_name, /* in: name of autoinc column */ + ib_uint64_t* value) /* out: AUTOINC value read */ +{ + ulint i; + ulint n_cols; + dict_field_t* dfield = NULL; + ulint error = DB_SUCCESS; + + n_cols = dict_index_get_n_ordering_defined_by_user(index); + + /* Search the index for the AUTOINC column name */ + for (i = 0; i < n_cols; ++i) { + dfield = dict_index_get_nth_field(index, i); + + if (strcmp(col_name, dfield->name) == 0) { + break; + } + } + + *value = 0; + + /* Must find the AUTOINC column name */ + if (i < n_cols && dfield) { + mtr_t mtr; + btr_pcur_t pcur; + + mtr_start(&mtr); + + /* Open at the high/right end (FALSE), and INIT + cursor (TRUE) */ + btr_pcur_open_at_index_side( + FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + + if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) { + const rec_t* rec; + + rec = row_search_autoinc_get_rec(&pcur, &mtr); + + if (rec != NULL) { + ibool unsigned_type = ( + dfield->col->prtype & DATA_UNSIGNED); + + *value = row_search_autoinc_read_column( + index, rec, i, unsigned_type); + } + } + + btr_pcur_close(&pcur); + + mtr_commit(&mtr); + } else { + error = DB_RECORD_NOT_FOUND; + } + + return(error); +} diff --git a/storage/xtradb/row/row0uins.c b/storage/xtradb/row/row0uins.c new file mode 100644 index 00000000000..69d6b2e6c2a --- /dev/null +++ b/storage/xtradb/row/row0uins.c @@ -0,0 +1,350 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#include "row0uins.h" + +#ifdef UNIV_NONINL +#include "row0uins.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "ibuf0ibuf.h" +#include "log0log.h" + +/******************************************************************* +Removes a clustered index record. The pcur in node was positioned on the +record, now it is detached. */ +static +ulint +row_undo_ins_remove_clust_rec( +/*==========================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node) /* in: undo node */ +{ + btr_cur_t* btr_cur; + ibool success; + ulint err; + ulint n_tries = 0; + mtr_t mtr; + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), + &mtr); + ut_a(success); + + if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Drop the index tree associated with the row in + SYS_INDEXES table: */ + + dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr); + + mtr_commit(&mtr); + + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, + &(node->pcur), &mtr); + ut_a(success); + } + + btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); + + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (success) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } +retry: + /* If did not succeed, try pessimistic descent to tree */ + mtr_start(&mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_TREE, + &(node->pcur), &mtr); + ut_a(success); + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + trx_is_recv(node->trx) + ? RB_RECOVERY + : RB_NORMAL, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err == DB_OUT_OF_FILE_SPACE + && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + trx_undo_rec_release(node->trx, node->undo_no); + + return(err); +} + +/******************************************************************* +Removes a secondary index entry if found. */ +static +ulint +row_undo_ins_remove_sec_low( +/*========================*/ + /* out: DB_SUCCESS, DB_FAIL, or + DB_OUT_OF_FILE_SPACE */ + ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry to remove */ +{ + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool found; + ibool success; + ulint err; + mtr_t mtr; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* Not found */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* No need to distinguish RB_RECOVERY here, because we + are deleting a secondary index record: the distinction + between RB_NORMAL and RB_RECOVERY only matters when + deleting a record that contains externally stored + columns. */ + ut_ad(!dict_index_is_clust(index)); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NORMAL, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/******************************************************************* +Removes a secondary index entry from the index if found. Tries first +optimistic, then pessimistic descent down the tree. */ +static +ulint +row_undo_ins_remove_sec( +/*====================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry to insert */ +{ + ulint err; + ulint n_tries = 0; + + /* Try first optimistic descent to the B-tree */ + + err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry); + + if (err == DB_SUCCESS) { + + return(err); + } + + /* Try then pessimistic descent to the B-tree */ +retry: + err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + + if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { + + n_tries++; + + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + + goto retry; + } + + return(err); +} + +/*************************************************************** +Parses the row reference and other info in a fresh insert undo record. */ +static +void +row_undo_ins_parse_undo_rec( +/*========================*/ + undo_node_t* node) /* in/out: row undo node */ +{ + dict_index_t* clust_index; + byte* ptr; + dulint undo_no; + dulint table_id; + ulint type; + ulint dummy; + ibool dummy_extern; + + ut_ad(node); + + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy, + &dummy_extern, &undo_no, &table_id); + ut_ad(type == TRX_UNDO_INSERT_REC); + node->rec_type = type; + + node->update = NULL; + node->table = dict_table_get_on_id(table_id, node->trx); + + /* Skip the UNDO if we can't find the table or the .ibd file. */ + if (UNIV_UNLIKELY(node->table == NULL)) { + } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) { + node->table = NULL; + } else { + clust_index = dict_table_get_first_index(node->table); + + if (clust_index != NULL) { + ptr = trx_undo_rec_get_row_ref( + ptr, clust_index, &node->ref, node->heap); + } else { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: table "); + ut_print_name(stderr, node->trx, TRUE, + node->table->name); + fprintf(stderr, " has no indexes, " + "ignoring the table\n"); + + node->table = NULL; + } + } +} + +/*************************************************************** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. */ +UNIV_INTERN +ulint +row_undo_ins( +/*=========*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node) /* in: row undo node */ +{ + ut_ad(node); + ut_ad(node->state == UNDO_NODE_INSERT); + + row_undo_ins_parse_undo_rec(node); + + if (!node->table || !row_undo_search_clust_to_pcur(node)) { + trx_undo_rec_release(node->trx, node->undo_no); + + return(DB_SUCCESS); + } + + /* Iterate over all the indexes and undo the insert.*/ + + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + while (node->index != NULL) { + dtuple_t* entry; + ulint err; + + entry = row_build_index_entry(node->row, node->ext, + node->index, node->heap); + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record. Because secondary index entries + are inserted after the clustered index record, + we may assume that the secondary index record + does not exist. However, this situation may + only occur during the rollback of incomplete + transactions. */ + ut_a(trx_is_recv(node->trx)); + } else { + err = row_undo_ins_remove_sec(node->index, entry); + + if (err != DB_SUCCESS) { + + return(err); + } + } + + node->index = dict_table_get_next_index(node->index); + } + + return(row_undo_ins_remove_clust_rec(node)); +} diff --git a/storage/xtradb/row/row0umod.c b/storage/xtradb/row/row0umod.c new file mode 100644 index 00000000000..835f357fc8d --- /dev/null +++ b/storage/xtradb/row/row0umod.c @@ -0,0 +1,820 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#include "row0umod.h" + +#ifdef UNIV_NONINL +#include "row0umod.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0roll.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "row0undo.h" +#include "row0vers.h" +#include "trx0trx.h" +#include "trx0rec.h" +#include "row0row.h" +#include "row0upd.h" +#include "que0que.h" +#include "log0log.h" + +/* Considerations on undoing a modify operation. +(1) Undoing a delete marking: all index records should be found. Some of +them may have delete mark already FALSE, if the delete mark operation was +stopped underway, or if the undo operation ended prematurely because of a +system crash. +(2) Undoing an update of a delete unmarked record: the newer version of +an updated secondary index entry should be removed if no prior version +of the clustered index record requires its existence. Otherwise, it should +be delete marked. +(3) Undoing an update of a delete marked record. In this kind of update a +delete marked clustered index record was delete unmarked and possibly also +some of its fields were changed. Now, it is possible that the delete marked +version has become obsolete at the time the undo is started. */ + +/*************************************************************** +Checks if also the previous version of the clustered index record was +modified or inserted by the same transaction, and its undo number is such +that it should be undone in the same rollback. */ +UNIV_INLINE +ibool +row_undo_mod_undo_also_prev_vers( +/*=============================*/ + /* out: TRUE if also previous modify or + insert of this row should be undone */ + undo_node_t* node, /* in: row undo node */ + dulint* undo_no)/* out: the undo number */ +{ + trx_undo_rec_t* undo_rec; + trx_t* trx; + + trx = node->trx; + + if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { + + *undo_no = ut_dulint_zero; + return(FALSE); + } + + undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap); + + *undo_no = trx_undo_rec_get_undo_no(undo_rec); + + return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0); +} + +/*************************************************************** +Undoes a modify in a clustered index record. */ +static +ulint +row_undo_mod_clust_low( +/*===================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code: + we may run out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr, /* in: mtr; must be committed before + latching any further pages */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + ut_ad(success); + + if (mode == BTR_MODIFY_LEAF) { + + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + mem_heap_t* heap = NULL; + big_rec_t* dummy_big_rec; + + ut_ad(mode == BTR_MODIFY_TREE); + + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, &heap, &dummy_big_rec, node->update, + node->cmpl_info, thr, mtr); + + ut_a(!dummy_big_rec); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } + + return(err); +} + +/*************************************************************** +Removes a clustered index record after undo if possible. */ +static +ulint +row_undo_mod_remove_clust_low( +/*==========================*/ + /* out: DB_SUCCESS, DB_FAIL, or error code: + we may run out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr __attribute__((unused)), /* in: query thread */ + mtr_t* mtr, /* in: mtr */ + ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + ibool success; + + pcur = &(node->pcur); + btr_cur = btr_pcur_get_btr_cur(pcur); + + success = btr_pcur_restore_position(mode, pcur, mtr); + + if (!success) { + + return(DB_SUCCESS); + } + + /* Find out if we can remove the whole clustered index record */ + + if (node->rec_type == TRX_UNDO_UPD_DEL_REC + && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) { + + /* Ok, we can remove */ + } else { + return(DB_SUCCESS); + } + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, mtr); + + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* Note that since this operation is analogous to purge, + we can free also inherited externally stored fields: + hence the RB_NONE in the call below */ + + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NONE, mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + + return(err); +} + +/*************************************************************** +Undoes a modify in a clustered index record. Sets also the node state for the +next round of undo. */ +static +ulint +row_undo_mod_clust( +/*===============*/ + /* out: DB_SUCCESS or error code: we may run + out of file space */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + btr_pcur_t* pcur; + mtr_t mtr; + ulint err; + ibool success; + ibool more_vers; + dulint new_undo_no; + + ut_ad(node && thr); + + /* Check if also the previous version of the clustered index record + should be undone in this same rollback operation */ + + more_vers = row_undo_mod_undo_also_prev_vers(node, &new_undo_no); + + pcur = &(node->pcur); + + mtr_start(&mtr); + + /* Try optimistic processing of the record, keeping changes within + the index page */ + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF); + + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a pessimistic + descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + + if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) { + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_LEAF); + if (err != DB_SUCCESS) { + btr_pcur_commit_specify_mtr(pcur, &mtr); + + /* We may have to modify tree structure: do a + pessimistic descent down the index tree */ + + mtr_start(&mtr); + + err = row_undo_mod_remove_clust_low(node, thr, &mtr, + BTR_MODIFY_TREE); + } + + btr_pcur_commit_specify_mtr(pcur, &mtr); + } + + node->state = UNDO_NODE_FETCH_NEXT; + + trx_undo_rec_release(node->trx, node->undo_no); + + if (more_vers && err == DB_SUCCESS) { + + /* Reserve the undo log record to the prior version after + committing &mtr: this is necessary to comply with the latching + order, as &mtr may contain the fsp latch which is lower in + the latch hierarchy than trx->undo_mutex. */ + + success = trx_undo_rec_reserve(node->trx, new_undo_no); + + if (success) { + node->state = UNDO_NODE_PREV_VERS; + } + } + + return(err); +} + +/*************************************************************** +Delete marks or removes a secondary index entry if found. */ +static +ulint +row_undo_mod_del_mark_or_remove_sec_low( +/*====================================*/ + /* out: DB_SUCCESS, DB_FAIL, or + DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry, /* in: index entry */ + ulint mode) /* in: latch mode BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ +{ + ibool found; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + ibool success; + ibool old_has; + ulint err; + mtr_t mtr; + mtr_t mtr_vers; + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, mode, &pcur, &mtr); + + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (!found) { + /* In crash recovery, the secondary index record may + be missing if the UPDATE did not have time to insert + the secondary index records before the crash. When we + are undoing that UPDATE in crash recovery, the record + may be missing. + + In normal processing, if an update ends in a deadlock + before it has inserted all updated secondary index + records, then the undo will not find those records. */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(DB_SUCCESS); + } + + /* We should remove the index record if no prior version of the row, + which cannot be purged yet, requires its existence. If some requires, + we should delete mark the record. */ + + mtr_start(&mtr_vers); + + success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur), + &mtr_vers); + ut_a(success); + + old_has = row_vers_old_has_index_entry(FALSE, + btr_pcur_get_rec(&(node->pcur)), + &mtr_vers, index, entry); + if (old_has) { + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, &mtr); + ut_ad(err == DB_SUCCESS); + } else { + /* Remove the index record */ + + if (mode == BTR_MODIFY_LEAF) { + success = btr_cur_optimistic_delete(btr_cur, &mtr); + if (success) { + err = DB_SUCCESS; + } else { + err = DB_FAIL; + } + } else { + ut_ad(mode == BTR_MODIFY_TREE); + + /* No need to distinguish RB_RECOVERY here, because we + are deleting a secondary index record: the distinction + between RB_NORMAL and RB_RECOVERY only matters when + deleting a record that contains externally stored + columns. */ + ut_ad(!dict_index_is_clust(index)); + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + RB_NORMAL, &mtr); + + /* The delete operation may fail if we have little + file space left: TODO: easiest to crash the database + and restart with more file space */ + } + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/*************************************************************** +Delete marks or removes a secondary index entry if found. +NOTE that if we updated the fields of a delete-marked secondary index record +so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot +return to the original values because we do not know them. But this should +not cause problems because in row0sel.c, in queries we always retrieve the +clustered index record or an earlier version of it, if the secondary index +record through which we do the search is delete-marked. */ +static +ulint +row_undo_mod_del_mark_or_remove_sec( +/*================================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + ulint err; + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_LEAF); + if (err == DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, + entry, BTR_MODIFY_TREE); + return(err); +} + +/*************************************************************** +Delete unmarks a secondary index entry which must be found. It might not be +delete-marked at the moment, but it does not harm to unmark it anyway. We also +need to update the fields of the secondary index record if we updated its +fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. */ +static +ulint +row_undo_mod_del_unmark_sec_and_undo_update( +/*========================================*/ + /* out: DB_FAIL or DB_SUCCESS or + DB_OUT_OF_FILE_SPACE */ + ulint mode, /* in: search mode: BTR_MODIFY_LEAF or + BTR_MODIFY_TREE */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: index */ + dtuple_t* entry) /* in: index entry */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + upd_t* update; + ulint err = DB_SUCCESS; + big_rec_t* dummy_big_rec; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + + /* Ignore indexes that are being created. */ + if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) { + + return(DB_SUCCESS); + } + + log_free_check(); + mtr_start(&mtr); + + if (UNIV_UNLIKELY(!row_search_index_entry(index, entry, + mode, &pcur, &mtr))) { + fputs("InnoDB: error in sec index entry del undo in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); + putc('\n', stderr); + trx_print(stderr, trx, 0); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + } else { + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); + + err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, + btr_cur, FALSE, thr, &mtr); + ut_a(err == DB_SUCCESS); + heap = mem_heap_create(100); + + update = row_upd_build_sec_rec_difference_binary( + index, entry, btr_cur_get_rec(btr_cur), trx, heap); + if (upd_get_n_fields(update) == 0) { + + /* Do nothing */ + + } else if (mode == BTR_MODIFY_LEAF) { + /* Try an optimistic updating of the record, keeping + changes within the page */ + + err = btr_cur_optimistic_update( + BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, + btr_cur, update, 0, thr, &mtr); + switch (err) { + case DB_OVERFLOW: + case DB_UNDERFLOW: + case DB_ZIP_OVERFLOW: + err = DB_FAIL; + } + } else { + ut_a(mode == BTR_MODIFY_TREE); + err = btr_cur_pessimistic_update( + BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, + btr_cur, &heap, &dummy_big_rec, + update, 0, thr, &mtr); + ut_a(!dummy_big_rec); + } + + mem_heap_free(heap); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + return(err); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is UPD_DEL. */ +static +ulint +row_undo_mod_upd_del_sec( +/*=====================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err = DB_SUCCESS; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, node->ext, + index, heap); + if (UNIV_UNLIKELY(!entry)) { + /* The database must have crashed after + inserting a clustered index record but before + writing all the externally stored columns of + that record. Because secondary index entries + are inserted after the clustered index record, + we may assume that the secondary index record + does not exist. However, this situation may + only occur during the rollback of incomplete + transactions. */ + ut_a(trx_is_recv(thr_get_trx(thr))); + } else { + err = row_undo_mod_del_mark_or_remove_sec( + node, thr, index, entry); + + if (err != DB_SUCCESS) { + + break; + } + } + + mem_heap_empty(heap); + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(err); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is DEL_MARK. */ +static +ulint +row_undo_mod_del_mark_sec( +/*======================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + entry = row_build_index_entry(node->row, node->ext, + index, heap); + ut_a(entry); + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err != DB_SUCCESS) { + + mem_heap_free(heap); + + return(err); + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Undoes a modify in secondary indexes when undo record type is UPD_EXIST. */ +static +ulint +row_undo_mod_upd_exist_sec( +/*=======================*/ + /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + mem_heap_t* heap; + dtuple_t* entry; + dict_index_t* index; + ulint err; + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + /* No change in secondary indexes */ + + return(DB_SUCCESS); + } + + heap = mem_heap_create(1024); + + while (node->index != NULL) { + index = node->index; + + if (row_upd_changes_ord_field_binary(node->row, node->index, + node->update)) { + + /* Build the newest version of the index entry */ + entry = row_build_index_entry(node->row, node->ext, + index, heap); + ut_a(entry); + /* NOTE that if we updated the fields of a + delete-marked secondary index record so that + alphabetically they stayed the same, e.g., + 'abc' -> 'aBc', we cannot return to the original + values because we do not know them. But this should + not cause problems because in row0sel.c, in queries + we always retrieve the clustered index record or an + earlier version of it, if the secondary index record + through which we do the search is delete-marked. */ + + err = row_undo_mod_del_mark_or_remove_sec(node, thr, + index, + entry); + if (err != DB_SUCCESS) { + mem_heap_free(heap); + + return(err); + } + + /* We may have to update the delete mark in the + secondary index record of the previous version of + the row. We also need to update the fields of + the secondary index record if we updated its fields + but alphabetically they stayed the same, e.g., + 'abc' -> 'aBc'. */ + mem_heap_empty(heap); + entry = row_build_index_entry(node->undo_row, + node->undo_ext, + index, heap); + ut_a(entry); + + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_LEAF, thr, index, entry); + if (err == DB_FAIL) { + err = row_undo_mod_del_unmark_sec_and_undo_update( + BTR_MODIFY_TREE, thr, index, entry); + } + + if (err != DB_SUCCESS) { + mem_heap_free(heap); + + return(err); + } + } + + node->index = dict_table_get_next_index(node->index); + } + + mem_heap_free(heap); + + return(DB_SUCCESS); +} + +/*************************************************************** +Parses the row reference and other info in a modify undo log record. */ +static +void +row_undo_mod_parse_undo_rec( +/*========================*/ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* clust_index; + byte* ptr; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + ulint info_bits; + ulint type; + ulint cmpl_info; + ibool dummy_extern; + trx_t* trx; + + ut_ad(node && thr); + trx = thr_get_trx(thr); + ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + node->rec_type = type; + + node->table = dict_table_get_on_id(table_id, trx); + + /* TODO: other fixes associated with DROP TABLE + rollback in the + same table by another user */ + + if (node->table == NULL) { + /* Table was dropped */ + return; + } + + if (node->table->ibd_file_missing) { + /* We skip undo operations to missing .ibd files */ + node->table = NULL; + + return; + } + + clust_index = dict_table_get_first_index(node->table); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), + node->heap); + + trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, + roll_ptr, info_bits, trx, + node->heap, &(node->update)); + node->new_roll_ptr = roll_ptr; + node->new_trx_id = trx_id; + node->cmpl_info = cmpl_info; +} + +/*************************************************************** +Undoes a modify operation on a row of a table. */ +UNIV_INTERN +ulint +row_undo_mod( +/*=========*/ + /* out: DB_SUCCESS or error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + ut_ad(node && thr); + ut_ad(node->state == UNDO_NODE_MODIFY); + + row_undo_mod_parse_undo_rec(node, thr); + + if (!node->table || !row_undo_search_clust_to_pcur(node)) { + /* It is already undone, or will be undone by another query + thread, or table was dropped */ + + trx_undo_rec_release(node->trx, node->undo_no); + node->state = UNDO_NODE_FETCH_NEXT; + + return(DB_SUCCESS); + } + + node->index = dict_table_get_next_index( + dict_table_get_first_index(node->table)); + + if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { + + err = row_undo_mod_upd_exist_sec(node, thr); + + } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { + + err = row_undo_mod_del_mark_sec(node, thr); + } else { + ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); + err = row_undo_mod_upd_del_sec(node, thr); + } + + if (err != DB_SUCCESS) { + + return(err); + } + + err = row_undo_mod_clust(node, thr); + + return(err); +} diff --git a/storage/xtradb/row/row0undo.c b/storage/xtradb/row/row0undo.c new file mode 100644 index 00000000000..d372f88e207 --- /dev/null +++ b/storage/xtradb/row/row0undo.c @@ -0,0 +1,378 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#include "row0undo.h" + +#ifdef UNIV_NONINL +#include "row0undo.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0uins.h" +#include "row0umod.h" +#include "row0upd.h" +#include "row0mysql.h" +#include "srv0srv.h" + +/* How to undo row operations? +(1) For an insert, we have stored a prefix of the clustered index record +in the undo log. Using it, we look for the clustered record, and using +that we look for the records in the secondary indexes. The insert operation +may have been left incomplete, if the database crashed, for example. +We may have look at the trx id and roll ptr to make sure the record in the +clustered index is really the one for which the undo log record was +written. We can use the framework we get from the original insert op. +(2) Delete marking: We can use the framework we get from the original +delete mark op. We only have to check the trx id. +(3) Update: This may be the most complicated. We have to use the framework +we get from the original update op. + +What if the same trx repeatedly deletes and inserts an identical row. +Then the row id changes and also roll ptr. What if the row id was not +part of the ordering fields in the clustered index? Maybe we have to write +it to undo log. Well, maybe not, because if we order the row id and trx id +in descending order, then the only undeleted copy is the first in the +index. Our searches in row operations always position the cursor before +the first record in the result set. But, if there is no key defined for +a table, then it would be desirable that row id is in ascending order. +So, lets store row id in descending order only if it is not an ordering +field in the clustered index. + +NOTE: Deletes and inserts may lead to situation where there are identical +records in a secondary index. Is that a problem in the B-tree? Yes. +Also updates can lead to this, unless trx id and roll ptr are included in +ord fields. +(1) Fix in clustered indexes: include row id, trx id, and roll ptr +in node pointers of B-tree. +(2) Fix in secondary indexes: include all fields in node pointers, and +if an entry is inserted, check if it is equal to the right neighbor, +in which case update the right neighbor: the neighbor must be delete +marked, set it unmarked and write the trx id of the current transaction. + +What if the same trx repeatedly updates the same row, updating a secondary +index field or not? Updating a clustered index ordering field? + +(1) If it does not update the secondary index and not the clustered index +ord field. Then the secondary index record stays unchanged, but the +trx id in the secondary index record may be smaller than in the clustered +index record. This is no problem? +(2) If it updates secondary index ord field but not clustered: then in +secondary index there are delete marked records, which differ in an +ord field. No problem. +(3) Updates clustered ord field but not secondary, and secondary index +is unique. Then the record in secondary index is just updated at the +clustered ord field. +(4) + +Problem with duplicate records: +Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a +bigger trx id has inserted and delete marked a similar row, our trx inserts +again a similar row, and a trx with an even bigger id delete marks it. Then +the position of the row should change in the index if the trx id affects +the alphabetical ordering. + +Fix 2: If an insert encounters a similar row marked deleted, we turn the +insert into an 'update' of the row marked deleted. Then we must write undo +info on the update. A problem: what if a purge operation tries to remove +the delete marked row? + +We can think of the database row versions as a linked list which starts +from the record in the clustered index, and is linked by roll ptrs +through undo logs. The secondary index records are references which tell +what kinds of records can be found in this linked list for a record +in the clustered index. + +How to do the purge? A record can be removed from the clustered index +if its linked list becomes empty, i.e., the row has been marked deleted +and its roll ptr points to the record in the undo log we are going through, +doing the purge. Similarly, during a rollback, a record can be removed +if the stored roll ptr in the undo log points to a trx already (being) purged, +or if the roll ptr is NULL, i.e., it was a fresh insert. */ + +/************************************************************************ +Creates a row undo node to a query graph. */ +UNIV_INTERN +undo_node_t* +row_undo_node_create( +/*=================*/ + /* out, own: undo node */ + trx_t* trx, /* in: transaction */ + que_thr_t* parent, /* in: parent node, i.e., a thr node */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + undo_node_t* undo; + + ut_ad(trx && parent && heap); + + undo = mem_heap_alloc(heap, sizeof(undo_node_t)); + + undo->common.type = QUE_NODE_UNDO; + undo->common.parent = parent; + + undo->state = UNDO_NODE_FETCH_NEXT; + undo->trx = trx; + + btr_pcur_init(&(undo->pcur)); + + undo->heap = mem_heap_create(256); + + return(undo); +} + +/*************************************************************** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. */ +UNIV_INTERN +ibool +row_undo_search_clust_to_pcur( +/*==========================*/ + /* out: TRUE if found; NOTE the node->pcur + must be closed by the caller, regardless of + the return value */ + undo_node_t* node) /* in: row undo node */ +{ + dict_index_t* clust_index; + ibool found; + mtr_t mtr; + ibool ret; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + mtr_start(&mtr); + + clust_index = dict_table_get_first_index(node->table); + + found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, + node->table, node->ref, &mtr); + + rec = btr_pcur_get_rec(&(node->pcur)); + + offsets = rec_get_offsets(rec, clust_index, offsets, + ULINT_UNDEFINED, &heap); + + if (!found || 0 != ut_dulint_cmp(node->roll_ptr, + row_get_rec_roll_ptr(rec, clust_index, + offsets))) { + + /* We must remove the reservation on the undo log record + BEFORE releasing the latch on the clustered index page: this + is to make sure that some thread will eventually undo the + modification corresponding to node->roll_ptr. */ + + /* fputs("--------------------undoing a previous version\n", + stderr); */ + + ret = FALSE; + } else { + node->row = row_build(ROW_COPY_DATA, clust_index, rec, + offsets, NULL, &node->ext, node->heap); + if (node->update) { + node->undo_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->undo_row, &node->undo_ext, + clust_index, node->update, node->heap); + } else { + node->undo_row = NULL; + node->undo_ext = NULL; + } + + btr_pcur_store_position(&(node->pcur), &mtr); + + ret = TRUE; + } + + btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(ret); +} + +/*************************************************************** +Fetches an undo log record and does the undo for the recorded operation. +If none left, or a partial rollback completed, returns control to the +parent node, which is always a query thread node. */ +static +ulint +row_undo( +/*=====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + undo_node_t* node, /* in: row undo node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + trx_t* trx; + dulint roll_ptr; + ibool locked_data_dict; + + ut_ad(node && thr); + + trx = node->trx; + + if (node->state == UNDO_NODE_FETCH_NEXT) { + + node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, + trx->roll_limit, + &roll_ptr, + node->heap); + if (!node->undo_rec) { + /* Rollback completed for this query thread */ + + thr->run_node = que_node_get_parent(node); + + return(DB_SUCCESS); + } + + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + + } else if (node->state == UNDO_NODE_PREV_VERS) { + + /* Undo should be done to the same clustered index record + again in this same rollback, restoring the previous version */ + + roll_ptr = node->new_roll_ptr; + + node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr, + node->heap); + node->roll_ptr = roll_ptr; + node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + node->state = UNDO_NODE_INSERT; + } else { + node->state = UNDO_NODE_MODIFY; + } + } + + /* Prevent DROP TABLE etc. while we are rolling back this row. + If we are doing a TABLE CREATE or some other dictionary operation, + then we already have dict_operation_lock locked in x-mode. Do not + try to lock again, because that would cause a hang. */ + + locked_data_dict = (trx->dict_operation_lock_mode == 0); + + if (locked_data_dict) { + + row_mysql_lock_data_dictionary(trx); + } + + if (node->state == UNDO_NODE_INSERT) { + + err = row_undo_ins(node); + + node->state = UNDO_NODE_FETCH_NEXT; + } else { + ut_ad(node->state == UNDO_NODE_MODIFY); + err = row_undo_mod(node, thr); + } + + if (locked_data_dict) { + + row_mysql_unlock_data_dictionary(trx); + } + + /* Do some cleanup */ + btr_pcur_close(&(node->pcur)); + + mem_heap_empty(node->heap); + + thr->run_node = node; + + return(err); +} + +/*************************************************************** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. */ +UNIV_INTERN +que_thr_t* +row_undo_step( +/*==========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + undo_node_t* node; + trx_t* trx; + + ut_ad(thr); + + srv_activity_count++; + + trx = thr_get_trx(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); + + err = row_undo(node, thr); + + trx->error_state = err; + + if (err != DB_SUCCESS) { + /* SQL error detected */ + + fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", + (ulong) err); + + if (err == DB_OUT_OF_FILE_SPACE) { + fprintf(stderr, + "InnoDB: Error 13 means out of tablespace.\n" + "InnoDB: Consider increasing" + " your tablespace.\n"); + + exit(1); + } + + ut_error; + + return(NULL); + } + + return(thr); +} diff --git a/storage/xtradb/row/row0upd.c b/storage/xtradb/row/row0upd.c new file mode 100644 index 00000000000..740f1ee593d --- /dev/null +++ b/storage/xtradb/row/row0upd.c @@ -0,0 +1,2168 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "row0upd.h" + +#ifdef UNIV_NONINL +#include "row0upd.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "mach0data.h" +#include "trx0undo.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "que0que.h" +#include "row0ext.h" +#include "row0ins.h" +#include "row0sel.h" +#include "row0row.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "log0log.h" +#include "pars0sym.h" +#include "eval0eval.h" +#include "buf0lru.h" + + +/* What kind of latch and lock can we assume when the control comes to + ------------------------------------------------------------------- +an update node? +-------------- +Efficiency of massive updates would require keeping an x-latch on a +clustered index page through many updates, and not setting an explicit +x-lock on clustered index records, as they anyway will get an implicit +x-lock when they are updated. A problem is that the read nodes in the +graph should know that they must keep the latch when passing the control +up to the update node, and not set any record lock on the record which +will be updated. Another problem occurs if the execution is stopped, +as the kernel switches to another query thread, or the transaction must +wait for a lock. Then we should be able to release the latch and, maybe, +acquire an explicit x-lock on the record. + Because this seems too complicated, we conclude that the less +efficient solution of releasing all the latches when the control is +transferred to another node, and acquiring explicit x-locks, is better. */ + +/* How is a delete performed? If there is a delete without an +explicit cursor, i.e., a searched delete, there are at least +two different situations: +the implicit select cursor may run on (1) the clustered index or +on (2) a secondary index. The delete is performed by setting +the delete bit in the record and substituting the id of the +deleting transaction for the original trx id, and substituting a +new roll ptr for previous roll ptr. The old trx id and roll ptr +are saved in the undo log record. Thus, no physical changes occur +in the index tree structure at the time of the delete. Only +when the undo log is purged, the index records will be physically +deleted from the index trees. + +The query graph executing a searched delete would consist of +a delete node which has as a subtree a select subgraph. +The select subgraph should return a (persistent) cursor +in the clustered index, placed on page which is x-latched. +The delete node should look for all secondary index records for +this clustered index entry and mark them as deleted. When is +the x-latch freed? The most efficient way for performing a +searched delete is obviously to keep the x-latch for several +steps of query graph execution. */ + +/*************************************************************** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + /* out: TRUE if changes */ + dtuple_t* entry, /* in: old value of index entry */ + dict_index_t* index, /* in: index of entry */ + const upd_t* update, /* in: update vector for the row */ + ulint n); /* in: how many first fields to check */ + + +/************************************************************************* +Checks if index currently is mentioned as a referenced index in a foreign +key constraint. */ +static +ibool +row_upd_index_is_referenced( +/*========================*/ + /* out: TRUE if referenced; NOTE that since + we do not hold dict_operation_lock + when leaving the function, it may be that + the referencing table has been dropped when + we leave this function: this function is only + for heuristic use! */ + dict_index_t* index, /* in: index */ + trx_t* trx) /* in: transaction */ +{ + dict_table_t* table = index->table; + dict_foreign_t* foreign; + ibool froze_data_dict = FALSE; + ibool is_referenced = FALSE; + + if (!UT_LIST_GET_FIRST(table->referenced_list)) { + + return(FALSE); + } + + if (trx->dict_operation_lock_mode == 0) { + row_mysql_freeze_data_dictionary(trx); + froze_data_dict = TRUE; + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign) { + if (foreign->referenced_index == index) { + + is_referenced = TRUE; + goto func_exit; + } + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + +func_exit: + if (froze_data_dict) { + row_mysql_unfreeze_data_dictionary(trx); + } + + return(is_referenced); +} + +/************************************************************************* +Checks if possible foreign key constraints hold after a delete of the record +under pcur. NOTE that this function will temporarily commit mtr and lose the +pcur position! */ +static +ulint +row_upd_check_references_constraints( +/*=================================*/ + /* out: DB_SUCCESS or an error code */ + upd_node_t* node, /* in: row update node */ + btr_pcur_t* pcur, /* in: cursor positioned on a record; NOTE: the + cursor position is lost in this function! */ + dict_table_t* table, /* in: table in question */ + dict_index_t* index, /* in: index of the cursor */ + ulint* offsets,/* in/out: rec_get_offsets(pcur.rec, index) */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr */ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + dtuple_t* entry; + trx_t* trx; + const rec_t* rec; + ulint n_ext; + ulint err; + ibool got_s_lock = FALSE; + + if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) { + + return(DB_SUCCESS); + } + + trx = thr_get_trx(thr); + + rec = btr_pcur_get_rec(pcur); + ut_ad(rec_offs_validate(rec, index, offsets)); + + heap = mem_heap_create(500); + + entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, + &n_ext, heap); + + mtr_commit(mtr); + + mtr_start(mtr); + + if (trx->dict_operation_lock_mode == 0) { + got_s_lock = TRUE; + + row_mysql_freeze_data_dictionary(trx); + } + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign) { + /* Note that we may have an update which updates the index + record, but does NOT update the first fields which are + referenced in a foreign key constraint. Then the update does + NOT break the constraint. */ + + if (foreign->referenced_index == index + && (node->is_delete + || row_upd_changes_first_fields_binary( + entry, index, node->update, + foreign->n_fields))) { + + if (foreign->foreign_table == NULL) { + dict_table_get(foreign->foreign_table_name, + FALSE); + } + + if (foreign->foreign_table) { + mutex_enter(&(dict_sys->mutex)); + + (foreign->foreign_table + ->n_foreign_key_checks_running)++; + + mutex_exit(&(dict_sys->mutex)); + } + + /* NOTE that if the thread ends up waiting for a lock + we will release dict_operation_lock temporarily! + But the counter on the table protects 'foreign' from + being dropped while the check is running. */ + + err = row_ins_check_foreign_constraint( + FALSE, foreign, table, entry, thr); + + if (foreign->foreign_table) { + mutex_enter(&(dict_sys->mutex)); + + ut_a(foreign->foreign_table + ->n_foreign_key_checks_running > 0); + + (foreign->foreign_table + ->n_foreign_key_checks_running)--; + + mutex_exit(&(dict_sys->mutex)); + } + + if (err != DB_SUCCESS) { + + goto func_exit; + } + } + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + err = DB_SUCCESS; + +func_exit: + if (got_s_lock) { + row_mysql_unfreeze_data_dictionary(trx); + } + + mem_heap_free(heap); + + return(err); +} + +/************************************************************************* +Creates an update node for a query graph. */ +UNIV_INTERN +upd_node_t* +upd_node_create( +/*============*/ + /* out, own: update node */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + upd_node_t* node; + + node = mem_heap_alloc(heap, sizeof(upd_node_t)); + node->common.type = QUE_NODE_UPDATE; + + node->state = UPD_NODE_UPDATE_CLUSTERED; + node->in_mysql_interface = FALSE; + + node->row = NULL; + node->ext = NULL; + node->upd_row = NULL; + node->upd_ext = NULL; + node->index = NULL; + node->update = NULL; + + node->foreign = NULL; + node->cascade_heap = NULL; + node->cascade_node = NULL; + + node->select = NULL; + + node->heap = mem_heap_create(128); + node->magic_n = UPD_NODE_MAGIC_N; + + node->cmpl_info = 0; + + return(node); +} + +/************************************************************************* +Updates the trx id and roll ptr field in a clustered index record in database +recovery. */ +UNIV_INTERN +void +row_upd_rec_sys_fields_in_recovery( +/*===============================*/ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + ulint pos, /* in: TRX_ID position in rec */ + dulint trx_id, /* in: transaction id */ + dulint roll_ptr)/* in: roll ptr of the undo log record */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_trx_id_and_roll_ptr( + page_zip, rec, offsets, pos, trx_id, roll_ptr); + } else { + byte* field; + ulint len; + + field = rec_get_nth_field(rec, offsets, pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); +#if DATA_TRX_ID + 1 != DATA_ROLL_PTR +# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR" +#endif + trx_write_trx_id(field, trx_id); + trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr); + } +} + +/************************************************************************* +Sets the trx id or roll ptr field of a clustered index entry. */ +UNIV_INTERN +void +row_upd_index_entry_sys_field( +/*==========================*/ + const dtuple_t* entry, /* in: index entry, where the memory buffers + for sys fields are already allocated: + the function just copies the new values to + them */ + dict_index_t* index, /* in: clustered index */ + ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ + dulint val) /* in: value to write */ +{ + dfield_t* dfield; + byte* field; + ulint pos; + + ut_ad(dict_index_is_clust(index)); + + pos = dict_index_get_sys_col_pos(index, type); + + dfield = dtuple_get_nth_field(entry, pos); + field = dfield_get_data(dfield); + + if (type == DATA_TRX_ID) { + trx_write_trx_id(field, val); + } else { + ut_ad(type == DATA_ROLL_PTR); + trx_write_roll_ptr(field, val); + } +} + +/*************************************************************** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. */ +UNIV_INTERN +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + /* out: TRUE if the update changes the size of + some field in index or the field is external + in rec or update */ + dict_index_t* index, /* in: index */ + const ulint* offsets,/* in: rec_get_offsets(rec, index) */ + const upd_t* update) /* in: update vector */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint old_len; + ulint new_len; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(NULL, index, offsets)); + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + new_len = dfield_get_len(new_val); + + if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) { + /* A bug fixed on Dec 31st, 2004: we looked at the + SQL NULL size from the wrong field! We may backport + this fix also to 4.0. The merge to 5.0 will be made + manually immediately after we commit this to 4.1. */ + + new_len = dict_col_get_sql_null_size( + dict_index_get_nth_col(index, + upd_field->field_no)); + } + + old_len = rec_offs_nth_size(offsets, upd_field->field_no); + + if (rec_offs_comp(offsets) + && rec_offs_nth_sql_null(offsets, + upd_field->field_no)) { + /* Note that in the compact table format, for a + variable length field, an SQL NULL will use zero + bytes in the offset array at the start of the physical + record, but a zero-length value (empty string) will + use one byte! Thus, we cannot use update-in-place + if we update an SQL NULL varchar to an empty string! */ + + old_len = UNIV_SQL_NULL; + } + + if (dfield_is_ext(new_val) || old_len != new_len + || rec_offs_nth_extern(offsets, upd_field->field_no)) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the record +given. No field size changes are allowed. */ +UNIV_INTERN +void +row_upd_rec_in_place( +/*=================*/ + rec_t* rec, /* in/out: record where replaced */ + dict_index_t* index, /* in: the index the record belongs to */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + const upd_t* update, /* in: update vector */ + page_zip_des_t* page_zip)/* in: compressed page with enough space + available, or NULL */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (rec_offs_comp(offsets)) { + rec_set_info_bits_new(rec, update->info_bits); + } else { + rec_set_info_bits_old(rec, update->info_bits); + } + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + ut_ad(!dfield_is_ext(new_val) == + !rec_offs_nth_extern(offsets, upd_field->field_no)); + + rec_set_nth_field(rec, offsets, upd_field->field_no, + dfield_get_data(new_val), + dfield_get_len(new_val)); + } + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_rec(page_zip, rec, index, offsets, 0); + } +} + +/************************************************************************* +Writes into the redo log the values of trx id and roll ptr and enough info +to determine their positions within a clustered index record. */ +UNIV_INTERN +byte* +row_upd_write_sys_vals_to_log( +/*==========================*/ + /* out: new pointer to mlog */ + dict_index_t* index, /* in: clustered index */ + trx_t* trx, /* in: transaction */ + dulint roll_ptr,/* in: roll ptr of the undo log record */ + byte* log_ptr,/* pointer to a buffer of size > 20 opened + in mlog */ + mtr_t* mtr __attribute__((unused))) /* in: mtr */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr); + + log_ptr += mach_write_compressed(log_ptr, + dict_index_get_sys_col_pos( + index, DATA_TRX_ID)); + + trx_write_roll_ptr(log_ptr, roll_ptr); + log_ptr += DATA_ROLL_PTR_LEN; + + log_ptr += mach_dulint_write_compressed(log_ptr, trx->id); + + return(log_ptr); +} + +/************************************************************************* +Parses the log data of system field values. */ +UNIV_INTERN +byte* +row_upd_parse_sys_vals( +/*===================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + ulint* pos, /* out: TRX_ID position in record */ + dulint* trx_id, /* out: trx id */ + dulint* roll_ptr)/* out: roll ptr */ +{ + ptr = mach_parse_compressed(ptr, end_ptr, pos); + + if (ptr == NULL) { + + return(NULL); + } + + if (end_ptr < ptr + DATA_ROLL_PTR_LEN) { + + return(NULL); + } + + *roll_ptr = trx_read_roll_ptr(ptr); + ptr += DATA_ROLL_PTR_LEN; + + ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id); + + return(ptr); +} + +/*************************************************************** +Writes to the redo log the new values of the fields occurring in the index. */ +UNIV_INTERN +void +row_upd_index_write_log( +/*====================*/ + const upd_t* update, /* in: update vector */ + byte* log_ptr,/* in: pointer to mlog buffer: must + contain at least MLOG_BUF_MARGIN bytes + of free space; the buffer is closed + within this function */ + mtr_t* mtr) /* in: mtr into whose log to write */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint len; + ulint n_fields; + byte* buf_end; + ulint i; + + n_fields = upd_get_n_fields(update); + + buf_end = log_ptr + MLOG_BUF_MARGIN; + + mach_write_to_1(log_ptr, update->info_bits); + log_ptr++; + log_ptr += mach_write_compressed(log_ptr, n_fields); + + for (i = 0; i < n_fields; i++) { + +#if MLOG_BUF_MARGIN <= 30 +# error "MLOG_BUF_MARGIN <= 30" +#endif + + if (log_ptr + 30 > buf_end) { + mlog_close(mtr, log_ptr); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + + upd_field = upd_get_nth_field(update, i); + + new_val = &(upd_field->new_val); + + len = dfield_get_len(new_val); + + log_ptr += mach_write_compressed(log_ptr, upd_field->field_no); + log_ptr += mach_write_compressed(log_ptr, len); + + if (len != UNIV_SQL_NULL) { + if (log_ptr + len < buf_end) { + memcpy(log_ptr, dfield_get_data(new_val), len); + + log_ptr += len; + } else { + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, + dfield_get_data(new_val), + len); + + log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN); + buf_end = log_ptr + MLOG_BUF_MARGIN; + } + } + } + + mlog_close(mtr, log_ptr); +} + +/************************************************************************* +Parses the log data written by row_upd_index_write_log. */ +UNIV_INTERN +byte* +row_upd_index_parse( +/*================*/ + /* out: log data end or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + mem_heap_t* heap, /* in: memory heap where update vector is + built */ + upd_t** update_out)/* out: update vector */ +{ + upd_t* update; + upd_field_t* upd_field; + dfield_t* new_val; + ulint len; + ulint n_fields; + ulint info_bits; + ulint i; + + if (end_ptr < ptr + 1) { + + return(NULL); + } + + info_bits = mach_read_from_1(ptr); + ptr++; + ptr = mach_parse_compressed(ptr, end_ptr, &n_fields); + + if (ptr == NULL) { + + return(NULL); + } + + update = upd_create(n_fields, heap); + update->info_bits = info_bits; + + for (i = 0; i < n_fields; i++) { + ulint field_no; + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + + ptr = mach_parse_compressed(ptr, end_ptr, &field_no); + + if (ptr == NULL) { + + return(NULL); + } + + upd_field->field_no = field_no; + + ptr = mach_parse_compressed(ptr, end_ptr, &len); + + if (ptr == NULL) { + + return(NULL); + } + + if (len != UNIV_SQL_NULL) { + + if (end_ptr < ptr + len) { + + return(NULL); + } + + dfield_set_data(new_val, + mem_heap_dup(heap, ptr, len), len); + ptr += len; + } else { + dfield_set_null(new_val); + } + } + + *update_out = update; + + return(ptr); +} + +/******************************************************************* +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! */ +UNIV_INTERN +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + /* out, own: update vector of differing + fields */ + dict_index_t* index, /* in: index */ + const dtuple_t* entry, /* in: entry to insert */ + const rec_t* rec, /* in: secondary index record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap) /* in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + const dfield_t* dfield; + const byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint i; + ulint offsets_[REC_OFFS_SMALL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + /* This function is used only for a secondary index */ + ut_a(!dict_index_is_clust(index)); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE that it may be that len != dfield_get_len(dfield) if we + are updating in a character set and collation where strings of + different length can be equal in an alphabetical comparison, + and also in the case where we have a column prefix index + and the last characters in the index field are spaces; the + latter case probably caused the assertion failures reported at + row0upd.c line 713 in versions 4.0.14 - 4.0.16. */ + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (!dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, trx); + + n_diff++; + } + } + + update->n_fields = n_diff; + + return(update); +} + +/******************************************************************* +Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! */ +UNIV_INTERN +upd_t* +row_upd_build_difference_binary( +/*============================*/ + /* out, own: update vector of differing + fields, excluding roll ptr and trx id */ + dict_index_t* index, /* in: clustered index */ + const dtuple_t* entry, /* in: entry to insert */ + const rec_t* rec, /* in: clustered index record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap) /* in: memory heap from which allocated */ +{ + upd_field_t* upd_field; + const dfield_t* dfield; + const byte* data; + ulint len; + upd_t* update; + ulint n_diff; + ulint roll_ptr_pos; + ulint trx_id_pos; + ulint i; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + /* This function is used only for a clustered index */ + ut_a(dict_index_is_clust(index)); + + update = upd_create(dtuple_get_n_fields(entry), heap); + + n_diff = 0; + + roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); + trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + for (i = 0; i < dtuple_get_n_fields(entry); i++) { + + data = rec_get_nth_field(rec, offsets, i, &len); + + dfield = dtuple_get_nth_field(entry, i); + + /* NOTE: we compare the fields as binary strings! + (No collation) */ + + if (i == trx_id_pos || i == roll_ptr_pos) { + + goto skip_compare; + } + + if (UNIV_UNLIKELY(!dfield_is_ext(dfield) + != !rec_offs_nth_extern(offsets, i)) + || !dfield_data_is_binary_equal(dfield, len, data)) { + + upd_field = upd_get_nth_field(update, n_diff); + + dfield_copy(&(upd_field->new_val), dfield); + + upd_field_set_field_no(upd_field, i, index, trx); + + n_diff++; + } +skip_compare: + ; + } + + update->n_fields = n_diff; + + return(update); +} + +/*************************************************************** +Fetch a prefix of an externally stored column. This is similar +to row_ext_lookup(), but the row_ext_t holds the old values +of the column and must not be poisoned with the new values. */ +static +byte* +row_upd_ext_fetch( +/*==============*/ + /* out: BLOB prefix */ + const byte* data, /* in: 'internally' stored part of the + field containing also the reference to + the external part */ + ulint local_len, /* in: length of data, in bytes */ + ulint zip_size, /* in: nonzero=compressed BLOB + page size, zero for uncompressed + BLOBs */ + ulint* len, /* in: length of prefix to fetch; + out: fetched length of the prefix */ + mem_heap_t* heap) /* in: heap where to allocate */ +{ + byte* buf = mem_heap_alloc(heap, *len); + + *len = btr_copy_externally_stored_field_prefix(buf, *len, + zip_size, + data, local_len); + /* We should never update records containing a half-deleted BLOB. */ + ut_a(*len); + + return(buf); +} + +/*************************************************************** +Replaces the new column value stored in the update vector in +the given index entry field. */ +static +void +row_upd_index_replace_new_col_val( +/*==============================*/ + dfield_t* dfield, /* in/out: data field + of the index entry */ + const dict_field_t* field, /* in: index field */ + const dict_col_t* col, /* in: field->col */ + const upd_field_t* uf, /* in: update field */ + mem_heap_t* heap, /* in: memory heap for allocating + and copying the new value */ + ulint zip_size)/* in: compressed page + size of the table, or 0 */ +{ + ulint len; + const byte* data; + + dfield_copy_data(dfield, &uf->new_val); + + if (dfield_is_null(dfield)) { + return; + } + + len = dfield_get_len(dfield); + data = dfield_get_data(dfield); + + if (field->prefix_len > 0) { + ibool fetch_ext = dfield_is_ext(dfield) + && len < (ulint) field->prefix_len + + BTR_EXTERN_FIELD_REF_SIZE; + + if (fetch_ext) { + ulint l = len; + + len = field->prefix_len; + + data = row_upd_ext_fetch(data, l, zip_size, + &len, heap); + } + + len = dtype_get_at_most_n_mbchars(col->prtype, + col->mbminlen, col->mbmaxlen, + field->prefix_len, len, + (const char*) data); + + dfield_set_data(dfield, data, len); + + if (!fetch_ext) { + dfield_dup(dfield, heap); + } + + return; + } + + switch (uf->orig_len) { + byte* buf; + case BTR_EXTERN_FIELD_REF_SIZE: + /* Restore the original locally stored + part of the column. In the undo log, + InnoDB writes a longer prefix of externally + stored columns, so that column prefixes + in secondary indexes can be reconstructed. */ + dfield_set_data(dfield, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + dfield_set_ext(dfield); + /* fall through */ + case 0: + dfield_dup(dfield, heap); + break; + default: + /* Reconstruct the original locally + stored part of the column. The data + will have to be copied. */ + ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE); + buf = mem_heap_alloc(heap, uf->orig_len); + /* Copy the locally stored prefix. */ + memcpy(buf, data, + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE); + /* Copy the BLOB pointer. */ + memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE, + data + len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(dfield, buf, uf->orig_len); + dfield_set_ext(dfield); + break; + } +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals_index_pos( +/*=========================================*/ + dtuple_t* entry, /* in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /* in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /* in: an update vector built for the index so + that the field number in an upd_field is the + index position */ + ibool order_only, + /* in: if TRUE, limit the replacement to + ordering fields of index; note that this + does not work for non-clustered indexes. */ + mem_heap_t* heap) /* in: memory heap for allocating and + copying the new values */ +{ + ulint i; + ulint n_fields; + const ulint zip_size = dict_table_zip_size(index->table); + + ut_ad(index); + + dtuple_set_info_bits(entry, update->info_bits); + + if (order_only) { + n_fields = dict_index_get_n_unique(index); + } else { + n_fields = dict_index_get_n_fields(index); + } + + for (i = 0; i < n_fields; i++) { + const dict_field_t* field; + const dict_col_t* col; + const upd_field_t* uf; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + uf = upd_get_field_by_field_no(update, i); + + if (uf) { + row_upd_index_replace_new_col_val( + dtuple_get_nth_field(entry, i), + field, col, uf, heap, zip_size); + } + } +} + +/*************************************************************** +Replaces the new column values stored in the update vector to the index entry +given. */ +UNIV_INTERN +void +row_upd_index_replace_new_col_vals( +/*===============================*/ + dtuple_t* entry, /* in/out: index entry where replaced; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + dict_index_t* index, /* in: index; NOTE that this may also be a + non-clustered index */ + const upd_t* update, /* in: an update vector built for the + CLUSTERED index so that the field number in + an upd_field is the clustered index position */ + mem_heap_t* heap) /* in: memory heap for allocating and + copying the new values */ +{ + ulint i; + const dict_index_t* clust_index + = dict_table_get_first_index(index->table); + const ulint zip_size + = dict_table_zip_size(index->table); + + dtuple_set_info_bits(entry, update->info_bits); + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t* field; + const dict_col_t* col; + const upd_field_t* uf; + + field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(field); + uf = upd_get_field_by_field_no( + update, dict_col_get_clust_pos(col, clust_index)); + + if (uf) { + row_upd_index_replace_new_col_val( + dtuple_get_nth_field(entry, i), + field, col, uf, heap, zip_size); + } + } +} + +/*************************************************************** +Replaces the new column values stored in the update vector. */ +UNIV_INTERN +void +row_upd_replace( +/*============*/ + dtuple_t* row, /* in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /* out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /* in: clustered index */ + const upd_t* update, /* in: an update vector built for the + clustered index */ + mem_heap_t* heap) /* in: memory heap */ +{ + ulint col_no; + ulint i; + ulint n_cols; + ulint n_ext_cols; + ulint* ext_cols; + const dict_table_t* table; + + ut_ad(row); + ut_ad(ext); + ut_ad(index); + ut_ad(dict_index_is_clust(index)); + ut_ad(update); + ut_ad(heap); + + n_cols = dtuple_get_n_fields(row); + table = index->table; + ut_ad(n_cols == dict_table_get_n_cols(table)); + + ext_cols = mem_heap_alloc(heap, n_cols * sizeof *ext_cols); + n_ext_cols = 0; + + dtuple_set_info_bits(row, update->info_bits); + + for (col_no = 0; col_no < n_cols; col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + const ulint clust_pos + = dict_col_get_clust_pos(col, index); + dfield_t* dfield; + + if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) { + + continue; + } + + dfield = dtuple_get_nth_field(row, col_no); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + const upd_field_t* upd_field + = upd_get_nth_field(update, i); + + if (upd_field->field_no != clust_pos) { + + continue; + } + + dfield_copy_data(dfield, &upd_field->new_val); + break; + } + + if (dfield_is_ext(dfield) && col->ord_part) { + ext_cols[n_ext_cols++] = col_no; + } + } + + if (n_ext_cols) { + *ext = row_ext_create(n_ext_cols, ext_cols, row, + dict_table_zip_size(table), heap); + } else { + *ext = NULL; + } +} + +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! */ +UNIV_INTERN +ibool +row_upd_changes_ord_field_binary( +/*=============================*/ + /* out: TRUE if update vector changes + an ordering field in the index record; + NOTE: the fields are compared as binary + strings */ + const dtuple_t* row, /* in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + dict_index_t* index, /* in: index of the record */ + const upd_t* update) /* in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +{ + ulint n_unique; + ulint n_upd_fields; + ulint i, j; + dict_index_t* clust_index; + + ut_ad(update && index); + + n_unique = dict_index_get_n_unique(index); + n_upd_fields = upd_get_n_fields(update); + + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n_unique; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_pos; + ulint col_no; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col, clust_index); + col_no = dict_col_get_no(col); + + for (j = 0; j < n_upd_fields; j++) { + + const upd_field_t* upd_field + = upd_get_nth_field(update, j); + + /* Note that if the index field is a column prefix + then it may be that row does not contain an externally + stored part of the column value, and we cannot compare + the datas */ + + if (col_pos == upd_field->field_no + && (row == NULL + || ind_field->prefix_len > 0 + || !dfield_datas_are_binary_equal( + dtuple_get_nth_field(row, col_no), + &(upd_field->new_val)))) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/*************************************************************** +Checks if an update vector changes an ordering field of an index record. +NOTE: we compare the fields as binary strings! */ +UNIV_INTERN +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + /* out: TRUE if update vector + may change an ordering field + in an index record */ + const dict_table_t* table, /* in: table */ + const upd_t* update) /* in: update vector for the row */ +{ + upd_field_t* upd_field; + dict_index_t* index; + ulint i; + + index = dict_table_get_first_index(table); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + upd_field = upd_get_nth_field(update, i); + + if (dict_field_get_col(dict_index_get_nth_field( + index, upd_field->field_no)) + ->ord_part) { + + return(TRUE); + } + } + + return(FALSE); +} + +/*************************************************************** +Checks if an update vector changes some of the first ordering fields of an +index record. This is only used in foreign key checks and we can assume +that index does not contain column prefixes. */ +static +ibool +row_upd_changes_first_fields_binary( +/*================================*/ + /* out: TRUE if changes */ + dtuple_t* entry, /* in: index entry */ + dict_index_t* index, /* in: index of entry */ + const upd_t* update, /* in: update vector for the row */ + ulint n) /* in: how many first fields to check */ +{ + ulint n_upd_fields; + ulint i, j; + dict_index_t* clust_index; + + ut_ad(update && index); + ut_ad(n <= dict_index_get_n_fields(index)); + + n_upd_fields = upd_get_n_fields(update); + clust_index = dict_table_get_first_index(index->table); + + for (i = 0; i < n; i++) { + + const dict_field_t* ind_field; + const dict_col_t* col; + ulint col_pos; + + ind_field = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ind_field); + col_pos = dict_col_get_clust_pos(col, clust_index); + + ut_a(ind_field->prefix_len == 0); + + for (j = 0; j < n_upd_fields; j++) { + + upd_field_t* upd_field + = upd_get_nth_field(update, j); + + if (col_pos == upd_field->field_no + && !dfield_datas_are_binary_equal( + dtuple_get_nth_field(entry, i), + &(upd_field->new_val))) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/************************************************************************* +Copies the column values from a record. */ +UNIV_INLINE +void +row_upd_copy_columns( +/*=================*/ + rec_t* rec, /* in: record in a clustered index */ + const ulint* offsets,/* in: array returned by rec_get_offsets() */ + sym_node_t* column) /* in: first column in a column list, or + NULL */ +{ + byte* data; + ulint len; + + while (column) { + data = rec_get_nth_field(rec, offsets, + column->field_nos[SYM_CLUST_FIELD_NO], + &len); + if (len == UNIV_SQL_NULL) { + len = UNIV_SQL_NULL; + } + eval_node_copy_and_alloc_val(column, data, len); + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Calculates the new values for fields to update. Note that row_upd_copy_columns +must have been called first. */ +UNIV_INLINE +void +row_upd_eval_new_vals( +/*==================*/ + upd_t* update) /* in/out: update vector */ +{ + que_node_t* exp; + upd_field_t* upd_field; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + upd_field = upd_get_nth_field(update, i); + + exp = upd_field->exp; + + eval_exp(exp); + + dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp)); + } +} + +/*************************************************************** +Stores to the heap the row on which the node->pcur is positioned. */ +static +void +row_upd_store_row( +/*==============*/ + upd_node_t* node) /* in: row update node */ +{ + dict_index_t* clust_index; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + const ulint* offsets; + rec_offs_init(offsets_); + + ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); + + if (node->row != NULL) { + mem_heap_empty(node->heap); + } + + clust_index = dict_table_get_first_index(node->table); + + rec = btr_pcur_get_rec(node->pcur); + + offsets = rec_get_offsets(rec, clust_index, offsets_, + ULINT_UNDEFINED, &heap); + node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, + NULL, &node->ext, node->heap); + if (node->is_delete) { + node->upd_row = NULL; + node->upd_ext = NULL; + } else { + node->upd_row = dtuple_copy(node->row, node->heap); + row_upd_replace(node->upd_row, &node->upd_ext, + clust_index, node->update, node->heap); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/*************************************************************** +Updates a secondary index entry of a row. */ +static +ulint +row_upd_sec_index_entry( +/*====================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ibool check_ref; + ibool found; + dict_index_t* index; + dtuple_t* entry; + btr_pcur_t pcur; + btr_cur_t* btr_cur; + mem_heap_t* heap; + rec_t* rec; + ulint err = DB_SUCCESS; + mtr_t mtr; + trx_t* trx = thr_get_trx(thr); + + index = node->index; + + check_ref = row_upd_index_is_referenced(index, trx); + + heap = mem_heap_create(1024); + + /* Build old index entry */ + entry = row_build_index_entry(node->row, node->ext, index, heap); + ut_a(entry); + + log_free_check(); + mtr_start(&mtr); + + found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur, + &mtr); + btr_cur = btr_pcur_get_btr_cur(&pcur); + + rec = btr_cur_get_rec(btr_cur); + + if (UNIV_UNLIKELY(!found)) { + fputs("InnoDB: error in sec index entry update in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, rec, index); + putc('\n', stderr); + + trx_print(stderr, trx, 0); + + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + } else { + /* Delete mark the old index record; it can already be + delete marked if we return after a lock wait in + row_ins_index_entry below */ + + if (!rec_get_deleted_flag(rec, + dict_table_is_comp(index->table))) { + err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE, + thr, &mtr); + if (err == DB_SUCCESS && check_ref) { + + ulint* offsets = rec_get_offsets( + rec, index, NULL, + ULINT_UNDEFINED, &heap); + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints( + node, &pcur, index->table, + index, offsets, thr, &mtr); + } + } + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + if (node->is_delete || err != DB_SUCCESS) { + + goto func_exit; + } + + /* Build a new index entry */ + entry = row_build_index_entry(node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + + /* Insert new index entry */ + err = row_ins_index_entry(index, entry, 0, TRUE, thr); + +func_exit: + mem_heap_free(heap); + + return(err); +} + +/*************************************************************** +Updates the secondary index record if it is changed in the row update or +deletes it if this is a delete. */ +UNIV_INLINE +ulint +row_upd_sec_step( +/*=============*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC) + || (node->state == UPD_NODE_UPDATE_SOME_SEC)); + ut_ad(!dict_index_is_clust(node->index)); + + if (node->state == UPD_NODE_UPDATE_ALL_SEC + || row_upd_changes_ord_field_binary(node->row, node->index, + node->update)) { + return(row_upd_sec_index_entry(node, thr)); + } + + return(DB_SUCCESS); +} + +/*************************************************************** +Marks the clustered index record deleted and inserts the updated version +of the record to the index. This function should be used when the ordering +fields of the clustered index record change. This should be quite rare in +database applications. */ +static +ulint +row_upd_clust_rec_by_insert( +/*========================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index of the record */ + que_thr_t* thr, /* in: query thread */ + ibool check_ref,/* in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + mem_heap_t* heap = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + trx_t* trx; + dict_table_t* table; + dtuple_t* entry; + ulint err; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + + trx = thr_get_trx(thr); + table = node->table; + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + if (node->state != UPD_NODE_INSERT_CLUSTERED) { + rec_t* rec; + dict_index_t* index; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + rec_offs_init(offsets_); + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + return(err); + } + + /* Mark as not-owned the externally stored fields which the new + row inherits from the delete marked record: purge should not + free those externally stored fields even if the delete marked + record is removed from the index tree, or updated. */ + + rec = btr_cur_get_rec(btr_cur); + index = dict_table_get_first_index(table); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + btr_cur_mark_extern_inherited_fields( + btr_cur_get_page_zip(btr_cur), + rec, index, offsets, node->update, mtr); + if (check_ref) { + /* NOTE that the following call loses + the position of pcur ! */ + err = row_upd_check_references_constraints( + node, pcur, table, index, offsets, thr, mtr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); + } + } + } + + mtr_commit(mtr); + + if (!heap) { + heap = mem_heap_create(500); + } + node->state = UPD_NODE_INSERT_CLUSTERED; + + entry = row_build_index_entry(node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + + row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id); + + if (node->upd_ext) { + /* If we return from a lock wait, for example, we may have + extern fields marked as not-owned in entry (marked in the + if-branch above). We must unmark them. */ + + btr_cur_unmark_dtuple_extern_fields(entry); + + /* We must mark non-updated extern fields in entry as + inherited, so that a possible rollback will not free them. */ + + btr_cur_mark_dtuple_inherited_extern(entry, node->update); + } + + err = row_ins_index_entry(index, entry, + node->upd_ext ? node->upd_ext->n_ext : 0, + TRUE, thr); + mem_heap_free(heap); + + return(err); +} + +/*************************************************************** +Updates a clustered index record of a row when the ordering fields do +not change. */ +static +ulint +row_upd_clust_rec( +/*==============*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index */ + que_thr_t* thr, /* in: query thread */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + mem_heap_t* heap = NULL; + big_rec_t* big_rec = NULL; + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table))); + + /* Try optimistic updating of the record, keeping changes within + the page; we do not check locks because we assume the x-lock on the + record to update */ + + if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { + err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } else { + err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG, + btr_cur, node->update, + node->cmpl_info, thr, mtr); + } + + mtr_commit(mtr); + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + + return(DB_SUCCESS); + } + + if (buf_LRU_buf_pool_running_out()) { + + return(DB_LOCK_TABLE_FULL); + } + /* We may have to modify the tree structure: do a pessimistic descent + down the index tree */ + + mtr_start(mtr); + + /* NOTE: this transaction has an s-lock or x-lock on the record and + therefore other transactions cannot modify the record when we have no + latch on the page. In addition, we assume that other query threads of + the same transaction do not modify the record in the meantime. + Therefore we can assert that the restoration of the cursor succeeds. */ + + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + + ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + dict_table_is_comp(index->table))); + + err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, + &heap, &big_rec, node->update, + node->cmpl_info, thr, mtr); + mtr_commit(mtr); + + if (err == DB_SUCCESS && big_rec) { + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + rec_t* rec; + rec_offs_init(offsets_); + + mtr_start(mtr); + + ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); + rec = btr_cur_get_rec(btr_cur); + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(btr_cur), rec, + rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap), + big_rec, mtr); + mtr_commit(mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (big_rec) { + dtuple_big_rec_free(big_rec); + } + + return(err); +} + +/*************************************************************** +Delete marks a clustered index record. */ +static +ulint +row_upd_del_mark_clust_rec( +/*=======================*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code */ + upd_node_t* node, /* in: row update node */ + dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in/out: rec_get_offsets() for the + record under the cursor */ + que_thr_t* thr, /* in: query thread */ + ibool check_ref,/* in: TRUE if index may be referenced in + a foreign key constraint */ + mtr_t* mtr) /* in: mtr; gets committed here */ +{ + btr_pcur_t* pcur; + btr_cur_t* btr_cur; + ulint err; + + ut_ad(node); + ut_ad(dict_index_is_clust(index)); + ut_ad(node->is_delete); + + pcur = node->pcur; + btr_cur = btr_pcur_get_btr_cur(pcur); + + /* Store row because we have to build also the secondary index + entries */ + + row_upd_store_row(node); + + /* Mark the clustered index record deleted; we do not have to check + locks, because we assume that we have an x-lock on the record */ + + err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, + btr_cur, TRUE, thr, mtr); + if (err == DB_SUCCESS && check_ref) { + /* NOTE that the following call loses the position of pcur ! */ + + err = row_upd_check_references_constraints(node, + pcur, index->table, + index, offsets, + thr, mtr); + } + + mtr_commit(mtr); + + return(err); +} + +/*************************************************************** +Updates the clustered index record. */ +static +ulint +row_upd_clust_step( +/*===============*/ + /* out: DB_SUCCESS if operation successfully + completed, DB_LOCK_WAIT in case of a lock wait, + else error code */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* index; + btr_pcur_t* pcur; + ibool success; + ibool check_ref; + ulint err; + mtr_t* mtr; + mtr_t mtr_buf; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + rec_offs_init(offsets_); + + index = dict_table_get_first_index(node->table); + + check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr)); + + pcur = node->pcur; + + /* We have to restore the cursor to its position */ + mtr = &mtr_buf; + + mtr_start(mtr); + + /* If the restoration does not succeed, then the same + transaction has deleted the record on which the cursor was, + and that is an SQL error. If the restoration succeeds, it may + still be that the same transaction has successively deleted + and inserted a record with the same ordering fields, but in + that case we know that the transaction has at least an + implicit x-lock on the record. */ + + ut_a(pcur->rel_pos == BTR_PCUR_ON); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + + if (!success) { + err = DB_RECORD_NOT_FOUND; + + mtr_commit(mtr); + + return(err); + } + + /* If this is a row in SYS_INDEXES table of the data dictionary, + then we have to free the file segments of the index tree associated + with the index */ + + if (node->is_delete + && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + + dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, + mtr); + if (!success) { + err = DB_ERROR; + + mtr_commit(mtr); + + return(err); + } + } + + rec = btr_pcur_get_rec(pcur); + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + + if (!node->has_clust_rec_x_lock) { + err = lock_clust_rec_modify_check_and_lock( + 0, btr_pcur_get_block(pcur), + rec, index, offsets, thr); + if (err != DB_SUCCESS) { + mtr_commit(mtr); + goto exit_func; + } + } + + /* NOTE: the following function calls will also commit mtr */ + + if (node->is_delete) { + err = row_upd_del_mark_clust_rec(node, index, offsets, + thr, check_ref, mtr); + if (err == DB_SUCCESS) { + node->state = UPD_NODE_UPDATE_ALL_SEC; + node->index = dict_table_get_next_index(index); + } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(err); + } + + /* If the update is made for MySQL, we already have the update vector + ready, else we have to do some evaluation: */ + + if (UNIV_UNLIKELY(!node->in_mysql_interface)) { + /* Copy the necessary columns from clust_rec and calculate the + new values to set */ + row_upd_copy_columns(rec, offsets, + UT_LIST_GET_FIRST(node->columns)); + row_upd_eval_new_vals(node->update); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { + + err = row_upd_clust_rec(node, index, thr, mtr); + return(err); + } + + row_upd_store_row(node); + + if (row_upd_changes_ord_field_binary(node->row, index, node->update)) { + + /* Update causes an ordering field (ordering fields within + the B-tree) of the clustered index record to change: perform + the update by delete marking and inserting. + + TODO! What to do to the 'Halloween problem', where an update + moves the record forward in index so that it is again + updated when the cursor arrives there? Solution: the + read operation must check the undo record undo number when + choosing records to update. MySQL solves now the problem + externally! */ + + err = row_upd_clust_rec_by_insert(node, index, thr, check_ref, + mtr); + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_ALL_SEC; + } else { + err = row_upd_clust_rec(node, index, thr, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + node->state = UPD_NODE_UPDATE_SOME_SEC; + } + + node->index = dict_table_get_next_index(index); + + return(err); +} + +/*************************************************************** +Updates the affected index records of a row. When the control is transferred +to this node, we assume that we have a persistent cursor which was on a +record, and the position of the cursor is stored in the cursor. */ +static +ulint +row_upd( +/*====*/ + /* out: DB_SUCCESS if operation successfully + completed, else error code or DB_LOCK_WAIT */ + upd_node_t* node, /* in: row update node */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err = DB_SUCCESS; + + ut_ad(node && thr); + + if (UNIV_LIKELY(node->in_mysql_interface)) { + + /* We do not get the cmpl_info value from the MySQL + interpreter: we must calculate it on the fly: */ + + if (node->is_delete + || row_upd_changes_some_index_ord_field_binary( + node->table, node->update)) { + node->cmpl_info = 0; + } else { + node->cmpl_info = UPD_NODE_NO_ORD_CHANGE; + } + } + + if (node->state == UPD_NODE_UPDATE_CLUSTERED + || node->state == UPD_NODE_INSERT_CLUSTERED) { + + err = row_upd_clust_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + } + + if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + + goto function_exit; + } + + while (node->index != NULL) { + err = row_upd_sec_step(node, thr); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->index = dict_table_get_next_index(node->index); + } + +function_exit: + if (err == DB_SUCCESS) { + /* Do some cleanup */ + + if (node->row != NULL) { + node->row = NULL; + node->ext = NULL; + node->upd_row = NULL; + node->upd_ext = NULL; + mem_heap_empty(node->heap); + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + } + + return(err); +} + +/*************************************************************** +Updates a row in a table. This is a high-level function used in SQL execution +graphs. */ +UNIV_INTERN +que_thr_t* +row_upd_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + upd_node_t* node; + sel_node_t* sel_node; + que_node_t* parent; + ulint err = DB_SUCCESS; + trx_t* trx; + + ut_ad(thr); + + trx = thr_get_trx(thr); + + trx_start_if_not_started(trx); + + node = thr->run_node; + + sel_node = node->select; + + parent = que_node_get_parent(node); + + ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE); + + if (thr->prev_node == parent) { + node->state = UPD_NODE_SET_IX_LOCK; + } + + if (node->state == UPD_NODE_SET_IX_LOCK) { + + if (!node->has_clust_rec_x_lock) { + /* It may be that the current session has not yet + started its transaction, or it has been committed: */ + + err = lock_table(0, node->table, LOCK_IX, thr); + + if (err != DB_SUCCESS) { + + goto error_handling; + } + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + if (node->searched_update) { + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch a row to update */ + + thr->run_node = sel_node; + + return(thr); + } + } + + /* sel_node is NULL if we are in the MySQL interface */ + + if (sel_node && (sel_node->state != SEL_NODE_FETCH)) { + + if (!node->searched_update) { + /* An explicit cursor should be positioned on a row + to update */ + + ut_error; + + err = DB_ERROR; + + goto error_handling; + } + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to update, or the select node performed the + updates directly in-place */ + + thr->run_node = parent; + + return(thr); + } + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = row_upd(node, thr); + +error_handling: + trx->error_state = err; + + if (err != DB_SUCCESS) { + return(NULL); + } + + /* DO THE TRIGGER ACTIONS HERE */ + + if (node->searched_update) { + /* Fetch next row to update */ + + thr->run_node = sel_node; + } else { + /* It was an explicit cursor update */ + + thr->run_node = parent; + } + + node->state = UPD_NODE_UPDATE_CLUSTERED; + + return(thr); +} diff --git a/storage/xtradb/row/row0vers.c b/storage/xtradb/row/row0vers.c new file mode 100644 index 00000000000..3abba6d6fb8 --- /dev/null +++ b/storage/xtradb/row/row0vers.c @@ -0,0 +1,741 @@ +/***************************************************************************** + +Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#include "row0vers.h" + +#ifdef UNIV_NONINL +#include "row0vers.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "btr0btr.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "trx0rec.h" +#include "que0que.h" +#include "row0row.h" +#include "row0upd.h" +#include "rem0cmp.h" +#include "read0read.h" +#include "lock0lock.h" + +/********************************************************************* +Finds out if an active transaction has inserted or modified a secondary +index record. NOTE: the kernel mutex is temporarily released in this +function! */ +UNIV_INTERN +trx_t* +row_vers_impl_x_locked_off_kernel( +/*==============================*/ + /* out: NULL if committed, else the active + transaction; NOTE that the kernel mutex is + temporarily released! */ + const rec_t* rec, /* in: record in a secondary index */ + dict_index_t* index, /* in: the secondary index */ + const ulint* offsets)/* in: rec_get_offsets(rec, index) */ +{ + dict_index_t* clust_index; + rec_t* clust_rec; + ulint* clust_offsets; + rec_t* version; + dulint trx_id; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + dtuple_t* entry = NULL; /* assignment to eliminate compiler + warning */ + trx_t* trx; + ulint rec_del; + ulint err; + mtr_t mtr; + ulint comp; + + ut_ad(mutex_own(&kernel_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + /* Search for the clustered index record: this is a time-consuming + operation: therefore we release the kernel mutex; also, the release + is required by the latching order convention. The latch on the + clustered index locks the top of the stack of versions. We also + reserve purge_latch to lock the bottom of the version stack. */ + + clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, + &clust_index, &mtr); + if (!clust_rec) { + /* In a rare case it is possible that no clust rec is found + for a secondary index record: if in row0umod.c + row_undo_mod_remove_clust_low() we have already removed the + clust rec, while purge is still cleaning and removing + secondary index records associated with earlier versions of + the clustered index record. In that case there cannot be + any implicit lock on the secondary index record, because + an active transaction which has modified the secondary index + record has also modified the clustered index record. And in + a rollback we always undo the modifications to secondary index + records before the clustered index record. */ + + mutex_enter(&kernel_mutex); + mtr_commit(&mtr); + + return(NULL); + } + + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); + + mtr_s_lock(&(purge_sys->latch), &mtr); + + mutex_enter(&kernel_mutex); + + trx = NULL; + if (!trx_is_active(trx_id)) { + /* The transaction that modified or inserted clust_rec is no + longer active: no implicit lock on rec */ + goto exit_func; + } + + if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, + clust_offsets, TRUE)) { + /* Corruption noticed: try to avoid a crash by returning */ + goto exit_func; + } + + comp = page_rec_is_comp(rec); + ut_ad(index->table == clust_index->table); + ut_ad(!!comp == dict_table_is_comp(index->table)); + ut_ad(!comp == !page_rec_is_comp(clust_rec)); + + /* We look up if some earlier version, which was modified by the trx_id + transaction, of the clustered index record would require rec to be in + a different state (delete marked or unmarked, or have different field + values, or not existing). If there is such a version, then rec was + modified by the trx_id transaction, and it has an implicit x-lock on + rec. Note that if clust_rec itself would require rec to be in a + different state, then the trx_id transaction has not yet had time to + modify rec, and does not necessarily have an implicit x-lock on rec. */ + + rec_del = rec_get_deleted_flag(rec, comp); + trx = NULL; + + version = clust_rec; + + for (;;) { + rec_t* prev_version; + ulint vers_del; + row_ext_t* ext; + dulint prev_trx_id; + + mutex_exit(&kernel_mutex); + + /* While we retrieve an earlier version of clust_rec, we + release the kernel mutex, because it may take time to access + the disk. After the release, we have to check if the trx_id + transaction is still active. We keep the semaphore in mtr on + the clust_rec page, so that no other transaction can update + it and get an implicit x-lock on rec. */ + + heap2 = heap; + heap = mem_heap_create(1024); + err = trx_undo_prev_version_build(clust_rec, &mtr, version, + clust_index, clust_offsets, + heap, &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (prev_version == NULL) { + mutex_enter(&kernel_mutex); + + if (!trx_is_active(trx_id)) { + /* Transaction no longer active: no + implicit x-lock */ + + break; + } + + /* If the transaction is still active, + clust_rec must be a fresh insert, because no + previous version was found. */ + ut_ad(err == DB_SUCCESS); + + /* It was a freshly inserted version: there is an + implicit x-lock on rec */ + + trx = trx_get_on_id(trx_id); + + break; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + vers_del = rec_get_deleted_flag(prev_version, comp); + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); + + /* If the trx_id and prev_trx_id are different and if + the prev_version is marked deleted then the + prev_trx_id must have already committed for the trx_id + to be able to modify the row. Therefore, prev_trx_id + cannot hold any implicit lock. */ + if (vers_del && 0 != ut_dulint_cmp(trx_id, prev_trx_id)) { + + mutex_enter(&kernel_mutex); + break; + } + + /* The stack of versions is locked by mtr. Thus, it + is safe to fetch the prefixes for externally stored + columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, + clust_offsets, NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + /* entry may be NULL if a record was inserted in place + of a deleted record, and the BLOB pointers of the new + record were not initialized yet. But in that case, + prev_version should be NULL. */ + ut_a(entry); + + mutex_enter(&kernel_mutex); + + if (!trx_is_active(trx_id)) { + /* Transaction no longer active: no implicit x-lock */ + + break; + } + + /* If we get here, we know that the trx_id transaction is + still active and it has modified prev_version. Let us check + if prev_version would require rec to be in a different + state. */ + + /* The previous version of clust_rec must be + accessible, because the transaction is still active + and clust_rec was not a fresh insert. */ + ut_ad(err == DB_SUCCESS); + + /* We check if entry and rec are identified in the alphabetical + ordering */ + if (0 == cmp_dtuple_rec(entry, rec, offsets)) { + /* The delete marks of rec and prev_version should be + equal for rec to be in the state required by + prev_version */ + + if (rec_del != vers_del) { + trx = trx_get_on_id(trx_id); + + break; + } + + /* It is possible that the row was updated so that the + secondary index record remained the same in + alphabetical ordering, but the field values changed + still. For example, 'abc' -> 'ABC'. Check also that. */ + + dtuple_set_types_binary(entry, + dtuple_get_n_fields(entry)); + if (0 != cmp_dtuple_rec(entry, rec, offsets)) { + + trx = trx_get_on_id(trx_id); + + break; + } + } else if (!rec_del) { + /* The delete mark should be set in rec for it to be + in the state required by prev_version */ + + trx = trx_get_on_id(trx_id); + + break; + } + + if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { + /* The versions modified by the trx_id transaction end + to prev_version: no implicit x-lock */ + + break; + } + + version = prev_version; + }/* for (;;) */ + +exit_func: + mtr_commit(&mtr); + mem_heap_free(heap); + + return(trx); +} + +/********************************************************************* +Finds out if we must preserve a delete marked earlier version of a clustered +index record, because it is >= the purge view. */ +UNIV_INTERN +ibool +row_vers_must_preserve_del_marked( +/*==============================*/ + /* out: TRUE if earlier version should be preserved */ + dulint trx_id, /* in: transaction id in the version */ + mtr_t* mtr) /* in: mtr holding the latch on the clustered index + record; it will also hold the latch on purge_view */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + mtr_s_lock(&(purge_sys->latch), mtr); + + if (trx_purge_update_undo_must_exist(trx_id)) { + + /* A purge operation is not yet allowed to remove this + delete marked record */ + + return(TRUE); + } + + return(FALSE); +} + +/********************************************************************* +Finds out if a version of the record, where the version >= the current +purge view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry and ientry are identified in +the alphabetical ordering; exactly in this case we return TRUE. */ +UNIV_INTERN +ibool +row_vers_old_has_index_entry( +/*=========================*/ + /* out: TRUE if earlier version should have */ + ibool also_curr,/* in: TRUE if also rec is included in the + versions to search; otherwise only versions + prior to it are searched */ + const rec_t* rec, /* in: record in the clustered index; the + caller must have a latch on the page */ + mtr_t* mtr, /* in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /* in: the secondary index */ + const dtuple_t* ientry) /* in: the secondary index entry */ +{ + const rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index; + ulint* clust_offsets; + mem_heap_t* heap; + mem_heap_t* heap2; + const dtuple_t* row; + const dtuple_t* entry; + ulint err; + ulint comp; + + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + mtr_s_lock(&(purge_sys->latch), mtr); + + clust_index = dict_table_get_first_index(index->table); + + comp = page_rec_is_comp(rec); + ut_ad(!dict_table_is_comp(index->table) == !comp); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &heap); + + if (also_curr && !rec_get_deleted_flag(rec, comp)) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset BLOB + pointers. This must be a freshly inserted record. If + this is called from + row_purge_remove_sec_if_poss_low(), the thread will + hold latches on the clustered index and the secondary + index. Because the insert works in three steps: + + (1) insert the record to clustered index + (2) store the BLOBs and update BLOB pointers + (3) insert records to secondary indexes + + the purge thread can safely ignore freshly inserted + records and delete the secondary index record. The + thread that inserted the new record will be inserting + the secondary index records. */ + + /* NOTE that we cannot do the comparison as binary + fields because the row is maybe being modified so that + the clustered index record has already been updated to + a different binary value in a char field, but the + collation identifies the old and new value anyway! */ + if (entry && !dtuple_coll_cmp(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = rec; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + err = trx_undo_prev_version_build(rec, mtr, version, + clust_index, clust_offsets, + heap, &prev_version); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (err != DB_SUCCESS || !prev_version) { + /* Versions end here */ + + mem_heap_free(heap); + + return(FALSE); + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, ULINT_UNDEFINED, &heap); + + if (!rec_get_deleted_flag(prev_version, comp)) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, + NULL, &ext, heap); + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset + BLOB pointers. This must be a freshly + inserted record that we can safely ignore. + For the justification, see the comments after + the previous row_build_index_entry() call. */ + + /* NOTE that we cannot do the comparison as binary + fields because maybe the secondary index record has + already been updated to a different binary value in + a char field, but the collation identifies the old + and new value anyway! */ + + if (entry && !dtuple_coll_cmp(ientry, entry)) { + + mem_heap_free(heap); + + return(TRUE); + } + } + + version = prev_version; + } +} + +/********************************************************************* +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. */ +UNIV_INTERN +ulint +row_vers_build_for_consistent_read( +/*===============================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + const rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec */ + dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ + read_view_t* view, /* in: the consistent read view */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers)/* out, own: old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + const rec_t* version; + rec_t* prev_version; + dulint trx_id; + mem_heap_t* heap = NULL; + byte* buf; + ulint err; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + trx_id = row_get_rec_trx_id(rec, index, *offsets); + + ut_ad(!read_view_sees_trx_id(view, trx_id)); + + rw_lock_s_lock(&(purge_sys->latch)); + version = rec; + + for (;;) { + mem_heap_t* heap2 = heap; + trx_undo_rec_t* undo_rec; + dulint roll_ptr; + dulint undo_no; + heap = mem_heap_create(1024); + + /* If we have high-granularity consistent read view and + creating transaction of the view is the same as trx_id in + the record we see this record only in the case when + undo_no of the record is < undo_no in the view. */ + + if (view->type == VIEW_HIGH_GRANULARITY + && ut_dulint_cmp(view->creator_trx_id, trx_id) == 0) { + + roll_ptr = row_get_rec_roll_ptr(version, index, + *offsets); + undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + undo_no = trx_undo_rec_get_undo_no(undo_rec); + mem_heap_empty(heap); + + if (ut_dulint_cmp(view->undo_no, undo_no) > 0) { + /* The view already sees this version: we can + copy it to in_heap and return */ + + buf = mem_heap_alloc(in_heap, + rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, version, *offsets); + rec_offs_make_valid(*old_vers, index, + *offsets); + err = DB_SUCCESS; + + break; + } + } + + err = trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version); + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (err != DB_SUCCESS) { + break; + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + err = DB_SUCCESS; + + break; + } + + *offsets = rec_get_offsets(prev_version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + + trx_id = row_get_rec_trx_id(prev_version, index, *offsets); + + if (read_view_sees_trx_id(view, trx_id)) { + + /* The view already sees this version: we can copy + it to in_heap and return */ + + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, prev_version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + err = DB_SUCCESS; + + break; + } + + version = prev_version; + }/* for (;;) */ + + mem_heap_free(heap); + rw_lock_s_unlock(&(purge_sys->latch)); + + return(err); +} + +/********************************************************************* +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +UNIV_INTERN +ulint +row_vers_build_for_semi_consistent_read( +/*====================================*/ + /* out: DB_SUCCESS or DB_MISSING_HISTORY */ + const rec_t* rec, /* in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /* in: mtr holding the latch on rec */ + dict_index_t* index, /* in: the clustered index */ + ulint** offsets,/* in/out: offsets returned by + rec_get_offsets(rec, index) */ + mem_heap_t** offset_heap,/* in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/* in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + const rec_t** old_vers)/* out: rec, old version, or NULL if the + record does not exist in the view, that is, + it was freshly inserted afterwards */ +{ + const rec_t* version; + mem_heap_t* heap = NULL; + byte* buf; + ulint err; + dulint rec_trx_id = ut_dulint_zero; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) + || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(rec_offs_validate(rec, index, *offsets)); + + rw_lock_s_lock(&(purge_sys->latch)); + /* The S-latch on purge_sys prevents the purge view from + changing. Thus, if we have an uncommitted transaction at + this point, then purge cannot remove its undo log even if + the transaction could commit now. */ + + version = rec; + + for (;;) { + trx_t* version_trx; + mem_heap_t* heap2; + rec_t* prev_version; + dulint version_trx_id; + + version_trx_id = row_get_rec_trx_id(version, index, *offsets); + if (rec == version) { + rec_trx_id = version_trx_id; + } + + mutex_enter(&kernel_mutex); + version_trx = trx_get_on_id(version_trx_id); + mutex_exit(&kernel_mutex); + + if (!version_trx + || version_trx->conc_state == TRX_NOT_STARTED + || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) { + + /* We found a version that belongs to a + committed transaction: return it. */ + + if (rec == version) { + *old_vers = rec; + err = DB_SUCCESS; + break; + } + + /* We assume that a rolled-back transaction stays in + TRX_ACTIVE state until all the changes have been + rolled back and the transaction is removed from + the global list of transactions. */ + + if (!ut_dulint_cmp(rec_trx_id, version_trx_id)) { + /* The transaction was committed while + we searched for earlier versions. + Return the current version as a + semi-consistent read. */ + + version = rec; + *offsets = rec_get_offsets(version, + index, *offsets, + ULINT_UNDEFINED, + offset_heap); + } + + buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); + *old_vers = rec_copy(buf, version, *offsets); + rec_offs_make_valid(*old_vers, index, *offsets); + err = DB_SUCCESS; + + break; + } + + heap2 = heap; + heap = mem_heap_create(1024); + + err = trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version); + if (heap2) { + mem_heap_free(heap2); /* free version */ + } + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + break; + } + + if (prev_version == NULL) { + /* It was a freshly inserted version */ + *old_vers = NULL; + err = DB_SUCCESS; + + break; + } + + version = prev_version; + *offsets = rec_get_offsets(version, index, *offsets, + ULINT_UNDEFINED, offset_heap); + }/* for (;;) */ + + if (heap) { + mem_heap_free(heap); + } + rw_lock_s_unlock(&(purge_sys->latch)); + + return(err); +} diff --git a/storage/xtradb/scripts/install_innodb_plugins.sql b/storage/xtradb/scripts/install_innodb_plugins.sql new file mode 100644 index 00000000000..b1a6f8e2ba0 --- /dev/null +++ b/storage/xtradb/scripts/install_innodb_plugins.sql @@ -0,0 +1,14 @@ +-- execute these to install InnoDB if it is built as a dynamic plugin +INSTALL PLUGIN innodb SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_trx SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_locks SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_lock_waits SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_cmp SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_cmp_reset SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_cmpmem SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_cmpmem_reset SONAME 'ha_innodb.so'; +INSTALL PLUGIN XTRADB_ENHANCEMENTS SONAME 'ha_innodb.so'; +INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES SONAME 'ha_innodb.so'; +INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_BLOB SONAME 'ha_innodb.so'; +INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_INDEX SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_rseg SONAME 'ha_innodb.so'; diff --git a/storage/xtradb/scripts/install_innodb_plugins_win.sql b/storage/xtradb/scripts/install_innodb_plugins_win.sql new file mode 100644 index 00000000000..8c94b4e240d --- /dev/null +++ b/storage/xtradb/scripts/install_innodb_plugins_win.sql @@ -0,0 +1,9 @@ +-- execute these to install InnoDB if it is built as a dynamic plugin +INSTALL PLUGIN innodb SONAME 'ha_innodb.dll'; +INSTALL PLUGIN innodb_trx SONAME 'ha_innodb.dll'; +INSTALL PLUGIN innodb_locks SONAME 'ha_innodb.dll'; +INSTALL PLUGIN innodb_lock_waits SONAME 'ha_innodb.dll'; +INSTALL PLUGIN innodb_cmp SONAME 'ha_innodb.dll'; +INSTALL PLUGIN innodb_cmp_reset SONAME 'ha_innodb.dll'; +INSTALL PLUGIN innodb_cmpmem SONAME 'ha_innodb.dll'; +INSTALL PLUGIN innodb_cmpmem_reset SONAME 'ha_innodb.dll'; diff --git a/storage/xtradb/srv/srv0que.c b/storage/xtradb/srv/srv0que.c new file mode 100644 index 00000000000..344aaed8775 --- /dev/null +++ b/storage/xtradb/srv/srv0que.c @@ -0,0 +1,126 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Server query execution + +Created 6/5/1996 Heikki Tuuri +*******************************************************/ + +#include "srv0que.h" + +#include "srv0srv.h" +#include "sync0sync.h" +#include "os0thread.h" +#include "usr0sess.h" +#include "que0que.h" + +/************************************************************************** +Checks if there is work to do in the server task queue. If there is, the +thread starts processing a task. Before leaving, it again checks the task +queue and picks a new task if any exists. This is called by a SRV_WORKER +thread. */ +UNIV_INTERN +void +srv_que_task_queue_check(void) +/*==========================*/ +{ + que_thr_t* thr; + + for (;;) { + mutex_enter(&kernel_mutex); + + thr = UT_LIST_GET_FIRST(srv_sys->tasks); + + if (thr == NULL) { + mutex_exit(&kernel_mutex); + + return; + } + + UT_LIST_REMOVE(queue, srv_sys->tasks, thr); + + mutex_exit(&kernel_mutex); + + que_run_threads(thr); + } +} + +/************************************************************************** +Performs round-robin on the server tasks. This is called by a SRV_WORKER +thread every second or so. */ +UNIV_INTERN +que_thr_t* +srv_que_round_robin( +/*================*/ + /* out: the new (may be == thr) query thread + to run */ + que_thr_t* thr) /* in: query thread */ +{ + que_thr_t* new_thr; + + ut_ad(thr); + ut_ad(thr->state == QUE_THR_RUNNING); + + mutex_enter(&kernel_mutex); + + UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr); + + new_thr = UT_LIST_GET_FIRST(srv_sys->tasks); + + mutex_exit(&kernel_mutex); + + return(new_thr); +} + +/************************************************************************** +Enqueues a task to server task queue and releases a worker thread, if there +is a suspended one. */ +UNIV_INTERN +void +srv_que_task_enqueue_low( +/*=====================*/ + que_thr_t* thr) /* in: query thread */ +{ + ut_ad(thr); + ut_ad(mutex_own(&kernel_mutex)); + + UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr); + + srv_release_threads(SRV_WORKER, 1); +} + +/************************************************************************** +Enqueues a task to server task queue and releases a worker thread, if there +is a suspended one. */ +UNIV_INTERN +void +srv_que_task_enqueue( +/*=================*/ + que_thr_t* thr) /* in: query thread */ +{ + ut_ad(thr); + + ut_a(0); /* Under MySQL this is never called */ + + mutex_enter(&kernel_mutex); + + srv_que_task_enqueue_low(thr); + + mutex_exit(&kernel_mutex); +} diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c new file mode 100644 index 00000000000..f448afb0dad --- /dev/null +++ b/storage/xtradb/srv/srv0srv.c @@ -0,0 +1,2814 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The database server main program + +NOTE: SQL Server 7 uses something which the documentation +calls user mode scheduled threads (UMS threads). One such +thread is usually allocated per processor. Win32 +documentation does not know any UMS threads, which suggests +that the concept is internal to SQL Server 7. It may mean that +SQL Server 7 does all the scheduling of threads itself, even +in i/o waits. We should maybe modify InnoDB to use the same +technique, because thread switches within NT may be too slow. + +SQL Server 7 also mentions fibers, which are cooperatively +scheduled threads. They can boost performance by 5 %, +according to the Delaney and Soukup's book. + +Windows 2000 will have something called thread pooling +(see msdn website), which we could possibly use. + +Another possibility could be to use some very fast user space +thread library. This might confuse NT though. + +Created 10/8/1995 Heikki Tuuri +*******************************************************/ + +/* Dummy comment */ +#include "srv0srv.h" + +#include "ut0mem.h" +#include "ut0ut.h" +#include "os0proc.h" +#include "mem0mem.h" +#include "mem0pool.h" +#include "sync0sync.h" +#include "thr0loc.h" +#include "que0que.h" +#include "srv0que.h" +#include "log0recv.h" +#include "pars0pars.h" +#include "usr0sess.h" +#include "lock0lock.h" +#include "trx0purge.h" +#include "ibuf0ibuf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "btr0sea.h" +#include "dict0load.h" +#include "dict0boot.h" +#include "srv0start.h" +#include "row0mysql.h" +#include "ha_prototypes.h" +#include "trx0i_s.h" + +/* This is set to TRUE if the MySQL user has set it in MySQL; currently +affects only FOREIGN KEY definition parsing */ +UNIV_INTERN ibool srv_lower_case_table_names = FALSE; + +/* The following counter is incremented whenever there is some user activity +in the server */ +UNIV_INTERN ulint srv_activity_count = 0; + +/* The following is the maximum allowed duration of a lock wait. */ +UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600; + +/* How much data manipulation language (DML) statements need to be delayed, +in microseconds, in order to reduce the lagging of the purge thread. */ +UNIV_INTERN ulint srv_dml_needed_delay = 0; + +UNIV_INTERN ibool srv_lock_timeout_and_monitor_active = FALSE; +UNIV_INTERN ibool srv_error_monitor_active = FALSE; + +UNIV_INTERN const char* srv_main_thread_op_info = ""; + +/* Prefix used by MySQL to indicate pre-5.1 table name encoding */ +UNIV_INTERN const char srv_mysql50_table_name_prefix[9] = "#mysql50#"; + +/* Server parameters which are read from the initfile */ + +/* The following three are dir paths which are catenated before file +names, where the file name itself may also contain a path */ + +UNIV_INTERN char* srv_data_home = NULL; +#ifdef UNIV_LOG_ARCHIVE +UNIV_INTERN char* srv_arch_dir = NULL; +#endif /* UNIV_LOG_ARCHIVE */ + +/* store to its own file each table created by an user; data +dictionary tables are in the system tablespace 0 */ +UNIV_INTERN my_bool srv_file_per_table; +/* The file format to use on new *.ibd files. */ +UNIV_INTERN ulint srv_file_format = 0; +/* Whether to check file format during startup a value of +DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to +set it to the highest format we support. */ +UNIV_INTERN ulint srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX; + +#if DICT_TF_FORMAT_51 +# error "DICT_TF_FORMAT_51 must be 0!" +#endif +/* Place locks to records only i.e. do not use next-key locking except +on duplicate key checking and foreign key checking */ +UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE; + +UNIV_INTERN ulint srv_n_data_files = 0; +UNIV_INTERN char** srv_data_file_names = NULL; +/* size in database pages */ +UNIV_INTERN ulint* srv_data_file_sizes = NULL; + +UNIV_INTERN ibool srv_extra_undoslots = FALSE; + +/* if TRUE, then we auto-extend the last data file */ +UNIV_INTERN ibool srv_auto_extend_last_data_file = FALSE; +/* if != 0, this tells the max size auto-extending may increase the +last data file size */ +UNIV_INTERN ulint srv_last_file_size_max = 0; +/* If the last data file is auto-extended, we add this +many pages to it at a time */ +UNIV_INTERN ulong srv_auto_extend_increment = 8; +UNIV_INTERN ulint* srv_data_file_is_raw_partition = NULL; + +/* If the following is TRUE we do not allow inserts etc. This protects +the user from forgetting the 'newraw' keyword to my.cnf */ + +UNIV_INTERN ibool srv_created_new_raw = FALSE; + +UNIV_INTERN char** srv_log_group_home_dirs = NULL; + +UNIV_INTERN ulint srv_n_log_groups = ULINT_MAX; +UNIV_INTERN ulint srv_n_log_files = ULINT_MAX; +/* size in database pages */ +UNIV_INTERN ulint srv_log_file_size = ULINT_MAX; +/* size in database pages */ +UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX; +UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1; + +UNIV_INTERN ulint srv_show_locks_held = 10; +UNIV_INTERN ulint srv_show_verbose_locks = 0; + + +/* The sort order table of the MySQL latin1_swedish_ci character set +collation */ +UNIV_INTERN const byte* srv_latin1_ordering; + +/* use os/external memory allocator */ +UNIV_INTERN my_bool srv_use_sys_malloc = TRUE; +/* requested size in kilobytes */ +UNIV_INTERN ulint srv_buf_pool_size = ULINT_MAX; +/* previously requested size */ +UNIV_INTERN ulint srv_buf_pool_old_size; +/* current size in kilobytes */ +UNIV_INTERN ulint srv_buf_pool_curr_size = 0; +/* size in bytes */ +UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX; +UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX; + +UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX; +UNIV_INTERN ulint srv_n_read_io_threads = 1; +UNIV_INTERN ulint srv_n_write_io_threads = 1; + +#ifdef UNIV_LOG_ARCHIVE +UNIV_INTERN ibool srv_log_archive_on = FALSE; +UNIV_INTERN ibool srv_archive_recovery = 0; +UNIV_INTERN ib_uint64_t srv_archive_recovery_limit_lsn; +#endif /* UNIV_LOG_ARCHIVE */ + +/* This parameter is used to throttle the number of insert buffers that are +merged in a batch. By increasing this parameter on a faster disk you can +possibly reduce the number of I/O operations performed to complete the +merge operation. The value of this parameter is used as is by the +background loop when the system is idle (low load), on a busy system +the parameter is scaled down by a factor of 4, this is to avoid putting +a heavier load on the I/O sub system. */ + +UNIV_INTERN ulong srv_insert_buffer_batch_size = 20; + +UNIV_INTERN char* srv_file_flush_method_str = NULL; +UNIV_INTERN ulint srv_unix_file_flush_method = SRV_UNIX_FSYNC; +UNIV_INTERN ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; + +UNIV_INTERN ulint srv_max_n_open_files = 300; + +/* The InnoDB main thread tries to keep the ratio of modified pages +in the buffer pool to all database pages in the buffer pool smaller than +the following number. But it is not guaranteed that the value stays below +that during a time of heavy update/insert activity. */ + +UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 90; + +/* variable counts amount of data read in total (in bytes) */ +UNIV_INTERN ulint srv_data_read = 0; + +/* here we count the amount of data written in total (in bytes) */ +UNIV_INTERN ulint srv_data_written = 0; + +/* the number of the log write requests done */ +UNIV_INTERN ulint srv_log_write_requests = 0; + +/* the number of physical writes to the log performed */ +UNIV_INTERN ulint srv_log_writes = 0; + +/* amount of data written to the log files in bytes */ +UNIV_INTERN ulint srv_os_log_written = 0; + +/* amount of writes being done to the log files */ +UNIV_INTERN ulint srv_os_log_pending_writes = 0; + +/* we increase this counter, when there we don't have enough space in the +log buffer and have to flush it */ +UNIV_INTERN ulint srv_log_waits = 0; + +/* this variable counts the amount of times, when the doublewrite buffer +was flushed */ +UNIV_INTERN ulint srv_dblwr_writes = 0; + +/* here we store the number of pages that have been flushed to the +doublewrite buffer */ +UNIV_INTERN ulint srv_dblwr_pages_written = 0; + +/* in this variable we store the number of write requests issued */ +UNIV_INTERN ulint srv_buf_pool_write_requests = 0; + +/* here we store the number of times when we had to wait for a free page +in the buffer pool. It happens when the buffer pool is full and we need +to make a flush, in order to be able to read or create a page. */ +UNIV_INTERN ulint srv_buf_pool_wait_free = 0; + +/* variable to count the number of pages that were written from buffer +pool to the disk */ +UNIV_INTERN ulint srv_buf_pool_flushed = 0; + +/* variable to count the number of buffer pool reads that led to the +reading of a disk page */ +UNIV_INTERN ulint srv_buf_pool_reads = 0; + +/* variable to count the number of sequential read-aheads */ +UNIV_INTERN ulint srv_read_ahead_seq = 0; + +/* variable to count the number of random read-aheads */ +UNIV_INTERN ulint srv_read_ahead_rnd = 0; + +/* structure to pass status variables to MySQL */ +UNIV_INTERN export_struc export_vars; + +/* If the following is != 0 we do not allow inserts etc. This protects +the user from forgetting the innodb_force_recovery keyword to my.cnf */ + +UNIV_INTERN ulint srv_force_recovery = 0; +/*-----------------------*/ +/* We are prepared for a situation that we have this many threads waiting for +a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the +value. */ + +UNIV_INTERN ulint srv_max_n_threads = 0; + +/* The following controls how many threads we let inside InnoDB concurrently: +threads waiting for locks are not counted into the number because otherwise +we could get a deadlock. MySQL creates a thread for each user session, and +semaphore contention and convoy problems can occur withput this restriction. +Value 10 should be good if there are less than 4 processors + 4 disks in the +computer. Bigger computers need bigger values. Value 0 will disable the +concurrency check. */ + +UNIV_INTERN ulong srv_thread_concurrency = 0; +UNIV_INTERN ulong srv_commit_concurrency = 0; + +/* this mutex protects srv_conc data structures */ +UNIV_INTERN os_fast_mutex_t srv_conc_mutex; +/* number of transactions that have declared_to_be_inside_innodb set. +It used to be a non-error for this value to drop below zero temporarily. +This is no longer true. We'll, however, keep the lint datatype to add +assertions to catch any corner cases that we may have missed. */ +UNIV_INTERN lint srv_conc_n_threads = 0; +/* number of OS threads waiting in the FIFO for a permission to enter +InnoDB */ +UNIV_INTERN ulint srv_conc_n_waiting_threads = 0; + +typedef struct srv_conc_slot_struct srv_conc_slot_t; +struct srv_conc_slot_struct{ + os_event_t event; /* event to wait */ + ibool reserved; /* TRUE if slot + reserved */ + ibool wait_ended; /* TRUE when another + thread has already set + the event and the + thread in this slot is + free to proceed; but + reserved may still be + TRUE at that point */ + UT_LIST_NODE_T(srv_conc_slot_t) srv_conc_queue; /* queue node */ +}; + +/* queue of threads waiting to get in */ +UNIV_INTERN UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue; +/* array of wait slots */ +UNIV_INTERN srv_conc_slot_t* srv_conc_slots; + +/* Number of times a thread is allowed to enter InnoDB within the same +SQL query after it has once got the ticket at srv_conc_enter_innodb */ +#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter +#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay +/*-----------------------*/ +/* If the following is set to 1 then we do not run purge and insert buffer +merge to completion before shutdown. If it is set to 2, do not even flush the +buffer pool to data files at the shutdown: we effectively 'crash' +InnoDB (but lose no committed transactions). */ +UNIV_INTERN ulint srv_fast_shutdown = 0; + +/* Generate a innodb_status. file */ +UNIV_INTERN ibool srv_innodb_status = FALSE; + +/* When estimating number of different key values in an index, sample +this many index pages */ +UNIV_INTERN unsigned long long srv_stats_sample_pages = 8; + +UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE; +UNIV_INTERN ibool srv_use_checksums = TRUE; + +UNIV_INTERN ibool srv_set_thread_priorities = TRUE; +UNIV_INTERN int srv_query_thread_priority = 0; + +UNIV_INTERN ulong srv_replication_delay = 0; + +UNIV_INTERN ulint srv_io_capacity = 100; + +/* Returns the number of IO operations that is X percent of the capacity. +PCT_IO(5) -> returns the number of IO operations that is 5% of the max +where max is srv_io_capacity. */ +#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0))) + +UNIV_INTERN long long srv_ibuf_max_size = 0; +UNIV_INTERN ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */ +UNIV_INTERN ulint srv_ibuf_accel_rate = 100; +#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0))) + +UNIV_INTERN ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */ + +UNIV_INTERN ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */ +UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ +UNIV_INTERN ulint srv_adaptive_checkpoint = 0; /* 0:disable 1:enable */ + +UNIV_INTERN ulint srv_extra_rsegments = 0; /* extra rseg for users */ +/*-------------------------------------------*/ +UNIV_INTERN ulong srv_n_spin_wait_rounds = 20; +UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500; +UNIV_INTERN ulong srv_thread_sleep_delay = 10000; +UNIV_INTERN ulint srv_spin_wait_delay = 5; +UNIV_INTERN ibool srv_priority_boost = TRUE; + +#ifdef UNIV_DEBUG +UNIV_INTERN ibool srv_print_thread_releases = FALSE; +UNIV_INTERN ibool srv_print_lock_waits = FALSE; +UNIV_INTERN ibool srv_print_buf_io = FALSE; +UNIV_INTERN ibool srv_print_log_io = FALSE; +UNIV_INTERN ibool srv_print_latch_waits = FALSE; +#endif /* UNIV_DEBUG */ + +UNIV_INTERN ulint srv_n_rows_inserted = 0; +UNIV_INTERN ulint srv_n_rows_updated = 0; +UNIV_INTERN ulint srv_n_rows_deleted = 0; +UNIV_INTERN ulint srv_n_rows_read = 0; +#ifndef UNIV_HOTBACKUP +static ulint srv_n_rows_inserted_old = 0; +static ulint srv_n_rows_updated_old = 0; +static ulint srv_n_rows_deleted_old = 0; +static ulint srv_n_rows_read_old = 0; +#endif /* !UNIV_HOTBACKUP */ + +UNIV_INTERN ulint srv_n_lock_wait_count = 0; +UNIV_INTERN ulint srv_n_lock_wait_current_count = 0; +UNIV_INTERN ib_int64_t srv_n_lock_wait_time = 0; +UNIV_INTERN ulint srv_n_lock_max_wait_time = 0; + + +/* + Set the following to 0 if you want InnoDB to write messages on + stderr on startup/shutdown +*/ +UNIV_INTERN ibool srv_print_verbose_log = TRUE; +UNIV_INTERN ibool srv_print_innodb_monitor = FALSE; +UNIV_INTERN ibool srv_print_innodb_lock_monitor = FALSE; +UNIV_INTERN ibool srv_print_innodb_tablespace_monitor = FALSE; +UNIV_INTERN ibool srv_print_innodb_table_monitor = FALSE; + +/* Array of English strings describing the current state of an +i/o handler thread */ + +UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS]; +UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; + +UNIV_INTERN time_t srv_last_monitor_time; + +UNIV_INTERN mutex_t srv_innodb_monitor_mutex; + +/* Mutex for locking srv_monitor_file */ +UNIV_INTERN mutex_t srv_monitor_file_mutex; +/* Temporary file for innodb monitor output */ +UNIV_INTERN FILE* srv_monitor_file; +/* Mutex for locking srv_dict_tmpfile. +This mutex has a very high rank; threads reserving it should not +be holding any InnoDB latches. */ +UNIV_INTERN mutex_t srv_dict_tmpfile_mutex; +/* Temporary file for output from the data dictionary */ +UNIV_INTERN FILE* srv_dict_tmpfile; +/* Mutex for locking srv_misc_tmpfile. +This mutex has a very low rank; threads reserving it should not +acquire any further latches or sleep before releasing this one. */ +UNIV_INTERN mutex_t srv_misc_tmpfile_mutex; +/* Temporary file for miscellanous diagnostic output */ +UNIV_INTERN FILE* srv_misc_tmpfile; + +UNIV_INTERN ulint srv_main_thread_process_no = 0; +UNIV_INTERN ulint srv_main_thread_id = 0; + +/* + IMPLEMENTATION OF THE SERVER MAIN PROGRAM + ========================================= + +There is the following analogue between this database +server and an operating system kernel: + +DB concept equivalent OS concept +---------- --------------------- +transaction -- process; + +query thread -- thread; + +lock -- semaphore; + +transaction set to +the rollback state -- kill signal delivered to a process; + +kernel -- kernel; + +query thread execution: +(a) without kernel mutex +reserved -- process executing in user mode; +(b) with kernel mutex reserved + -- process executing in kernel mode; + +The server is controlled by a master thread which runs at +a priority higher than normal, that is, higher than user threads. +It sleeps most of the time, and wakes up, say, every 300 milliseconds, +to check whether there is anything happening in the server which +requires intervention of the master thread. Such situations may be, +for example, when flushing of dirty blocks is needed in the buffer +pool or old version of database rows have to be cleaned away. + +The threads which we call user threads serve the queries of +the clients and input from the console of the server. +They run at normal priority. The server may have several +communications endpoints. A dedicated set of user threads waits +at each of these endpoints ready to receive a client request. +Each request is taken by a single user thread, which then starts +processing and, when the result is ready, sends it to the client +and returns to wait at the same endpoint the thread started from. + +So, we do not have dedicated communication threads listening at +the endpoints and dealing the jobs to dedicated worker threads. +Our architecture saves one thread swithch per request, compared +to the solution with dedicated communication threads +which amounts to 15 microseconds on 100 MHz Pentium +running NT. If the client +is communicating over a network, this saving is negligible, but +if the client resides in the same machine, maybe in an SMP machine +on a different processor from the server thread, the saving +can be important as the threads can communicate over shared +memory with an overhead of a few microseconds. + +We may later implement a dedicated communication thread solution +for those endpoints which communicate over a network. + +Our solution with user threads has two problems: for each endpoint +there has to be a number of listening threads. If there are many +communication endpoints, it may be difficult to set the right number +of concurrent threads in the system, as many of the threads +may always be waiting at less busy endpoints. Another problem +is queuing of the messages, as the server internally does not +offer any queue for jobs. + +Another group of user threads is intended for splitting the +queries and processing them in parallel. Let us call these +parallel communication threads. These threads are waiting for +parallelized tasks, suspended on event semaphores. + +A single user thread waits for input from the console, +like a command to shut the database. + +Utility threads are a different group of threads which takes +care of the buffer pool flushing and other, mainly background +operations, in the server. +Some of these utility threads always run at a lower than normal +priority, so that they are always in background. Some of them +may dynamically boost their priority by the pri_adjust function, +even to higher than normal priority, if their task becomes urgent. +The running of utilities is controlled by high- and low-water marks +of urgency. The urgency may be measured by the number of dirty blocks +in the buffer pool, in the case of the flush thread, for example. +When the high-water mark is exceeded, an utility starts running, until +the urgency drops under the low-water mark. Then the utility thread +suspend itself to wait for an event. The master thread is +responsible of signaling this event when the utility thread is +again needed. + +For each individual type of utility, some threads always remain +at lower than normal priority. This is because pri_adjust is implemented +so that the threads at normal or higher priority control their +share of running time by calling sleep. Thus, if the load of the +system sudenly drops, these threads cannot necessarily utilize +the system fully. The background priority threads make up for this, +starting to run when the load drops. + +When there is no activity in the system, also the master thread +suspends itself to wait for an event making +the server totally silent. The responsibility to signal this +event is on the user thread which again receives a message +from a client. + +There is still one complication in our server design. If a +background utility thread obtains a resource (e.g., mutex) needed by a user +thread, and there is also some other user activity in the system, +the user thread may have to wait indefinitely long for the +resource, as the OS does not schedule a background thread if +there is some other runnable user thread. This problem is called +priority inversion in real-time programming. + +One solution to the priority inversion problem would be to +keep record of which thread owns which resource and +in the above case boost the priority of the background thread +so that it will be scheduled and it can release the resource. +This solution is called priority inheritance in real-time programming. +A drawback of this solution is that the overhead of acquiring a mutex +increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because +the thread has to call os_thread_get_curr_id. +This may be compared to 0.5 microsecond overhead for a mutex lock-unlock +pair. Note that the thread +cannot store the information in the resource, say mutex, itself, +because competing threads could wipe out the information if it is +stored before acquiring the mutex, and if it stored afterwards, +the information is outdated for the time of one machine instruction, +at least. (To be precise, the information could be stored to +lock_word in mutex if the machine supports atomic swap.) + +The above solution with priority inheritance may become actual in the +future, but at the moment we plan to implement a more coarse solution, +which could be called a global priority inheritance. If a thread +has to wait for a long time, say 300 milliseconds, for a resource, +we just guess that it may be waiting for a resource owned by a background +thread, and boost the the priority of all runnable background threads +to the normal level. The background threads then themselves adjust +their fixed priority back to background after releasing all resources +they had (or, at some fixed points in their program code). + +What is the performance of the global priority inheritance solution? +We may weigh the length of the wait time 300 milliseconds, during +which the system processes some other thread +to the cost of boosting the priority of each runnable background +thread, rescheduling it, and lowering the priority again. +On 100 MHz Pentium + NT this overhead may be of the order 100 +microseconds per thread. So, if the number of runnable background +threads is not very big, say < 100, the cost is tolerable. +Utility threads probably will access resources used by +user threads not very often, so collisions of user threads +to preempted utility threads should not happen very often. + +The thread table contains +information of the current status of each thread existing in the system, +and also the event semaphores used in suspending the master thread +and utility and parallel communication threads when they have nothing to do. +The thread table can be seen as an analogue to the process table +in a traditional Unix implementation. + +The thread table is also used in the global priority inheritance +scheme. This brings in one additional complication: threads accessing +the thread table must have at least normal fixed priority, +because the priority inheritance solution does not work if a background +thread is preempted while possessing the mutex protecting the thread table. +So, if a thread accesses the thread table, its priority has to be +boosted at least to normal. This priority requirement can be seen similar to +the privileged mode used when processing the kernel calls in traditional +Unix.*/ + +/* Thread slot in the thread table */ +struct srv_slot_struct{ + os_thread_id_t id; /* thread id */ + os_thread_t handle; /* thread handle */ + unsigned type:3; /* thread type: user, utility etc. */ + unsigned in_use:1; /* TRUE if this slot is in use */ + unsigned suspended:1; /* TRUE if the thread is waiting + for the event of this slot */ + ib_time_t suspend_time; /* time when the thread was + suspended */ + os_event_t event; /* event used in suspending the + thread when it has nothing to do */ + que_thr_t* thr; /* suspended query thread (only + used for MySQL threads) */ +}; + +/* Table for MySQL threads where they will be suspended to wait for locks */ +UNIV_INTERN srv_slot_t* srv_mysql_table = NULL; + +UNIV_INTERN os_event_t srv_lock_timeout_thread_event; + +UNIV_INTERN srv_sys_t* srv_sys = NULL; + +/* padding to prevent other memory update hotspots from residing on +the same memory cache line */ +UNIV_INTERN byte srv_pad1[64]; +/* mutex protecting the server, trx structs, query threads, and lock table */ +UNIV_INTERN mutex_t* kernel_mutex_temp; +/* padding to prevent other memory update hotspots from residing on +the same memory cache line */ +UNIV_INTERN byte srv_pad2[64]; + +#if 0 +/* The following three values measure the urgency of the jobs of +buffer, version, and insert threads. They may vary from 0 - 1000. +The server mutex protects all these variables. The low-water values +tell that the server can acquiesce the utility when the value +drops below this low-water mark. */ + +static ulint srv_meter[SRV_MASTER + 1]; +static ulint srv_meter_low_water[SRV_MASTER + 1]; +static ulint srv_meter_high_water[SRV_MASTER + 1]; +static ulint srv_meter_high_water2[SRV_MASTER + 1]; +static ulint srv_meter_foreground[SRV_MASTER + 1]; +#endif + +/* The following values give info about the activity going on in +the database. They are protected by the server mutex. The arrays +are indexed by the type of the thread. */ + +UNIV_INTERN ulint srv_n_threads_active[SRV_MASTER + 1]; +UNIV_INTERN ulint srv_n_threads[SRV_MASTER + 1]; + +/************************************************************************* +Sets the info describing an i/o thread current state. */ +UNIV_INTERN +void +srv_set_io_thread_op_info( +/*======================*/ + ulint i, /* in: the 'segment' of the i/o thread */ + const char* str) /* in: constant char string describing the + state */ +{ + ut_a(i < SRV_MAX_N_IO_THREADS); + + srv_io_thread_op_info[i] = str; +} + +/************************************************************************* +Accessor function to get pointer to n'th slot in the server thread +table. */ +static +srv_slot_t* +srv_table_get_nth_slot( +/*===================*/ + /* out: pointer to the slot */ + ulint index) /* in: index of the slot */ +{ + ut_a(index < OS_THREAD_MAX_N); + + return(srv_sys->threads + index); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************* +Gets the number of threads in the system. */ +UNIV_INTERN +ulint +srv_get_n_threads(void) +/*===================*/ +{ + ulint i; + ulint n_threads = 0; + + mutex_enter(&kernel_mutex); + + for (i = SRV_COM; i < SRV_MASTER + 1; i++) { + + n_threads += srv_n_threads[i]; + } + + mutex_exit(&kernel_mutex); + + return(n_threads); +} + +/************************************************************************* +Reserves a slot in the thread table for the current thread. Also creates the +thread local storage struct for the current thread. NOTE! The server mutex +has to be reserved by the caller! */ +static +ulint +srv_table_reserve_slot( +/*===================*/ + /* out: reserved slot index */ + enum srv_thread_type type) /* in: type of the thread */ +{ + srv_slot_t* slot; + ulint i; + + ut_a(type > 0); + ut_a(type <= SRV_MASTER); + + i = 0; + slot = srv_table_get_nth_slot(i); + + while (slot->in_use) { + i++; + slot = srv_table_get_nth_slot(i); + } + + ut_a(slot->in_use == FALSE); + + slot->in_use = TRUE; + slot->suspended = FALSE; + slot->type = type; + slot->id = os_thread_get_curr_id(); + slot->handle = os_thread_get_curr(); + + thr_local_create(); + + thr_local_set_slot_no(os_thread_get_curr_id(), i); + + return(i); +} + +/************************************************************************* +Suspends the calling thread to wait for the event in its thread slot. +NOTE! The server mutex has to be reserved by the caller! */ +static +os_event_t +srv_suspend_thread(void) +/*====================*/ + /* out: event for the calling thread to wait */ +{ + srv_slot_t* slot; + os_event_t event; + ulint slot_no; + enum srv_thread_type type; + + ut_ad(mutex_own(&kernel_mutex)); + + slot_no = thr_local_get_slot_no(os_thread_get_curr_id()); + + if (srv_print_thread_releases) { + fprintf(stderr, + "Suspending thread %lu to slot %lu\n", + (ulong) os_thread_get_curr_id(), (ulong) slot_no); + } + + slot = srv_table_get_nth_slot(slot_no); + + type = slot->type; + + ut_ad(type >= SRV_WORKER); + ut_ad(type <= SRV_MASTER); + + event = slot->event; + + slot->suspended = TRUE; + + ut_ad(srv_n_threads_active[type] > 0); + + srv_n_threads_active[type]--; + + os_event_reset(event); + + return(event); +} +#endif /* !UNIV_HOTBACKUP */ + +/************************************************************************* +Releases threads of the type given from suspension in the thread table. +NOTE! The server mutex has to be reserved by the caller! */ +UNIV_INTERN +ulint +srv_release_threads( +/*================*/ + /* out: number of threads + released: this may be < n if + not enough threads were + suspended at the moment */ + enum srv_thread_type type, /* in: thread type */ + ulint n) /* in: number of threads to release */ +{ + srv_slot_t* slot; + ulint i; + ulint count = 0; + + ut_ad(type >= SRV_WORKER); + ut_ad(type <= SRV_MASTER); + ut_ad(n > 0); + ut_ad(mutex_own(&kernel_mutex)); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = srv_table_get_nth_slot(i); + + if (slot->in_use && slot->type == type && slot->suspended) { + + slot->suspended = FALSE; + + srv_n_threads_active[type]++; + + os_event_set(slot->event); + + if (srv_print_thread_releases) { + fprintf(stderr, + "Releasing thread %lu type %lu" + " from slot %lu\n", + (ulong) slot->id, (ulong) type, + (ulong) i); + } + + count++; + + if (count == n) { + break; + } + } + } + + return(count); +} + +/************************************************************************* +Returns the calling thread type. */ +UNIV_INTERN +enum srv_thread_type +srv_get_thread_type(void) +/*=====================*/ + /* out: SRV_COM, ... */ +{ + ulint slot_no; + srv_slot_t* slot; + enum srv_thread_type type; + + mutex_enter(&kernel_mutex); + + slot_no = thr_local_get_slot_no(os_thread_get_curr_id()); + + slot = srv_table_get_nth_slot(slot_no); + + type = slot->type; + + ut_ad(type >= SRV_WORKER); + ut_ad(type <= SRV_MASTER); + + mutex_exit(&kernel_mutex); + + return(type); +} + +/************************************************************************* +Initializes the server. */ +UNIV_INTERN +void +srv_init(void) +/*==========*/ +{ + srv_conc_slot_t* conc_slot; + srv_slot_t* slot; + dict_table_t* table; + ulint i; + + srv_sys = mem_alloc(sizeof(srv_sys_t)); + + kernel_mutex_temp = mem_alloc(sizeof(mutex_t)); + mutex_create(&kernel_mutex, SYNC_KERNEL); + + mutex_create(&srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK); + + srv_sys->threads = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t)); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + slot = srv_table_get_nth_slot(i); + slot->in_use = FALSE; + slot->type=0; /* Avoid purify errors */ + slot->event = os_event_create(NULL); + ut_a(slot->event); + } + + srv_mysql_table = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t)); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + slot = srv_mysql_table + i; + slot->in_use = FALSE; + slot->type = 0; + slot->event = os_event_create(NULL); + ut_a(slot->event); + } + + srv_lock_timeout_thread_event = os_event_create(NULL); + + for (i = 0; i < SRV_MASTER + 1; i++) { + srv_n_threads_active[i] = 0; + srv_n_threads[i] = 0; +#if 0 + srv_meter[i] = 30; + srv_meter_low_water[i] = 50; + srv_meter_high_water[i] = 100; + srv_meter_high_water2[i] = 200; + srv_meter_foreground[i] = 250; +#endif + } + + UT_LIST_INIT(srv_sys->tasks); + + /* create dummy table and index for old-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY1", + DICT_HDR_SPACE, 1, 0); + dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8); + + srv_sys->dummy_ind1 = dict_mem_index_create( + "SYS_DUMMY1", "SYS_DUMMY1", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind1, table, + dict_table_get_nth_col(table, 0), 0); + srv_sys->dummy_ind1->table = table; + /* create dummy table and index for new-style infimum and supremum */ + table = dict_mem_table_create("SYS_DUMMY2", + DICT_HDR_SPACE, 1, DICT_TF_COMPACT); + dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR, + DATA_ENGLISH | DATA_NOT_NULL, 8); + srv_sys->dummy_ind2 = dict_mem_index_create( + "SYS_DUMMY2", "SYS_DUMMY2", DICT_HDR_SPACE, 0, 1); + dict_index_add_col(srv_sys->dummy_ind2, table, + dict_table_get_nth_col(table, 0), 0); + srv_sys->dummy_ind2->table = table; + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + srv_sys->dummy_ind1->cached = srv_sys->dummy_ind2->cached = TRUE; + + /* Init the server concurrency restriction data structures */ + + os_fast_mutex_init(&srv_conc_mutex); + + UT_LIST_INIT(srv_conc_queue); + + srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t)); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + conc_slot = srv_conc_slots + i; + conc_slot->reserved = FALSE; + conc_slot->event = os_event_create(NULL); + ut_a(conc_slot->event); + } + + /* Initialize some INFORMATION SCHEMA internal structures */ + trx_i_s_cache_init(trx_i_s_cache); +} + +/************************************************************************* +Frees the OS fast mutex created in srv_init(). */ +UNIV_INTERN +void +srv_free(void) +/*==========*/ +{ + os_fast_mutex_free(&srv_conc_mutex); +} + +/************************************************************************* +Initializes the synchronization primitives, memory system, and the thread +local storage. */ +UNIV_INTERN +void +srv_general_init(void) +/*==================*/ +{ + ut_mem_init(); + os_sync_init(); + sync_init(); + mem_init(srv_mem_pool_size); + thr_local_init(); +} + +/*======================= InnoDB Server FIFO queue =======================*/ + +/* Maximum allowable purge history length. <=0 means 'infinite'. */ +UNIV_INTERN ulong srv_max_purge_lag = 0; + +/************************************************************************* +Puts an OS thread to wait if there are too many concurrent threads +(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ +UNIV_INTERN +void +srv_conc_enter_innodb( +/*==================*/ + trx_t* trx) /* in: transaction object associated with the + thread */ +{ + ibool has_slept = FALSE; + srv_conc_slot_t* slot = NULL; + ulint i; + + if (trx->mysql_thd != NULL + && thd_is_replication_slave_thread(trx->mysql_thd)) { + + UT_WAIT_FOR(srv_conc_n_threads + < (lint)srv_thread_concurrency, + srv_replication_delay * 1000); + + return; + } + + /* If trx has 'free tickets' to enter the engine left, then use one + such ticket */ + + if (trx->n_tickets_to_enter_innodb > 0) { + trx->n_tickets_to_enter_innodb--; + + return; + } + + os_fast_mutex_lock(&srv_conc_mutex); +retry: + if (trx->declared_to_be_inside_innodb) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: trying to declare trx" + " to enter InnoDB, but\n" + "InnoDB: it already is declared.\n", stderr); + trx_print(stderr, trx, 0); + putc('\n', stderr); + os_fast_mutex_unlock(&srv_conc_mutex); + + return; + } + + ut_ad(srv_conc_n_threads >= 0); + + if (srv_conc_n_threads < (lint)srv_thread_concurrency) { + + srv_conc_n_threads++; + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER; + + os_fast_mutex_unlock(&srv_conc_mutex); + + return; + } + + /* If the transaction is not holding resources, let it sleep + for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */ + + if (!has_slept && !trx->has_search_latch + && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) { + + has_slept = TRUE; /* We let it sleep only once to avoid + starvation */ + + srv_conc_n_waiting_threads++; + + os_fast_mutex_unlock(&srv_conc_mutex); + + trx->op_info = "sleeping before joining InnoDB queue"; + + /* Peter Zaitsev suggested that we take the sleep away + altogether. But the sleep may be good in pathological + situations of lots of thread switches. Simply put some + threads aside for a while to reduce the number of thread + switches. */ + if (SRV_THREAD_SLEEP_DELAY > 0) { + os_thread_sleep(SRV_THREAD_SLEEP_DELAY); + } + + trx->op_info = ""; + + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc_n_waiting_threads--; + + goto retry; + } + + /* Too many threads inside: put the current thread to a queue */ + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + slot = srv_conc_slots + i; + + if (!slot->reserved) { + + break; + } + } + + if (i == OS_THREAD_MAX_N) { + /* Could not find a free wait slot, we must let the + thread enter */ + + srv_conc_n_threads++; + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = 0; + + os_fast_mutex_unlock(&srv_conc_mutex); + + return; + } + + /* Release possible search system latch this thread has */ + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + /* Add to the queue */ + slot->reserved = TRUE; + slot->wait_ended = FALSE; + + UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot); + + os_event_reset(slot->event); + + srv_conc_n_waiting_threads++; + + os_fast_mutex_unlock(&srv_conc_mutex); + + /* Go to wait for the event; when a thread leaves InnoDB it will + release this thread */ + + trx->op_info = "waiting in InnoDB queue"; + + os_event_wait(slot->event); + + trx->op_info = ""; + + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc_n_waiting_threads--; + + /* NOTE that the thread which released this thread already + incremented the thread counter on behalf of this thread */ + + slot->reserved = FALSE; + + UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot); + + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER; + + os_fast_mutex_unlock(&srv_conc_mutex); +} + +/************************************************************************* +This lets a thread enter InnoDB regardless of the number of threads inside +InnoDB. This must be called when a thread ends a lock wait. */ +UNIV_INTERN +void +srv_conc_force_enter_innodb( +/*========================*/ + trx_t* trx) /* in: transaction object associated with the + thread */ +{ + if (UNIV_LIKELY(!srv_thread_concurrency)) { + + return; + } + + ut_ad(srv_conc_n_threads >= 0); + + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc_n_threads++; + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = 1; + + os_fast_mutex_unlock(&srv_conc_mutex); +} + +/************************************************************************* +This must be called when a thread exits InnoDB in a lock wait or at the +end of an SQL statement. */ +UNIV_INTERN +void +srv_conc_force_exit_innodb( +/*=======================*/ + trx_t* trx) /* in: transaction object associated with the + thread */ +{ + srv_conc_slot_t* slot = NULL; + + if (trx->mysql_thd != NULL + && thd_is_replication_slave_thread(trx->mysql_thd)) { + + return; + } + + if (trx->declared_to_be_inside_innodb == FALSE) { + + return; + } + + os_fast_mutex_lock(&srv_conc_mutex); + + ut_ad(srv_conc_n_threads > 0); + srv_conc_n_threads--; + trx->declared_to_be_inside_innodb = FALSE; + trx->n_tickets_to_enter_innodb = 0; + + if (srv_conc_n_threads < (lint)srv_thread_concurrency) { + /* Look for a slot where a thread is waiting and no other + thread has yet released the thread */ + + slot = UT_LIST_GET_FIRST(srv_conc_queue); + + while (slot && slot->wait_ended == TRUE) { + slot = UT_LIST_GET_NEXT(srv_conc_queue, slot); + } + + if (slot != NULL) { + slot->wait_ended = TRUE; + + /* We increment the count on behalf of the released + thread */ + + srv_conc_n_threads++; + } + } + + os_fast_mutex_unlock(&srv_conc_mutex); + + if (slot != NULL) { + os_event_set(slot->event); + } +} + +/************************************************************************* +This must be called when a thread exits InnoDB. */ +UNIV_INTERN +void +srv_conc_exit_innodb( +/*=================*/ + trx_t* trx) /* in: transaction object associated with the + thread */ +{ + if (trx->n_tickets_to_enter_innodb > 0) { + /* We will pretend the thread is still inside InnoDB though it + now leaves the InnoDB engine. In this way we save + a lot of semaphore operations. srv_conc_force_exit_innodb is + used to declare the thread definitely outside InnoDB. It + should be called when there is a lock wait or an SQL statement + ends. */ + + return; + } + + srv_conc_force_exit_innodb(trx); +} + +/*========================================================================*/ + +/************************************************************************* +Normalizes init parameter values to use units we use inside InnoDB. */ +static +ulint +srv_normalize_init_values(void) +/*===========================*/ + /* out: DB_SUCCESS or error code */ +{ + ulint n; + ulint i; + + n = srv_n_data_files; + + for (i = 0; i < n; i++) { + srv_data_file_sizes[i] = srv_data_file_sizes[i] + * ((1024 * 1024) / UNIV_PAGE_SIZE); + } + + srv_last_file_size_max = srv_last_file_size_max + * ((1024 * 1024) / UNIV_PAGE_SIZE); + + srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE; + + srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE; + + srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE); + + return(DB_SUCCESS); +} + +/************************************************************************* +Boots the InnoDB server. */ +UNIV_INTERN +ulint +srv_boot(void) +/*==========*/ + /* out: DB_SUCCESS or error code */ +{ + ulint err; + + /* Transform the init parameter values given by MySQL to + use units we use inside InnoDB: */ + + err = srv_normalize_init_values(); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Initialize synchronization primitives, memory management, and thread + local storage */ + + srv_general_init(); + + /* Initialize this module */ + + srv_init(); + + return(DB_SUCCESS); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************* +Reserves a slot in the thread table for the current MySQL OS thread. +NOTE! The kernel mutex has to be reserved by the caller! */ +static +srv_slot_t* +srv_table_reserve_slot_for_mysql(void) +/*==================================*/ + /* out: reserved slot */ +{ + srv_slot_t* slot; + ulint i; + + ut_ad(mutex_own(&kernel_mutex)); + + i = 0; + slot = srv_mysql_table + i; + + while (slot->in_use) { + i++; + + if (i >= OS_THREAD_MAX_N) { + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: There appear to be %lu MySQL" + " threads currently waiting\n" + "InnoDB: inside InnoDB, which is the" + " upper limit. Cannot continue operation.\n" + "InnoDB: We intentionally generate" + " a seg fault to print a stack trace\n" + "InnoDB: on Linux. But first we print" + " a list of waiting threads.\n", (ulong) i); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = srv_mysql_table + i; + + fprintf(stderr, + "Slot %lu: thread id %lu, type %lu," + " in use %lu, susp %lu, time %lu\n", + (ulong) i, + (ulong) os_thread_pf(slot->id), + (ulong) slot->type, + (ulong) slot->in_use, + (ulong) slot->suspended, + (ulong) difftime(ut_time(), + slot->suspend_time)); + } + + ut_error; + } + + slot = srv_mysql_table + i; + } + + ut_a(slot->in_use == FALSE); + + slot->in_use = TRUE; + slot->id = os_thread_get_curr_id(); + slot->handle = os_thread_get_curr(); + + return(slot); +} +#endif /* !UNIV_HOTBACKUP */ + +/******************************************************************* +Puts a MySQL OS thread to wait for a lock to be released. If an error +occurs during the wait trx->error_state associated with thr is +!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK +are possible errors. DB_DEADLOCK is returned if selective deadlock +resolution chose this transaction as a victim. */ +UNIV_INTERN +void +srv_suspend_mysql_thread( +/*=====================*/ + que_thr_t* thr) /* in: query thread associated with the MySQL + OS thread */ +{ +#ifndef UNIV_HOTBACKUP + srv_slot_t* slot; + os_event_t event; + double wait_time; + trx_t* trx; + ulint had_dict_lock; + ibool was_declared_inside_innodb = FALSE; + ib_int64_t start_time = 0; + ib_int64_t finish_time; + ulint diff_time; + ulint sec; + ulint ms; + ulong lock_wait_timeout; + + ut_ad(!mutex_own(&kernel_mutex)); + + trx = thr_get_trx(thr); + + os_event_set(srv_lock_timeout_thread_event); + + mutex_enter(&kernel_mutex); + + trx->error_state = DB_SUCCESS; + + if (thr->state == QUE_THR_RUNNING) { + + ut_ad(thr->is_active == TRUE); + + /* The lock has already been released or this transaction + was chosen as a deadlock victim: no need to suspend */ + + if (trx->was_chosen_as_deadlock_victim) { + + trx->error_state = DB_DEADLOCK; + trx->was_chosen_as_deadlock_victim = FALSE; + } + + mutex_exit(&kernel_mutex); + + return; + } + + ut_ad(thr->is_active == FALSE); + + slot = srv_table_reserve_slot_for_mysql(); + + event = slot->event; + + slot->thr = thr; + + os_event_reset(event); + + slot->suspend_time = ut_time(); + + if (thr->lock_state == QUE_THR_LOCK_ROW) { + srv_n_lock_wait_count++; + srv_n_lock_wait_current_count++; + + if (ut_usectime(&sec, &ms) == -1) { + start_time = -1; + } else { + start_time = (ib_int64_t) sec * 1000000 + ms; + } + } + /* Wake the lock timeout monitor thread, if it is suspended */ + + os_event_set(srv_lock_timeout_thread_event); + + mutex_exit(&kernel_mutex); + + if (trx->declared_to_be_inside_innodb) { + + was_declared_inside_innodb = TRUE; + + /* We must declare this OS thread to exit InnoDB, since a + possible other thread holding a lock which this thread waits + for must be allowed to enter, sooner or later */ + + srv_conc_force_exit_innodb(trx); + } + + had_dict_lock = trx->dict_operation_lock_mode; + + switch (had_dict_lock) { + case RW_S_LATCH: + /* Release foreign key check latch */ + row_mysql_unfreeze_data_dictionary(trx); + break; + case RW_X_LATCH: + /* Release fast index creation latch */ + row_mysql_unlock_data_dictionary(trx); + break; + } + + ut_a(trx->dict_operation_lock_mode == 0); + + /* Suspend this thread and wait for the event. */ + + os_event_wait(event); + + /* After resuming, reacquire the data dictionary latch if + necessary. */ + + switch (had_dict_lock) { + case RW_S_LATCH: + row_mysql_freeze_data_dictionary(trx); + break; + case RW_X_LATCH: + row_mysql_lock_data_dictionary(trx); + break; + } + + if (was_declared_inside_innodb) { + + /* Return back inside InnoDB */ + + srv_conc_force_enter_innodb(trx); + } + + mutex_enter(&kernel_mutex); + + /* Release the slot for others to use */ + + slot->in_use = FALSE; + + wait_time = ut_difftime(ut_time(), slot->suspend_time); + + if (thr->lock_state == QUE_THR_LOCK_ROW) { + if (ut_usectime(&sec, &ms) == -1) { + finish_time = -1; + } else { + finish_time = (ib_int64_t) sec * 1000000 + ms; + } + + diff_time = (ulint) (finish_time - start_time); + + srv_n_lock_wait_current_count--; + srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time; + if (diff_time > srv_n_lock_max_wait_time && + /* only update the variable if we successfully + retrieved the start and finish times. See Bug#36819. */ + start_time != -1 && finish_time != -1) { + srv_n_lock_max_wait_time = diff_time; + } + } + + if (trx->was_chosen_as_deadlock_victim) { + + trx->error_state = DB_DEADLOCK; + trx->was_chosen_as_deadlock_victim = FALSE; + } + + mutex_exit(&kernel_mutex); + + /* InnoDB system transactions (such as the purge, and + incomplete transactions that are being rolled back after crash + recovery) will use the global value of + innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */ + lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd); + + if (lock_wait_timeout < 100000000 + && wait_time > (double) lock_wait_timeout) { + + trx->error_state = DB_LOCK_WAIT_TIMEOUT; + } +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ +} + +/************************************************************************ +Releases a MySQL OS thread waiting for a lock to be released, if the +thread is already suspended. */ +UNIV_INTERN +void +srv_release_mysql_thread_if_suspended( +/*==================================*/ + que_thr_t* thr) /* in: query thread associated with the + MySQL OS thread */ +{ +#ifndef UNIV_HOTBACKUP + srv_slot_t* slot; + ulint i; + + ut_ad(mutex_own(&kernel_mutex)); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = srv_mysql_table + i; + + if (slot->in_use && slot->thr == thr) { + /* Found */ + + os_event_set(slot->event); + + return; + } + } + + /* not found */ +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; +#endif /* UNIV_HOTBACKUP */ +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************** +Refreshes the values used to calculate per-second averages. */ +static +void +srv_refresh_innodb_monitor_stats(void) +/*==================================*/ +{ + mutex_enter(&srv_innodb_monitor_mutex); + + srv_last_monitor_time = time(NULL); + + os_aio_refresh_stats(); + + btr_cur_n_sea_old = btr_cur_n_sea; + btr_cur_n_non_sea_old = btr_cur_n_non_sea; + + log_refresh_stats(); + + buf_refresh_io_stats(); + + srv_n_rows_inserted_old = srv_n_rows_inserted; + srv_n_rows_updated_old = srv_n_rows_updated; + srv_n_rows_deleted_old = srv_n_rows_deleted; + srv_n_rows_read_old = srv_n_rows_read; + + mutex_exit(&srv_innodb_monitor_mutex); +} + +/********************************************************************** +Outputs to a file the output of the InnoDB Monitor. */ +UNIV_INTERN +void +srv_printf_innodb_monitor( +/*======================*/ + FILE* file, /* in: output stream */ + ulint* trx_start, /* out: file position of the start of + the list of active transactions */ + ulint* trx_end) /* out: file position of the end of + the list of active transactions */ +{ + double time_elapsed; + time_t current_time; + ulint n_reserved; + + ulint btr_search_sys_subtotal; + ulint lock_sys_subtotal; + ulint recv_sys_subtotal; + ulint io_counter_subtotal; + + ulint i; + trx_t* trx; + + mutex_enter(&srv_innodb_monitor_mutex); + + current_time = time(NULL); + + /* We add 0.001 seconds to time_elapsed to prevent division + by zero if two users happen to call SHOW INNODB STATUS at the same + time */ + + time_elapsed = difftime(current_time, srv_last_monitor_time) + + 0.001; + + srv_last_monitor_time = time(NULL); + + fputs("\n=====================================\n", file); + + ut_print_timestamp(file); + fprintf(file, + " INNODB MONITOR OUTPUT\n" + "=====================================\n" + "Per second averages calculated from the last %lu seconds\n", + (ulong)time_elapsed); + + fputs("----------\n" + "SEMAPHORES\n" + "----------\n", file); + sync_print(file); + + /* Conceptually, srv_innodb_monitor_mutex has a very high latching + order level in sync0sync.h, while dict_foreign_err_mutex has a very + low level 135. Therefore we can reserve the latter mutex here without + a danger of a deadlock of threads. */ + + mutex_enter(&dict_foreign_err_mutex); + + if (ftell(dict_foreign_err_file) != 0L) { + fputs("------------------------\n" + "LATEST FOREIGN KEY ERROR\n" + "------------------------\n", file); + ut_copy_file(file, dict_foreign_err_file); + } + + mutex_exit(&dict_foreign_err_mutex); + + fputs("--------\n" + "FILE I/O\n" + "--------\n", file); + os_aio_print(file); + + fputs("-------------------------------------\n" + "INSERT BUFFER AND ADAPTIVE HASH INDEX\n" + "-------------------------------------\n", file); + ibuf_print(file); + + ha_print_info(file, btr_search_sys->hash_index); + + fprintf(file, + "%.2f hash searches/s, %.2f non-hash searches/s\n", + (btr_cur_n_sea - btr_cur_n_sea_old) + / time_elapsed, + (btr_cur_n_non_sea - btr_cur_n_non_sea_old) + / time_elapsed); + btr_cur_n_sea_old = btr_cur_n_sea; + btr_cur_n_non_sea_old = btr_cur_n_non_sea; + + fputs("---\n" + "LOG\n" + "---\n", file); + log_print(file); + + fputs("----------------------\n" + "BUFFER POOL AND MEMORY\n" + "----------------------\n", file); + fprintf(file, + "Total memory allocated " ULINTPF + "; in additional pool allocated " ULINTPF "\n", + ut_total_allocated_memory, + mem_pool_get_reserved(mem_comm_pool)); + /* Calcurate reserved memories */ + if (btr_search_sys && btr_search_sys->hash_index->heap) { + btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap); + } else { + btr_search_sys_subtotal = 0; + for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) { + btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]); + } + } + + lock_sys_subtotal = 0; + if (trx_sys) { + mutex_enter(&kernel_mutex); + trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); + while (trx) { + lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0); + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } + mutex_exit(&kernel_mutex); + } + + recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash) + ? mem_heap_get_size(recv_sys->heap) : 0); + + fprintf(file, + "Internal hash tables (constant factor + variable factor)\n" + " Adaptive hash index %lu \t(%lu + %lu)\n" + " Page hash %lu\n" + " Dictionary cache %lu \t(%lu + %lu)\n" + " File system %lu \t(%lu + %lu)\n" + " Lock system %lu \t(%lu + %lu)\n" + " Recovery system %lu \t(%lu + %lu)\n" + " Threads %lu \t(%lu + %lu)\n", + + (ulong) (btr_search_sys + ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0) + + btr_search_sys_subtotal, + (ulong) (btr_search_sys + ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0), + (ulong) btr_search_sys_subtotal, + + (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)), + + (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells + ) * sizeof(hash_cell_t) + + dict_sys->size) : 0), + (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells + ) * sizeof(hash_cell_t)) : 0), + (ulong) (dict_sys ? (dict_sys->size) : 0), + + (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t) + + fil_system_hash_nodes()), + (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)), + (ulong) fil_system_hash_nodes(), + + (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0) + + lock_sys_subtotal), + (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0), + (ulong) lock_sys_subtotal, + + (ulong) (((recv_sys && recv_sys->addr_hash) + ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0) + + recv_sys_subtotal), + (ulong) ((recv_sys && recv_sys->addr_hash) + ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0), + (ulong) recv_sys_subtotal, + + (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t) + + thr_local_hash_nodes()), + (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)), + (ulong) thr_local_hash_nodes()); + + fprintf(file, "Dictionary memory allocated " ULINTPF "\n", + dict_sys->size); + + buf_print_io(file); + + fputs("--------------\n" + "ROW OPERATIONS\n" + "--------------\n", file); + fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n", + (long) srv_conc_n_threads, + (ulong) srv_conc_n_waiting_threads); + + fprintf(file, "%lu read views open inside InnoDB\n", + UT_LIST_GET_LEN(trx_sys->view_list)); + + n_reserved = fil_space_get_n_reserved_extents(0); + if (n_reserved > 0) { + fprintf(file, + "%lu tablespace extents now reserved for" + " B-tree split operations\n", + (ulong) n_reserved); + } + +#ifdef UNIV_LINUX + fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n", + (ulong) srv_main_thread_process_no, + (ulong) srv_main_thread_id, + srv_main_thread_op_info); +#else + fprintf(file, "Main thread id %lu, state: %s\n", + (ulong) srv_main_thread_id, + srv_main_thread_op_info); +#endif + fprintf(file, + "Number of rows inserted " ULINTPF + ", updated " ULINTPF ", deleted " ULINTPF + ", read " ULINTPF "\n", + srv_n_rows_inserted, + srv_n_rows_updated, + srv_n_rows_deleted, + srv_n_rows_read); + fprintf(file, + "%.2f inserts/s, %.2f updates/s," + " %.2f deletes/s, %.2f reads/s\n", + (srv_n_rows_inserted - srv_n_rows_inserted_old) + / time_elapsed, + (srv_n_rows_updated - srv_n_rows_updated_old) + / time_elapsed, + (srv_n_rows_deleted - srv_n_rows_deleted_old) + / time_elapsed, + (srv_n_rows_read - srv_n_rows_read_old) + / time_elapsed); + + srv_n_rows_inserted_old = srv_n_rows_inserted; + srv_n_rows_updated_old = srv_n_rows_updated; + srv_n_rows_deleted_old = srv_n_rows_deleted; + srv_n_rows_read_old = srv_n_rows_read; + + lock_print_info_summary(file); + if (trx_start) { + long t = ftell(file); + if (t < 0) { + *trx_start = ULINT_UNDEFINED; + } else { + *trx_start = (ulint) t; + } + } + lock_print_info_all_transactions(file); + if (trx_end) { + long t = ftell(file); + if (t < 0) { + *trx_end = ULINT_UNDEFINED; + } else { + *trx_end = (ulint) t; + } + } + + fputs("----------------------------\n" + "END OF INNODB MONITOR OUTPUT\n" + "============================\n", file); + mutex_exit(&srv_innodb_monitor_mutex); + fflush(file); +} + +/********************************************************************** +Function to pass InnoDB status variables to MySQL */ +UNIV_INTERN +void +srv_export_innodb_status(void) +{ + mutex_enter(&srv_innodb_monitor_mutex); + + export_vars.innodb_data_pending_reads + = os_n_pending_reads; + export_vars.innodb_data_pending_writes + = os_n_pending_writes; + export_vars.innodb_data_pending_fsyncs + = fil_n_pending_log_flushes + + fil_n_pending_tablespace_flushes; + export_vars.innodb_data_fsyncs = os_n_fsyncs; + export_vars.innodb_data_read = srv_data_read; + export_vars.innodb_data_reads = os_n_file_reads; + export_vars.innodb_data_writes = os_n_file_writes; + export_vars.innodb_data_written = srv_data_written; + export_vars.innodb_buffer_pool_read_requests = buf_pool->n_page_gets; + export_vars.innodb_buffer_pool_write_requests + = srv_buf_pool_write_requests; + export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free; + export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed; + export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads; + export_vars.innodb_buffer_pool_read_ahead_rnd = srv_read_ahead_rnd; + export_vars.innodb_buffer_pool_read_ahead_seq = srv_read_ahead_seq; + export_vars.innodb_buffer_pool_pages_data + = UT_LIST_GET_LEN(buf_pool->LRU); + export_vars.innodb_buffer_pool_pages_dirty + = UT_LIST_GET_LEN(buf_pool->flush_list); + export_vars.innodb_buffer_pool_pages_free + = UT_LIST_GET_LEN(buf_pool->free); +#ifdef UNIV_DEBUG + export_vars.innodb_buffer_pool_pages_latched + = buf_get_latched_pages_number(); +#endif /* UNIV_DEBUG */ + export_vars.innodb_buffer_pool_pages_total = buf_pool->curr_size; + + export_vars.innodb_buffer_pool_pages_misc = buf_pool->curr_size + - UT_LIST_GET_LEN(buf_pool->LRU) + - UT_LIST_GET_LEN(buf_pool->free); +#ifdef HAVE_GCC_ATOMIC_BUILTINS + export_vars.innodb_have_atomic_builtins = 1; +#else + export_vars.innodb_have_atomic_builtins = 0; +#endif + export_vars.innodb_page_size = UNIV_PAGE_SIZE; + export_vars.innodb_log_waits = srv_log_waits; + export_vars.innodb_os_log_written = srv_os_log_written; + export_vars.innodb_os_log_fsyncs = fil_n_log_flushes; + export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes; + export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes; + export_vars.innodb_log_write_requests = srv_log_write_requests; + export_vars.innodb_log_writes = srv_log_writes; + export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written; + export_vars.innodb_dblwr_writes = srv_dblwr_writes; + export_vars.innodb_pages_created = buf_pool->n_pages_created; + export_vars.innodb_pages_read = buf_pool->n_pages_read; + export_vars.innodb_pages_written = buf_pool->n_pages_written; + export_vars.innodb_row_lock_waits = srv_n_lock_wait_count; + export_vars.innodb_row_lock_current_waits + = srv_n_lock_wait_current_count; + export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000; + if (srv_n_lock_wait_count > 0) { + export_vars.innodb_row_lock_time_avg = (ulint) + (srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count); + } else { + export_vars.innodb_row_lock_time_avg = 0; + } + export_vars.innodb_row_lock_time_max + = srv_n_lock_max_wait_time / 1000; + export_vars.innodb_rows_read = srv_n_rows_read; + export_vars.innodb_rows_inserted = srv_n_rows_inserted; + export_vars.innodb_rows_updated = srv_n_rows_updated; + export_vars.innodb_rows_deleted = srv_n_rows_deleted; + + mutex_exit(&srv_innodb_monitor_mutex); +} + +/************************************************************************* +A thread which wakes up threads whose lock wait may have lasted too long. +This also prints the info output by various InnoDB monitors. */ +UNIV_INTERN +os_thread_ret_t +srv_lock_timeout_and_monitor_thread( +/*================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ +{ + srv_slot_t* slot; + double time_elapsed; + time_t current_time; + time_t last_table_monitor_time; + time_t last_tablespace_monitor_time; + time_t last_monitor_time; + ibool some_waits; + double wait_time; + ulint i; + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Lock timeout thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + UT_NOT_USED(arg); + srv_last_monitor_time = time(NULL); + last_table_monitor_time = time(NULL); + last_tablespace_monitor_time = time(NULL); + last_monitor_time = time(NULL); +loop: + srv_lock_timeout_and_monitor_active = TRUE; + + /* When someone is waiting for a lock, we wake up every second + and check if a timeout has passed for a lock wait */ + + os_thread_sleep(1000000); + + current_time = time(NULL); + + time_elapsed = difftime(current_time, last_monitor_time); + + if (time_elapsed > 15) { + last_monitor_time = time(NULL); + + if (srv_print_innodb_monitor) { + srv_printf_innodb_monitor(stderr, NULL, NULL); + } + + if (srv_innodb_status) { + mutex_enter(&srv_monitor_file_mutex); + rewind(srv_monitor_file); + srv_printf_innodb_monitor(srv_monitor_file, NULL, + NULL); + os_file_set_eof(srv_monitor_file); + mutex_exit(&srv_monitor_file_mutex); + } + + if (srv_print_innodb_tablespace_monitor + && difftime(current_time, + last_tablespace_monitor_time) > 60) { + last_tablespace_monitor_time = time(NULL); + + fputs("========================" + "========================\n", + stderr); + + ut_print_timestamp(stderr); + + fputs(" INNODB TABLESPACE MONITOR OUTPUT\n" + "========================" + "========================\n", + stderr); + + fsp_print(0); + fputs("Validating tablespace\n", stderr); + fsp_validate(0); + fputs("Validation ok\n" + "---------------------------------------\n" + "END OF INNODB TABLESPACE MONITOR OUTPUT\n" + "=======================================\n", + stderr); + } + + if (srv_print_innodb_table_monitor + && difftime(current_time, last_table_monitor_time) > 60) { + + last_table_monitor_time = time(NULL); + + fputs("===========================================\n", + stderr); + + ut_print_timestamp(stderr); + + fputs(" INNODB TABLE MONITOR OUTPUT\n" + "===========================================\n", + stderr); + dict_print(); + + fputs("-----------------------------------\n" + "END OF INNODB TABLE MONITOR OUTPUT\n" + "==================================\n", + stderr); + } + } + + mutex_enter(&kernel_mutex); + + some_waits = FALSE; + + /* Check of all slots if a thread is waiting there, and if it + has exceeded the time limit */ + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = srv_mysql_table + i; + + if (slot->in_use) { + trx_t* trx; + ulong lock_wait_timeout; + + some_waits = TRUE; + + wait_time = ut_difftime(ut_time(), slot->suspend_time); + + trx = thr_get_trx(slot->thr); + lock_wait_timeout = thd_lock_wait_timeout( + trx->mysql_thd); + + if (lock_wait_timeout < 100000000 + && (wait_time > (double) lock_wait_timeout + || wait_time < 0)) { + + /* Timeout exceeded or a wrap-around in system + time counter: cancel the lock request queued + by the transaction and release possible + other transactions waiting behind; it is + possible that the lock has already been + granted: in that case do nothing */ + + if (trx->wait_lock) { + lock_cancel_waiting_and_release( + trx->wait_lock); + } + } + } + } + + os_event_reset(srv_lock_timeout_thread_event); + + mutex_exit(&kernel_mutex); + + if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) { + goto exit_func; + } + + if (some_waits || srv_print_innodb_monitor + || srv_print_innodb_lock_monitor + || srv_print_innodb_tablespace_monitor + || srv_print_innodb_table_monitor) { + goto loop; + } + + /* No one was waiting for a lock and no monitor was active: + suspend this thread */ + + srv_lock_timeout_and_monitor_active = FALSE; + +#if 0 + /* The following synchronisation is disabled, since + the InnoDB monitor output is to be updated every 15 seconds. */ + os_event_wait(srv_lock_timeout_thread_event); +#endif + goto loop; + +exit_func: + srv_lock_timeout_and_monitor_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/************************************************************************* +A thread which prints warnings about semaphore waits which have lasted +too long. These can be used to track bugs which cause hangs. */ +UNIV_INTERN +os_thread_ret_t +srv_error_monitor_thread( +/*=====================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ +{ + /* number of successive fatal timeouts observed */ + ulint fatal_cnt = 0; + ib_uint64_t old_lsn; + ib_uint64_t new_lsn; + + old_lsn = srv_start_lsn; + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Error monitor thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif +loop: + srv_error_monitor_active = TRUE; + + /* Try to track a strange bug reported by Harald Fuchs and others, + where the lsn seems to decrease at times */ + + new_lsn = log_get_lsn(); + + if (new_lsn < old_lsn) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: old log sequence number %llu" + " was greater\n" + "InnoDB: than the new log sequence number %llu!\n" + "InnoDB: Please submit a bug report" + " to http://bugs.mysql.com\n", + old_lsn, new_lsn); + } + + old_lsn = new_lsn; + + if (difftime(time(NULL), srv_last_monitor_time) > 60) { + /* We referesh InnoDB Monitor values so that averages are + printed from at most 60 last seconds */ + + srv_refresh_innodb_monitor_stats(); + } + + /* Update the statistics collected for deciding LRU + eviction policy. */ + buf_LRU_stat_update(); + + /* In case mutex_exit is not a memory barrier, it is + theoretically possible some threads are left waiting though + the semaphore is already released. Wake up those threads: */ + + sync_arr_wake_threads_if_sema_free(); + + if (sync_array_print_long_waits()) { + fatal_cnt++; + if (fatal_cnt > 10) { + + fprintf(stderr, + "InnoDB: Error: semaphore wait has lasted" + " > %lu seconds\n" + "InnoDB: We intentionally crash the server," + " because it appears to be hung.\n", + (ulong) srv_fatal_semaphore_wait_threshold); + + ut_error; + } + } else { + fatal_cnt = 0; + } + + /* Flush stderr so that a database user gets the output + to possible MySQL error file */ + + fflush(stderr); + + os_thread_sleep(1000000); + + if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) { + + goto loop; + } + + srv_error_monitor_active = FALSE; + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*********************************************************************** +Tells the InnoDB server that there has been activity in the database +and wakes up the master thread if it is suspended (not sleeping). Used +in the MySQL interface. Note that there is a small chance that the master +thread stays suspended (we do not protect our operation with the kernel +mutex, for performace reasons). */ +UNIV_INTERN +void +srv_active_wake_master_thread(void) +/*===============================*/ +{ + srv_activity_count++; + + if (srv_n_threads_active[SRV_MASTER] == 0) { + + mutex_enter(&kernel_mutex); + + srv_release_threads(SRV_MASTER, 1); + + mutex_exit(&kernel_mutex); + } +} + +/*********************************************************************** +Wakes up the master thread if it is suspended or being suspended. */ +UNIV_INTERN +void +srv_wake_master_thread(void) +/*========================*/ +{ + srv_activity_count++; + + mutex_enter(&kernel_mutex); + + srv_release_threads(SRV_MASTER, 1); + + mutex_exit(&kernel_mutex); +} + +/************************************************************************* +The master thread controlling the server. */ +UNIV_INTERN +os_thread_ret_t +srv_master_thread( +/*==============*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ +{ + os_event_t event; + time_t last_flush_time; + time_t current_time; + ulint old_activity_count; + ulint n_pages_purged = 0; + ulint n_bytes_merged; + ulint n_pages_flushed; + ulint n_bytes_archived; + ulint n_tables_to_drop; + ulint n_ios; + ulint n_ios_old; + ulint n_ios_very_old; + ulint n_pend_ios; + ibool skip_sleep = FALSE; + ulint i; + + ib_uint64_t oldest_lsn; + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Master thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); +#endif + srv_main_thread_process_no = os_proc_get_number(); + srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); + + srv_table_reserve_slot(SRV_MASTER); + + mutex_enter(&kernel_mutex); + + srv_n_threads_active[SRV_MASTER]++; + + mutex_exit(&kernel_mutex); + +loop: + /*****************************************************************/ + /* ---- When there is database activity by users, we cycle in this + loop */ + + srv_main_thread_op_info = "reserving kernel mutex"; + + n_ios_very_old = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; + mutex_enter(&kernel_mutex); + + /* Store the user activity counter at the start of this loop */ + old_activity_count = srv_activity_count; + + mutex_exit(&kernel_mutex); + + if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) { + + goto suspend_thread; + } + + /* ---- We run the following loop approximately once per second + when there is database activity */ + + skip_sleep = FALSE; + + for (i = 0; i < 10; i++) { + n_ios_old = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; + srv_main_thread_op_info = "sleeping"; + + if (!skip_sleep) { + + os_thread_sleep(1000000); + } + + skip_sleep = FALSE; + + /* ALTER TABLE in MySQL requires on Unix that the table handler + can drop tables lazily after there no longer are SELECT + queries to them. */ + + srv_main_thread_op_info = "doing background drop tables"; + + row_drop_tables_for_mysql_in_background(); + + srv_main_thread_op_info = ""; + + if (srv_fast_shutdown && srv_shutdown_state > 0) { + + goto background_loop; + } + + /* We flush the log once in a second even if no commit + is issued or the we have specified in my.cnf no flush + at transaction commit */ + + srv_main_thread_op_info = "flushing log"; + log_buffer_flush_to_disk(); + + srv_main_thread_op_info = "making checkpoint"; + log_free_check(); + + /* If there were less than 5 i/os during the + one second sleep, we assume that there is free + disk i/o capacity available, and it makes sense to + do an insert buffer merge. */ + + n_pend_ios = buf_get_n_pending_ios() + + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; + if (n_pend_ios < 3 && (n_ios - n_ios_old < PCT_IO(5))) { + srv_main_thread_op_info = "doing insert buffer merge"; + ibuf_contract_for_n_pages( + TRUE, PCT_IBUF_IO((srv_insert_buffer_batch_size / 4))); + + srv_main_thread_op_info = "flushing log"; + + log_buffer_flush_to_disk(); + } + + if (UNIV_UNLIKELY(buf_get_modified_ratio_pct() + > srv_max_buf_pool_modified_pct)) { + + /* Try to keep the number of modified pages in the + buffer pool under the limit wished by the user */ + + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + IB_ULONGLONG_MAX); + + /* If we had to do the flush, it may have taken + even more than 1 second, and also, there may be more + to flush. Do not sleep 1 second during the next + iteration of this loop. */ + + skip_sleep = TRUE; + } else if (srv_adaptive_checkpoint) { + + /* Try to keep modified age not to exceed + max_checkpoint_age * 7/8 line */ + + mutex_enter(&(log_sys->mutex)); + + oldest_lsn = buf_pool_get_oldest_modification(); + if (oldest_lsn == 0) { + + mutex_exit(&(log_sys->mutex)); + + } else { + if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) { + /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */ + /* We should not flush from here. */ + mutex_exit(&(log_sys->mutex)); + } else if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) { + + /* 2nd defence line (max_checkpoint_age * 3/4) */ + + mutex_exit(&(log_sys->mutex)); + + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + IB_ULONGLONG_MAX); + skip_sleep = TRUE; + } else if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age)/2 ) { + + /* 1st defence line (max_checkpoint_age * 1/2) */ + + mutex_exit(&(log_sys->mutex)); + + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), + IB_ULONGLONG_MAX); + skip_sleep = TRUE; + } else { + mutex_exit(&(log_sys->mutex)); + } + } + + } + + if (srv_activity_count == old_activity_count) { + + /* There is no user activity at the moment, go to + the background loop */ + + goto background_loop; + } + } + + /* ---- We perform the following code approximately once per + 10 seconds when there is database activity */ + +#ifdef MEM_PERIODIC_CHECK + /* Check magic numbers of every allocated mem block once in 10 + seconds */ + mem_validate_all_blocks(); +#endif + /* If there were less than 200 i/os during the 10 second period, + we assume that there is free disk i/o capacity available, and it + makes sense to flush 100 pages. */ + + n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; + if (n_pend_ios < 3 && (n_ios - n_ios_very_old < PCT_IO(200))) { + + srv_main_thread_op_info = "flushing buffer pool pages"; + buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), IB_ULONGLONG_MAX); + + srv_main_thread_op_info = "flushing log"; + log_buffer_flush_to_disk(); + } + + /* We run a batch of insert buffer merge every 10 seconds, + even if the server were active */ + + srv_main_thread_op_info = "doing insert buffer merge"; + ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO((srv_insert_buffer_batch_size / 4))); + + srv_main_thread_op_info = "flushing log"; + log_buffer_flush_to_disk(); + + /* We run a full purge every 10 seconds, even if the server + were active */ + + last_flush_time = time(NULL); + + do { + + if (srv_fast_shutdown && srv_shutdown_state > 0) { + + goto background_loop; + } + + srv_main_thread_op_info = "purging"; + n_pages_purged = trx_purge(); + + current_time = time(NULL); + + if (difftime(current_time, last_flush_time) > 1) { + srv_main_thread_op_info = "flushing log"; + + log_buffer_flush_to_disk(); + last_flush_time = current_time; + } + } while (n_pages_purged); + + srv_main_thread_op_info = "flushing buffer pool pages"; + + /* Flush a few oldest pages to make a new checkpoint younger */ + + if (buf_get_modified_ratio_pct() > 70) { + + /* If there are lots of modified pages in the buffer pool + (> 70 %), we assume we can afford reserving the disk(s) for + the time it requires to flush 100 pages */ + + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + IB_ULONGLONG_MAX); + } else { + /* Otherwise, we only flush a small number of pages so that + we do not unnecessarily use much disk i/o capacity from + other work */ + + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), + IB_ULONGLONG_MAX); + } + + srv_main_thread_op_info = "making checkpoint"; + + /* Make a new checkpoint about once in 10 seconds */ + + log_checkpoint(TRUE, FALSE); + + srv_main_thread_op_info = "reserving kernel mutex"; + + mutex_enter(&kernel_mutex); + + /* ---- When there is database activity, we jump from here back to + the start of loop */ + + if (srv_activity_count != old_activity_count) { + mutex_exit(&kernel_mutex); + goto loop; + } + + mutex_exit(&kernel_mutex); + + /* If the database is quiet, we enter the background loop */ + + /*****************************************************************/ +background_loop: + /* ---- In this loop we run background operations when the server + is quiet from user activity. Also in the case of a shutdown, we + loop here, flushing the buffer pool to the data files. */ + + /* The server has been quiet for a while: start running background + operations */ + + srv_main_thread_op_info = "doing background drop tables"; + + n_tables_to_drop = row_drop_tables_for_mysql_in_background(); + + if (n_tables_to_drop > 0) { + /* Do not monopolize the CPU even if there are tables waiting + in the background drop queue. (It is essentially a bug if + MySQL tries to drop a table while there are still open handles + to it and we had to put it to the background drop queue.) */ + + os_thread_sleep(100000); + } + + srv_main_thread_op_info = "purging"; + + /* Run a full purge */ + + last_flush_time = time(NULL); + + do { + if (srv_fast_shutdown && srv_shutdown_state > 0) { + + break; + } + + srv_main_thread_op_info = "purging"; + n_pages_purged = trx_purge(); + + current_time = time(NULL); + + if (difftime(current_time, last_flush_time) > 1) { + srv_main_thread_op_info = "flushing log"; + + log_buffer_flush_to_disk(); + last_flush_time = current_time; + } + } while (n_pages_purged); + + srv_main_thread_op_info = "reserving kernel mutex"; + + mutex_enter(&kernel_mutex); + if (srv_activity_count != old_activity_count) { + mutex_exit(&kernel_mutex); + goto loop; + } + mutex_exit(&kernel_mutex); + + srv_main_thread_op_info = "doing insert buffer merge"; + + if (srv_fast_shutdown && srv_shutdown_state > 0) { + n_bytes_merged = 0; + } else { + n_bytes_merged = ibuf_contract_for_n_pages( + TRUE, PCT_IBUF_IO((srv_insert_buffer_batch_size * 5))); + } + + srv_main_thread_op_info = "reserving kernel mutex"; + + mutex_enter(&kernel_mutex); + if (srv_activity_count != old_activity_count) { + mutex_exit(&kernel_mutex); + goto loop; + } + mutex_exit(&kernel_mutex); + +flush_loop: + srv_main_thread_op_info = "flushing buffer pool pages"; + + if (srv_fast_shutdown < 2) { + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + IB_ULONGLONG_MAX); + } else { + /* In the fastest shutdown we do not flush the buffer pool + to data files: we set n_pages_flushed to 0 artificially. */ + + n_pages_flushed = 0; + } + + srv_main_thread_op_info = "reserving kernel mutex"; + + mutex_enter(&kernel_mutex); + if (srv_activity_count != old_activity_count) { + mutex_exit(&kernel_mutex); + goto loop; + } + mutex_exit(&kernel_mutex); + + srv_main_thread_op_info = "waiting for buffer pool flush to end"; + buf_flush_wait_batch_end(BUF_FLUSH_LIST); + + srv_main_thread_op_info = "flushing log"; + + log_buffer_flush_to_disk(); + + srv_main_thread_op_info = "making checkpoint"; + + log_checkpoint(TRUE, FALSE); + + if (buf_get_modified_ratio_pct() > srv_max_buf_pool_modified_pct) { + + /* Try to keep the number of modified pages in the + buffer pool under the limit wished by the user */ + + goto flush_loop; + } + + srv_main_thread_op_info = "reserving kernel mutex"; + + mutex_enter(&kernel_mutex); + if (srv_activity_count != old_activity_count) { + mutex_exit(&kernel_mutex); + goto loop; + } + mutex_exit(&kernel_mutex); + /* + srv_main_thread_op_info = "archiving log (if log archive is on)"; + + log_archive_do(FALSE, &n_bytes_archived); + */ + n_bytes_archived = 0; + + /* Keep looping in the background loop if still work to do */ + + if (srv_fast_shutdown && srv_shutdown_state > 0) { + if (n_tables_to_drop + n_pages_flushed + + n_bytes_archived != 0) { + + /* If we are doing a fast shutdown (= the default) + we do not do purge or insert buffer merge. But we + flush the buffer pool completely to disk. + In a 'very fast' shutdown we do not flush the buffer + pool to data files: we have set n_pages_flushed to + 0 artificially. */ + + goto background_loop; + } + } else if (n_tables_to_drop + + n_pages_purged + n_bytes_merged + n_pages_flushed + + n_bytes_archived != 0) { + /* In a 'slow' shutdown we run purge and the insert buffer + merge to completion */ + + goto background_loop; + } + + /* There is no work for background operations either: suspend + master thread to wait for more server activity */ + +suspend_thread: + srv_main_thread_op_info = "suspending"; + + mutex_enter(&kernel_mutex); + + if (row_get_background_drop_list_len_low() > 0) { + mutex_exit(&kernel_mutex); + + goto loop; + } + + event = srv_suspend_thread(); + + mutex_exit(&kernel_mutex); + + /* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql() + waits for database activity to die down when converting < 4.1.x + databases, and relies on this string being exactly as it is. InnoDB + manual also mentions this string in several places. */ + srv_main_thread_op_info = "waiting for server activity"; + + os_event_wait(event); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + /* This is only extra safety, the thread should exit + already when the event wait ends */ + + os_thread_exit(NULL); + } + + /* When there is user activity, InnoDB will set the event and the + main thread goes back to loop. */ + + goto loop; + + OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */ +} +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c new file mode 100644 index 00000000000..2933e020b62 --- /dev/null +++ b/storage/xtradb/srv/srv0start.c @@ -0,0 +1,2052 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +Starts the InnoDB database server + +Created 2/16/1996 Heikki Tuuri +*************************************************************************/ + +#include "os0proc.h" +#include "sync0sync.h" +#include "ut0mem.h" +#include "mem0mem.h" +#include "data0data.h" +#include "data0type.h" +#include "dict0dict.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "os0file.h" +#include "os0thread.h" +#include "fil0fil.h" +#include "fsp0fsp.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "mtr0mtr.h" +#include "log0log.h" +#include "log0recv.h" +#include "page0page.h" +#include "page0cur.h" +#include "trx0trx.h" +#include "dict0boot.h" +#include "dict0load.h" +#include "trx0sys.h" +#include "dict0crea.h" +#include "btr0btr.h" +#include "btr0pcur.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "rem0rec.h" +#include "srv0srv.h" +#include "que0que.h" +#include "usr0sess.h" +#include "lock0lock.h" +#include "trx0roll.h" +#include "trx0purge.h" +#include "row0ins.h" +#include "row0sel.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0mysql.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "pars0pars.h" +#include "btr0sea.h" +#include "srv0start.h" +#include "que0que.h" + +/* Log sequence number immediately after startup */ +UNIV_INTERN ib_uint64_t srv_start_lsn; +/* Log sequence number at shutdown */ +UNIV_INTERN ib_uint64_t srv_shutdown_lsn; + +#ifdef HAVE_DARWIN_THREADS +# include +UNIV_INTERN ibool srv_have_fullfsync = FALSE; +#endif + +UNIV_INTERN ibool srv_start_raw_disk_in_use = FALSE; + +UNIV_INTERN ibool srv_startup_is_before_trx_rollback_phase = FALSE; +UNIV_INTERN ibool srv_is_being_started = FALSE; +UNIV_INTERN ibool srv_was_started = FALSE; +#ifndef UNIV_HOTBACKUP +static ibool srv_start_has_been_called = FALSE; +#endif /* !UNIV_HOTBACKUP */ + +/* At a shutdown the value first climbs to SRV_SHUTDOWN_CLEANUP +and then to SRV_SHUTDOWN_LAST_PHASE */ +UNIV_INTERN ulint srv_shutdown_state = 0; + +#ifndef UNIV_HOTBACKUP +static os_file_t files[1000]; + +static mutex_t ios_mutex; +static ulint ios; + +static ulint n[SRV_MAX_N_IO_THREADS + 5]; +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 5]; + +/* We use this mutex to test the return value of pthread_mutex_trylock + on successful locking. HP-UX does NOT return 0, though Linux et al do. */ +static os_fast_mutex_t srv_os_test_mutex; + +/* Name of srv_monitor_file */ +static char* srv_monitor_file_name; +#endif /* !UNIV_HOTBACKUP */ + +#define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD +#define SRV_MAX_N_PENDING_SYNC_IOS 100 + + +/* Avoid warnings when using purify */ + +#ifdef HAVE_valgrind +static int inno_bcmp(register const char *s1, register const char *s2, + register uint len) +{ + while ((len-- != 0) && (*s1++ == *s2++)) + ; + + return(len + 1); +} +#define memcmp(A,B,C) inno_bcmp((A),(B),(C)) +#endif + +static +char* +srv_parse_megabytes( +/*================*/ + /* out: next character in string */ + char* str, /* in: string containing a quantity in bytes */ + ulint* megs) /* out: the number in megabytes */ +{ + char* endp; + ulint size; + + size = strtoul(str, &endp, 10); + + str = endp; + + switch (*str) { + case 'G': case 'g': + size *= 1024; + /* fall through */ + case 'M': case 'm': + str++; + break; + default: + size /= 1024 * 1024; + break; + } + + *megs = size; + return(str); +} + +/************************************************************************* +Reads the data files and their sizes from a character string given in +the .cnf file. */ +UNIV_INTERN +ibool +srv_parse_data_file_paths_and_sizes( +/*================================*/ + /* out: TRUE if ok, FALSE on parse error */ + char* str) /* in/out: the data file path string */ +{ + char* input_str; + char* path; + ulint size; + ulint i = 0; + + srv_auto_extend_last_data_file = FALSE; + srv_last_file_size_max = 0; + srv_data_file_names = NULL; + srv_data_file_sizes = NULL; + srv_data_file_is_raw_partition = NULL; + + input_str = str; + + /* First calculate the number of data files and check syntax: + path:size[M | G];path:size[M | G]... . Note that a Windows path may + contain a drive name and a ':'. */ + + while (*str != '\0') { + path = str; + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == '\0') { + return(FALSE); + } + + str++; + + str = srv_parse_megabytes(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = srv_parse_megabytes(str, &size); + } + + if (*str != '\0') { + + return(FALSE); + } + } + + if (strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + str += 3; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + str += 3; + } + + if (size == 0) { + return(FALSE); + } + + i++; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + + return(FALSE); + } + } + + if (i == 0) { + /* If innodb_data_file_path was defined it must contain + at least one data file definition */ + + return(FALSE); + } + + srv_data_file_names = malloc(i * sizeof *srv_data_file_names); + srv_data_file_sizes = malloc(i * sizeof *srv_data_file_sizes); + srv_data_file_is_raw_partition = malloc( + i * sizeof *srv_data_file_is_raw_partition); + + srv_n_data_files = i; + + /* Then store the actual values to our arrays */ + + str = input_str; + i = 0; + + while (*str != '\0') { + path = str; + + /* Note that we must step over the ':' in a Windows path; + a Windows path normally looks like C:\ibdata\ibdata1:1G, but + a Windows raw partition may have a specification like + \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */ + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == ':') { + /* Make path a null-terminated string */ + *str = '\0'; + str++; + } + + str = srv_parse_megabytes(str, &size); + + srv_data_file_names[i] = path; + srv_data_file_sizes[i] = size; + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + srv_auto_extend_last_data_file = TRUE; + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = srv_parse_megabytes( + str, &srv_last_file_size_max); + } + + if (*str != '\0') { + + return(FALSE); + } + } + + (srv_data_file_is_raw_partition)[i] = 0; + + if (strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + str += 3; + (srv_data_file_is_raw_partition)[i] = SRV_NEW_RAW; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + str += 3; + + if ((srv_data_file_is_raw_partition)[i] == 0) { + (srv_data_file_is_raw_partition)[i] = SRV_OLD_RAW; + } + } + + i++; + + if (*str == ';') { + str++; + } + } + + return(TRUE); +} + +/************************************************************************* +Reads log group home directories from a character string given in +the .cnf file. */ +UNIV_INTERN +ibool +srv_parse_log_group_home_dirs( +/*==========================*/ + /* out: TRUE if ok, FALSE on parse error */ + char* str) /* in/out: character string */ +{ + char* input_str; + char* path; + ulint i = 0; + + srv_log_group_home_dirs = NULL; + + input_str = str; + + /* First calculate the number of directories and check syntax: + path;path;... */ + + while (*str != '\0') { + path = str; + + while (*str != ';' && *str != '\0') { + str++; + } + + i++; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + + return(FALSE); + } + } + + if (i != 1) { + /* If innodb_log_group_home_dir was defined it must + contain exactly one path definition under current MySQL */ + + return(FALSE); + } + + srv_log_group_home_dirs = malloc(i * sizeof *srv_log_group_home_dirs); + + /* Then store the actual values to our array */ + + str = input_str; + i = 0; + + while (*str != '\0') { + path = str; + + while (*str != ';' && *str != '\0') { + str++; + } + + if (*str == ';') { + *str = '\0'; + str++; + } + + srv_log_group_home_dirs[i] = path; + + i++; + } + + return(TRUE); +} + +/************************************************************************* +Frees the memory allocated by srv_parse_data_file_paths_and_sizes() +and srv_parse_log_group_home_dirs(). */ +UNIV_INTERN +void +srv_free_paths_and_sizes(void) +/*==========================*/ +{ + free(srv_data_file_names); + srv_data_file_names = NULL; + free(srv_data_file_sizes); + srv_data_file_sizes = NULL; + free(srv_data_file_is_raw_partition); + srv_data_file_is_raw_partition = NULL; + free(srv_log_group_home_dirs); + srv_log_group_home_dirs = NULL; +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************ +I/o-handler thread function. */ +static + +os_thread_ret_t +io_handler_thread( +/*==============*/ + void* arg) +{ + ulint segment; + ulint i; + + segment = *((ulint*)arg); + +#ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment, + os_thread_pf(os_thread_get_curr_id())); +#endif + for (i = 0;; i++) { + fil_aio_wait(segment); + + mutex_enter(&ios_mutex); + ios++; + mutex_exit(&ios_mutex); + } + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. + The thread actually never comes here because it is exited in an + os_event_wait(). */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef __WIN__ +#define SRV_PATH_SEPARATOR '\\' +#else +#define SRV_PATH_SEPARATOR '/' +#endif + +/************************************************************************* +Normalizes a directory path for Windows: converts slashes to backslashes. */ +UNIV_INTERN +void +srv_normalize_path_for_win( +/*=======================*/ + char* str __attribute__((unused))) /* in/out: null-terminated + character string */ +{ +#ifdef __WIN__ + for (; *str; str++) { + + if (*str == '/') { + *str = '\\'; + } + } +#endif +} + +/************************************************************************* +Adds a slash or a backslash to the end of a string if it is missing +and the string is not empty. */ +UNIV_INTERN +char* +srv_add_path_separator_if_needed( +/*=============================*/ + /* out: string which has the separator if the + string is not empty */ + char* str) /* in: null-terminated character string */ +{ + char* out_str; + ulint len = ut_strlen(str); + + if (len == 0 || str[len - 1] == SRV_PATH_SEPARATOR) { + + return(str); + } + + out_str = ut_malloc(len + 2); + memcpy(out_str, str, len); + out_str[len] = SRV_PATH_SEPARATOR; + out_str[len + 1] = 0; + + return(out_str); +} + +#ifndef UNIV_HOTBACKUP +/************************************************************************* +Calculates the low 32 bits when a file size which is given as a number +database pages is converted to the number of bytes. */ +static +ulint +srv_calc_low32( +/*===========*/ + /* out: low 32 bytes of file size when + expressed in bytes */ + ulint file_size) /* in: file size in database pages */ +{ + return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT)); +} + +/************************************************************************* +Calculates the high 32 bits when a file size which is given as a number +database pages is converted to the number of bytes. */ +static +ulint +srv_calc_high32( +/*============*/ + /* out: high 32 bytes of file size when + expressed in bytes */ + ulint file_size) /* in: file size in database pages */ +{ + return(file_size >> (32 - UNIV_PAGE_SIZE_SHIFT)); +} + +/************************************************************************* +Creates or opens the log files and closes them. */ +static +ulint +open_or_create_log_file( +/*====================*/ + /* out: DB_SUCCESS or error code */ + ibool create_new_db, /* in: TRUE if we should create a + new database */ + ibool* log_file_created, /* out: TRUE if new log file + created */ + ibool log_file_has_been_opened,/* in: TRUE if a log file has been + opened before: then it is an error + to try to create another log file */ + ulint k, /* in: log group number */ + ulint i) /* in: log file number in group */ +{ + ibool ret; + ulint size; + ulint size_high; + char name[10000]; + + UT_NOT_USED(create_new_db); + + *log_file_created = FALSE; + + srv_normalize_path_for_win(srv_log_group_home_dirs[k]); + srv_log_group_home_dirs[k] = srv_add_path_separator_if_needed( + srv_log_group_home_dirs[k]); + + ut_a(strlen(srv_log_group_home_dirs[k]) + < (sizeof name) - 10 - sizeof "ib_logfile"); + sprintf(name, "%s%s%lu", srv_log_group_home_dirs[k], + "ib_logfile", (ulong) i); + + files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, + OS_LOG_FILE, &ret); + if (ret == FALSE) { + if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS +#ifdef UNIV_AIX + /* AIX 5.1 after security patch ML7 may have errno set + to 0 here, which causes our function to return 100; + work around that AIX problem */ + && os_file_get_last_error(FALSE) != 100 +#endif + ) { + fprintf(stderr, + "InnoDB: Error in creating" + " or opening %s\n", name); + + return(DB_ERROR); + } + + files[i] = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO, + OS_LOG_FILE, &ret); + if (!ret) { + fprintf(stderr, + "InnoDB: Error in opening %s\n", name); + + return(DB_ERROR); + } + + ret = os_file_get_size(files[i], &size, &size_high); + ut_a(ret); + + if (size != srv_calc_low32(srv_log_file_size) + || size_high != srv_calc_high32(srv_log_file_size)) { + + fprintf(stderr, + "InnoDB: Error: log file %s is" + " of different size %lu %lu bytes\n" + "InnoDB: than specified in the .cnf" + " file %lu %lu bytes!\n", + name, (ulong) size_high, (ulong) size, + (ulong) srv_calc_high32(srv_log_file_size), + (ulong) srv_calc_low32(srv_log_file_size)); + + return(DB_ERROR); + } + } else { + *log_file_created = TRUE; + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Log file %s did not exist:" + " new to be created\n", + name); + if (log_file_has_been_opened) { + + return(DB_ERROR); + } + + fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n", + name, (ulong) srv_log_file_size + >> (20 - UNIV_PAGE_SIZE_SHIFT)); + + fprintf(stderr, + "InnoDB: Database physically writes the file" + " full: wait...\n"); + + ret = os_file_set_size(name, files[i], + srv_calc_low32(srv_log_file_size), + srv_calc_high32(srv_log_file_size)); + if (!ret) { + fprintf(stderr, + "InnoDB: Error in creating %s:" + " probably out of disk space\n", + name); + + return(DB_ERROR); + } + } + + ret = os_file_close(files[i]); + ut_a(ret); + + if (i == 0) { + /* Create in memory the file space object + which is for this log group */ + + fil_space_create(name, + 2 * k + SRV_LOG_SPACE_FIRST_ID, 0, FIL_LOG); + } + + ut_a(fil_validate()); + + fil_node_create(name, srv_log_file_size, + 2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE); +#ifdef UNIV_LOG_ARCHIVE + /* If this is the first log group, create the file space object + for archived logs. + Under MySQL, no archiving ever done. */ + + if (k == 0 && i == 0) { + arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID; + + fil_space_create("arch_log_space", arch_space_id, 0, FIL_LOG); + } else { + arch_space_id = ULINT_UNDEFINED; + } +#endif /* UNIV_LOG_ARCHIVE */ + if (i == 0) { + log_group_init(k, srv_n_log_files, + srv_log_file_size * UNIV_PAGE_SIZE, + 2 * k + SRV_LOG_SPACE_FIRST_ID, + SRV_LOG_SPACE_FIRST_ID + 1); /* dummy arch + space id */ + } + + return(DB_SUCCESS); +} + +/************************************************************************* +Creates or opens database data files and closes them. */ +static +ulint +open_or_create_data_files( +/*======================*/ + /* out: DB_SUCCESS or error code */ + ibool* create_new_db, /* out: TRUE if new database should be + created */ +#ifdef UNIV_LOG_ARCHIVE + ulint* min_arch_log_no,/* out: min of archived log + numbers in data files */ + ulint* max_arch_log_no,/* out: max of archived log + numbers in data files */ +#endif /* UNIV_LOG_ARCHIVE */ + ib_uint64_t* min_flushed_lsn,/* out: min of flushed lsn + values in data files */ + ib_uint64_t* max_flushed_lsn,/* out: max of flushed lsn + values in data files */ + ulint* sum_of_new_sizes)/* out: sum of sizes of the + new files added */ +{ + ibool ret; + ulint i; + ibool one_opened = FALSE; + ibool one_created = FALSE; + ulint size; + ulint size_high; + ulint rounded_size_pages; + char name[10000]; + + if (srv_n_data_files >= 1000) { + fprintf(stderr, "InnoDB: can only have < 1000 data files\n" + "InnoDB: you have defined %lu\n", + (ulong) srv_n_data_files); + return(DB_ERROR); + } + + *sum_of_new_sizes = 0; + + *create_new_db = FALSE; + + srv_normalize_path_for_win(srv_data_home); + srv_data_home = srv_add_path_separator_if_needed(srv_data_home); + + for (i = 0; i < srv_n_data_files; i++) { + srv_normalize_path_for_win(srv_data_file_names[i]); + + ut_a(strlen(srv_data_home) + strlen(srv_data_file_names[i]) + < (sizeof name) - 1); + sprintf(name, "%s%s", srv_data_home, srv_data_file_names[i]); + + if (srv_data_file_is_raw_partition[i] == 0) { + + /* First we try to create the file: if it already + exists, ret will get value FALSE */ + + files[i] = os_file_create(name, OS_FILE_CREATE, + OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + + if (ret == FALSE && os_file_get_last_error(FALSE) + != OS_FILE_ALREADY_EXISTS +#ifdef UNIV_AIX + /* AIX 5.1 after security patch ML7 may have + errno set to 0 here, which causes our function + to return 100; work around that AIX problem */ + && os_file_get_last_error(FALSE) != 100 +#endif + ) { + fprintf(stderr, + "InnoDB: Error in creating" + " or opening %s\n", + name); + + return(DB_ERROR); + } + } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { + /* The partition is opened, not created; then it is + written over */ + + srv_start_raw_disk_in_use = TRUE; + srv_created_new_raw = TRUE; + + files[i] = os_file_create(name, OS_FILE_OPEN_RAW, + OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + if (!ret) { + fprintf(stderr, + "InnoDB: Error in opening %s\n", name); + + return(DB_ERROR); + } + } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + srv_start_raw_disk_in_use = TRUE; + + ret = FALSE; + } else { + ut_a(0); + } + + if (ret == FALSE) { + /* We open the data file */ + + if (one_created) { + fprintf(stderr, + "InnoDB: Error: data files can only" + " be added at the end\n"); + fprintf(stderr, + "InnoDB: of a tablespace, but" + " data file %s existed beforehand.\n", + name); + return(DB_ERROR); + } + + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + files[i] = os_file_create( + name, OS_FILE_OPEN_RAW, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + } else if (i == 0) { + files[i] = os_file_create( + name, OS_FILE_OPEN_RETRY, + OS_FILE_NORMAL, OS_DATA_FILE, &ret); + } else { + files[i] = os_file_create( + name, OS_FILE_OPEN, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + } + + if (!ret) { + fprintf(stderr, + "InnoDB: Error in opening %s\n", name); + os_file_get_last_error(TRUE); + + return(DB_ERROR); + } + + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + + goto skip_size_check; + } + + ret = os_file_get_size(files[i], &size, &size_high); + ut_a(ret); + /* Round size downward to megabytes */ + + rounded_size_pages + = (size / (1024 * 1024) + 4096 * size_high) + << (20 - UNIV_PAGE_SIZE_SHIFT); + + if (i == srv_n_data_files - 1 + && srv_auto_extend_last_data_file) { + + if (srv_data_file_sizes[i] > rounded_size_pages + || (srv_last_file_size_max > 0 + && srv_last_file_size_max + < rounded_size_pages)) { + + fprintf(stderr, + "InnoDB: Error: auto-extending" + " data file %s is" + " of a different size\n" + "InnoDB: %lu pages (rounded" + " down to MB) than specified" + " in the .cnf file:\n" + "InnoDB: initial %lu pages," + " max %lu (relevant if" + " non-zero) pages!\n", + name, + (ulong) rounded_size_pages, + (ulong) srv_data_file_sizes[i], + (ulong) + srv_last_file_size_max); + + return(DB_ERROR); + } + + srv_data_file_sizes[i] = rounded_size_pages; + } + + if (rounded_size_pages != srv_data_file_sizes[i]) { + + fprintf(stderr, + "InnoDB: Error: data file %s" + " is of a different size\n" + "InnoDB: %lu pages" + " (rounded down to MB)\n" + "InnoDB: than specified" + " in the .cnf file %lu pages!\n", + name, + (ulong) rounded_size_pages, + (ulong) srv_data_file_sizes[i]); + + return(DB_ERROR); + } +skip_size_check: + fil_read_flushed_lsn_and_arch_log_no( + files[i], one_opened, +#ifdef UNIV_LOG_ARCHIVE + min_arch_log_no, max_arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + min_flushed_lsn, max_flushed_lsn); + one_opened = TRUE; + } else { + /* We created the data file and now write it full of + zeros */ + + one_created = TRUE; + + if (i > 0) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Data file %s did not" + " exist: new to be created\n", + name); + } else { + fprintf(stderr, + "InnoDB: The first specified" + " data file %s did not exist:\n" + "InnoDB: a new database" + " to be created!\n", name); + *create_new_db = TRUE; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Setting file %s size to %lu MB\n", + name, + (ulong) (srv_data_file_sizes[i] + >> (20 - UNIV_PAGE_SIZE_SHIFT))); + + fprintf(stderr, + "InnoDB: Database physically writes the" + " file full: wait...\n"); + + ret = os_file_set_size( + name, files[i], + srv_calc_low32(srv_data_file_sizes[i]), + srv_calc_high32(srv_data_file_sizes[i])); + + if (!ret) { + fprintf(stderr, + "InnoDB: Error in creating %s:" + " probably out of disk space\n", name); + + return(DB_ERROR); + } + + *sum_of_new_sizes = *sum_of_new_sizes + + srv_data_file_sizes[i]; + } + + ret = os_file_close(files[i]); + ut_a(ret); + + if (i == 0) { + fil_space_create(name, 0, 0, FIL_TABLESPACE); + } + + ut_a(fil_validate()); + + fil_node_create(name, srv_data_file_sizes[i], 0, + srv_data_file_is_raw_partition[i] != 0); + } + + ios = 0; + + mutex_create(&ios_mutex, SYNC_NO_ORDER_CHECK); + + return(DB_SUCCESS); +} + +/******************************************************************** +Starts InnoDB and creates a new database if database files +are not found and the user wants. */ +UNIV_INTERN +int +innobase_start_or_create_for_mysql(void) +/*====================================*/ + /* out: DB_SUCCESS or error code */ +{ + buf_pool_t* ret; + ibool create_new_db; + ibool log_file_created; + ibool log_created = FALSE; + ibool log_opened = FALSE; + ib_uint64_t min_flushed_lsn; + ib_uint64_t max_flushed_lsn; +#ifdef UNIV_LOG_ARCHIVE + ulint min_arch_log_no; + ulint max_arch_log_no; +#endif /* UNIV_LOG_ARCHIVE */ + ulint sum_of_new_sizes; + ulint sum_of_data_file_sizes; + ulint tablespace_size_in_header; + ulint err; + ulint i; + my_bool srv_file_per_table_original_value + = srv_file_per_table; + mtr_t mtr; +#ifdef HAVE_DARWIN_THREADS +# ifdef F_FULLFSYNC + /* This executable has been compiled on Mac OS X 10.3 or later. + Assume that F_FULLFSYNC is available at run-time. */ + srv_have_fullfsync = TRUE; +# else /* F_FULLFSYNC */ + /* This executable has been compiled on Mac OS X 10.2 + or earlier. Determine if the executable is running + on Mac OS X 10.3 or later. */ + struct utsname utsname; + if (uname(&utsname)) { + fputs("InnoDB: cannot determine Mac OS X version!\n", stderr); + } else { + srv_have_fullfsync = strcmp(utsname.release, "7.") >= 0; + } + if (!srv_have_fullfsync) { + fputs("InnoDB: On Mac OS X, fsync() may be" + " broken on internal drives,\n" + "InnoDB: making transactions unsafe!\n", stderr); + } +# endif /* F_FULLFSYNC */ +#endif /* HAVE_DARWIN_THREADS */ + + if (sizeof(ulint) != sizeof(void*)) { + fprintf(stderr, + "InnoDB: Error: size of InnoDB's ulint is %lu," + " but size of void* is %lu.\n" + "InnoDB: The sizes should be the same" + " so that on a 64-bit platform you can\n" + "InnoDB: allocate more than 4 GB of memory.", + (ulong)sizeof(ulint), (ulong)sizeof(void*)); + } + + /* System tables are created in tablespace 0. Thus, we must + temporarily clear srv_file_per_table. This is ok, because the + server will not accept connections (which could modify + innodb_file_per_table) until this function has returned. */ + srv_file_per_table = FALSE; +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n"); +#endif + +#ifdef UNIV_IBUF_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n" + "InnoDB: Crash recovery will fail with UNIV_IBUF_DEBUG\n"); +#endif + +#ifdef UNIV_SYNC_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n"); +#endif + +#ifdef UNIV_SEARCH_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n"); +#endif + +#ifdef UNIV_MEM_DEBUG + fprintf(stderr, + "InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n"); +#endif + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + fprintf(stderr, + "InnoDB: The InnoDB memory heap is disabled\n"); + } + +#ifdef HAVE_GCC_ATOMIC_BUILTINS +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + fprintf(stderr, + "InnoDB: Mutexes and rw_locks use GCC atomic builtins.\n"); +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ + fprintf(stderr, + "InnoDB: Mutexes use GCC atomic builtins, rw_locks do not.\n"); +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ +#else /* HAVE_GCC_ATOMIC_BUILTINS */ + fprintf(stderr, + "InnoDB: Neither mutexes nor rw_locks use GCC atomic builtins.\n"); +#endif /* HAVE_GCC_ATOMIC_BUILTINS */ + + /* Since InnoDB does not currently clean up all its internal data + structures in MySQL Embedded Server Library server_end(), we + print an error message if someone tries to start up InnoDB a + second time during the process lifetime. */ + + if (srv_start_has_been_called) { + fprintf(stderr, + "InnoDB: Error:startup called second time" + " during the process lifetime.\n" + "InnoDB: In the MySQL Embedded Server Library" + " you cannot call server_init()\n" + "InnoDB: more than once during" + " the process lifetime.\n"); + } + + srv_start_has_been_called = TRUE; + +#ifdef UNIV_DEBUG + log_do_write = TRUE; +#endif /* UNIV_DEBUG */ + /* yydebug = TRUE; */ + + srv_is_being_started = TRUE; + srv_startup_is_before_trx_rollback_phase = TRUE; + os_aio_use_native_aio = FALSE; + +#ifdef __WIN__ + if (os_get_os_version() == OS_WIN95 + || os_get_os_version() == OS_WIN31 + || os_get_os_version() == OS_WINNT) { + + /* On Win 95, 98, ME, Win32 subsystem for Windows 3.1, + and NT use simulated aio. In NT Windows provides async i/o, + but when run in conjunction with InnoDB Hot Backup, it seemed + to corrupt the data files. */ + + os_aio_use_native_aio = FALSE; + } else { + /* On Win 2000 and XP use async i/o */ + os_aio_use_native_aio = TRUE; + } +#endif + if (srv_file_flush_method_str == NULL) { + /* These are the default options */ + + srv_unix_file_flush_method = SRV_UNIX_FSYNC; + + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +#ifndef __WIN__ + } else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) { + srv_unix_file_flush_method = SRV_UNIX_FSYNC; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) { + srv_unix_file_flush_method = SRV_UNIX_O_DSYNC; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) { + srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) { + srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) { + srv_unix_file_flush_method = SRV_UNIX_NOSYNC; +#else + } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) { + srv_win_file_flush_method = SRV_WIN_IO_NORMAL; + os_aio_use_native_aio = FALSE; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) { + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; + os_aio_use_native_aio = FALSE; + + } else if (0 == ut_strcmp(srv_file_flush_method_str, + "async_unbuffered")) { + srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +#endif + } else { + fprintf(stderr, + "InnoDB: Unrecognized value %s for" + " innodb_flush_method\n", + srv_file_flush_method_str); + return(DB_ERROR); + } + + /* Note that the call srv_boot() also changes the values of + some variables to the units used by InnoDB internally */ + + /* Set the maximum number of threads which can wait for a semaphore + inside InnoDB: this is the 'sync wait array' size, as well as the + maximum number of threads that can wait in the 'srv_conc array' for + their time to enter InnoDB. */ + +#if defined(__NETWARE__) + + /* Create less event semaphores because Win 98/ME had + difficulty creating 40000 event semaphores. Comment from + Novell, Inc.: also, these just take a lot of memory on + NetWare. */ + srv_max_n_threads = 1000; +#else + if (srv_buf_pool_size >= 1000 * 1024 * 1024) { + /* If buffer pool is less than 1000 MB, + assume fewer threads. */ + srv_max_n_threads = 50000; + + } else if (srv_buf_pool_size >= 8 * 1024 * 1024) { + + srv_max_n_threads = 10000; + } else { + srv_max_n_threads = 1000; /* saves several MB of memory, + especially in 64-bit + computers */ + } +#endif + err = srv_boot(); + + if (err != DB_SUCCESS) { + + return((int) err); + } + + mutex_create(&srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK); + + if (srv_innodb_status) { + srv_monitor_file_name = mem_alloc( + strlen(fil_path_to_mysql_datadir) + + 20 + sizeof "/innodb_status."); + sprintf(srv_monitor_file_name, "%s/innodb_status.%lu", + fil_path_to_mysql_datadir, os_proc_get_number()); + srv_monitor_file = fopen(srv_monitor_file_name, "w+"); + if (!srv_monitor_file) { + fprintf(stderr, "InnoDB: unable to create %s: %s\n", + srv_monitor_file_name, strerror(errno)); + return(DB_ERROR); + } + } else { + srv_monitor_file_name = NULL; + srv_monitor_file = os_file_create_tmpfile(); + if (!srv_monitor_file) { + return(DB_ERROR); + } + } + + mutex_create(&srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION); + + srv_dict_tmpfile = os_file_create_tmpfile(); + if (!srv_dict_tmpfile) { + return(DB_ERROR); + } + + mutex_create(&srv_misc_tmpfile_mutex, SYNC_ANY_LATCH); + + srv_misc_tmpfile = os_file_create_tmpfile(); + if (!srv_misc_tmpfile) { + return(DB_ERROR); + } + + /* over write innodb_file_io_threads */ + srv_n_file_io_threads = 2 + srv_n_read_io_threads + srv_n_write_io_threads; + + /* Restrict the maximum number of file i/o threads */ + if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) { + + srv_n_file_io_threads = SRV_MAX_N_IO_THREADS; + srv_n_read_io_threads = srv_n_write_io_threads = (SRV_MAX_N_IO_THREADS - 2) / 2; + } + + if (!os_aio_use_native_aio) { + /* In simulated aio we currently have use only for 4 threads */ + /*srv_n_file_io_threads = 4;*/ + + os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD + * srv_n_file_io_threads, + srv_n_read_io_threads, srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS * 8); + } else { + os_aio_init(SRV_N_PENDING_IOS_PER_THREAD + * srv_n_file_io_threads, + srv_n_read_io_threads, srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS); + } + + fil_init(srv_max_n_open_files); + + ret = buf_pool_init(); + + if (ret == NULL) { + fprintf(stderr, + "InnoDB: Fatal error: cannot allocate the memory" + " for the buffer pool\n"); + + return(DB_ERROR); + } + +#ifdef UNIV_DEBUG + /* We have observed deadlocks with a 5MB buffer pool but + the actual lower limit could very well be a little higher. */ + + if (srv_buf_pool_size <= 5 * 1024 * 1024) { + + fprintf(stderr, "InnoDB: Warning: Small buffer pool size " + "(%luM), the flst_validate() debug function " + "can cause a deadlock if the buffer pool fills up.\n", + srv_buf_pool_size / 1024 / 1024); + } +#endif + + fsp_init(); + log_init(); + + lock_sys_create(srv_lock_table_size); + + /* Create i/o-handler threads: */ + + for (i = 0; i < srv_n_file_io_threads; i++) { + n[i] = i; + + os_thread_create(io_handler_thread, n + i, thread_ids + i); + } + +#ifdef UNIV_LOG_ARCHIVE + if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) { + fprintf(stderr, + "InnoDB: Error: you must set the log group" + " home dir in my.cnf the\n" + "InnoDB: same as log arch dir.\n"); + + return(DB_ERROR); + } +#endif /* UNIV_LOG_ARCHIVE */ + + if (srv_n_log_files * srv_log_file_size >= 262144) { + fprintf(stderr, + "InnoDB: Error: combined size of log files" + " must be < 4 GB\n"); + + return(DB_ERROR); + } + + sum_of_new_sizes = 0; + + for (i = 0; i < srv_n_data_files; i++) { +#ifndef __WIN__ + if (sizeof(off_t) < 5 && srv_data_file_sizes[i] >= 262144) { + fprintf(stderr, + "InnoDB: Error: file size must be < 4 GB" + " with this MySQL binary\n" + "InnoDB: and operating system combination," + " in some OS's < 2 GB\n"); + + return(DB_ERROR); + } +#endif + sum_of_new_sizes += srv_data_file_sizes[i]; + } + + if (sum_of_new_sizes < 640) { + fprintf(stderr, + "InnoDB: Error: tablespace size must be" + " at least 10 MB\n"); + + return(DB_ERROR); + } + + err = open_or_create_data_files(&create_new_db, +#ifdef UNIV_LOG_ARCHIVE + &min_arch_log_no, &max_arch_log_no, +#endif /* UNIV_LOG_ARCHIVE */ + &min_flushed_lsn, &max_flushed_lsn, + &sum_of_new_sizes); + if (err != DB_SUCCESS) { + fprintf(stderr, + "InnoDB: Could not open or create data files.\n" + "InnoDB: If you tried to add new data files," + " and it failed here,\n" + "InnoDB: you should now edit innodb_data_file_path" + " in my.cnf back\n" + "InnoDB: to what it was, and remove the" + " new ibdata files InnoDB created\n" + "InnoDB: in this failed attempt. InnoDB only wrote" + " those files full of\n" + "InnoDB: zeros, but did not yet use them in any way." + " But be careful: do not\n" + "InnoDB: remove old data files" + " which contain your precious data!\n"); + + return((int) err); + } + +#ifdef UNIV_LOG_ARCHIVE + srv_normalize_path_for_win(srv_arch_dir); + srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir); +#endif /* UNIV_LOG_ARCHIVE */ + + for (i = 0; i < srv_n_log_files; i++) { + err = open_or_create_log_file(create_new_db, &log_file_created, + log_opened, 0, i); + if (err != DB_SUCCESS) { + + return((int) err); + } + + if (log_file_created) { + log_created = TRUE; + } else { + log_opened = TRUE; + } + if ((log_opened && create_new_db) + || (log_opened && log_created)) { + fprintf(stderr, + "InnoDB: Error: all log files must be" + " created at the same time.\n" + "InnoDB: All log files must be" + " created also in database creation.\n" + "InnoDB: If you want bigger or smaller" + " log files, shut down the\n" + "InnoDB: database and make sure there" + " were no errors in shutdown.\n" + "InnoDB: Then delete the existing log files." + " Edit the .cnf file\n" + "InnoDB: and start the database again.\n"); + + return(DB_ERROR); + } + } + + /* Open all log files and data files in the system tablespace: we + keep them open until database shutdown */ + + fil_open_log_and_system_tablespace_files(); + + if (log_created && !create_new_db +#ifdef UNIV_LOG_ARCHIVE + && !srv_archive_recovery +#endif /* UNIV_LOG_ARCHIVE */ + ) { + if (max_flushed_lsn != min_flushed_lsn +#ifdef UNIV_LOG_ARCHIVE + || max_arch_log_no != min_arch_log_no +#endif /* UNIV_LOG_ARCHIVE */ + ) { + fprintf(stderr, + "InnoDB: Cannot initialize created" + " log files because\n" + "InnoDB: data files were not in sync" + " with each other\n" + "InnoDB: or the data files are corrupt.\n"); + + return(DB_ERROR); + } + + if (max_flushed_lsn < (ib_uint64_t) 1000) { + fprintf(stderr, + "InnoDB: Cannot initialize created" + " log files because\n" + "InnoDB: data files are corrupt," + " or new data files were\n" + "InnoDB: created when the database" + " was started previous\n" + "InnoDB: time but the database" + " was not shut down\n" + "InnoDB: normally after that.\n"); + + return(DB_ERROR); + } + + mutex_enter(&(log_sys->mutex)); + +#ifdef UNIV_LOG_ARCHIVE + /* Do not + 1 arch_log_no because we do not use log + archiving */ + recv_reset_logs(max_flushed_lsn, max_arch_log_no, TRUE); +#else + recv_reset_logs(max_flushed_lsn, TRUE); +#endif /* UNIV_LOG_ARCHIVE */ + + mutex_exit(&(log_sys->mutex)); + } + + trx_sys_file_format_init(); + + if (create_new_db) { + mtr_start(&mtr); + fsp_header_init(0, sum_of_new_sizes, &mtr); + + mtr_commit(&mtr); + + trx_sys_create(); + dict_create(); + srv_startup_is_before_trx_rollback_phase = FALSE; + + if (srv_extra_rsegments) + trx_sys_create_extra_rseg(srv_extra_rsegments); +#ifdef UNIV_LOG_ARCHIVE + } else if (srv_archive_recovery) { + fprintf(stderr, + "InnoDB: Starting archive" + " recovery from a backup...\n"); + err = recv_recovery_from_archive_start( + min_flushed_lsn, srv_archive_recovery_limit_lsn, + min_arch_log_no); + if (err != DB_SUCCESS) { + + return(DB_ERROR); + } + /* Since ibuf init is in dict_boot, and ibuf is needed + in any disk i/o, first call dict_boot */ + + dict_boot(); + trx_sys_init_at_db_start(); + srv_startup_is_before_trx_rollback_phase = FALSE; + + /* Initialize the fsp free limit global variable in the log + system */ + fsp_header_get_free_limit(); + + recv_recovery_from_archive_finish(); +#endif /* UNIV_LOG_ARCHIVE */ + } else { + + /* Check if we support the max format that is stamped + on the system tablespace. + Note: We are NOT allowed to make any modifications to + the TRX_SYS_PAGE_NO page before recovery because this + page also contains the max_trx_id etc. important system + variables that are required for recovery. We need to + ensure that we return the system to a state where normal + recovery is guaranteed to work. We do this by + invalidating the buffer cache, this will force the + reread of the page and restoration to its last known + consistent state, this is REQUIRED for the recovery + process to work. */ + err = trx_sys_file_format_max_check( + srv_check_file_format_at_startup); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Invalidate the buffer pool to ensure that we reread + the page that we read above, during recovery. + Note that this is not as heavy weight as it seems. At + this point there will be only ONE page in the buf_LRU + and there must be no page in the buf_flush list. */ + buf_pool_invalidate(); + + /* We always try to do a recovery, even if the database had + been shut down normally: this is the normal startup path */ + + err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT, + IB_ULONGLONG_MAX, + min_flushed_lsn, + max_flushed_lsn); + if (err != DB_SUCCESS) { + + return(DB_ERROR); + } + + /* Since the insert buffer init is in dict_boot, and the + insert buffer is needed in any disk i/o, first we call + dict_boot(). Note that trx_sys_init_at_db_start() only needs + to access space 0, and the insert buffer at this stage already + works for space 0. */ + + dict_boot(); + trx_sys_init_at_db_start(); + + if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) { + /* The following call is necessary for the insert + buffer to work with multiple tablespaces. We must + know the mapping between space id's and .ibd file + names. + + In a crash recovery, we check that the info in data + dictionary is consistent with what we already know + about space id's from the call of + fil_load_single_table_tablespaces(). + + In a normal startup, we create the space objects for + every table in the InnoDB data dictionary that has + an .ibd file. + + We also determine the maximum tablespace id used. + + TODO: We may have incomplete transactions in the + data dictionary tables. Does that harm the scanning of + the data dictionary below? */ + + dict_check_tablespaces_and_store_max_id( + recv_needed_recovery); + } + + srv_startup_is_before_trx_rollback_phase = FALSE; + + /* Initialize the fsp free limit global variable in the log + system */ + fsp_header_get_free_limit(); + + /* recv_recovery_from_checkpoint_finish needs trx lists which + are initialized in trx_sys_init_at_db_start(). */ + + recv_recovery_from_checkpoint_finish(); + + /* It is possible that file_format tag has never + been set. In this case we initialize it to minimum + value. Important to note that we can do it ONLY after + we have finished the recovery process so that the + image of TRX_SYS_PAGE_NO is not stale. */ + trx_sys_file_format_tag_init(); + } + + if (!create_new_db && sum_of_new_sizes > 0) { + /* New data file(s) were added */ + mtr_start(&mtr); + + fsp_header_inc_size(0, sum_of_new_sizes, &mtr); + + mtr_commit(&mtr); + + /* Immediately write the log record about increased tablespace + size to disk, so that it is durable even if mysqld would crash + quickly */ + + log_buffer_flush_to_disk(); + } + +#ifdef UNIV_LOG_ARCHIVE + /* Archiving is always off under MySQL */ + if (!srv_log_archive_on) { + ut_a(DB_SUCCESS == log_archive_noarchivelog()); + } else { + mutex_enter(&(log_sys->mutex)); + + start_archive = FALSE; + + if (log_sys->archiving_state == LOG_ARCH_OFF) { + start_archive = TRUE; + } + + mutex_exit(&(log_sys->mutex)); + + if (start_archive) { + ut_a(DB_SUCCESS == log_archive_archivelog()); + } + } +#endif /* UNIV_LOG_ARCHIVE */ + + /* fprintf(stderr, "Max allowed record size %lu\n", + page_get_free_space_of_empty() / 2); */ + + /* Create the thread which watches the timeouts for lock waits + and prints InnoDB monitor info */ + + os_thread_create(&srv_lock_timeout_and_monitor_thread, NULL, + thread_ids + 2 + SRV_MAX_N_IO_THREADS); + + /* Create the thread which warns of long semaphore waits */ + os_thread_create(&srv_error_monitor_thread, NULL, + thread_ids + 3 + SRV_MAX_N_IO_THREADS); + srv_is_being_started = FALSE; + + if (trx_doublewrite == NULL) { + /* Create the doublewrite buffer to a new tablespace */ + + trx_sys_create_doublewrite_buf(); + } + + err = dict_create_or_check_foreign_constraint_tables(); + + if (err != DB_SUCCESS) { + return((int)DB_ERROR); + } + + /* Create the master thread which does purge and other utility + operations */ + + os_thread_create(&srv_master_thread, NULL, thread_ids + + (1 + SRV_MAX_N_IO_THREADS)); +#ifdef UNIV_DEBUG + /* buf_debug_prints = TRUE; */ +#endif /* UNIV_DEBUG */ + sum_of_data_file_sizes = 0; + + for (i = 0; i < srv_n_data_files; i++) { + sum_of_data_file_sizes += srv_data_file_sizes[i]; + } + + tablespace_size_in_header = fsp_header_get_tablespace_size(); + + if (!srv_auto_extend_last_data_file + && sum_of_data_file_sizes != tablespace_size_in_header) { + + fprintf(stderr, + "InnoDB: Error: tablespace size" + " stored in header is %lu pages, but\n" + "InnoDB: the sum of data file sizes is %lu pages\n", + (ulong) tablespace_size_in_header, + (ulong) sum_of_data_file_sizes); + + if (srv_force_recovery == 0 + && sum_of_data_file_sizes < tablespace_size_in_header) { + /* This is a fatal error, the tail of a tablespace is + missing */ + + fprintf(stderr, + "InnoDB: Cannot start InnoDB." + " The tail of the system tablespace is\n" + "InnoDB: missing. Have you edited" + " innodb_data_file_path in my.cnf in an\n" + "InnoDB: inappropriate way, removing" + " ibdata files from there?\n" + "InnoDB: You can set innodb_force_recovery=1" + " in my.cnf to force\n" + "InnoDB: a startup if you are trying" + " to recover a badly corrupt database.\n"); + + return(DB_ERROR); + } + } + + if (srv_auto_extend_last_data_file + && sum_of_data_file_sizes < tablespace_size_in_header) { + + fprintf(stderr, + "InnoDB: Error: tablespace size stored in header" + " is %lu pages, but\n" + "InnoDB: the sum of data file sizes" + " is only %lu pages\n", + (ulong) tablespace_size_in_header, + (ulong) sum_of_data_file_sizes); + + if (srv_force_recovery == 0) { + + fprintf(stderr, + "InnoDB: Cannot start InnoDB. The tail of" + " the system tablespace is\n" + "InnoDB: missing. Have you edited" + " innodb_data_file_path in my.cnf in an\n" + "InnoDB: inappropriate way, removing" + " ibdata files from there?\n" + "InnoDB: You can set innodb_force_recovery=1" + " in my.cnf to force\n" + "InnoDB: a startup if you are trying to" + " recover a badly corrupt database.\n"); + + return(DB_ERROR); + } + } + + /* Check that os_fast_mutexes work as expected */ + os_fast_mutex_init(&srv_os_test_mutex); + + if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) { + fprintf(stderr, + "InnoDB: Error: pthread_mutex_trylock returns" + " an unexpected value on\n" + "InnoDB: success! Cannot continue.\n"); + exit(1); + } + + os_fast_mutex_unlock(&srv_os_test_mutex); + + os_fast_mutex_lock(&srv_os_test_mutex); + + os_fast_mutex_unlock(&srv_os_test_mutex); + + os_fast_mutex_free(&srv_os_test_mutex); + + if (srv_print_verbose_log) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB Plugin %s started; " + "log sequence number %llu\n", + INNODB_VERSION_STR, srv_start_lsn); + } + + if (srv_force_recovery > 0) { + fprintf(stderr, + "InnoDB: !!! innodb_force_recovery" + " is set to %lu !!!\n", + (ulong) srv_force_recovery); + } + + fflush(stderr); + + if (trx_doublewrite_must_reset_space_ids) { + /* Actually, we did not change the undo log format between + 4.0 and 4.1.1, and we would not need to run purge to + completion. Note also that the purge algorithm in 4.1.1 + can process the the history list again even after a full + purge, because our algorithm does not cut the end of the + history list in all cases so that it would become empty + after a full purge. That mean that we may purge 4.0 type + undo log even after this phase. + + The insert buffer record format changed between 4.0 and + 4.1.1. It is essential that the insert buffer is emptied + here! */ + + fprintf(stderr, + "InnoDB: You are upgrading to an" + " InnoDB version which allows multiple\n" + "InnoDB: tablespaces. Wait that purge" + " and insert buffer merge run to\n" + "InnoDB: completion...\n"); + for (;;) { + os_thread_sleep(1000000); + + if (0 == strcmp(srv_main_thread_op_info, + "waiting for server activity")) { + + ut_a(ibuf_is_empty()); + + break; + } + } + fprintf(stderr, + "InnoDB: Full purge and insert buffer merge" + " completed.\n"); + + trx_sys_mark_upgraded_to_multiple_tablespaces(); + + fprintf(stderr, + "InnoDB: You have now successfully upgraded" + " to the multiple tablespaces\n" + "InnoDB: format. You should NOT DOWNGRADE" + " to an earlier version of\n" + "InnoDB: InnoDB! But if you absolutely need to" + " downgrade, see\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "multiple-tablespaces.html\n" + "InnoDB: for instructions.\n"); + } + + if (srv_force_recovery == 0) { + /* In the insert buffer we may have even bigger tablespace + id's, because we may have dropped those tablespaces, but + insert buffer merge has not had time to clean the records from + the ibuf tree. */ + + ibuf_update_max_tablespace_id(); + } + + srv_file_per_table = srv_file_per_table_original_value; + + srv_was_started = TRUE; + + return((int) DB_SUCCESS); +} + +/******************************************************************** +Shuts down the InnoDB database. */ +UNIV_INTERN +int +innobase_shutdown_for_mysql(void) +/*=============================*/ + /* out: DB_SUCCESS or error code */ +{ + ulint i; +#ifdef __NETWARE__ + extern ibool panic_shutdown; +#endif + if (!srv_was_started) { + if (srv_is_being_started) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: shutting down" + " a not properly started\n" + "InnoDB: or created database!\n"); + } + + return(DB_SUCCESS); + } + + /* 1. Flush the buffer pool to disk, write the current lsn to + the tablespace header(s), and copy all log data to archive. + The step 1 is the real InnoDB shutdown. The remaining steps 2 - ... + just free data structures after the shutdown. */ + + + if (srv_fast_shutdown == 2) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: MySQL has requested a very fast shutdown" + " without flushing " + "the InnoDB buffer pool to data files." + " At the next mysqld startup " + "InnoDB will do a crash recovery!\n"); + } + +#ifdef __NETWARE__ + if(!panic_shutdown) +#endif + logs_empty_and_mark_files_at_shutdown(); + + if (srv_conc_n_threads != 0) { + fprintf(stderr, + "InnoDB: Warning: query counter shows %ld queries" + " still\n" + "InnoDB: inside InnoDB at shutdown\n", + srv_conc_n_threads); + } + + /* 2. Make all threads created by InnoDB to exit */ + + srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS; + + /* In a 'very fast' shutdown, we do not need to wait for these threads + to die; all which counts is that we flushed the log; a 'very fast' + shutdown is essentially a crash. */ + + if (srv_fast_shutdown == 2) { + return(DB_SUCCESS); + } + + /* All threads end up waiting for certain events. Put those events + to the signaled state. Then the threads will exit themselves in + os_thread_event_wait(). */ + + for (i = 0; i < 1000; i++) { + /* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM + HERE OR EARLIER */ + + /* a. Let the lock timeout thread exit */ + os_event_set(srv_lock_timeout_thread_event); + + /* b. srv error monitor thread exits automatically, no need + to do anything here */ + + /* c. We wake the master thread so that it exits */ + srv_wake_master_thread(); + + /* d. Exit the i/o threads */ + + os_aio_wake_all_threads_at_shutdown(); + + os_mutex_enter(os_sync_mutex); + + if (os_thread_count == 0) { + /* All the threads have exited or are just exiting; + NOTE that the threads may not have completed their + exit yet. Should we use pthread_join() to make sure + they have exited? Now we just sleep 0.1 seconds and + hope that is enough! */ + + os_mutex_exit(os_sync_mutex); + + os_thread_sleep(100000); + + break; + } + + os_mutex_exit(os_sync_mutex); + + os_thread_sleep(100000); + } + + if (i == 1000) { + fprintf(stderr, + "InnoDB: Warning: %lu threads created by InnoDB" + " had not exited at shutdown!\n", + (ulong) os_thread_count); + } + + if (srv_monitor_file) { + fclose(srv_monitor_file); + srv_monitor_file = 0; + if (srv_monitor_file_name) { + unlink(srv_monitor_file_name); + mem_free(srv_monitor_file_name); + } + } + if (srv_dict_tmpfile) { + fclose(srv_dict_tmpfile); + srv_dict_tmpfile = 0; + } + + if (srv_misc_tmpfile) { + fclose(srv_misc_tmpfile); + srv_misc_tmpfile = 0; + } + + trx_sys_file_format_close(); + + mutex_free(&srv_monitor_file_mutex); + mutex_free(&srv_dict_tmpfile_mutex); + mutex_free(&srv_misc_tmpfile_mutex); + + /* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside + them */ + sync_close(); + + /* 4. Free the os_conc_mutex and all os_events and os_mutexes */ + + srv_free(); + os_sync_free(); + + /* Check that all read views are closed except read view owned + by a purge. */ + + if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) { + fprintf(stderr, + "InnoDB: Error: all read views were not closed" + " before shutdown:\n" + "InnoDB: %lu read views open \n", + UT_LIST_GET_LEN(trx_sys->view_list) - 1); + } + + /* 5. Free all allocated memory and the os_fast_mutex created in + ut0mem.c */ + + buf_pool_free(); + ut_free_all_mem(); + + if (os_thread_count != 0 + || os_event_count != 0 + || os_mutex_count != 0 + || os_fast_mutex_count != 0) { + fprintf(stderr, + "InnoDB: Warning: some resources were not" + " cleaned up in shutdown:\n" + "InnoDB: threads %lu, events %lu," + " os_mutexes %lu, os_fast_mutexes %lu\n", + (ulong) os_thread_count, (ulong) os_event_count, + (ulong) os_mutex_count, (ulong) os_fast_mutex_count); + } + + if (dict_foreign_err_file) { + fclose(dict_foreign_err_file); + } + if (lock_latest_err_file) { + fclose(lock_latest_err_file); + } + + if (srv_print_verbose_log) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Shutdown completed;" + " log sequence number %llu\n", + srv_shutdown_lsn); + } + + srv_was_started = FALSE; + + return((int) DB_SUCCESS); +} + +#ifdef __NETWARE__ +void set_panic_flag_for_netware() +{ + extern ibool panic_shutdown; + panic_shutdown = TRUE; +} +#endif /* __NETWARE__ */ +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/sync/sync0arr.c b/storage/xtradb/sync/sync0arr.c new file mode 100644 index 00000000000..62165eefd46 --- /dev/null +++ b/storage/xtradb/sync/sync0arr.c @@ -0,0 +1,1029 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The wait array used in synchronization primitives + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0arr.h" +#ifdef UNIV_NONINL +#include "sync0arr.ic" +#endif + +#include "sync0sync.h" +#include "sync0rw.h" +#include "os0sync.h" +#include "os0file.h" +#include "srv0srv.h" + +/* + WAIT ARRAY + ========== + +The wait array consists of cells each of which has an +an operating system event object created for it. The threads +waiting for a mutex, for example, can reserve a cell +in the array and suspend themselves to wait for the event +to become signaled. When using the wait array, remember to make +sure that some thread holding the synchronization object +will eventually know that there is a waiter in the array and +signal the object, to prevent infinite wait. +Why we chose to implement a wait array? First, to make +mutexes fast, we had to code our own implementation of them, +which only in usually uncommon cases resorts to using +slow operating system primitives. Then we had the choice of +assigning a unique OS event for each mutex, which would +be simpler, or using a global wait array. In some operating systems, +the global wait array solution is more efficient and flexible, +because we can do with a very small number of OS events, +say 200. In NT 3.51, allocating events seems to be a quadratic +algorithm, because 10 000 events are created fast, but +100 000 events takes a couple of minutes to create. + +As of 5.0.30 the above mentioned design is changed. Since now +OS can handle millions of wait events efficiently, we no longer +have this concept of each cell of wait array having one event. +Instead, now the event that a thread wants to wait on is embedded +in the wait object (mutex or rw_lock). We still keep the global +wait array for the sake of diagnostics and also to avoid infinite +wait The error_monitor thread scans the global wait array to signal +any waiting threads who have missed the signal. */ + +/* A cell where an individual thread may wait suspended +until a resource is released. The suspending is implemented +using an operating system event semaphore. */ +struct sync_cell_struct { + void* wait_object; /* pointer to the object the + thread is waiting for; if NULL + the cell is free for use */ + mutex_t* old_wait_mutex; /* the latest wait mutex in cell */ + rw_lock_t* old_wait_rw_lock;/* the latest wait rw-lock in cell */ + ulint request_type; /* lock type requested on the + object */ + const char* file; /* in debug version file where + requested */ + ulint line; /* in debug version line where + requested */ + os_thread_id_t thread; /* thread id of this waiting + thread */ + ibool waiting; /* TRUE if the thread has already + called sync_array_event_wait + on this cell */ + ib_int64_t signal_count; /* We capture the signal_count + of the wait_object when we + reset the event. This value is + then passed on to os_event_wait + and we wait only if the event + has not been signalled in the + period between the reset and + wait call. */ + time_t reservation_time;/* time when the thread reserved + the wait cell */ +}; + +/* NOTE: It is allowed for a thread to wait +for an event allocated for the array without owning the +protecting mutex (depending on the case: OS or database mutex), but +all changes (set or reset) to the state of the event must be made +while owning the mutex. */ +struct sync_array_struct { + ulint n_reserved; /* number of currently reserved + cells in the wait array */ + ulint n_cells; /* number of cells in the + wait array */ + sync_cell_t* array; /* pointer to wait array */ + ulint protection; /* this flag tells which + mutex protects the data */ + mutex_t mutex; /* possible database mutex + protecting this data structure */ + os_mutex_t os_mutex; /* Possible operating system mutex + protecting the data structure. + As this data structure is used in + constructing the database mutex, + to prevent infinite recursion + in implementation, we fall back to + an OS mutex. */ + ulint sg_count; /* count of how many times an + object has been signalled */ + ulint res_count; /* count of cell reservations + since creation of the array */ +}; + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +This function is called only in the debug version. Detects a deadlock +of one or more threads because of waits of semaphores. */ +static +ibool +sync_array_detect_deadlock( +/*=======================*/ + /* out: TRUE if deadlock detected */ + sync_array_t* arr, /* in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /* in: cell where recursive search started */ + sync_cell_t* cell, /* in: cell to search */ + ulint depth); /* in: recursion depth */ +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************* +Gets the nth cell in array. */ +static +sync_cell_t* +sync_array_get_nth_cell( +/*====================*/ + /* out: cell */ + sync_array_t* arr, /* in: sync array */ + ulint n) /* in: index */ +{ + ut_a(arr); + ut_a(n < arr->n_cells); + + return(arr->array + n); +} + +/********************************************************************** +Reserves the mutex semaphore protecting a sync array. */ +static +void +sync_array_enter( +/*=============*/ + sync_array_t* arr) /* in: sync wait array */ +{ + ulint protection; + + protection = arr->protection; + + if (protection == SYNC_ARRAY_OS_MUTEX) { + os_mutex_enter(arr->os_mutex); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_enter(&(arr->mutex)); + } else { + ut_error; + } +} + +/********************************************************************** +Releases the mutex semaphore protecting a sync array. */ +static +void +sync_array_exit( +/*============*/ + sync_array_t* arr) /* in: sync wait array */ +{ + ulint protection; + + protection = arr->protection; + + if (protection == SYNC_ARRAY_OS_MUTEX) { + os_mutex_exit(arr->os_mutex); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_exit(&(arr->mutex)); + } else { + ut_error; + } +} + +/*********************************************************************** +Creates a synchronization wait array. It is protected by a mutex +which is automatically reserved when the functions operating on it +are called. */ +UNIV_INTERN +sync_array_t* +sync_array_create( +/*==============*/ + /* out, own: created wait array */ + ulint n_cells, /* in: number of cells in the array + to create */ + ulint protection) /* in: either SYNC_ARRAY_OS_MUTEX or + SYNC_ARRAY_MUTEX: determines the type + of mutex protecting the data structure */ +{ + sync_array_t* arr; + sync_cell_t* cell_array; + sync_cell_t* cell; + ulint i; + + ut_a(n_cells > 0); + + /* Allocate memory for the data structures */ + arr = ut_malloc(sizeof(sync_array_t)); + + cell_array = ut_malloc(sizeof(sync_cell_t) * n_cells); + + arr->n_cells = n_cells; + arr->n_reserved = 0; + arr->array = cell_array; + arr->protection = protection; + arr->sg_count = 0; + arr->res_count = 0; + + /* Then create the mutex to protect the wait array complex */ + if (protection == SYNC_ARRAY_OS_MUTEX) { + arr->os_mutex = os_mutex_create(NULL); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_create(&arr->mutex, SYNC_NO_ORDER_CHECK); + } else { + ut_error; + } + + for (i = 0; i < n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + cell->wait_object = NULL; + cell->waiting = FALSE; + cell->signal_count = 0; + } + + return(arr); +} + +/********************************************************************** +Frees the resources in a wait array. */ +UNIV_INTERN +void +sync_array_free( +/*============*/ + sync_array_t* arr) /* in, own: sync wait array */ +{ + ulint protection; + + ut_a(arr->n_reserved == 0); + + sync_array_validate(arr); + + protection = arr->protection; + + /* Release the mutex protecting the wait array complex */ + + if (protection == SYNC_ARRAY_OS_MUTEX) { + os_mutex_free(arr->os_mutex); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_free(&(arr->mutex)); + } else { + ut_error; + } + + ut_free(arr->array); + ut_free(arr); +} + +/************************************************************************ +Validates the integrity of the wait array. Checks +that the number of reserved cells equals the count variable. */ +UNIV_INTERN +void +sync_array_validate( +/*================*/ + sync_array_t* arr) /* in: sync wait array */ +{ + ulint i; + sync_cell_t* cell; + ulint count = 0; + + sync_array_enter(arr); + + for (i = 0; i < arr->n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + if (cell->wait_object != NULL) { + count++; + } + } + + ut_a(count == arr->n_reserved); + + sync_array_exit(arr); +} + +/*********************************************************************** +Returns the event that the thread owning the cell waits for. */ +static +os_event_t +sync_cell_get_event( +/*================*/ + sync_cell_t* cell) /* in: non-empty sync array cell */ +{ + ulint type = cell->request_type; + + if (type == SYNC_MUTEX) { + return(((mutex_t *) cell->wait_object)->event); + } else if (type == RW_LOCK_WAIT_EX) { + return(((rw_lock_t *) cell->wait_object)->wait_ex_event); + } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */ + return(((rw_lock_t *) cell->wait_object)->event); + } +} + +/********************************************************************** +Reserves a wait array cell for waiting for an object. +The event of the cell is reset to nonsignalled state. */ +UNIV_INTERN +void +sync_array_reserve_cell( +/*====================*/ + sync_array_t* arr, /* in: wait array */ + void* object, /* in: pointer to the object to wait for */ + ulint type, /* in: lock request type */ + const char* file, /* in: file where requested */ + ulint line, /* in: line where requested */ + ulint* index) /* out: index of the reserved cell */ +{ + sync_cell_t* cell; + os_event_t event; + ulint i; + + ut_a(object); + ut_a(index); + + sync_array_enter(arr); + + arr->res_count++; + + /* Reserve a new cell. */ + for (i = 0; i < arr->n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object == NULL) { + + cell->waiting = FALSE; + cell->wait_object = object; + + if (type == SYNC_MUTEX) { + cell->old_wait_mutex = object; + } else { + cell->old_wait_rw_lock = object; + } + + cell->request_type = type; + + cell->file = file; + cell->line = line; + + arr->n_reserved++; + + *index = i; + + sync_array_exit(arr); + + /* Make sure the event is reset and also store + the value of signal_count at which the event + was reset. */ + event = sync_cell_get_event(cell); + cell->signal_count = os_event_reset(event); + + cell->reservation_time = time(NULL); + + cell->thread = os_thread_get_curr_id(); + + return; + } + } + + ut_error; /* No free cell found */ + + return; +} + +/********************************************************************** +This function should be called when a thread starts to wait on +a wait array cell. In the debug version this function checks +if the wait for a semaphore will result in a deadlock, in which +case prints info and asserts. */ +UNIV_INTERN +void +sync_array_wait_event( +/*==================*/ + sync_array_t* arr, /* in: wait array */ + ulint index) /* in: index of the reserved cell */ +{ + sync_cell_t* cell; + os_event_t event; + + ut_a(arr); + + sync_array_enter(arr); + + cell = sync_array_get_nth_cell(arr, index); + + ut_a(cell->wait_object); + ut_a(!cell->waiting); + ut_ad(os_thread_get_curr_id() == cell->thread); + + event = sync_cell_get_event(cell); + cell->waiting = TRUE; + +#ifdef UNIV_SYNC_DEBUG + + /* We use simple enter to the mutex below, because if + we cannot acquire it at once, mutex_enter would call + recursively sync_array routines, leading to trouble. + rw_lock_debug_mutex freezes the debug lists. */ + + rw_lock_debug_mutex_enter(); + + if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) { + + fputs("########################################\n", stderr); + ut_error; + } + + rw_lock_debug_mutex_exit(); +#endif + sync_array_exit(arr); + + os_event_wait_low(event, cell->signal_count); + + sync_array_free_cell(arr, index); +} + +/********************************************************************** +Reports info of a wait array cell. */ +static +void +sync_array_cell_print( +/*==================*/ + FILE* file, /* in: file where to print */ + sync_cell_t* cell) /* in: sync cell */ +{ + mutex_t* mutex; + rw_lock_t* rwlock; + ulint type; + ulint writer; + + type = cell->request_type; + + fprintf(file, + "--Thread %lu has waited at %s line %lu" + " for %#.5g seconds the semaphore:\n", + (ulong) os_thread_pf(cell->thread), cell->file, + (ulong) cell->line, + difftime(time(NULL), cell->reservation_time)); + + if (type == SYNC_MUTEX) { + /* We use old_wait_mutex in case the cell has already + been freed meanwhile */ + mutex = cell->old_wait_mutex; + + fprintf(file, + "Mutex at %p created file %s line %lu, lock var %lu\n" +#ifdef UNIV_SYNC_DEBUG + "Last time reserved in file %s line %lu, " +#endif /* UNIV_SYNC_DEBUG */ + "waiters flag %lu\n", + (void*) mutex, mutex->cfile_name, (ulong) mutex->cline, + (ulong) mutex->lock_word, +#ifdef UNIV_SYNC_DEBUG + mutex->file_name, (ulong) mutex->line, +#endif /* UNIV_SYNC_DEBUG */ + (ulong) mutex->waiters); + + } else if (type == RW_LOCK_EX + || type == RW_LOCK_WAIT_EX + || type == RW_LOCK_SHARED) { + + fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file); + + rwlock = cell->old_wait_rw_lock; + + fprintf(file, + " RW-latch at %p created in file %s line %lu\n", + (void*) rwlock, rwlock->cfile_name, + (ulong) rwlock->cline); + writer = rw_lock_get_writer(rwlock); + if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has" + " reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), + writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + } + + fprintf(file, + "number of readers %lu, waiters flag %lu, " + "lock_word: %lx\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", + (ulong) rw_lock_get_reader_count(rwlock), + (ulong) rwlock->waiters, + rwlock->lock_word, + rwlock->last_s_file_name, + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, + (ulong) rwlock->last_x_line); + } else { + ut_error; + } + + if (!cell->waiting) { + fputs("wait has ended\n", file); + } +} + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Looks for a cell with the given thread id. */ +static +sync_cell_t* +sync_array_find_thread( +/*===================*/ + /* out: pointer to cell or NULL + if not found */ + sync_array_t* arr, /* in: wait array */ + os_thread_id_t thread) /* in: thread id */ +{ + ulint i; + sync_cell_t* cell; + + for (i = 0; i < arr->n_cells; i++) { + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL + && os_thread_eq(cell->thread, thread)) { + + return(cell); /* Found */ + } + } + + return(NULL); /* Not found */ +} + +/********************************************************************** +Recursion step for deadlock detection. */ +static +ibool +sync_array_deadlock_step( +/*=====================*/ + /* out: TRUE if deadlock detected */ + sync_array_t* arr, /* in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /* in: cell where recursive search + started */ + os_thread_id_t thread, /* in: thread to look at */ + ulint pass, /* in: pass value */ + ulint depth) /* in: recursion depth */ +{ + sync_cell_t* new; + ibool ret; + + depth++; + + if (pass != 0) { + /* If pass != 0, then we do not know which threads are + responsible of releasing the lock, and no deadlock can + be detected. */ + + return(FALSE); + } + + new = sync_array_find_thread(arr, thread); + + if (new == start) { + /* Stop running of other threads */ + + ut_dbg_stop_threads = TRUE; + + /* Deadlock */ + fputs("########################################\n" + "DEADLOCK of threads detected!\n", stderr); + + return(TRUE); + + } else if (new) { + ret = sync_array_detect_deadlock(arr, start, new, depth); + + if (ret) { + return(TRUE); + } + } + return(FALSE); +} + +/********************************************************************** +This function is called only in the debug version. Detects a deadlock +of one or more threads because of waits of semaphores. */ +static +ibool +sync_array_detect_deadlock( +/*=======================*/ + /* out: TRUE if deadlock detected */ + sync_array_t* arr, /* in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /* in: cell where recursive search started */ + sync_cell_t* cell, /* in: cell to search */ + ulint depth) /* in: recursion depth */ +{ + mutex_t* mutex; + rw_lock_t* lock; + os_thread_id_t thread; + ibool ret; + rw_lock_debug_t*debug; + + ut_a(arr); + ut_a(start); + ut_a(cell); + ut_ad(cell->wait_object); + ut_ad(os_thread_get_curr_id() == start->thread); + ut_ad(depth < 100); + + depth++; + + if (!cell->waiting) { + + return(FALSE); /* No deadlock here */ + } + + if (cell->request_type == SYNC_MUTEX) { + + mutex = cell->wait_object; + + if (mutex_get_lock_word(mutex) != 0) { + + thread = mutex->thread_id; + + /* Note that mutex->thread_id above may be + also OS_THREAD_ID_UNDEFINED, because the + thread which held the mutex maybe has not + yet updated the value, or it has already + released the mutex: in this case no deadlock + can occur, as the wait array cannot contain + a thread with ID_UNDEFINED value. */ + + ret = sync_array_deadlock_step(arr, start, thread, 0, + depth); + if (ret) { + fprintf(stderr, + "Mutex %p owned by thread %lu file %s line %lu\n", + mutex, (ulong) os_thread_pf(mutex->thread_id), + mutex->file_name, (ulong) mutex->line); + sync_array_cell_print(stderr, cell); + + return(TRUE); + } + } + + return(FALSE); /* No deadlock */ + + } else if (cell->request_type == RW_LOCK_EX + || cell->request_type == RW_LOCK_WAIT_EX) { + + lock = cell->wait_object; + + debug = UT_LIST_GET_FIRST(lock->debug_list); + + while (debug != NULL) { + + thread = debug->thread_id; + + if (((debug->lock_type == RW_LOCK_EX) + && !os_thread_eq(thread, cell->thread)) + || ((debug->lock_type == RW_LOCK_WAIT_EX) + && !os_thread_eq(thread, cell->thread)) + || (debug->lock_type == RW_LOCK_SHARED)) { + + /* The (wait) x-lock request can block + infinitely only if someone (can be also cell + thread) is holding s-lock, or someone + (cannot be cell thread) (wait) x-lock, and + he is blocked by start thread */ + + ret = sync_array_deadlock_step( + arr, start, thread, debug->pass, + depth); + if (ret) { +print: + fprintf(stderr, "rw-lock %p ", + (void*) lock); + sync_array_cell_print(stderr, cell); + rw_lock_debug_print(debug); + return(TRUE); + } + } + + debug = UT_LIST_GET_NEXT(list, debug); + } + + return(FALSE); + + } else if (cell->request_type == RW_LOCK_SHARED) { + + lock = cell->wait_object; + debug = UT_LIST_GET_FIRST(lock->debug_list); + + while (debug != NULL) { + + thread = debug->thread_id; + + if ((debug->lock_type == RW_LOCK_EX) + || (debug->lock_type == RW_LOCK_WAIT_EX)) { + + /* The s-lock request can block infinitely + only if someone (can also be cell thread) is + holding (wait) x-lock, and he is blocked by + start thread */ + + ret = sync_array_deadlock_step( + arr, start, thread, debug->pass, + depth); + if (ret) { + goto print; + } + } + + debug = UT_LIST_GET_NEXT(list, debug); + } + + return(FALSE); + + } else { + ut_error; + } + + return(TRUE); /* Execution never reaches this line: for compiler + fooling only */ +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Determines if we can wake up the thread waiting for a sempahore. */ +static +ibool +sync_arr_cell_can_wake_up( +/*======================*/ + sync_cell_t* cell) /* in: cell to search */ +{ + mutex_t* mutex; + rw_lock_t* lock; + + if (cell->request_type == SYNC_MUTEX) { + + mutex = cell->wait_object; + + if (mutex_get_lock_word(mutex) == 0) { + + return(TRUE); + } + + } else if (cell->request_type == RW_LOCK_EX) { + + lock = cell->wait_object; + + if (lock->lock_word > 0) { + /* Either unlocked or only read locked. */ + + return(TRUE); + } + + } else if (cell->request_type == RW_LOCK_WAIT_EX) { + + lock = cell->wait_object; + + /* lock_word == 0 means all readers have left */ + if (lock->lock_word == 0) { + + return(TRUE); + } + } else if (cell->request_type == RW_LOCK_SHARED) { + lock = cell->wait_object; + + /* lock_word > 0 means no writer or reserved writer */ + if (lock->lock_word > 0) { + + return(TRUE); + } + } + + return(FALSE); +} + +/********************************************************************** +Frees the cell. NOTE! sync_array_wait_event frees the cell +automatically! */ +UNIV_INTERN +void +sync_array_free_cell( +/*=================*/ + sync_array_t* arr, /* in: wait array */ + ulint index) /* in: index of the cell in array */ +{ + sync_cell_t* cell; + + sync_array_enter(arr); + + cell = sync_array_get_nth_cell(arr, index); + + ut_a(cell->wait_object != NULL); + + cell->waiting = FALSE; + cell->wait_object = NULL; + cell->signal_count = 0; + + ut_a(arr->n_reserved > 0); + arr->n_reserved--; + + sync_array_exit(arr); +} + +/************************************************************************** +Increments the signalled count. */ +UNIV_INTERN +void +sync_array_object_signalled( +/*========================*/ + sync_array_t* arr) /* in: wait array */ +{ +#ifdef HAVE_GCC_ATOMIC_BUILTINS + (void) os_atomic_increment(&arr->sg_count, 1); +#else + sync_array_enter(arr); + + arr->sg_count++; + + sync_array_exit(arr); +#endif +} + +/************************************************************************** +If the wakeup algorithm does not work perfectly at semaphore relases, +this function will do the waking (see the comment in mutex_exit). This +function should be called about every 1 second in the server. + +Note that there's a race condition between this thread and mutex_exit +changing the lock_word and calling signal_object, so sometimes this finds +threads to wake up even when nothing has gone wrong. */ +UNIV_INTERN +void +sync_arr_wake_threads_if_sema_free(void) +/*====================================*/ +{ + sync_array_t* arr = sync_primary_wait_array; + sync_cell_t* cell; + ulint count; + ulint i; + os_event_t event; + + sync_array_enter(arr); + + i = 0; + count = 0; + + while (count < arr->n_reserved) { + + cell = sync_array_get_nth_cell(arr, i); + i++; + + if (cell->wait_object == NULL) { + continue; + } + count++; + + if (sync_arr_cell_can_wake_up(cell)) { + + event = sync_cell_get_event(cell); + + os_event_set(event); + } + + } + + sync_array_exit(arr); +} + +/************************************************************************** +Prints warnings of long semaphore waits to stderr. */ +UNIV_INTERN +ibool +sync_array_print_long_waits(void) +/*=============================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ +{ + sync_cell_t* cell; + ibool old_val; + ibool noticed = FALSE; + ulint i; + ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; + ibool fatal = FALSE; + + for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); + + if (cell->wait_object != NULL && cell->waiting + && difftime(time(NULL), cell->reservation_time) > 240) { + fputs("InnoDB: Warning: a long semaphore wait:\n", + stderr); + sync_array_cell_print(stderr, cell); + noticed = TRUE; + } + + if (cell->wait_object != NULL && cell->waiting + && difftime(time(NULL), cell->reservation_time) + > fatal_timeout) { + fatal = TRUE; + } + } + + if (noticed) { + fprintf(stderr, + "InnoDB: ###### Starts InnoDB Monitor" + " for 30 secs to print diagnostic info:\n"); + old_val = srv_print_innodb_monitor; + + /* If some crucial semaphore is reserved, then also the InnoDB + Monitor can hang, and we do not get diagnostics. Since in + many cases an InnoDB hang is caused by a pwrite() or a pread() + call hanging inside the operating system, let us print right + now the values of pending calls of these. */ + + fprintf(stderr, + "InnoDB: Pending preads %lu, pwrites %lu\n", + (ulong)os_file_n_pending_preads, + (ulong)os_file_n_pending_pwrites); + + srv_print_innodb_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + + os_thread_sleep(30000000); + + srv_print_innodb_monitor = old_val; + fprintf(stderr, + "InnoDB: ###### Diagnostic info printed" + " to the standard error stream\n"); + } + + return(fatal); +} + +/************************************************************************** +Prints info of the wait array. */ +static +void +sync_array_output_info( +/*===================*/ + FILE* file, /* in: file where to print */ + sync_array_t* arr) /* in: wait array; NOTE! caller must own the + mutex */ +{ + sync_cell_t* cell; + ulint count; + ulint i; + + fprintf(file, + "OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n", + (long) arr->res_count, (long) arr->sg_count); + i = 0; + count = 0; + + while (count < arr->n_reserved) { + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL) { + count++; + sync_array_cell_print(file, cell); + } + + i++; + } +} + +/************************************************************************** +Prints info of the wait array. */ +UNIV_INTERN +void +sync_array_print_info( +/*==================*/ + FILE* file, /* in: file where to print */ + sync_array_t* arr) /* in: wait array */ +{ + sync_array_enter(arr); + + sync_array_output_info(file, arr); + + sync_array_exit(arr); +} diff --git a/storage/xtradb/sync/sync0rw.c b/storage/xtradb/sync/sync0rw.c new file mode 100644 index 00000000000..09c732eefc9 --- /dev/null +++ b/storage/xtradb/sync/sync0rw.c @@ -0,0 +1,1035 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The read-write lock (for thread synchronization) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0rw.h" +#ifdef UNIV_NONINL +#include "sync0rw.ic" +#endif + +#include "os0thread.h" +#include "mem0mem.h" +#include "srv0srv.h" + +/* + IMPLEMENTATION OF THE RW_LOCK + ============================= +The status of a rw_lock is held in lock_word. The initial value of lock_word is +X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR +for each x-lock. This describes the lock state for each value of lock_word: + +lock_word == X_LOCK_DECR: Unlocked. +0 < lock_word < X_LOCK_DECR: Read locked, no waiting writers. + (X_LOCK_DECR - lock_word) is the + number of readers that hold the lock. +lock_word == 0: Write locked +-X_LOCK_DECR < lock_word < 0: Read locked, with a waiting writer. + (-lock_word) is the number of readers + that hold the lock. +lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been + decremented by X_LOCK_DECR once for each lock, + so the number of locks is: + ((-lock_word) / X_LOCK_DECR) + 1 +When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0: +other values of lock_word are invalid. + +The lock_word is always read and updated atomically and consistently, so that +it always represents the state of the lock, and the state of the lock changes +with a single atomic operation. This lock_word holds all of the information +that a thread needs in order to determine if it is eligible to gain the lock +or if it must spin or sleep. The one exception to this is that writer_thread +must be verified before recursive write locks: to solve this scenario, we make +writer_thread readable by all threads, but only writeable by the x-lock holder. + +The other members of the lock obey the following rules to remain consistent: + +recursive: This and the writer_thread field together control the + behaviour of recursive x-locking. + lock->recursive must be FALSE in following states: + 1) The writer_thread contains garbage i.e.: the + lock has just been initialized. + 2) The lock is not x-held and there is no + x-waiter waiting on WAIT_EX event. + 3) The lock is x-held or there is an x-waiter + waiting on WAIT_EX event but the 'pass' value + is non-zero. + lock->recursive is TRUE iff: + 1) The lock is x-held or there is an x-waiter + waiting on WAIT_EX event and the 'pass' value + is zero. + This flag must be set after the writer_thread field + has been updated with a memory ordering barrier. + It is unset before the lock_word has been incremented. +writer_thread: Is used only in recursive x-locking. Can only be safely + read iff lock->recursive flag is TRUE. + This field is uninitialized at lock creation time and + is updated atomically when x-lock is acquired or when + move_ownership is called. A thread is only allowed to + set the value of this field to it's thread_id i.e.: a + thread cannot set writer_thread to some other thread's + id. +waiters: May be set to 1 anytime, but to avoid unnecessary wake-up + signals, it should only be set to 1 when there are threads + waiting on event. Must be 1 when a writer starts waiting to + ensure the current x-locking thread sends a wake-up signal + during unlock. May only be reset to 0 immediately before a + a wake-up signal is sent to event. On most platforms, a + memory barrier is required after waiters is set, and before + verifying lock_word is still held, to ensure some unlocker + really does see the flags new value. +event: Threads wait on event for read or writer lock when another + thread has an x-lock or an x-lock reservation (wait_ex). A + thread may only wait on event after performing the following + actions in order: + (1) Record the counter value of event (with os_event_reset). + (2) Set waiters to 1. + (3) Verify lock_word <= 0. + (1) must come before (2) to ensure signal is not missed. + (2) must come before (3) to ensure a signal is sent. + These restrictions force the above ordering. + Immediately before sending the wake-up signal, we should: + (1) Verify lock_word == X_LOCK_DECR (unlocked) + (2) Reset waiters to 0. +wait_ex_event: A thread may only wait on the wait_ex_event after it has + performed the following actions in order: + (1) Decrement lock_word by X_LOCK_DECR. + (2) Record counter value of wait_ex_event (os_event_reset, + called from sync_array_reserve_cell). + (3) Verify that lock_word < 0. + (1) must come first to ensures no other threads become reader + or next writer, and notifies unlocker that signal must be sent. + (2) must come before (3) to ensure the signal is not missed. + These restrictions force the above ordering. + Immediately before sending the wake-up signal, we should: + Verify lock_word == 0 (waiting thread holds x_lock) +*/ + + +/* number of spin waits on rw-latches, +resulted during shared (read) locks */ +UNIV_INTERN ib_int64_t rw_s_spin_wait_count = 0; +UNIV_INTERN ib_int64_t rw_s_spin_round_count = 0; + +/* number of OS waits on rw-latches, +resulted during shared (read) locks */ +UNIV_INTERN ib_int64_t rw_s_os_wait_count = 0; + +/* number of unlocks (that unlock shared locks), +set only when UNIV_SYNC_PERF_STAT is defined */ +UNIV_INTERN ib_int64_t rw_s_exit_count = 0; + +/* number of spin waits on rw-latches, +resulted during exclusive (write) locks */ +UNIV_INTERN ib_int64_t rw_x_spin_wait_count = 0; +UNIV_INTERN ib_int64_t rw_x_spin_round_count = 0; + +/* number of OS waits on rw-latches, +resulted during exclusive (write) locks */ +UNIV_INTERN ib_int64_t rw_x_os_wait_count = 0; + +/* number of unlocks (that unlock exclusive locks), +set only when UNIV_SYNC_PERF_STAT is defined */ +UNIV_INTERN ib_int64_t rw_x_exit_count = 0; + +/* The global list of rw-locks */ +UNIV_INTERN rw_lock_list_t rw_lock_list; +UNIV_INTERN mutex_t rw_lock_list_mutex; + +#ifdef UNIV_SYNC_DEBUG +/* The global mutex which protects debug info lists of all rw-locks. +To modify the debug info list of an rw-lock, this mutex has to be +acquired in addition to the mutex protecting the lock. */ + +UNIV_INTERN mutex_t rw_lock_debug_mutex; +/* If deadlock detection does not get immediately the mutex, +it may wait for this event */ +UNIV_INTERN os_event_t rw_lock_debug_event; +/* This is set to TRUE, if there may be waiters for the event */ +UNIV_INTERN ibool rw_lock_debug_waiters; + +/********************************************************************** +Creates a debug info struct. */ +static +rw_lock_debug_t* +rw_lock_debug_create(void); +/*======================*/ +/********************************************************************** +Frees a debug info struct. */ +static +void +rw_lock_debug_free( +/*===============*/ + rw_lock_debug_t* info); + +/********************************************************************** +Creates a debug info struct. */ +static +rw_lock_debug_t* +rw_lock_debug_create(void) +/*======================*/ +{ + return((rw_lock_debug_t*) mem_alloc(sizeof(rw_lock_debug_t))); +} + +/********************************************************************** +Frees a debug info struct. */ +static +void +rw_lock_debug_free( +/*===============*/ + rw_lock_debug_t* info) +{ + mem_free(info); +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /* in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cmutex_name, /* in: mutex name */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /* in: file name where created */ + ulint cline) /* in: file line where created */ +{ + /* If this is the very first time a synchronization object is + created, then the following call initializes the sync system. */ + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_create(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK); + + lock->mutex.cfile_name = cfile_name; + lock->mutex.cline = cline; + +# if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; +# endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ +# ifdef UNIV_DEBUG + UT_NOT_USED(cmutex_name); +# endif +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + lock->lock_word = X_LOCK_DECR; + lock->waiters = 0; + + /* We set this value to signify that lock->writer_thread + contains garbage at initialization and cannot be used for + recursive x-locking. */ + lock->recursive = FALSE; + +#ifdef UNIV_SYNC_DEBUG + UT_LIST_INIT(lock->debug_list); + + lock->level = level; +#endif /* UNIV_SYNC_DEBUG */ + + lock->magic_n = RW_LOCK_MAGIC_N; + + lock->cfile_name = cfile_name; + lock->cline = (unsigned int) cline; + + lock->count_os_wait = 0; + lock->last_s_file_name = "not yet reserved"; + lock->last_x_file_name = "not yet reserved"; + lock->last_s_line = 0; + lock->last_x_line = 0; + lock->event = os_event_create(NULL); + lock->wait_ex_event = os_event_create(NULL); + + mutex_enter(&rw_lock_list_mutex); + + if (UT_LIST_GET_LEN(rw_lock_list) > 0) { + ut_a(UT_LIST_GET_FIRST(rw_lock_list)->magic_n + == RW_LOCK_MAGIC_N); + } + + UT_LIST_ADD_FIRST(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); +} + +/********************************************************************** +Calling this function is obligatory only if the memory buffer containing +the rw-lock is freed. Removes an rw-lock object from the global list. The +rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free( +/*=========*/ + rw_lock_t* lock) /* in: rw-lock */ +{ + ut_ad(rw_lock_validate(lock)); + ut_a(lock->lock_word == X_LOCK_DECR); + + lock->magic_n = 0; + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_free(rw_lock_get_mutex(lock)); +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + mutex_enter(&rw_lock_list_mutex); + os_event_free(lock->event); + + os_event_free(lock->wait_ex_event); + + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + if (UT_LIST_GET_NEXT(list, lock)) { + ut_a(UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + + UT_LIST_REMOVE(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); +} + +#ifdef UNIV_DEBUG +/********************************************************************** +Checks that the rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + rw_lock_t* lock) +{ + ut_a(lock); + + ulint waiters = rw_lock_get_waiters(lock); + lint lock_word = lock->lock_word; + + ut_a(lock->magic_n == RW_LOCK_MAGIC_N); + ut_a(waiters == 0 || waiters == 1); + ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************** +Lock an rw-lock in shared mode for the current thread. If the rw-lock is +locked in exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. */ +UNIV_INTERN +void +rw_lock_s_lock_spin( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock + will be passed to another thread to unlock */ + const char* file_name, /* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + ulint index; /* index of the reserved wait cell */ + ulint i = 0; /* spin round count */ + + ut_ad(rw_lock_validate(lock)); + + rw_s_spin_wait_count++; /* Count calls to this function */ +lock_loop: + + /* Spin waiting for the writer field to become free */ + while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + + i++; + } + + if (i == SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu spin wait rw-s-lock at %p" + " cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), + (void*) lock, + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + + /* We try once again to obtain the lock */ + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { + rw_s_spin_round_count += i; + + return; /* Success */ + } else { + + if (i < SYNC_SPIN_ROUNDS) { + goto lock_loop; + } + + rw_s_spin_round_count += i; + + sync_array_reserve_cell(sync_primary_wait_array, + lock, RW_LOCK_SHARED, + file_name, line, + &index); + + /* Set waiters before checking lock_word to ensure wake-up + signal is sent. This may lead to some unnecessary signals. */ + rw_lock_set_waiter_flag(lock); + + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { + sync_array_free_cell(sync_primary_wait_array, index); + return; /* Success */ + } + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu OS wait rw-s-lock at %p" + " cfile %s cline %lu\n", + os_thread_pf(os_thread_get_curr_id()), + (void*) lock, lock->cfile_name, + (ulong) lock->cline); + } + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_s_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + + i = 0; + goto lock_loop; + } +} + +/********************************************************************** +This function is used in the insert buffer to move the ownership of an +x-latch on a buffer frame to the current thread. The x-latch was set by +the buffer read operation and it protected the buffer frame while the +read was done. The ownership is moved because we want that the current +thread is able to acquire a second x-latch which is stored in an mtr. +This, in turn, is needed to pass the debug checks of index page +operations. */ +UNIV_INTERN +void +rw_lock_x_lock_move_ownership( +/*==========================*/ + rw_lock_t* lock) /* in: lock which was x-locked in the + buffer read */ +{ + ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX)); + + rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); +} + +/********************************************************************** +Function for the next writer to call. Waits for readers to exit. +The caller must have already decremented lock_word by X_LOCK_DECR.*/ +UNIV_INLINE +void +rw_lock_x_lock_wait( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ +#endif + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + ulint index; + ulint i = 0; + + ut_ad(lock->lock_word <= 0); + + while (lock->lock_word < 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + if(i < SYNC_SPIN_ROUNDS) { + i++; + continue; + } + + /* If there is still a reader, then go to sleep.*/ + rw_x_spin_round_count += i; + i = 0; + sync_array_reserve_cell(sync_primary_wait_array, + lock, + RW_LOCK_WAIT_EX, + file_name, line, + &index); + /* Check lock_word to ensure wake-up isn't missed.*/ + if(lock->lock_word < 0) { + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_x_os_wait_count++; + + /* Add debug info as it is needed to detect possible + deadlock. We must add info for WAIT_EX thread for + deadlock detection to work properly. */ +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, + file_name, line); +#endif + + sync_array_wait_event(sync_primary_wait_array, + index); +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, + RW_LOCK_WAIT_EX); +#endif + /* It is possible to wake when lock_word < 0. + We must pass the while-loop check to proceed.*/ + } else { + sync_array_free_cell(sync_primary_wait_array, + index); + } + } + rw_x_spin_round_count += i; +} + +/********************************************************************** +Low-level function for acquiring an exclusive lock. */ +UNIV_INLINE +ibool +rw_lock_x_lock_low( +/*===============*/ + /* out: RW_LOCK_NOT_LOCKED if did + not succeed, RW_LOCK_EX if success. */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + os_thread_id_t curr_thread = os_thread_get_curr_id(); + + if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) { + + /* lock->recursive also tells us if the writer_thread + field is stale or active. As we are going to write + our own thread id in that field it must be that the + current writer_thread value is not active. */ + ut_a(!lock->recursive); + + /* Decrement occurred: we are writer or next-writer. */ + rw_lock_set_writer_id_and_recursion_flag(lock, + pass ? FALSE : TRUE); + + rw_lock_x_lock_wait(lock, +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + file_name, line); + + } else { + /* Decrement failed: relock or failed lock */ + if (!pass && lock->recursive + && os_thread_eq(lock->writer_thread, curr_thread)) { + /* Relock */ + lock->lock_word -= X_LOCK_DECR; + } else { + /* Another thread locked before us */ + return(FALSE); + } + } +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, + file_name, line); +#endif + lock->last_x_file_name = file_name; + lock->last_x_line = (unsigned int) line; + + return(TRUE); +} + +/********************************************************************** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + ulint index; /* index of the reserved wait cell */ + ulint i; /* spin round count */ + ibool spinning = FALSE; + + ut_ad(rw_lock_validate(lock)); + + i = 0; + +lock_loop: + + if (rw_lock_x_lock_low(lock, pass, file_name, line)) { + rw_x_spin_round_count += i; + + return; /* Locking succeeded */ + + } else { + + if (!spinning) { + spinning = TRUE; + rw_x_spin_wait_count++; + } + + /* Spin waiting for the lock_word to become free */ + while (i < SYNC_SPIN_ROUNDS + && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); + } + + i++; + } + if (i == SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } else { + goto lock_loop; + } + } + + rw_x_spin_round_count += i; + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu spin wait rw-x-lock at %p" + " cfile %s cline %lu rnds %lu\n", + os_thread_pf(os_thread_get_curr_id()), (void*) lock, + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + + sync_array_reserve_cell(sync_primary_wait_array, + lock, + RW_LOCK_EX, + file_name, line, + &index); + + /* Waiters must be set before checking lock_word, to ensure signal + is sent. This could lead to a few unnecessary wake-up signals. */ + rw_lock_set_waiter_flag(lock); + + if (rw_lock_x_lock_low(lock, pass, file_name, line)) { + sync_array_free_cell(sync_primary_wait_array, index); + return; /* Locking succeeded */ + } + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu OS wait for rw-x-lock at %p" + " cfile %s cline %lu\n", + os_thread_pf(os_thread_get_curr_id()), (void*) lock, + lock->cfile_name, (ulong) lock->cline); + } + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_x_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + + i = 0; + goto lock_loop; +} + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Acquires the debug mutex. We cannot use the mutex defined in sync0sync, +because the debug mutex is also acquired in sync0arr while holding the OS +mutex protecting the sync array, and the ordinary mutex_enter might +recursively call routines in sync0arr, leading to a deadlock on the OS +mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_enter(void) +/*==========================*/ +{ +loop: + if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { + return; + } + + os_event_reset(rw_lock_debug_event); + + rw_lock_debug_waiters = TRUE; + + if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { + return; + } + + os_event_wait(rw_lock_debug_event); + + goto loop; +} + +/********************************************************************** +Releases the debug mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_exit(void) +/*==========================*/ +{ + mutex_exit(&rw_lock_debug_mutex); + + if (rw_lock_debug_waiters) { + rw_lock_debug_waiters = FALSE; + os_event_set(rw_lock_debug_event); + } +} + +/********************************************************************** +Inserts the debug information for an rw-lock. */ +UNIV_INTERN +void +rw_lock_add_debug_info( +/*===================*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint pass, /* in: pass value */ + ulint lock_type, /* in: lock type */ + const char* file_name, /* in: file where requested */ + ulint line) /* in: line where requested */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + ut_ad(file_name); + + info = rw_lock_debug_create(); + + rw_lock_debug_mutex_enter(); + + info->file_name = file_name; + info->line = line; + info->lock_type = lock_type; + info->thread_id = os_thread_get_curr_id(); + info->pass = pass; + + UT_LIST_ADD_FIRST(list, lock->debug_list, info); + + rw_lock_debug_mutex_exit(); + + if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { + sync_thread_add_level(lock, lock->level); + } +} + +/********************************************************************** +Removes a debug information struct for an rw-lock. */ +UNIV_INTERN +void +rw_lock_remove_debug_info( +/*======================*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint pass, /* in: pass value */ + ulint lock_type) /* in: lock type */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + + if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { + sync_thread_reset_level(lock); + } + + rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + + while (info != NULL) { + if ((pass == info->pass) + && ((pass != 0) + || os_thread_eq(info->thread_id, + os_thread_get_curr_id())) + && (info->lock_type == lock_type)) { + + /* Found! */ + UT_LIST_REMOVE(list, lock->debug_list, info); + rw_lock_debug_mutex_exit(); + + rw_lock_debug_free(info); + + return; + } + + info = UT_LIST_GET_NEXT(list, info); + } + + ut_error; +} +#endif /* UNIV_SYNC_DEBUG */ + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + /* out: TRUE if locked */ + rw_lock_t* lock, /* in: rw-lock */ + ulint lock_type) /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + + rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + + while (info != NULL) { + + if (os_thread_eq(info->thread_id, os_thread_get_curr_id()) + && (info->pass == 0) + && (info->lock_type == lock_type)) { + + rw_lock_debug_mutex_exit(); + /* Found! */ + + return(TRUE); + } + + info = UT_LIST_GET_NEXT(list, info); + } + rw_lock_debug_mutex_exit(); + + return(FALSE); +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Checks if somebody has locked the rw-lock in the specified mode. */ +UNIV_INTERN +ibool +rw_lock_is_locked( +/*==============*/ + /* out: TRUE if locked */ + rw_lock_t* lock, /* in: rw-lock */ + ulint lock_type) /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +{ + ibool ret = FALSE; + + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + + if (lock_type == RW_LOCK_SHARED) { + if (rw_lock_get_reader_count(lock) > 0) { + ret = TRUE; + } + } else if (lock_type == RW_LOCK_EX) { + if (rw_lock_get_writer(lock) == RW_LOCK_EX) { + ret = TRUE; + } + } else { + ut_error; + } + + return(ret); +} + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************* +Prints debug info of currently locked rw-locks. */ +UNIV_INTERN +void +rw_lock_list_print_info( +/*====================*/ + FILE* file) /* in: file where to print */ +{ + rw_lock_t* lock; + ulint count = 0; + rw_lock_debug_t* info; + + mutex_enter(&rw_lock_list_mutex); + + fputs("-------------\n" + "RW-LATCH INFO\n" + "-------------\n", file); + + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { + + count++; + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_enter(&(lock->mutex)); +#endif + if (lock->lock_word != X_LOCK_DECR) { + + fprintf(file, "RW-LOCK: %p ", (void*) lock); + + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", file); + } else { + putc('\n', file); + } + + info = UT_LIST_GET_FIRST(lock->debug_list); + while (info != NULL) { + rw_lock_debug_print(info); + info = UT_LIST_GET_NEXT(list, info); + } + } +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_exit(&(lock->mutex)); +#endif + + lock = UT_LIST_GET_NEXT(list, lock); + } + + fprintf(file, "Total number of rw-locks %ld\n", count); + mutex_exit(&rw_lock_list_mutex); +} + +/******************************************************************* +Prints debug info of an rw-lock. */ +UNIV_INTERN +void +rw_lock_print( +/*==========*/ + rw_lock_t* lock) /* in: rw-lock */ +{ + rw_lock_debug_t* info; + + fprintf(stderr, + "-------------\n" + "RW-LATCH INFO\n" + "RW-LATCH: %p ", (void*) lock); + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_enter(&(lock->mutex)); +#endif + if (lock->lock_word != X_LOCK_DECR) { + + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", stderr); + } else { + putc('\n', stderr); + } + + info = UT_LIST_GET_FIRST(lock->debug_list); + while (info != NULL) { + rw_lock_debug_print(info); + info = UT_LIST_GET_NEXT(list, info); + } + } +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_exit(&(lock->mutex)); +#endif +} + +/************************************************************************* +Prints info of a debug struct. */ +UNIV_INTERN +void +rw_lock_debug_print( +/*================*/ + rw_lock_debug_t* info) /* in: debug struct */ +{ + ulint rwt; + + rwt = info->lock_type; + + fprintf(stderr, "Locked: thread %ld file %s line %ld ", + (ulong) os_thread_pf(info->thread_id), info->file_name, + (ulong) info->line); + if (rwt == RW_LOCK_SHARED) { + fputs("S-LOCK", stderr); + } else if (rwt == RW_LOCK_EX) { + fputs("X-LOCK", stderr); + } else if (rwt == RW_LOCK_WAIT_EX) { + fputs("WAIT X-LOCK", stderr); + } else { + ut_error; + } + if (info->pass != 0) { + fprintf(stderr, " pass value %lu", (ulong) info->pass); + } + putc('\n', stderr); +} + +/******************************************************************* +Returns the number of currently locked rw-locks. Works only in the debug +version. */ +UNIV_INTERN +ulint +rw_lock_n_locked(void) +/*==================*/ +{ + rw_lock_t* lock; + ulint count = 0; + + mutex_enter(&rw_lock_list_mutex); + + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { + + if (lock->lock_word != X_LOCK_DECR) { + count++; + } + + lock = UT_LIST_GET_NEXT(list, lock); + } + + mutex_exit(&rw_lock_list_mutex); + + return(count); +} +#endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.c new file mode 100644 index 00000000000..3b2d033aae5 --- /dev/null +++ b/storage/xtradb/sync/sync0sync.c @@ -0,0 +1,1411 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0sync.h" +#ifdef UNIV_NONINL +#include "sync0sync.ic" +#endif + +#include "sync0rw.h" +#include "buf0buf.h" +#include "srv0srv.h" +#include "buf0types.h" + +/* + REASONS FOR IMPLEMENTING THE SPIN LOCK MUTEX + ============================================ + +Semaphore operations in operating systems are slow: Solaris on a 1993 Sparc +takes 3 microseconds (us) for a lock-unlock pair and Windows NT on a 1995 +Pentium takes 20 microseconds for a lock-unlock pair. Therefore, we have to +implement our own efficient spin lock mutex. Future operating systems may +provide efficient spin locks, but we cannot count on that. + +Another reason for implementing a spin lock is that on multiprocessor systems +it can be more efficient for a processor to run a loop waiting for the +semaphore to be released than to switch to a different thread. A thread switch +takes 25 us on both platforms mentioned above. See Gray and Reuter's book +Transaction processing for background. + +How long should the spin loop last before suspending the thread? On a +uniprocessor, spinning does not help at all, because if the thread owning the +mutex is not executing, it cannot be released. Spinning actually wastes +resources. + +On a multiprocessor, we do not know if the thread owning the mutex is +executing or not. Thus it would make sense to spin as long as the operation +guarded by the mutex would typically last assuming that the thread is +executing. If the mutex is not released by that time, we may assume that the +thread owning the mutex is not executing and suspend the waiting thread. + +A typical operation (where no i/o involved) guarded by a mutex or a read-write +lock may last 1 - 20 us on the current Pentium platform. The longest +operations are the binary searches on an index node. + +We conclude that the best choice is to set the spin time at 20 us. Then the +system should work well on a multiprocessor. On a uniprocessor we have to +make sure that thread swithches due to mutex collisions are not frequent, +i.e., they do not happen every 100 us or so, because that wastes too much +resources. If the thread switches are not frequent, the 20 us wasted in spin +loop is not too much. + +Empirical studies on the effect of spin time should be done for different +platforms. + + + IMPLEMENTATION OF THE MUTEX + =========================== + +For background, see Curt Schimmel's book on Unix implementation on modern +architectures. The key points in the implementation are atomicity and +serialization of memory accesses. The test-and-set instruction (XCHG in +Pentium) must be atomic. As new processors may have weak memory models, also +serialization of memory references may be necessary. The successor of Pentium, +P6, has at least one mode where the memory model is weak. As far as we know, +in Pentium all memory accesses are serialized in the program order and we do +not have to worry about the memory model. On other processors there are +special machine instructions called a fence, memory barrier, or storage +barrier (STBAR in Sparc), which can be used to serialize the memory accesses +to happen in program order relative to the fence instruction. + +Leslie Lamport has devised a "bakery algorithm" to implement a mutex without +the atomic test-and-set, but his algorithm should be modified for weak memory +models. We do not use Lamport's algorithm, because we guess it is slower than +the atomic test-and-set. + +Our mutex implementation works as follows: After that we perform the atomic +test-and-set instruction on the memory word. If the test returns zero, we +know we got the lock first. If the test returns not zero, some other thread +was quicker and got the lock: then we spin in a loop reading the memory word, +waiting it to become zero. It is wise to just read the word in the loop, not +perform numerous test-and-set instructions, because they generate memory +traffic between the cache and the main memory. The read loop can just access +the cache, saving bus bandwidth. + +If we cannot acquire the mutex lock in the specified time, we reserve a cell +in the wait array, set the waiters byte in the mutex to 1. To avoid a race +condition, after setting the waiters byte and before suspending the waiting +thread, we still have to check that the mutex is reserved, because it may +have happened that the thread which was holding the mutex has just released +it and did not see the waiters byte set to 1, a case which would lead the +other thread to an infinite wait. + +LEMMA 1: After a thread resets the event of a mutex (or rw_lock), some +======= +thread will eventually call os_event_set() on that particular event. +Thus no infinite wait is possible in this case. + +Proof: After making the reservation the thread sets the waiters field in the +mutex to 1. Then it checks that the mutex is still reserved by some thread, +or it reserves the mutex for itself. In any case, some thread (which may be +also some earlier thread, not necessarily the one currently holding the mutex) +will set the waiters field to 0 in mutex_exit, and then call +os_event_set() with the mutex as an argument. +Q.E.D. + +LEMMA 2: If an os_event_set() call is made after some thread has called +======= +the os_event_reset() and before it starts wait on that event, the call +will not be lost to the second thread. This is true even if there is an +intervening call to os_event_reset() by another thread. +Thus no infinite wait is possible in this case. + +Proof (non-windows platforms): os_event_reset() returns a monotonically +increasing value of signal_count. This value is increased at every +call of os_event_set() If thread A has called os_event_reset() followed +by thread B calling os_event_set() and then some other thread C calling +os_event_reset(), the is_set flag of the event will be set to FALSE; +but now if thread A calls os_event_wait_low() with the signal_count +value returned from the earlier call of os_event_reset(), it will +return immediately without waiting. +Q.E.D. + +Proof (windows): If there is a writer thread which is forced to wait for +the lock, it may be able to set the state of rw_lock to RW_LOCK_WAIT_EX +The design of rw_lock ensures that there is one and only one thread +that is able to change the state to RW_LOCK_WAIT_EX and this thread is +guaranteed to acquire the lock after it is released by the current +holders and before any other waiter gets the lock. +On windows this thread waits on a separate event i.e.: wait_ex_event. +Since only one thread can wait on this event there is no chance +of this event getting reset before the writer starts wait on it. +Therefore, this thread is guaranteed to catch the os_set_event() +signalled unconditionally at the release of the lock. +Q.E.D. */ + +/* Number of spin waits on mutexes: for performance monitoring */ + +/* round=one iteration of a spin loop */ +UNIV_INTERN ib_int64_t mutex_spin_round_count = 0; +UNIV_INTERN ib_int64_t mutex_spin_wait_count = 0; +UNIV_INTERN ib_int64_t mutex_os_wait_count = 0; +UNIV_INTERN ib_int64_t mutex_exit_count = 0; + +/* The global array of wait cells for implementation of the database's own +mutexes and read-write locks */ +UNIV_INTERN sync_array_t* sync_primary_wait_array; + +/* This variable is set to TRUE when sync_init is called */ +UNIV_INTERN ibool sync_initialized = FALSE; + + +typedef struct sync_level_struct sync_level_t; +typedef struct sync_thread_struct sync_thread_t; + +#ifdef UNIV_SYNC_DEBUG +/* The latch levels currently owned by threads are stored in this data +structure; the size of this array is OS_THREAD_MAX_N */ + +UNIV_INTERN sync_thread_t* sync_thread_level_arrays; + +/* Mutex protecting sync_thread_level_arrays */ +UNIV_INTERN mutex_t sync_thread_mutex; +#endif /* UNIV_SYNC_DEBUG */ + +/* Global list of database mutexes (not OS mutexes) created. */ +UNIV_INTERN ut_list_base_node_t mutex_list; + +/* Mutex protecting the mutex_list variable */ +UNIV_INTERN mutex_t mutex_list_mutex; + +#ifdef UNIV_SYNC_DEBUG +/* Latching order checks start when this is set TRUE */ +UNIV_INTERN ibool sync_order_checks_on = FALSE; +#endif /* UNIV_SYNC_DEBUG */ + +struct sync_thread_struct{ + os_thread_id_t id; /* OS thread id */ + sync_level_t* levels; /* level array for this thread; if this is NULL + this slot is unused */ +}; + +/* Number of slots reserved for each OS thread in the sync level array */ +#define SYNC_THREAD_N_LEVELS 10000 + +struct sync_level_struct{ + void* latch; /* pointer to a mutex or an rw-lock; NULL means that + the slot is empty */ + ulint level; /* level of the latch in the latching order */ +}; + +/********************************************************************** +Creates, or rather, initializes a mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + mutex_t* mutex, /* in: pointer to memory */ +#ifdef UNIV_DEBUG + const char* cmutex_name, /* in: mutex name */ +# ifdef UNIV_SYNC_DEBUG + ulint level, /* in: level */ +# endif /* UNIV_SYNC_DEBUG */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /* in: file name where created */ + ulint cline) /* in: file line where created */ +{ +#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) + mutex_reset_lock_word(mutex); +#elif defined(HAVE_GCC_ATOMIC_BUILTINS) + mutex_reset_lock_word(mutex); +#else + os_fast_mutex_init(&(mutex->os_fast_mutex)); + mutex->lock_word = 0; +#endif + mutex->event = os_event_create(NULL); + mutex_set_waiters(mutex, 0); +#ifdef UNIV_DEBUG + mutex->magic_n = MUTEX_MAGIC_N; +#endif /* UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG + mutex->line = 0; + mutex->file_name = "not yet reserved"; + mutex->level = level; +#endif /* UNIV_SYNC_DEBUG */ + mutex->cfile_name = cfile_name; + mutex->cline = cline; +#ifndef UNIV_HOTBACKUP + mutex->count_os_wait = 0; +# ifdef UNIV_DEBUG + mutex->cmutex_name= cmutex_name; + mutex->count_using= 0; + mutex->mutex_type= 0; + mutex->lspent_time= 0; + mutex->lmax_spent_time= 0; + mutex->count_spin_loop= 0; + mutex->count_spin_rounds= 0; + mutex->count_os_yield= 0; +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + + /* Check that lock_word is aligned; this is important on Intel */ + ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0); + + /* NOTE! The very first mutexes are not put to the mutex list */ + + if ((mutex == &mutex_list_mutex) +#ifdef UNIV_SYNC_DEBUG + || (mutex == &sync_thread_mutex) +#endif /* UNIV_SYNC_DEBUG */ + ) { + + return; + } + + mutex_enter(&mutex_list_mutex); + + ut_ad(UT_LIST_GET_LEN(mutex_list) == 0 + || UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N); + + UT_LIST_ADD_FIRST(list, mutex_list, mutex); + + mutex_exit(&mutex_list_mutex); +} + +/********************************************************************** +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a mutex object from the mutex list. The mutex +is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free( +/*=======*/ + mutex_t* mutex) /* in: mutex */ +{ + ut_ad(mutex_validate(mutex)); + ut_a(mutex_get_lock_word(mutex) == 0); + ut_a(mutex_get_waiters(mutex) == 0); + + if (mutex != &mutex_list_mutex +#ifdef UNIV_SYNC_DEBUG + && mutex != &sync_thread_mutex +#endif /* UNIV_SYNC_DEBUG */ + ) { + + mutex_enter(&mutex_list_mutex); + + ut_ad(!UT_LIST_GET_PREV(list, mutex) + || UT_LIST_GET_PREV(list, mutex)->magic_n + == MUTEX_MAGIC_N); + ut_ad(!UT_LIST_GET_NEXT(list, mutex) + || UT_LIST_GET_NEXT(list, mutex)->magic_n + == MUTEX_MAGIC_N); + + UT_LIST_REMOVE(list, mutex_list, mutex); + + mutex_exit(&mutex_list_mutex); + } + + os_event_free(mutex->event); + +#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) +#elif defined(HAVE_GCC_ATOMIC_BUILTINS) +#else + os_fast_mutex_free(&(mutex->os_fast_mutex)); +#endif + /* If we free the mutex protecting the mutex list (freeing is + not necessary), we have to reset the magic number AFTER removing + it from the list. */ +#ifdef UNIV_DEBUG + mutex->magic_n = 0; +#endif /* UNIV_DEBUG */ +} + +/************************************************************************ +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. */ +UNIV_INTERN +ulint +mutex_enter_nowait_func( +/*====================*/ + /* out: 0 if succeed, 1 if not */ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name __attribute__((unused)), + /* in: file name where mutex + requested */ + ulint line __attribute__((unused))) + /* in: line where requested */ +{ + ut_ad(mutex_validate(mutex)); + + if (!mutex_test_and_set(mutex)) { + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + + return(0); /* Succeeded! */ + } + + return(1); +} + +#ifdef UNIV_DEBUG +/********************************************************************** +Checks that the mutex has been initialized. */ +UNIV_INTERN +ibool +mutex_validate( +/*===========*/ + const mutex_t* mutex) +{ + ut_a(mutex); + ut_a(mutex->magic_n == MUTEX_MAGIC_N); + + return(TRUE); +} + +/********************************************************************** +Checks that the current thread owns the mutex. Works only in the debug +version. */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + /* out: TRUE if owns */ + const mutex_t* mutex) /* in: mutex */ +{ + ut_ad(mutex_validate(mutex)); + + return(mutex_get_lock_word(mutex) == 1 + && os_thread_eq(mutex->thread_id, os_thread_get_curr_id())); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************** +Sets the waiters field in a mutex. */ +UNIV_INTERN +void +mutex_set_waiters( +/*==============*/ + mutex_t* mutex, /* in: mutex */ + ulint n) /* in: value to set */ +{ + volatile ulint* ptr; /* declared volatile to ensure that + the value is stored to memory */ + ut_ad(mutex); + + ptr = &(mutex->waiters); + + *ptr = n; /* Here we assume that the write of a single + word in memory is atomic */ +} + +/********************************************************************** +Reserves a mutex for the current thread. If the mutex is reserved, the +function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the mutex before suspending the thread. */ +UNIV_INTERN +void +mutex_spin_wait( +/*============*/ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where mutex + requested */ + ulint line) /* in: line where requested */ +{ + ulint index; /* index of the reserved wait cell */ + ulint i; /* spin round count */ +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + ib_int64_t lstart_time = 0, lfinish_time; /* for timing os_wait */ + ulint ltime_diff; + ulint sec; + ulint ms; + uint timer_started = 0; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + ut_ad(mutex); + + /* This update is not thread safe, but we don't mind if the count + isn't exact. Moved out of ifdef that follows because we are willing + to sacrifice the cost of counting this as the data is valuable. + Count the number of calls to mutex_spin_wait. */ + mutex_spin_wait_count++; + +mutex_loop: + + i = 0; + + /* Spin waiting for the lock word to become zero. Note that we do + not have to assume that the read access to the lock word is atomic, + as the actual locking is always committed with atomic test-and-set. + In reality, however, all processors probably have an atomic read of + a memory word. */ + +spin_loop: +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + mutex->count_spin_loop++; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + + while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + + i++; + } + + if (i == SYNC_SPIN_ROUNDS) { +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + mutex->count_os_yield++; + if (timed_mutexes == 1 && timer_started==0) { + ut_usectime(&sec, &ms); + lstart_time= (ib_int64_t)sec * 1000000 + ms; + timer_started = 1; + } +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + os_thread_yield(); + } + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu spin wait mutex at %p" + " cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif + + mutex_spin_round_count += i; + +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + mutex->count_spin_rounds += i; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + + if (mutex_test_and_set(mutex) == 0) { + /* Succeeded! */ + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + + goto finish_timing; + } + + /* We may end up with a situation where lock_word is 0 but the OS + fast mutex is still reserved. On FreeBSD the OS does not seem to + schedule a thread which is constantly calling pthread_mutex_trylock + (in mutex_test_and_set implementation). Then we could end up + spinning here indefinitely. The following 'i++' stops this infinite + spin. */ + + i++; + + if (i < SYNC_SPIN_ROUNDS) { + goto spin_loop; + } + + sync_array_reserve_cell(sync_primary_wait_array, mutex, + SYNC_MUTEX, file_name, line, &index); + + /* The memory order of the array reservation and the change in the + waiters field is important: when we suspend a thread, we first + reserve the cell and then set waiters field to 1. When threads are + released in mutex_exit, the waiters field is first set to zero and + then the event is set to the signaled state. */ + + mutex_set_waiters(mutex, 1); + + /* Try to reserve still a few times */ + for (i = 0; i < 4; i++) { + if (mutex_test_and_set(mutex) == 0) { + /* Succeeded! Free the reserved wait cell */ + + sync_array_free_cell(sync_primary_wait_array, index); + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, "Thread %lu spin wait succeeds at 2:" + " mutex at %p\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), + (void*) mutex); +#endif + + goto finish_timing; + + /* Note that in this case we leave the waiters field + set to 1. We cannot reset it to zero, as we do not + know if there are other waiters. */ + } + } + + /* Now we know that there has been some thread holding the mutex + after the change in the wait array and the waiters field was made. + Now there is no risk of infinite wait on the event. */ + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif + + mutex_os_wait_count++; + +#ifndef UNIV_HOTBACKUP + mutex->count_os_wait++; +# ifdef UNIV_DEBUG + /* !!!!! Sometimes os_wait can be called without os_thread_yield */ + + if (timed_mutexes == 1 && timer_started==0) { + ut_usectime(&sec, &ms); + lstart_time= (ib_int64_t)sec * 1000000 + ms; + timer_started = 1; + } +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + + sync_array_wait_event(sync_primary_wait_array, index); + goto mutex_loop; + +finish_timing: +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + if (timed_mutexes == 1 && timer_started==1) { + ut_usectime(&sec, &ms); + lfinish_time= (ib_int64_t)sec * 1000000 + ms; + + ltime_diff= (ulint) (lfinish_time - lstart_time); + mutex->lspent_time += ltime_diff; + + if (mutex->lmax_spent_time < ltime_diff) { + mutex->lmax_spent_time= ltime_diff; + } + } +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + return; +} + +/********************************************************************** +Releases the threads waiting in the primary wait array for this mutex. */ +UNIV_INTERN +void +mutex_signal_object( +/*================*/ + mutex_t* mutex) /* in: mutex */ +{ + mutex_set_waiters(mutex, 0); + + /* The memory order of resetting the waiters field and + signaling the object is important. See LEMMA 1 above. */ + os_event_set(mutex->event); + sync_array_object_signalled(sync_primary_wait_array); +} + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Sets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_set_debug_info( +/*=================*/ + mutex_t* mutex, /* in: mutex */ + const char* file_name, /* in: file where requested */ + ulint line) /* in: line where requested */ +{ + ut_ad(mutex); + ut_ad(file_name); + + sync_thread_add_level(mutex, mutex->level); + + mutex->file_name = file_name; + mutex->line = line; +} + +/********************************************************************** +Gets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_get_debug_info( +/*=================*/ + mutex_t* mutex, /* in: mutex */ + const char** file_name, /* out: file where requested */ + ulint* line, /* out: line where requested */ + os_thread_id_t* thread_id) /* out: id of the thread which owns + the mutex */ +{ + ut_ad(mutex); + + *file_name = mutex->file_name; + *line = mutex->line; + *thread_id = mutex->thread_id; +} + +/********************************************************************** +Prints debug info of currently reserved mutexes. */ +static +void +mutex_list_print_info( +/*==================*/ + FILE* file) /* in: file where to print */ +{ + mutex_t* mutex; + const char* file_name; + ulint line; + os_thread_id_t thread_id; + ulint count = 0; + + fputs("----------\n" + "MUTEX INFO\n" + "----------\n", file); + + mutex_enter(&mutex_list_mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex != NULL) { + count++; + + if (mutex_get_lock_word(mutex) != 0) { + mutex_get_debug_info(mutex, &file_name, &line, + &thread_id); + fprintf(file, + "Locked mutex: addr %p thread %ld" + " file %s line %ld\n", + (void*) mutex, os_thread_pf(thread_id), + file_name, line); + } + + mutex = UT_LIST_GET_NEXT(list, mutex); + } + + fprintf(file, "Total number of mutexes %ld\n", count); + + mutex_exit(&mutex_list_mutex); +} + +/********************************************************************** +Counts currently reserved mutexes. Works only in the debug version. */ +UNIV_INTERN +ulint +mutex_n_reserved(void) +/*==================*/ +{ + mutex_t* mutex; + ulint count = 0; + + mutex_enter(&mutex_list_mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex != NULL) { + if (mutex_get_lock_word(mutex) != 0) { + + count++; + } + + mutex = UT_LIST_GET_NEXT(list, mutex); + } + + mutex_exit(&mutex_list_mutex); + + ut_a(count >= 1); + + return(count - 1); /* Subtract one, because this function itself + was holding one mutex (mutex_list_mutex) */ +} + +/********************************************************************** +Returns TRUE if no mutex or rw-lock is currently locked. Works only in +the debug version. */ +UNIV_INTERN +ibool +sync_all_freed(void) +/*================*/ +{ + return(mutex_n_reserved() + rw_lock_n_locked() == 0); +} + +/********************************************************************** +Gets the value in the nth slot in the thread level arrays. */ +static +sync_thread_t* +sync_thread_level_arrays_get_nth( +/*=============================*/ + /* out: pointer to thread slot */ + ulint n) /* in: slot number */ +{ + ut_ad(n < OS_THREAD_MAX_N); + + return(sync_thread_level_arrays + n); +} + +/********************************************************************** +Looks for the thread slot for the calling thread. */ +static +sync_thread_t* +sync_thread_level_arrays_find_slot(void) +/*====================================*/ + /* out: pointer to thread slot, NULL if not found */ + +{ + sync_thread_t* slot; + os_thread_id_t id; + ulint i; + + id = os_thread_get_curr_id(); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = sync_thread_level_arrays_get_nth(i); + + if (slot->levels && os_thread_eq(slot->id, id)) { + + return(slot); + } + } + + return(NULL); +} + +/********************************************************************** +Looks for an unused thread slot. */ +static +sync_thread_t* +sync_thread_level_arrays_find_free(void) +/*====================================*/ + /* out: pointer to thread slot */ + +{ + sync_thread_t* slot; + ulint i; + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = sync_thread_level_arrays_get_nth(i); + + if (slot->levels == NULL) { + + return(slot); + } + } + + return(NULL); +} + +/********************************************************************** +Gets the value in the nth slot in the thread level array. */ +static +sync_level_t* +sync_thread_levels_get_nth( +/*=======================*/ + /* out: pointer to level slot */ + sync_level_t* arr, /* in: pointer to level array for an OS + thread */ + ulint n) /* in: slot number */ +{ + ut_ad(n < SYNC_THREAD_N_LEVELS); + + return(arr + n); +} + +/********************************************************************** +Checks if all the level values stored in the level array are greater than +the given limit. */ +static +ibool +sync_thread_levels_g( +/*=================*/ + /* out: TRUE if all greater */ + sync_level_t* arr, /* in: pointer to level array for an OS + thread */ + ulint limit) /* in: level limit */ +{ + sync_level_t* slot; + rw_lock_t* lock; + mutex_t* mutex; + ulint i; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(arr, i); + + if (slot->latch != NULL) { + if (slot->level <= limit) { + + lock = slot->latch; + mutex = slot->latch; + + fprintf(stderr, + "InnoDB: sync levels should be" + " > %lu but a level is %lu\n", + (ulong) limit, (ulong) slot->level); + + if (mutex->magic_n == MUTEX_MAGIC_N) { + fprintf(stderr, + "Mutex created at %s %lu\n", + mutex->cfile_name, + (ulong) mutex->cline); + + if (mutex_get_lock_word(mutex) != 0) { + const char* file_name; + ulint line; + os_thread_id_t thread_id; + + mutex_get_debug_info( + mutex, &file_name, + &line, &thread_id); + + fprintf(stderr, + "InnoDB: Locked mutex:" + " addr %p thread %ld" + " file %s line %ld\n", + (void*) mutex, + os_thread_pf( + thread_id), + file_name, + (ulong) line); + } else { + fputs("Not locked\n", stderr); + } + } else { + rw_lock_print(lock); + } + + return(FALSE); + } + } + } + + return(TRUE); +} + +/********************************************************************** +Checks if the level value is stored in the level array. */ +static +ibool +sync_thread_levels_contain( +/*=======================*/ + /* out: TRUE if stored */ + sync_level_t* arr, /* in: pointer to level array for an OS + thread */ + ulint level) /* in: level */ +{ + sync_level_t* slot; + ulint i; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(arr, i); + + if (slot->latch != NULL) { + if (slot->level == level) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/********************************************************************** +Checks that the level array for the current thread is empty. */ +UNIV_INTERN +ibool +sync_thread_levels_empty_gen( +/*=========================*/ + /* out: TRUE if empty except the + exceptions specified below */ + ibool dict_mutex_allowed) /* in: TRUE if dictionary mutex is + allowed to be owned by the thread, + also purge_is_running mutex is + allowed */ +{ + sync_level_t* arr; + sync_thread_t* thread_slot; + sync_level_t* slot; + ulint i; + + if (!sync_order_checks_on) { + + return(TRUE); + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + + arr = thread_slot->levels; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(arr, i); + + if (slot->latch != NULL + && (!dict_mutex_allowed + || (slot->level != SYNC_DICT + && slot->level != SYNC_DICT_OPERATION))) { + + mutex_exit(&sync_thread_mutex); + ut_error; + + return(FALSE); + } + } + + mutex_exit(&sync_thread_mutex); + + return(TRUE); +} + +/********************************************************************** +Checks that the level array for the current thread is empty. */ +UNIV_INTERN +ibool +sync_thread_levels_empty(void) +/*==========================*/ + /* out: TRUE if empty */ +{ + return(sync_thread_levels_empty_gen(FALSE)); +} + +/********************************************************************** +Adds a latch and its level in the thread level array. Allocates the memory +for the array if called first time for this OS thread. Makes the checks +against other latch levels stored in the array for this thread. */ +UNIV_INTERN +void +sync_thread_add_level( +/*==================*/ + void* latch, /* in: pointer to a mutex or an rw-lock */ + ulint level) /* in: level in the latching order; if + SYNC_LEVEL_VARYING, nothing is done */ +{ + sync_level_t* array; + sync_level_t* slot; + sync_thread_t* thread_slot; + ulint i; + + if (!sync_order_checks_on) { + + return; + } + + if ((latch == (void*)&sync_thread_mutex) + || (latch == (void*)&mutex_list_mutex) + || (latch == (void*)&rw_lock_debug_mutex) + || (latch == (void*)&rw_lock_list_mutex)) { + + return; + } + + if (level == SYNC_LEVEL_VARYING) { + + return; + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + /* We have to allocate the level array for a new thread */ + array = ut_malloc(sizeof(sync_level_t) * SYNC_THREAD_N_LEVELS); + + thread_slot = sync_thread_level_arrays_find_free(); + + thread_slot->id = os_thread_get_curr_id(); + thread_slot->levels = array; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(array, i); + + slot->latch = NULL; + } + } + + array = thread_slot->levels; + + /* NOTE that there is a problem with _NODE and _LEAF levels: if the + B-tree height changes, then a leaf can change to an internal node + or the other way around. We do not know at present if this can cause + unnecessary assertion failures below. */ + + switch (level) { + case SYNC_NO_ORDER_CHECK: + case SYNC_EXTERN_STORAGE: + case SYNC_TREE_NODE_FROM_HASH: + /* Do no order checking */ + break; + case SYNC_MEM_POOL: + case SYNC_MEM_HASH: + case SYNC_RECV: + case SYNC_WORK_QUEUE: + case SYNC_LOG: + case SYNC_THR_LOCAL: + case SYNC_ANY_LATCH: + case SYNC_TRX_SYS_HEADER: + case SYNC_FILE_FORMAT_TAG: + case SYNC_DOUBLEWRITE: + case SYNC_BUF_POOL: + case SYNC_SEARCH_SYS: + case SYNC_SEARCH_SYS_CONF: + case SYNC_TRX_LOCK_HEAP: + case SYNC_KERNEL: + case SYNC_IBUF_BITMAP_MUTEX: + case SYNC_RSEG: + case SYNC_TRX_UNDO: + case SYNC_PURGE_LATCH: + case SYNC_PURGE_SYS: + case SYNC_DICT_AUTOINC_MUTEX: + case SYNC_DICT_OPERATION: + case SYNC_DICT_HEADER: + case SYNC_TRX_I_S_RWLOCK: + case SYNC_TRX_I_S_LAST_READ: + if (!sync_thread_levels_g(array, level)) { + fprintf(stderr, + "InnoDB: sync_thread_levels_g(array, %lu)" + " does not hold!\n", level); + ut_error; + } + break; + case SYNC_BUF_BLOCK: + /* Either the thread must own the buffer pool mutex + (buf_pool_mutex), or it is allowed to latch only ONE + buffer block (block->mutex or buf_pool_zip_mutex). */ + ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL) + && sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1)) + || sync_thread_levels_g(array, SYNC_BUF_BLOCK)); + break; + case SYNC_REC_LOCK: + ut_a((sync_thread_levels_contain(array, SYNC_KERNEL) + && sync_thread_levels_g(array, SYNC_REC_LOCK - 1)) + || sync_thread_levels_g(array, SYNC_REC_LOCK)); + break; + case SYNC_IBUF_BITMAP: + /* Either the thread must own the master mutex to all + the bitmap pages, or it is allowed to latch only ONE + bitmap page. */ + ut_a((sync_thread_levels_contain(array, SYNC_IBUF_BITMAP_MUTEX) + && sync_thread_levels_g(array, SYNC_IBUF_BITMAP - 1)) + || sync_thread_levels_g(array, SYNC_IBUF_BITMAP)); + break; + case SYNC_FSP_PAGE: + ut_a(sync_thread_levels_contain(array, SYNC_FSP)); + break; + case SYNC_FSP: + ut_a(sync_thread_levels_contain(array, SYNC_FSP) + || sync_thread_levels_g(array, SYNC_FSP)); + break; + case SYNC_TRX_UNDO_PAGE: + ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO) + || sync_thread_levels_contain(array, SYNC_RSEG) + || sync_thread_levels_contain(array, SYNC_PURGE_SYS) + || sync_thread_levels_g(array, SYNC_TRX_UNDO_PAGE)); + break; + case SYNC_RSEG_HEADER: + ut_a(sync_thread_levels_contain(array, SYNC_RSEG)); + break; + case SYNC_RSEG_HEADER_NEW: + ut_a(sync_thread_levels_contain(array, SYNC_KERNEL) + && sync_thread_levels_contain(array, SYNC_FSP_PAGE)); + break; + case SYNC_TREE_NODE: + ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE) + || sync_thread_levels_contain(array, SYNC_DICT_OPERATION) + || sync_thread_levels_g(array, SYNC_TREE_NODE - 1)); + break; + case SYNC_TREE_NODE_NEW: + ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE) + || sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + break; + case SYNC_INDEX_TREE: + ut_a((sync_thread_levels_contain(array, SYNC_IBUF_MUTEX) + && sync_thread_levels_contain(array, SYNC_FSP) + && sync_thread_levels_g(array, SYNC_FSP_PAGE - 1)) + || sync_thread_levels_g(array, SYNC_TREE_NODE - 1)); + break; + case SYNC_IBUF_MUTEX: + ut_a(sync_thread_levels_g(array, SYNC_FSP_PAGE - 1)); + break; + case SYNC_IBUF_PESS_INSERT_MUTEX: + ut_a(sync_thread_levels_g(array, SYNC_FSP - 1) + && !sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + break; + case SYNC_IBUF_HEADER: + ut_a(sync_thread_levels_g(array, SYNC_FSP - 1) + && !sync_thread_levels_contain(array, SYNC_IBUF_MUTEX) + && !sync_thread_levels_contain( + array, SYNC_IBUF_PESS_INSERT_MUTEX)); + break; + case SYNC_DICT: +#ifdef UNIV_DEBUG + ut_a(buf_debug_prints + || sync_thread_levels_g(array, SYNC_DICT)); +#else /* UNIV_DEBUG */ + ut_a(sync_thread_levels_g(array, SYNC_DICT)); +#endif /* UNIV_DEBUG */ + break; + default: + ut_error; + } + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(array, i); + + if (slot->latch == NULL) { + slot->latch = latch; + slot->level = level; + + break; + } + } + + ut_a(i < SYNC_THREAD_N_LEVELS); + + mutex_exit(&sync_thread_mutex); +} + +/********************************************************************** +Removes a latch from the thread level array if it is found there. */ +UNIV_INTERN +ibool +sync_thread_reset_level( +/*====================*/ + /* out: TRUE if found from the array; it is an error + if the latch is not found */ + void* latch) /* in: pointer to a mutex or an rw-lock */ +{ + sync_level_t* array; + sync_level_t* slot; + sync_thread_t* thread_slot; + ulint i; + + if (!sync_order_checks_on) { + + return(FALSE); + } + + if ((latch == (void*)&sync_thread_mutex) + || (latch == (void*)&mutex_list_mutex) + || (latch == (void*)&rw_lock_debug_mutex) + || (latch == (void*)&rw_lock_list_mutex)) { + + return(FALSE); + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + ut_error; + + mutex_exit(&sync_thread_mutex); + return(FALSE); + } + + array = thread_slot->levels; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(array, i); + + if (slot->latch == latch) { + slot->latch = NULL; + + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + } + + if (((mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) { + rw_lock_t* rw_lock; + + rw_lock = (rw_lock_t*) latch; + + if (rw_lock->level == SYNC_LEVEL_VARYING) { + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + } + + ut_error; + + mutex_exit(&sync_thread_mutex); + + return(FALSE); +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Initializes the synchronization data structures. */ +UNIV_INTERN +void +sync_init(void) +/*===========*/ +{ +#ifdef UNIV_SYNC_DEBUG + sync_thread_t* thread_slot; + ulint i; +#endif /* UNIV_SYNC_DEBUG */ + + ut_a(sync_initialized == FALSE); + + sync_initialized = TRUE; + + /* Create the primary system wait array which is protected by an OS + mutex */ + + sync_primary_wait_array = sync_array_create(OS_THREAD_MAX_N, + SYNC_ARRAY_OS_MUTEX); +#ifdef UNIV_SYNC_DEBUG + /* Create the thread latch level array where the latch levels + are stored for each OS thread */ + + sync_thread_level_arrays = ut_malloc(OS_THREAD_MAX_N + * sizeof(sync_thread_t)); + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + thread_slot = sync_thread_level_arrays_get_nth(i); + thread_slot->levels = NULL; + } +#endif /* UNIV_SYNC_DEBUG */ + /* Init the mutex list and create the mutex to protect it. */ + + UT_LIST_INIT(mutex_list); + mutex_create(&mutex_list_mutex, SYNC_NO_ORDER_CHECK); +#ifdef UNIV_SYNC_DEBUG + mutex_create(&sync_thread_mutex, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + /* Init the rw-lock list and create the mutex to protect it. */ + + UT_LIST_INIT(rw_lock_list); + mutex_create(&rw_lock_list_mutex, SYNC_NO_ORDER_CHECK); + +#ifdef UNIV_SYNC_DEBUG + mutex_create(&rw_lock_debug_mutex, SYNC_NO_ORDER_CHECK); + + rw_lock_debug_event = os_event_create(NULL); + rw_lock_debug_waiters = FALSE; +#endif /* UNIV_SYNC_DEBUG */ +} + +/********************************************************************** +Frees the resources in InnoDB's own synchronization data structures. Use +os_sync_free() after calling this. */ +UNIV_INTERN +void +sync_close(void) +/*===========*/ +{ + mutex_t* mutex; + + sync_array_free(sync_primary_wait_array); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex) { + mutex_free(mutex); + mutex = UT_LIST_GET_FIRST(mutex_list); + } + + mutex_free(&mutex_list_mutex); +#ifdef UNIV_SYNC_DEBUG + mutex_free(&sync_thread_mutex); +#endif /* UNIV_SYNC_DEBUG */ +} + +/*********************************************************************** +Prints wait info of the sync system. */ +UNIV_INTERN +void +sync_print_wait_info( +/*=================*/ + FILE* file) /* in: file where to print */ +{ +#ifdef UNIV_SYNC_DEBUG + fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n", + mutex_exit_count, rw_s_exit_count, rw_x_exit_count); +#endif + + fprintf(file, + "Mutex spin waits %llu, rounds %llu, OS waits %llu\n" + "RW-shared spins %llu, OS waits %llu;" + " RW-excl spins %llu, OS waits %llu\n", + mutex_spin_wait_count, + mutex_spin_round_count, + mutex_os_wait_count, + rw_s_spin_wait_count, + rw_s_os_wait_count, + rw_x_spin_wait_count, + rw_x_os_wait_count); + + fprintf(file, + "Spin rounds per wait: %.2f mutex, %.2f RW-shared, " + "%.2f RW-excl\n", + (double) mutex_spin_round_count / + (mutex_spin_wait_count ? mutex_spin_wait_count : 1), + (double) rw_s_spin_round_count / + (rw_s_spin_wait_count ? rw_s_spin_wait_count : 1), + (double) rw_x_spin_round_count / + (rw_x_spin_wait_count ? rw_x_spin_wait_count : 1)); +} + +/*********************************************************************** +Prints info of the sync system. */ +UNIV_INTERN +void +sync_print( +/*=======*/ + FILE* file) /* in: file where to print */ +{ +#ifdef UNIV_SYNC_DEBUG + mutex_list_print_info(file); + + rw_lock_list_print_info(file); +#endif /* UNIV_SYNC_DEBUG */ + + sync_array_print_info(file, sync_primary_wait_array); + + sync_print_wait_info(file); +} diff --git a/storage/xtradb/thr/thr0loc.c b/storage/xtradb/thr/thr0loc.c new file mode 100644 index 00000000000..aea1992a921 --- /dev/null +++ b/storage/xtradb/thr/thr0loc.c @@ -0,0 +1,273 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The thread local storage + +Created 10/5/1995 Heikki Tuuri +*******************************************************/ + +#include "thr0loc.h" +#ifdef UNIV_NONINL +#include "thr0loc.ic" +#endif + +#include "sync0sync.h" +#include "hash0hash.h" +#include "mem0mem.h" +#include "srv0srv.h" + +/* + IMPLEMENTATION OF THREAD LOCAL STORAGE + ====================================== + +The threads sometimes need private data which depends on the thread id. +This is implemented as a hash table, where the hash value is calculated +from the thread id, to prepare for a large number of threads. The hash table +is protected by a mutex. If you need modify the program and put new data to +the thread local storage, just add it to struct thr_local_struct in the +header file. */ + +/* Mutex protecting the local storage hash table */ +static mutex_t thr_local_mutex; + +/* The hash table. The module is not yet initialized when it is NULL. */ +static hash_table_t* thr_local_hash = NULL; +ulint thr_local_hash_n_nodes = 0; + +/* The private data for each thread should be put to +the structure below and the accessor functions written +for the field. */ +typedef struct thr_local_struct thr_local_t; + +struct thr_local_struct{ + os_thread_id_t id; /* id of the thread which owns this struct */ + os_thread_t handle; /* operating system handle to the thread */ + ulint slot_no;/* the index of the slot in the thread table + for this thread */ + ibool in_ibuf;/* TRUE if the the thread is doing an ibuf + operation */ + hash_node_t hash; /* hash chain node */ + ulint magic_n; +}; + +#define THR_LOCAL_MAGIC_N 1231234 + +/*********************************************************************** +Returns the local storage struct for a thread. */ +static +thr_local_t* +thr_local_get( +/*==========*/ + /* out: local storage */ + os_thread_id_t id) /* in: thread id of the thread */ +{ + thr_local_t* local; + +try_again: + ut_ad(thr_local_hash); + ut_ad(mutex_own(&thr_local_mutex)); + + /* Look for the local struct in the hash table */ + + local = NULL; + + HASH_SEARCH(hash, thr_local_hash, os_thread_pf(id), + thr_local_t*, local,, os_thread_eq(local->id, id)); + if (local == NULL) { + mutex_exit(&thr_local_mutex); + + thr_local_create(); + + mutex_enter(&thr_local_mutex); + + goto try_again; + } + + ut_ad(local->magic_n == THR_LOCAL_MAGIC_N); + + return(local); +} + +/*********************************************************************** +Gets the slot number in the thread table of a thread. */ +UNIV_INTERN +ulint +thr_local_get_slot_no( +/*==================*/ + /* out: slot number */ + os_thread_id_t id) /* in: thread id of the thread */ +{ + ulint slot_no; + thr_local_t* local; + + mutex_enter(&thr_local_mutex); + + local = thr_local_get(id); + + slot_no = local->slot_no; + + mutex_exit(&thr_local_mutex); + + return(slot_no); +} + +/*********************************************************************** +Sets the slot number in the thread table of a thread. */ +UNIV_INTERN +void +thr_local_set_slot_no( +/*==================*/ + os_thread_id_t id, /* in: thread id of the thread */ + ulint slot_no)/* in: slot number */ +{ + thr_local_t* local; + + mutex_enter(&thr_local_mutex); + + local = thr_local_get(id); + + local->slot_no = slot_no; + + mutex_exit(&thr_local_mutex); +} + +/*********************************************************************** +Returns pointer to the 'in_ibuf' field within the current thread local +storage. */ +UNIV_INTERN +ibool* +thr_local_get_in_ibuf_field(void) +/*=============================*/ + /* out: pointer to the in_ibuf field */ +{ + thr_local_t* local; + + mutex_enter(&thr_local_mutex); + + local = thr_local_get(os_thread_get_curr_id()); + + mutex_exit(&thr_local_mutex); + + return(&(local->in_ibuf)); +} + +/*********************************************************************** +Creates a local storage struct for the calling new thread. */ +UNIV_INTERN +void +thr_local_create(void) +/*==================*/ +{ + thr_local_t* local; + + if (thr_local_hash == NULL) { + thr_local_init(); + } + + local = mem_alloc(sizeof(thr_local_t)); + + local->id = os_thread_get_curr_id(); + local->handle = os_thread_get_curr(); + local->magic_n = THR_LOCAL_MAGIC_N; + + local->in_ibuf = FALSE; + + mutex_enter(&thr_local_mutex); + + HASH_INSERT(thr_local_t, hash, thr_local_hash, + os_thread_pf(os_thread_get_curr_id()), + local); + + thr_local_hash_n_nodes++; + mutex_exit(&thr_local_mutex); +} + +/*********************************************************************** +Frees the local storage struct for the specified thread. */ +UNIV_INTERN +void +thr_local_free( +/*===========*/ + os_thread_id_t id) /* in: thread id */ +{ + thr_local_t* local; + + mutex_enter(&thr_local_mutex); + + /* Look for the local struct in the hash table */ + + HASH_SEARCH(hash, thr_local_hash, os_thread_pf(id), + thr_local_t*, local,, os_thread_eq(local->id, id)); + if (local == NULL) { + mutex_exit(&thr_local_mutex); + + return; + } + + HASH_DELETE(thr_local_t, hash, thr_local_hash, + os_thread_pf(id), local); + thr_local_hash_n_nodes--; + + mutex_exit(&thr_local_mutex); + + ut_a(local->magic_n == THR_LOCAL_MAGIC_N); + + mem_free(local); +} + +/******************************************************************** +Initializes the thread local storage module. */ +UNIV_INTERN +void +thr_local_init(void) +/*================*/ +{ + + ut_a(thr_local_hash == NULL); + + thr_local_hash = hash_create(OS_THREAD_MAX_N + 100); + + mutex_create(&thr_local_mutex, SYNC_THR_LOCAL); +} + +/************************************************************************* +Return local hash table informations. */ + +ulint +thr_local_hash_cells(void) +/*======================*/ +{ + if (thr_local_hash) { + return (thr_local_hash->n_cells); + } else { + return 0; + } +} + +ulint +thr_local_hash_nodes(void) +/*======================*/ +{ + if (thr_local_hash) { + return (thr_local_hash_n_nodes + * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE)); + } else { + return 0; + } +} diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c new file mode 100644 index 00000000000..512e38cc17e --- /dev/null +++ b/storage/xtradb/trx/trx0i_s.c @@ -0,0 +1,1431 @@ +/***************************************************************************** + +Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables fetch code. + +The code below fetches information needed to fill those +3 dynamic tables and uploads it into a "transactions +table cache" for later retrieval. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +#include + +#include "mysql_addons.h" + +#include "univ.i" +#include "buf0buf.h" +#include "dict0dict.h" +#include "ha0storage.h" +#include "ha_prototypes.h" +#include "hash0hash.h" +#include "lock0iter.h" +#include "lock0lock.h" +#include "mem0mem.h" +#include "page0page.h" +#include "rem0rec.h" +#include "row0row.h" +#include "srv0srv.h" +#include "sync0rw.h" +#include "sync0sync.h" +#include "sync0types.h" +#include "trx0i_s.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "ut0mem.h" +#include "ut0ut.h" + +#define TABLE_CACHE_INITIAL_ROWSNUM 1024 + +/* Table cache's rows are stored in a set of chunks. When a new row is +added a new chunk is allocated if necessary. MEM_CHUNKS_IN_TABLE_CACHE +specifies the maximum number of chunks. +Assuming that the first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) +and each subsequent is N/2 where N is the number of rows we have +allocated till now, then 39th chunk would have 1677416425 number of rows +and all chunks would have 3354832851 number of rows. */ +#define MEM_CHUNKS_IN_TABLE_CACHE 39 + +/* The following are some testing auxiliary macros. Do not enable them +in a production environment. */ + +#if 0 +/* If this is enabled then lock folds will always be different +resulting in equal rows being put in a different cells of the hash +table. Checking for duplicates will be flawed because different +fold will be calculated when a row is searched in the hash table. */ +#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT +#endif + +#if 0 +/* This effectively kills the search-for-duplicate-before-adding-a-row +function, but searching in the hash is still performed. It will always +be assumed that lock is not present and insertion will be performed in +the hash table. */ +#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T +#endif + +#if 0 +/* This aggressively repeats adding each row many times. Depending on +the above settings this may be noop or may result in lots of rows being +added. */ +#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES +#endif + +#if 0 +/* Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash +table search is not performed at all. */ +#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS +#endif + +#if 0 +/* Do not insert each row into the hash table, duplicates may appear +if this is enabled, also if this is enabled searching into the hash is +noop because it will be empty. */ +#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE +#endif + +#define MAX_ALLOWED_FOR_STORAGE(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd) + +#define MAX_ALLOWED_FOR_ALLOC(cache) \ + (TRX_I_S_MEM_LIMIT \ + - (cache)->mem_allocd \ + - ha_storage_get_size((cache)->storage)) + +/* Memory for each table in the intermediate buffer is allocated in +separate chunks. These chunks are considered to be concatenated to +represent one flat array of rows. */ +typedef struct i_s_mem_chunk_struct { + ulint offset; /* offset, in number of rows */ + ulint rows_allocd; /* the size of this chunk, in number + of rows */ + void* base; /* start of the chunk */ +} i_s_mem_chunk_t; + +/* This represents one table's cache. */ +typedef struct i_s_table_cache_struct { + ulint rows_used; /* number of used rows */ + ulint rows_allocd; /* number of allocated rows */ + ulint row_size; /* size of a single row */ + i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /* array of + memory chunks that stores the + rows */ +} i_s_table_cache_t; + +/* This structure describes the intermediate buffer */ +struct trx_i_s_cache_struct { + rw_lock_t rw_lock; /* read-write lock protecting + the rest of this structure */ + ullint last_read; /* last time the cache was read; + measured in microseconds since + epoch */ + mutex_t last_read_mutex;/* mutex protecting the + last_read member - it is updated + inside a shared lock of the + rw_lock member */ + i_s_table_cache_t innodb_trx; /* innodb_trx table */ + i_s_table_cache_t innodb_locks; /* innodb_locks table */ + i_s_table_cache_t innodb_lock_waits;/* innodb_lock_waits table */ +/* the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */ +#define LOCKS_HASH_CELLS_NUM 10000 + hash_table_t* locks_hash; /* hash table used to eliminate + duplicate entries in the + innodb_locks table */ +#define CACHE_STORAGE_INITIAL_SIZE 1024 +#define CACHE_STORAGE_HASH_CELLS 2048 + ha_storage_t* storage; /* storage for external volatile + data that can possibly not be + available later, when we release + the kernel mutex */ + ulint mem_allocd; /* the amount of memory + allocated with mem_alloc*() */ + ibool is_truncated; /* this is TRUE if the memory + limit was hit and thus the data + in the cache is truncated */ +}; + +/* This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +static trx_i_s_cache_t trx_i_s_cache_static; +UNIV_INTERN trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static; + +/*********************************************************************** +For a record lock that is in waiting state retrieves the only bit that +is set, for a table lock returns ULINT_UNDEFINED. */ +static +ulint +wait_lock_get_heap_no( +/*==================*/ + /* out: record number within the heap */ + const lock_t* lock) /* in: lock */ +{ + ulint ret; + + switch (lock_get_type(lock)) { + case LOCK_REC: + ret = lock_rec_find_set_bit(lock); + ut_a(ret != ULINT_UNDEFINED); + break; + case LOCK_TABLE: + ret = ULINT_UNDEFINED; + break; + default: + ut_error; + } + + return(ret); +} + +/*********************************************************************** +Initializes the members of a table cache. */ +static +void +table_cache_init( +/*=============*/ + i_s_table_cache_t* table_cache, /* out: table cache */ + size_t row_size) /* in: the size of a + row */ +{ + ulint i; + + table_cache->rows_used = 0; + table_cache->rows_allocd = 0; + table_cache->row_size = row_size; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + /* the memory is actually allocated in + table_cache_create_empty_row() */ + table_cache->chunks[i].base = NULL; + } +} + +/*********************************************************************** +Returns an empty row from a table cache. The row is allocated if no more +empty rows are available. The number of used rows is incremented. +If the memory limit is hit then NULL is returned and nothing is +allocated. */ +static +void* +table_cache_create_empty_row( +/*=========================*/ + /* out: empty row, or + NULL if out of memory */ + i_s_table_cache_t* table_cache, /* in/out: table cache */ + trx_i_s_cache_t* cache) /* in/out: cache to record + how many bytes are + allocated */ +{ + ulint i; + void* row; + + ut_a(table_cache->rows_used <= table_cache->rows_allocd); + + if (table_cache->rows_used == table_cache->rows_allocd) { + + /* rows_used == rows_allocd means that new chunk needs + to be allocated: either no more empty rows in the + last allocated chunk or nothing has been allocated yet + (rows_num == rows_allocd == 0); */ + + i_s_mem_chunk_t* chunk; + ulint req_bytes; + ulint got_bytes; + ulint req_rows; + ulint got_rows; + + /* find the first not allocated chunk */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].base == NULL) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + have been allocated :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + /* allocate the chunk we just found */ + + if (i == 0) { + + /* first chunk, nothing is allocated yet */ + req_rows = TABLE_CACHE_INITIAL_ROWSNUM; + } else { + + /* Memory is increased by the formula + new = old + old / 2; We are trying not to be + aggressive here (= using the common new = old * 2) + because the allocated memory will not be freed + until InnoDB exit (it is reused). So it is better + to once allocate the memory in more steps, but + have less unused/wasted memory than to use less + steps in allocation (which is done once in a + lifetime) but end up with lots of unused/wasted + memory. */ + req_rows = table_cache->rows_allocd / 2; + } + req_bytes = req_rows * table_cache->row_size; + + if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) { + + return(NULL); + } + + chunk = &table_cache->chunks[i]; + + chunk->base = mem_alloc2(req_bytes, &got_bytes); + + got_rows = got_bytes / table_cache->row_size; + + cache->mem_allocd += got_bytes; + +#if 0 + printf("allocating chunk %d req bytes=%lu, got bytes=%lu, " + "row size=%lu, " + "req rows=%lu, got rows=%lu\n", + i, req_bytes, got_bytes, + table_cache->row_size, + req_rows, got_rows); +#endif + + chunk->rows_allocd = got_rows; + + table_cache->rows_allocd += got_rows; + + /* adjust the offset of the next chunk */ + if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) { + + table_cache->chunks[i + 1].offset + = chunk->offset + chunk->rows_allocd; + } + + /* return the first empty row in the newly allocated + chunk */ + row = chunk->base; + } else { + + char* chunk_start; + ulint offset; + + /* there is an empty row, no need to allocate new + chunks */ + + /* find the first chunk that contains allocated but + empty/unused rows */ + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd + > table_cache->rows_used) { + + break; + } + } + + /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks + are full, but + table_cache->rows_used != table_cache->rows_allocd means + exactly the opposite - there are allocated but + empty/unused rows :-X */ + ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE); + + chunk_start = (char*) table_cache->chunks[i].base; + offset = table_cache->rows_used + - table_cache->chunks[i].offset; + + row = chunk_start + offset * table_cache->row_size; + } + + table_cache->rows_used++; + + return(row); +} + +/*********************************************************************** +Fills i_s_trx_row_t object. +If memory can not be allocated then FALSE is returned. */ +static +ibool +fill_trx_row( +/*=========*/ + /* out: FALSE if + allocation fails */ + i_s_trx_row_t* row, /* out: result object + that's filled */ + const trx_t* trx, /* in: transaction to + get data from */ + const i_s_locks_row_t* requested_lock_row,/* in: pointer to the + corresponding row in + innodb_locks if trx is + waiting or NULL if trx + is not waiting */ + trx_i_s_cache_t* cache) /* in/out: cache into + which to copy volatile + strings */ +{ + row->trx_id = trx_get_id(trx); + row->trx_started = (ib_time_t) trx->start_time; + row->trx_state = trx_get_que_state_str(trx); + + if (trx->wait_lock != NULL) { + + ut_a(requested_lock_row != NULL); + + row->requested_lock_row = requested_lock_row; + row->trx_wait_started = (ib_time_t) trx->wait_started; + } else { + + ut_a(requested_lock_row == NULL); + + row->requested_lock_row = NULL; + row->trx_wait_started = 0; + } + + row->trx_weight = (ullint) ut_conv_dulint_to_longlong(TRX_WEIGHT(trx)); + + if (trx->mysql_thd != NULL) { + row->trx_mysql_thread_id + = thd_get_thread_id(trx->mysql_thd); + } else { + /* For internal transactions e.g., purge and transactions + being recovered at startup there is no associated MySQL + thread data structure. */ + row->trx_mysql_thread_id = 0; + } + + if (trx->mysql_query_str != NULL && *trx->mysql_query_str != NULL) { + + if (strlen(*trx->mysql_query_str) + > TRX_I_S_TRX_QUERY_MAX_LEN) { + + char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1]; + + memcpy(query, *trx->mysql_query_str, + TRX_I_S_TRX_QUERY_MAX_LEN); + query[TRX_I_S_TRX_QUERY_MAX_LEN] = '\0'; + + row->trx_query = ha_storage_put_memlim( + cache->storage, query, + TRX_I_S_TRX_QUERY_MAX_LEN + 1, + MAX_ALLOWED_FOR_STORAGE(cache)); + } else { + + row->trx_query = ha_storage_put_str_memlim( + cache->storage, *trx->mysql_query_str, + MAX_ALLOWED_FOR_STORAGE(cache)); + } + + if (row->trx_query == NULL) { + + return(FALSE); + } + } else { + + row->trx_query = NULL; + } + + return(TRUE); +} + +/*********************************************************************** +Format the nth field of "rec" and put it in "buf". The result is always +'\0'-terminated. Returns the number of bytes that were written to "buf" +(including the terminating '\0'). */ +static +ulint +put_nth_field( +/*==========*/ + /* out: end of the result */ + char* buf, /* out: buffer */ + ulint buf_size,/* in: buffer size in bytes */ + ulint n, /* in: number of field */ + const dict_index_t* index, /* in: index */ + const rec_t* rec, /* in: record */ + const ulint* offsets)/* in: record offsets, returned + by rec_get_offsets() */ +{ + const byte* data; + ulint data_len; + dict_field_t* dict_field; + ulint ret; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (buf_size == 0) { + + return(0); + } + + ret = 0; + + if (n > 0) { + /* we must append ", " before the actual data */ + + if (buf_size < 3) { + + buf[0] = '\0'; + return(1); + } + + memcpy(buf, ", ", 3); + + buf += 2; + buf_size -= 2; + ret += 2; + } + + /* now buf_size >= 1 */ + + data = rec_get_nth_field(rec, offsets, n, &data_len); + + dict_field = dict_index_get_nth_field(index, n); + + ret += row_raw_format((const char*) data, data_len, + dict_field, buf, buf_size); + + return(ret); +} + +/*********************************************************************** +Fills the "lock_data" member of i_s_locks_row_t object. +If memory can not be allocated then FALSE is returned. */ +static +ibool +fill_lock_data( +/*===========*/ + /* out: FALSE if allocation fails */ + const char** lock_data,/* out: "lock_data" to fill */ + const lock_t* lock, /* in: lock used to find the data */ + ulint heap_no,/* in: rec num used to find the data */ + trx_i_s_cache_t* cache) /* in/out: cache where to store + volatile data */ +{ + mtr_t mtr; + + const buf_block_t* block; + const page_t* page; + const rec_t* rec; + + ut_a(lock_get_type(lock) == LOCK_REC); + + mtr_start(&mtr); + + block = buf_page_try_get(lock_rec_get_space_id(lock), + lock_rec_get_page_no(lock), + &mtr); + + if (block == NULL) { + + *lock_data = NULL; + + mtr_commit(&mtr); + + return(TRUE); + } + + page = (const page_t*) buf_block_get_frame(block); + + rec = page_find_rec_with_heap_no(page, heap_no); + + if (page_rec_is_infimum(rec)) { + + *lock_data = ha_storage_put_str_memlim( + cache->storage, "infimum pseudo-record", + MAX_ALLOWED_FOR_STORAGE(cache)); + } else if (page_rec_is_supremum(rec)) { + + *lock_data = ha_storage_put_str_memlim( + cache->storage, "supremum pseudo-record", + MAX_ALLOWED_FOR_STORAGE(cache)); + } else { + + const dict_index_t* index; + ulint n_fields; + mem_heap_t* heap; + ulint offsets_onstack[REC_OFFS_NORMAL_SIZE]; + ulint* offsets; + char buf[TRX_I_S_LOCK_DATA_MAX_LEN]; + ulint buf_used; + ulint i; + + rec_offs_init(offsets_onstack); + offsets = offsets_onstack; + + index = lock_rec_get_index(lock); + + n_fields = dict_index_get_n_unique(index); + + ut_a(n_fields > 0); + + heap = NULL; + offsets = rec_get_offsets(rec, index, offsets, n_fields, + &heap); + + /* format and store the data */ + + buf_used = 0; + for (i = 0; i < n_fields; i++) { + + buf_used += put_nth_field( + buf + buf_used, sizeof(buf) - buf_used, + i, index, rec, offsets) - 1; + } + + *lock_data = (const char*) ha_storage_put_memlim( + cache->storage, buf, buf_used + 1, + MAX_ALLOWED_FOR_STORAGE(cache)); + + if (UNIV_UNLIKELY(heap != NULL)) { + + /* this means that rec_get_offsets() has created a new + heap and has stored offsets in it; check that this is + really the case and free the heap */ + ut_a(offsets != offsets_onstack); + mem_heap_free(heap); + } + } + + mtr_commit(&mtr); + + if (*lock_data == NULL) { + + return(FALSE); + } + + return(TRUE); +} + +/*********************************************************************** +Fills i_s_locks_row_t object. Returns its first argument. +If memory can not be allocated then FALSE is returned. */ +static +ibool +fill_locks_row( +/*===========*/ + /* out: FALSE if allocation fails */ + i_s_locks_row_t* row, /* out: result object that's filled */ + const lock_t* lock, /* in: lock to get data from */ + ulint heap_no,/* in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ + trx_i_s_cache_t* cache) /* in/out: cache into which to copy + volatile strings */ +{ + row->lock_trx_id = lock_get_trx_id(lock); + row->lock_mode = lock_get_mode_str(lock); + row->lock_type = lock_get_type_str(lock); + + row->lock_table = ha_storage_put_str_memlim( + cache->storage, lock_get_table_name(lock), + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_table == NULL) { + + return(FALSE); + } + + switch (lock_get_type(lock)) { + case LOCK_REC: + row->lock_index = ha_storage_put_str_memlim( + cache->storage, lock_rec_get_index_name(lock), + MAX_ALLOWED_FOR_STORAGE(cache)); + + /* memory could not be allocated */ + if (row->lock_index == NULL) { + + return(FALSE); + } + + row->lock_space = lock_rec_get_space_id(lock); + row->lock_page = lock_rec_get_page_no(lock); + row->lock_rec = heap_no; + + if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) { + + /* memory could not be allocated */ + return(FALSE); + } + + break; + case LOCK_TABLE: + row->lock_index = NULL; + + row->lock_space = ULINT_UNDEFINED; + row->lock_page = ULINT_UNDEFINED; + row->lock_rec = ULINT_UNDEFINED; + + row->lock_data = NULL; + + break; + default: + ut_error; + } + + row->lock_table_id = lock_get_table_id(lock); + + row->hash_chain.value = row; + + return(TRUE); +} + +/*********************************************************************** +Fills i_s_lock_waits_row_t object. Returns its first argument. */ +static +i_s_lock_waits_row_t* +fill_lock_waits_row( +/*================*/ + /* out: result object + that's filled */ + i_s_lock_waits_row_t* row, /* out: result object + that's filled */ + const i_s_locks_row_t* requested_lock_row,/* in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/* in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + row->requested_lock_row = requested_lock_row; + row->blocking_lock_row = blocking_lock_row; + + return(row); +} + +/*********************************************************************** +Calculates a hash fold for a lock. For a record lock the fold is +calculated from 4 elements, which uniquely identify a lock at a given +point in time: transaction id, space id, page number, record number. +For a table lock the fold is table's id. */ +static +ulint +fold_lock( +/*======*/ + /* out: fold */ + const lock_t* lock, /* in: lock object to fold */ + ulint heap_no)/* in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ +#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT + static ulint fold = 0; + + return(fold++); +#else + ulint ret; + + switch (lock_get_type(lock)) { + case LOCK_REC: + ut_a(heap_no != ULINT_UNDEFINED); + + ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock), + lock_rec_get_space_id(lock)); + + ret = ut_fold_ulint_pair(ret, + lock_rec_get_page_no(lock)); + + ret = ut_fold_ulint_pair(ret, heap_no); + + break; + case LOCK_TABLE: + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == ULINT_UNDEFINED); + + ret = (ulint) lock_get_table_id(lock); + + break; + default: + ut_error; + } + + return(ret); +#endif +} + +/*********************************************************************** +Checks whether i_s_locks_row_t object represents a lock_t object. */ +static +ibool +locks_row_eq_lock( +/*==============*/ + /* out: TRUE if they match */ + const i_s_locks_row_t* row, /* in: innodb_locks row */ + const lock_t* lock, /* in: lock object */ + ulint heap_no)/* in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ +#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T + return(0); +#else + switch (lock_get_type(lock)) { + case LOCK_REC: + ut_a(heap_no != ULINT_UNDEFINED); + + return(row->lock_trx_id == lock_get_trx_id(lock) + && row->lock_space == lock_rec_get_space_id(lock) + && row->lock_page == lock_rec_get_page_no(lock) + && row->lock_rec == heap_no); + + case LOCK_TABLE: + /* this check is actually not necessary for continuing + correct operation, but something must have gone wrong if + it fails. */ + ut_a(heap_no == ULINT_UNDEFINED); + + return(row->lock_trx_id == lock_get_trx_id(lock) + && row->lock_table_id == lock_get_table_id(lock)); + + default: + ut_error; + return(FALSE); + } +#endif +} + +/*********************************************************************** +Searches for a row in the innodb_locks cache that has a specified id. +This happens in O(1) time since a hash table is used. Returns pointer to +the row or NULL if none is found. */ +static +i_s_locks_row_t* +search_innodb_locks( +/*================*/ + /* out: row or NULL */ + trx_i_s_cache_t* cache, /* in: cache */ + const lock_t* lock, /* in: lock to search for */ + ulint heap_no)/* in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + i_s_hash_chain_t* hash_chain; + + HASH_SEARCH( + /* hash_chain->"next" */ + next, + /* the hash table */ + cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* the type of the next variable */ + i_s_hash_chain_t*, + /* auxiliary variable */ + hash_chain, + /* assertion on every traversed item */ + , + /* this determines if we have found the lock */ + locks_row_eq_lock(hash_chain->value, lock, heap_no)); + + if (hash_chain == NULL) { + + return(NULL); + } + /* else */ + + return(hash_chain->value); +} + +/*********************************************************************** +Adds new element to the locks cache, enlarging it if necessary. +Returns a pointer to the added row. If the row is already present then +no row is added and a pointer to the existing row is returned. +If row can not be allocated then NULL is returned. */ +static +i_s_locks_row_t* +add_lock_to_cache( +/*==============*/ + /* out: row */ + trx_i_s_cache_t* cache, /* in/out: cache */ + const lock_t* lock, /* in: the element to add */ + ulint heap_no)/* in: lock's record number + or ULINT_UNDEFINED if the lock + is a table lock */ +{ + i_s_locks_row_t* dst_row; + +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + ulint i; + for (i = 0; i < 10000; i++) { +#endif +#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS + /* quit if this lock is already present */ + dst_row = search_innodb_locks(cache, lock, heap_no); + if (dst_row != NULL) { + + return(dst_row); + } +#endif + + dst_row = (i_s_locks_row_t*) + table_cache_create_empty_row(&cache->innodb_locks, cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(NULL); + } + + if (!fill_locks_row(dst_row, lock, heap_no, cache)) { + + /* memory could not be allocated */ + cache->innodb_locks.rows_used--; + return(NULL); + } + +#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE + HASH_INSERT( + /* the type used in the hash chain */ + i_s_hash_chain_t, + /* hash_chain->"next" */ + next, + /* the hash table */ + cache->locks_hash, + /* fold */ + fold_lock(lock, heap_no), + /* add this data to the hash */ + &dst_row->hash_chain); +#endif +#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES + } /* for()-loop */ +#endif + + return(dst_row); +} + +/*********************************************************************** +Adds new pair of locks to the lock waits cache. +If memory can not be allocated then FALSE is returned. */ +static +ibool +add_lock_wait_to_cache( +/*===================*/ + /* out: FALSE if + allocation fails */ + trx_i_s_cache_t* cache, /* in/out: cache */ + const i_s_locks_row_t* requested_lock_row,/* in: pointer to the + relevant requested lock + row in innodb_locks */ + const i_s_locks_row_t* blocking_lock_row)/* in: pointer to the + relevant blocking lock + row in innodb_locks */ +{ + i_s_lock_waits_row_t* dst_row; + + dst_row = (i_s_lock_waits_row_t*) + table_cache_create_empty_row(&cache->innodb_lock_waits, + cache); + + /* memory could not be allocated */ + if (dst_row == NULL) { + + return(FALSE); + } + + fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row); + + return(TRUE); +} + +/*********************************************************************** +Adds transaction's relevant (important) locks to cache. +If the transaction is waiting, then the wait lock is added to +innodb_locks and a pointer to the added row is returned in +requested_lock_row, otherwise requested_lock_row is set to NULL. +If rows can not be allocated then FALSE is returned and the value of +requested_lock_row is undefined. */ +static +ibool +add_trx_relevant_locks_to_cache( +/*============================*/ + /* out: FALSE if allocation fails */ + trx_i_s_cache_t* cache, /* in/out: cache */ + const trx_t* trx, /* in: transaction */ + i_s_locks_row_t** requested_lock_row)/* out: pointer to the + requested lock row, or NULL or + undefined */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + /* If transaction is waiting we add the wait lock and all locks + from another transactions that are blocking the wait lock. */ + if (trx->que_state == TRX_QUE_LOCK_WAIT) { + + const lock_t* curr_lock; + ulint wait_lock_heap_no; + i_s_locks_row_t* blocking_lock_row; + lock_queue_iterator_t iter; + + ut_a(trx->wait_lock != NULL); + + wait_lock_heap_no + = wait_lock_get_heap_no(trx->wait_lock); + + /* add the requested lock */ + *requested_lock_row + = add_lock_to_cache(cache, trx->wait_lock, + wait_lock_heap_no); + + /* memory could not be allocated */ + if (*requested_lock_row == NULL) { + + return(FALSE); + } + + /* then iterate over the locks before the wait lock and + add the ones that are blocking it */ + + lock_queue_iterator_reset(&iter, trx->wait_lock, + ULINT_UNDEFINED); + + curr_lock = lock_queue_iterator_get_prev(&iter); + while (curr_lock != NULL) { + + if (lock_has_to_wait(trx->wait_lock, + curr_lock)) { + + /* add the lock that is + blocking trx->wait_lock */ + blocking_lock_row + = add_lock_to_cache( + cache, curr_lock, + /* heap_no is the same + for the wait and waited + locks */ + wait_lock_heap_no); + + /* memory could not be allocated */ + if (blocking_lock_row == NULL) { + + return(FALSE); + } + + /* add the relation between both locks + to innodb_lock_waits */ + if (!add_lock_wait_to_cache( + cache, *requested_lock_row, + blocking_lock_row)) { + + /* memory could not be allocated */ + return(FALSE); + } + } + + curr_lock = lock_queue_iterator_get_prev(&iter); + } + } else { + + *requested_lock_row = NULL; + } + + return(TRUE); +} + +/*********************************************************************** +Checks if the cache can safely be updated. */ +static +ibool +can_cache_be_updated( +/*=================*/ + trx_i_s_cache_t* cache) /* in: cache */ +{ + ullint now; + +/* The minimum time that a cache must not be updated after it has been +read for the last time; measured in microseconds. We use this technique +to ensure that SELECTs which join several INFORMATION SCHEMA tables read +the same version of the cache. */ +#define CACHE_MIN_IDLE_TIME_US 100000 /* 0.1 sec */ + + /* Here we read cache->last_read without acquiring its mutex + because last_read is only updated when a shared rw lock on the + whole cache is being held (see trx_i_s_cache_end_read()) and + we are currently holding an exclusive rw lock on the cache. + So it is not possible for last_read to be updated while we are + reading it. */ + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + now = ut_time_us(NULL); + if (now - cache->last_read > CACHE_MIN_IDLE_TIME_US) { + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************** +Declare a cache empty, preparing it to be filled up. Not all resources +are freed because they can be reused. */ +static +void +trx_i_s_cache_clear( +/*================*/ + trx_i_s_cache_t* cache) /* out: cache to clear */ +{ + cache->innodb_trx.rows_used = 0; + cache->innodb_locks.rows_used = 0; + cache->innodb_lock_waits.rows_used = 0; + + hash_table_clear(cache->locks_hash); + + ha_storage_empty(&cache->storage); +} + +/*********************************************************************** +Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the +table cache buffer. Cache must be locked for write. */ +static +void +fetch_data_into_cache( +/*==================*/ + trx_i_s_cache_t* cache) /* in/out: cache */ +{ + trx_t* trx; + i_s_trx_row_t* trx_row; + i_s_locks_row_t* requested_lock_row; + + ut_ad(mutex_own(&kernel_mutex)); + + trx_i_s_cache_clear(cache); + + /* We iterate over the list of all transactions and add each one + to innodb_trx's cache. We also add all locks that are relevant + to each transaction into innodb_locks' and innodb_lock_waits' + caches. */ + + for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + trx != NULL; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + + if (!add_trx_relevant_locks_to_cache(cache, trx, + &requested_lock_row)) { + + cache->is_truncated = TRUE; + return; + } + + trx_row = (i_s_trx_row_t*) + table_cache_create_empty_row(&cache->innodb_trx, + cache); + + /* memory could not be allocated */ + if (trx_row == NULL) { + + cache->is_truncated = TRUE; + return; + } + + if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) { + + /* memory could not be allocated */ + cache->innodb_trx.rows_used--; + cache->is_truncated = TRUE; + return; + } + } + + cache->is_truncated = FALSE; +} + +/*********************************************************************** +Update the transactions cache if it has not been read for some time. +Called from handler/i_s.cc. */ +UNIV_INTERN +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + /* out: 0 - fetched, 1 - not */ + trx_i_s_cache_t* cache) /* in/out: cache */ +{ + if (!can_cache_be_updated(cache)) { + + return(1); + } + + /* We are going to access trx->query in all transactions */ + innobase_mysql_prepare_print_arbitrary_thd(); + + /* We need to read trx_sys and record/table lock queues */ + mutex_enter(&kernel_mutex); + + fetch_data_into_cache(cache); + + mutex_exit(&kernel_mutex); + + innobase_mysql_end_print_arbitrary_thd(); + + return(0); +} + +/*********************************************************************** +Returns TRUE if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. */ +UNIV_INTERN +ibool +trx_i_s_cache_is_truncated( +/*=======================*/ + /* out: TRUE if truncated */ + trx_i_s_cache_t* cache) /* in: cache */ +{ + return(cache->is_truncated); +} + +/*********************************************************************** +Initialize INFORMATION SCHEMA trx related cache. */ +UNIV_INTERN +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache) /* out: cache to init */ +{ + /* The latching is done in the following order: + acquire trx_i_s_cache_t::rw_lock, X + acquire kernel_mutex + release kernel_mutex + release trx_i_s_cache_t::rw_lock + acquire trx_i_s_cache_t::rw_lock, S + acquire trx_i_s_cache_t::last_read_mutex + release trx_i_s_cache_t::last_read_mutex + release trx_i_s_cache_t::rw_lock */ + + rw_lock_create(&cache->rw_lock, SYNC_TRX_I_S_RWLOCK); + + cache->last_read = 0; + + mutex_create(&cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ); + + table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t)); + table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t)); + table_cache_init(&cache->innodb_lock_waits, + sizeof(i_s_lock_waits_row_t)); + + cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM); + + cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE, + CACHE_STORAGE_HASH_CELLS); + + cache->mem_allocd = 0; + + cache->is_truncated = FALSE; +} + +/*********************************************************************** +Issue a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache) /* in: cache */ +{ + rw_lock_s_lock(&cache->rw_lock); +} + +/*********************************************************************** +Release a shared/read lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache) /* in: cache */ +{ + ullint now; + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED)); +#endif + + /* update cache last read time */ + now = ut_time_us(NULL); + mutex_enter(&cache->last_read_mutex); + cache->last_read = now; + mutex_exit(&cache->last_read_mutex); + + rw_lock_s_unlock(&cache->rw_lock); +} + +/*********************************************************************** +Issue an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache) /* in: cache */ +{ + rw_lock_x_lock(&cache->rw_lock); +} + +/*********************************************************************** +Release an exclusive/write lock on the tables cache. */ +UNIV_INTERN +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache) /* in: cache */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + rw_lock_x_unlock(&cache->rw_lock); +} + +/*********************************************************************** +Selects a INFORMATION SCHEMA table cache from the whole cache. */ +static +i_s_table_cache_t* +cache_select_table( +/*===============*/ + /* out: table cache */ + trx_i_s_cache_t* cache, /* in: whole cache */ + enum i_s_table table) /* in: which table */ +{ + i_s_table_cache_t* table_cache; + +#ifdef UNIV_SYNC_DEBUG + ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED) + || rw_lock_own(&cache->rw_lock, RW_LOCK_EX)); +#endif + + switch (table) { + case I_S_INNODB_TRX: + table_cache = &cache->innodb_trx; + break; + case I_S_INNODB_LOCKS: + table_cache = &cache->innodb_locks; + break; + case I_S_INNODB_LOCK_WAITS: + table_cache = &cache->innodb_lock_waits; + break; + default: + ut_error; + } + + return(table_cache); +} + +/*********************************************************************** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. */ +UNIV_INTERN +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + /* out: number of rows */ + trx_i_s_cache_t* cache, /* in: cache */ + enum i_s_table table) /* in: which table */ +{ + i_s_table_cache_t* table_cache; + + table_cache = cache_select_table(cache, table); + + return(table_cache->rows_used); +} + +/*********************************************************************** +Retrieves the nth row (zero-based) in the cache for a given +INFORMATION SCHEMA table. */ +UNIV_INTERN +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + /* out: row */ + trx_i_s_cache_t* cache, /* in: cache */ + enum i_s_table table, /* in: which table */ + ulint n) /* in: row number */ +{ + i_s_table_cache_t* table_cache; + ulint i; + void* row; + + table_cache = cache_select_table(cache, table); + + ut_a(n < table_cache->rows_used); + + row = NULL; + + for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) { + + if (table_cache->chunks[i].offset + + table_cache->chunks[i].rows_allocd > n) { + + row = (char*) table_cache->chunks[i].base + + (n - table_cache->chunks[i].offset) + * table_cache->row_size; + break; + } + } + + ut_a(row != NULL); + + return(row); +} + +/*********************************************************************** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. */ +UNIV_INTERN +char* +trx_i_s_create_lock_id( +/*===================*/ + /* out: resulting lock id */ + const i_s_locks_row_t* row, /* in: innodb_locks row */ + char* lock_id,/* out: resulting lock_id */ + ulint lock_id_size)/* in: size of the lock id + buffer */ +{ + int res_len; + + /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */ + + if (row->lock_space != ULINT_UNDEFINED) { + /* record lock */ + res_len = ut_snprintf(lock_id, lock_id_size, + TRX_ID_FMT ":%lu:%lu:%lu", + row->lock_trx_id, row->lock_space, + row->lock_page, row->lock_rec); + } else { + /* table lock */ + res_len = ut_snprintf(lock_id, lock_id_size, + TRX_ID_FMT ":%llu", + row->lock_trx_id, + row->lock_table_id); + } + + /* the typecast is safe because snprintf(3) never returns + negative result */ + ut_a(res_len >= 0); + ut_a((ulint) res_len < lock_id_size); + + return(lock_id); +} diff --git a/storage/xtradb/trx/trx0purge.c b/storage/xtradb/trx/trx0purge.c new file mode 100644 index 00000000000..7a2a27a94ff --- /dev/null +++ b/storage/xtradb/trx/trx0purge.c @@ -0,0 +1,1175 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0purge.h" + +#ifdef UNIV_NONINL +#include "trx0purge.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "read0read.h" +#include "fut0fut.h" +#include "que0que.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "srv0que.h" +#include "os0thread.h" + +/* The global data structure coordinating a purge */ +UNIV_INTERN trx_purge_t* purge_sys = NULL; + +/* A dummy undo record used as a return value when we have a whole undo log +which needs no purge */ +UNIV_INTERN trx_undo_rec_t trx_purge_dummy_rec; + +/********************************************************************* +Checks if trx_id is >= purge_view: then it is guaranteed that its update +undo log still exists in the system. */ +UNIV_INTERN +ibool +trx_purge_update_undo_must_exist( +/*=============================*/ + /* out: TRUE if is sure that it is preserved, also + if the function returns FALSE, it is possible that + the undo log still exists in the system */ + dulint trx_id) /* in: transaction id */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!read_view_sees_trx_id(purge_sys->view, trx_id)) { + + return(TRUE); + } + + return(FALSE); +} + +/*=================== PURGE RECORD ARRAY =============================*/ + +/*********************************************************************** +Stores info of an undo log record during a purge. */ +static +trx_undo_inf_t* +trx_purge_arr_store_info( +/*=====================*/ + /* out: pointer to the storage cell */ + dulint trx_no, /* in: transaction number */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_inf_t* cell; + trx_undo_arr_t* arr; + ulint i; + + arr = purge_sys->arr; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!(cell->in_use)) { + /* Not in use, we may store here */ + cell->undo_no = undo_no; + cell->trx_no = trx_no; + cell->in_use = TRUE; + + arr->n_used++; + + return(cell); + } + } +} + +/*********************************************************************** +Removes info of an undo log record during a purge. */ +UNIV_INLINE +void +trx_purge_arr_remove_info( +/*======================*/ + trx_undo_inf_t* cell) /* in: pointer to the storage cell */ +{ + trx_undo_arr_t* arr; + + arr = purge_sys->arr; + + cell->in_use = FALSE; + + ut_ad(arr->n_used > 0); + + arr->n_used--; +} + +/*********************************************************************** +Gets the biggest pair of a trx number and an undo number in a purge array. */ +static +void +trx_purge_arr_get_biggest( +/*======================*/ + trx_undo_arr_t* arr, /* in: purge array */ + dulint* trx_no, /* out: transaction number: ut_dulint_zero + if array is empty */ + dulint* undo_no)/* out: undo number */ +{ + trx_undo_inf_t* cell; + dulint pair_trx_no; + dulint pair_undo_no; + int trx_cmp; + ulint n_used; + ulint i; + ulint n; + + n = 0; + n_used = arr->n_used; + pair_trx_no = ut_dulint_zero; + pair_undo_no = ut_dulint_zero; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use) { + n++; + trx_cmp = ut_dulint_cmp(cell->trx_no, pair_trx_no); + + if ((trx_cmp > 0) + || ((trx_cmp == 0) + && (ut_dulint_cmp(cell->undo_no, + pair_undo_no) >= 0))) { + + pair_trx_no = cell->trx_no; + pair_undo_no = cell->undo_no; + } + } + + if (n == n_used) { + *trx_no = pair_trx_no; + *undo_no = pair_undo_no; + + return; + } + } +} + +/******************************************************************** +Builds a purge 'query' graph. The actual purge is performed by executing +this query graph. */ +static +que_t* +trx_purge_graph_build(void) +/*=======================*/ + /* out, own: the query graph */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + /* que_thr_t* thr2; */ + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap); + fork->trx = purge_sys->trx; + + thr = que_thr_create(fork, heap); + + thr->child = row_purge_node_create(thr, heap); + + /* thr2 = que_thr_create(fork, fork, heap); + + thr2->child = row_purge_node_create(fork, thr2, heap); */ + + return(fork); +} + +/************************************************************************ +Creates the global purge system control structure and inits the history +mutex. */ +UNIV_INTERN +void +trx_purge_sys_create(void) +/*======================*/ +{ + ut_ad(mutex_own(&kernel_mutex)); + + purge_sys = mem_alloc(sizeof(trx_purge_t)); + + purge_sys->state = TRX_STOP_PURGE; + + purge_sys->n_pages_handled = 0; + + purge_sys->purge_trx_no = ut_dulint_zero; + purge_sys->purge_undo_no = ut_dulint_zero; + purge_sys->next_stored = FALSE; + + rw_lock_create(&purge_sys->latch, SYNC_PURGE_LATCH); + + mutex_create(&purge_sys->mutex, SYNC_PURGE_SYS); + + purge_sys->heap = mem_heap_create(256); + + purge_sys->arr = trx_undo_arr_create(); + + purge_sys->sess = sess_open(); + + purge_sys->trx = purge_sys->sess->trx; + + purge_sys->trx->is_purge = 1; + + ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED)); + + purge_sys->query = trx_purge_graph_build(); + + purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero, + purge_sys->heap); +} + +/*================ UNDO LOG HISTORY LIST =============================*/ + +/************************************************************************ +Adds the update undo log as the first log in the history list. Removes the +update undo log segment from the rseg slot if it is too big for reuse. */ +UNIV_INTERN +void +trx_purge_add_update_undo_to_history( +/*=================================*/ + trx_t* trx, /* in: transaction */ + page_t* undo_page, /* in: update undo log header page, + x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + trx_undo_t* undo; + trx_rseg_t* rseg; + trx_rsegf_t* rseg_header; + trx_usegf_t* seg_header; + trx_ulogf_t* undo_header; + trx_upagef_t* page_header; + ulint hist_size; + + undo = trx->update_undo; + + ut_ad(undo); + + rseg = undo->rseg; + + ut_ad(mutex_own(&(rseg->mutex))); + + rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, mtr); + + undo_header = undo_page + undo->hdr_offset; + seg_header = undo_page + TRX_UNDO_SEG_HDR; + page_header = undo_page + TRX_UNDO_PAGE_HDR; + + if (undo->state != TRX_UNDO_CACHED) { + /* The undo log segment will not be reused */ + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + ut_error; + } + + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr); + + hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr); + ut_ad(undo->size == flst_get_len( + seg_header + TRX_UNDO_PAGE_LIST, mtr)); + + mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size + undo->size, MLOG_4BYTES, mtr); + } + + /* Add the log as the first in the history list */ + flst_add_first(rseg_header + TRX_RSEG_HISTORY, + undo_header + TRX_UNDO_HISTORY_NODE, mtr); + mutex_enter(&kernel_mutex); + trx_sys->rseg_history_len++; + mutex_exit(&kernel_mutex); + + /* Write the trx number to the undo log header */ + mlog_write_dulint(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr); + /* Write information about delete markings to the undo log header */ + + if (!undo->del_marks) { + mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, mtr); + } + + if (rseg->last_page_no == FIL_NULL) { + + rseg->last_page_no = undo->hdr_page_no; + rseg->last_offset = undo->hdr_offset; + rseg->last_trx_no = trx->no; + rseg->last_del_marks = undo->del_marks; + } +} + +/************************************************************************** +Frees an undo log segment which is in the history list. Cuts the end of the +history list at the youngest undo log in this segment. */ +static +void +trx_purge_free_segment( +/*===================*/ + trx_rseg_t* rseg, /* in: rollback segment */ + fil_addr_t hdr_addr, /* in: the file address of log_hdr */ + ulint n_removed_logs) /* in: count of how many undo logs we + will cut off from the end of the + history list */ +{ + page_t* undo_page; + trx_rsegf_t* rseg_hdr; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + ibool freed; + ulint seg_size; + ulint hist_size; + ibool marked = FALSE; + mtr_t mtr; + + /* fputs("Freeing an update undo log segment\n", stderr); */ + + ut_ad(mutex_own(&(purge_sys->mutex))); +loop: + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + hdr_addr.page, &mtr); + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + log_hdr = undo_page + hdr_addr.boffset; + + /* Mark the last undo log totally purged, so that if the system + crashes, the tail of the undo log will not get accessed again. The + list of pages in the undo log tail gets inconsistent during the + freeing of the segment, and therefore purge should not try to access + them again. */ + + if (!marked) { + mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE, + MLOG_2BYTES, &mtr); + marked = TRUE; + } + + freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER, + &mtr); + if (!freed) { + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + goto loop; + } + + /* The page list may now be inconsistent, but the length field + stored in the list base node tells us how big it was before we + started the freeing. */ + + seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr); + + /* We may free the undo log segment header page; it must be freed + within the same mtr as the undo log header is removed from the + history list: otherwise, in case of a database crash, the segment + could become inaccessible garbage in the file space. */ + + flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); + + mutex_enter(&kernel_mutex); + ut_ad(trx_sys->rseg_history_len >= n_removed_logs); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&kernel_mutex); + + freed = FALSE; + + while (!freed) { + /* Here we assume that a file segment with just the header + page can be freed in a few steps, so that the buffer pool + is not flooded with bufferfixed pages: see the note in + fsp0fsp.c. */ + + freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, + &mtr); + } + + hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, &mtr); + ut_ad(hist_size >= seg_size); + + mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, + hist_size - seg_size, MLOG_4BYTES, &mtr); + + ut_ad(rseg->curr_size >= seg_size); + + rseg->curr_size -= seg_size; + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); +} + +/************************************************************************ +Removes unnecessary history data from a rollback segment. */ +static +void +trx_purge_truncate_rseg_history( +/*============================*/ + trx_rseg_t* rseg, /* in: rollback segment */ + dulint limit_trx_no, /* in: remove update undo logs whose + trx number is < limit_trx_no */ + dulint limit_undo_no) /* in: if transaction number is equal + to limit_trx_no, truncate undo records + with undo number < limit_undo_no */ +{ + fil_addr_t hdr_addr; + fil_addr_t prev_hdr_addr; + trx_rsegf_t* rseg_hdr; + page_t* undo_page; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + int cmp; + ulint n_removed_logs = 0; + mtr_t mtr; + + ut_ad(mutex_own(&(purge_sys->mutex))); + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + hdr_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr)); +loop: + if (hdr_addr.page == FIL_NULL) { + + mutex_exit(&(rseg->mutex)); + + mtr_commit(&mtr); + + return; + } + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + hdr_addr.page, &mtr); + + log_hdr = undo_page + hdr_addr.boffset; + + cmp = ut_dulint_cmp(mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO), + limit_trx_no); + if (cmp == 0) { + trx_undo_truncate_start(rseg, rseg->space, hdr_addr.page, + hdr_addr.boffset, limit_undo_no); + } + + if (cmp >= 0) { + mutex_enter(&kernel_mutex); + ut_a(trx_sys->rseg_history_len >= n_removed_logs); + trx_sys->rseg_history_len -= n_removed_logs; + mutex_exit(&kernel_mutex); + + flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY, + log_hdr + TRX_UNDO_HISTORY_NODE, + n_removed_logs, &mtr); + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return; + } + + prev_hdr_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); + n_removed_logs++; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE) + && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) { + + /* We can free the whole log segment */ + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + trx_purge_free_segment(rseg, hdr_addr, n_removed_logs); + + n_removed_logs = 0; + } else { + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } + + mtr_start(&mtr); + mutex_enter(&(rseg->mutex)); + + rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + hdr_addr = prev_hdr_addr; + + goto loop; +} + +/************************************************************************ +Removes unnecessary history data from rollback segments. NOTE that when this +function is called, the caller must not have any latches on undo log pages! */ +static +void +trx_purge_truncate_history(void) +/*============================*/ +{ + trx_rseg_t* rseg; + dulint limit_trx_no; + dulint limit_undo_no; + + ut_ad(mutex_own(&(purge_sys->mutex))); + + trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no, + &limit_undo_no); + + if (ut_dulint_is_zero(limit_trx_no)) { + + limit_trx_no = purge_sys->purge_trx_no; + limit_undo_no = purge_sys->purge_undo_no; + } + + /* We play safe and set the truncate limit at most to the purge view + low_limit number, though this is not necessary */ + + if (ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) >= 0) { + limit_trx_no = purge_sys->view->low_limit_no; + limit_undo_no = ut_dulint_zero; + } + + ut_ad((ut_dulint_cmp(limit_trx_no, + purge_sys->view->low_limit_no) <= 0)); + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + while (rseg) { + trx_purge_truncate_rseg_history(rseg, limit_trx_no, + limit_undo_no); + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } +} + +/************************************************************************ +Does a truncate if the purge array is empty. NOTE that when this function is +called, the caller must not have any latches on undo log pages! */ +UNIV_INLINE +ibool +trx_purge_truncate_if_arr_empty(void) +/*=================================*/ + /* out: TRUE if array empty */ +{ + ut_ad(mutex_own(&(purge_sys->mutex))); + + if (purge_sys->arr->n_used == 0) { + + trx_purge_truncate_history(); + + return(TRUE); + } + + return(FALSE); +} + +/*************************************************************************** +Updates the last not yet purged history log info in rseg when we have purged +a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */ +static +void +trx_purge_rseg_get_next_history_log( +/*================================*/ + trx_rseg_t* rseg) /* in: rollback segment */ +{ + page_t* undo_page; + trx_ulogf_t* log_hdr; + trx_usegf_t* seg_hdr; + fil_addr_t prev_log_addr; + dulint trx_no; + ibool del_marks; + mtr_t mtr; + + ut_ad(mutex_own(&(purge_sys->mutex))); + + mutex_enter(&(rseg->mutex)); + + ut_a(rseg->last_page_no != FIL_NULL); + + purge_sys->purge_trx_no = ut_dulint_add(rseg->last_trx_no, 1); + purge_sys->purge_undo_no = ut_dulint_zero; + purge_sys->next_stored = FALSE; + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, + rseg->last_page_no, &mtr); + log_hdr = undo_page + rseg->last_offset; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + /* Increase the purge page count by one for every handled log */ + + purge_sys->n_pages_handled++; + + prev_log_addr = trx_purge_get_log_from_hist( + flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); + if (prev_log_addr.page == FIL_NULL) { + /* No logs left in the history list */ + + rseg->last_page_no = FIL_NULL; + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + mutex_enter(&kernel_mutex); + + /* Add debug code to track history list corruption reported + on the MySQL mailing list on Nov 9, 2004. The fut0lst.c + file-based list was corrupt. The prev node pointer was + FIL_NULL, even though the list length was over 8 million nodes! + We assume that purge truncates the history list in moderate + size pieces, and if we here reach the head of the list, the + list cannot be longer than 20 000 undo logs now. */ + + if (trx_sys->rseg_history_len > 20000) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: purge reached the" + " head of the history list,\n" + "InnoDB: but its length is still" + " reported as %lu! Make a detailed bug\n" + "InnoDB: report, and submit it" + " to http://bugs.mysql.com\n", + (ulong) trx_sys->rseg_history_len); + } + + mutex_exit(&kernel_mutex); + + return; + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + /* Read the trx number and del marks from the previous log header */ + mtr_start(&mtr); + + log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, + prev_log_addr.page, &mtr) + + prev_log_addr.boffset; + + trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); + + del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS); + + mtr_commit(&mtr); + + mutex_enter(&(rseg->mutex)); + + rseg->last_page_no = prev_log_addr.page; + rseg->last_offset = prev_log_addr.boffset; + rseg->last_trx_no = trx_no; + rseg->last_del_marks = del_marks; + + mutex_exit(&(rseg->mutex)); +} + +/*************************************************************************** +Chooses the next undo log to purge and updates the info in purge_sys. This +function is used to initialize purge_sys when the next record to purge is +not known, and also to update the purge system info on the next record when +purge has handled the whole undo log for a transaction. */ +static +void +trx_purge_choose_next_log(void) +/*===========================*/ +{ + trx_undo_rec_t* rec; + trx_rseg_t* rseg; + trx_rseg_t* min_rseg; + dulint min_trx_no; + ulint space = 0; /* remove warning (??? bug ???) */ + ulint zip_size = 0; + ulint page_no = 0; /* remove warning (??? bug ???) */ + ulint offset = 0; /* remove warning (??? bug ???) */ + mtr_t mtr; + + ut_ad(mutex_own(&(purge_sys->mutex))); + ut_ad(purge_sys->next_stored == FALSE); + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + min_trx_no = ut_dulint_max; + + min_rseg = NULL; + + while (rseg) { + mutex_enter(&(rseg->mutex)); + + if (rseg->last_page_no != FIL_NULL) { + + if ((min_rseg == NULL) + || (ut_dulint_cmp(min_trx_no, + rseg->last_trx_no) > 0)) { + + min_rseg = rseg; + min_trx_no = rseg->last_trx_no; + space = rseg->space; + zip_size = rseg->zip_size; + ut_a(space == 0); /* We assume in purge of + externally stored fields + that space id == 0 */ + page_no = rseg->last_page_no; + offset = rseg->last_offset; + } + } + + mutex_exit(&(rseg->mutex)); + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } + + if (min_rseg == NULL) { + + return; + } + + mtr_start(&mtr); + + if (!min_rseg->last_del_marks) { + /* No need to purge this log */ + + rec = &trx_purge_dummy_rec; + } else { + rec = trx_undo_get_first_rec(space, zip_size, page_no, offset, + RW_S_LATCH, &mtr); + if (rec == NULL) { + /* Undo log empty */ + + rec = &trx_purge_dummy_rec; + } + } + + purge_sys->next_stored = TRUE; + purge_sys->rseg = min_rseg; + + purge_sys->hdr_page_no = page_no; + purge_sys->hdr_offset = offset; + + purge_sys->purge_trx_no = min_trx_no; + + if (rec == &trx_purge_dummy_rec) { + + purge_sys->purge_undo_no = ut_dulint_zero; + purge_sys->page_no = page_no; + purge_sys->offset = 0; + } else { + purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec); + + purge_sys->page_no = page_get_page_no(page_align(rec)); + purge_sys->offset = page_offset(rec); + } + + mtr_commit(&mtr); +} + +/*************************************************************************** +Gets the next record to purge and updates the info in the purge system. */ +static +trx_undo_rec_t* +trx_purge_get_next_rec( +/*===================*/ + /* out: copy of an undo log record or + pointer to the dummy undo log record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_rec_t* rec; + trx_undo_rec_t* rec_copy; + trx_undo_rec_t* rec2; + trx_undo_rec_t* next_rec; + page_t* undo_page; + page_t* page; + ulint offset; + ulint page_no; + ulint space; + ulint zip_size; + ulint type; + ulint cmpl_info; + mtr_t mtr; + + ut_ad(mutex_own(&(purge_sys->mutex))); + ut_ad(purge_sys->next_stored); + + space = purge_sys->rseg->space; + zip_size = purge_sys->rseg->zip_size; + page_no = purge_sys->page_no; + offset = purge_sys->offset; + + if (offset == 0) { + /* It is the dummy undo log record, which means that there is + no need to purge this undo log */ + + trx_purge_rseg_get_next_history_log(purge_sys->rseg); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + return(&trx_purge_dummy_rec); + } + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(space, zip_size, + page_no, &mtr); + rec = undo_page + offset; + + rec2 = rec; + + for (;;) { + /* Try first to find the next record which requires a purge + operation from the same page of the same undo log */ + + next_rec = trx_undo_page_get_next_rec(rec2, + purge_sys->hdr_page_no, + purge_sys->hdr_offset); + if (next_rec == NULL) { + rec2 = trx_undo_get_next_rec( + rec2, purge_sys->hdr_page_no, + purge_sys->hdr_offset, &mtr); + break; + } + + rec2 = next_rec; + + type = trx_undo_rec_get_type(rec2); + + if (type == TRX_UNDO_DEL_MARK_REC) { + + break; + } + + cmpl_info = trx_undo_rec_get_cmpl_info(rec2); + + if (trx_undo_rec_get_extern_storage(rec2)) { + break; + } + + if ((type == TRX_UNDO_UPD_EXIST_REC) + && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + break; + } + } + + if (rec2 == NULL) { + mtr_commit(&mtr); + + trx_purge_rseg_get_next_history_log(purge_sys->rseg); + + /* Look for the next undo log and record to purge */ + + trx_purge_choose_next_log(); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(space, zip_size, + page_no, &mtr); + + rec = undo_page + offset; + } else { + page = page_align(rec2); + + purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2); + purge_sys->page_no = page_get_page_no(page); + purge_sys->offset = rec2 - page; + + if (undo_page != page) { + /* We advance to a new page of the undo log: */ + purge_sys->n_pages_handled++; + } + } + + rec_copy = trx_undo_rec_copy(rec, heap); + + mtr_commit(&mtr); + + return(rec_copy); +} + +/************************************************************************ +Fetches the next undo log record from the history list to purge. It must be +released with the corresponding release function. */ +UNIV_INTERN +trx_undo_rec_t* +trx_purge_fetch_next_rec( +/*=====================*/ + /* out: copy of an undo log record or + pointer to the dummy undo log record + &trx_purge_dummy_rec, if the whole undo log + can skipped in purge; NULL if none left */ + dulint* roll_ptr,/* out: roll pointer to undo record */ + trx_undo_inf_t** cell, /* out: storage cell for the record in the + purge array */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_rec_t* undo_rec; + + mutex_enter(&(purge_sys->mutex)); + + if (purge_sys->state == TRX_STOP_PURGE) { + trx_purge_truncate_if_arr_empty(); + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + + if (!purge_sys->next_stored) { + trx_purge_choose_next_log(); + + if (!purge_sys->next_stored) { + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + if (srv_print_thread_releases) { + fprintf(stderr, + "Purge: No logs left in the" + " history list; pages handled %lu\n", + (ulong) purge_sys->n_pages_handled); + } + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + } + + if (purge_sys->n_pages_handled >= purge_sys->handle_limit) { + + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + + if (ut_dulint_cmp(purge_sys->purge_trx_no, + purge_sys->view->low_limit_no) >= 0) { + purge_sys->state = TRX_STOP_PURGE; + + trx_purge_truncate_if_arr_empty(); + + mutex_exit(&(purge_sys->mutex)); + + return(NULL); + } + + /* fprintf(stderr, "Thread %lu purging trx %lu undo record %lu\n", + os_thread_get_curr_id(), + ut_dulint_get_low(purge_sys->purge_trx_no), + ut_dulint_get_low(purge_sys->purge_undo_no)); */ + + *roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id, + purge_sys->page_no, + purge_sys->offset); + + *cell = trx_purge_arr_store_info(purge_sys->purge_trx_no, + purge_sys->purge_undo_no); + + ut_ad(ut_dulint_cmp(purge_sys->purge_trx_no, + (purge_sys->view)->low_limit_no) < 0); + + /* The following call will advance the stored values of purge_trx_no + and purge_undo_no, therefore we had to store them first */ + + undo_rec = trx_purge_get_next_rec(heap); + + mutex_exit(&(purge_sys->mutex)); + + return(undo_rec); +} + +/*********************************************************************** +Releases a reserved purge undo record. */ +UNIV_INTERN +void +trx_purge_rec_release( +/*==================*/ + trx_undo_inf_t* cell) /* in: storage cell */ +{ + trx_undo_arr_t* arr; + + mutex_enter(&(purge_sys->mutex)); + + arr = purge_sys->arr; + + trx_purge_arr_remove_info(cell); + + mutex_exit(&(purge_sys->mutex)); +} + +/*********************************************************************** +This function runs a purge batch. */ +UNIV_INTERN +ulint +trx_purge(void) +/*===========*/ + /* out: number of undo log pages handled in + the batch */ +{ + que_thr_t* thr; + /* que_thr_t* thr2; */ + ulint old_pages_handled; + + mutex_enter(&(purge_sys->mutex)); + + if (purge_sys->trx->n_active_thrs > 0) { + + mutex_exit(&(purge_sys->mutex)); + + /* Should not happen */ + + ut_error; + + return(0); + } + + rw_lock_x_lock(&(purge_sys->latch)); + + mutex_enter(&kernel_mutex); + + /* Close and free the old purge view */ + + read_view_close(purge_sys->view); + purge_sys->view = NULL; + mem_heap_empty(purge_sys->heap); + + /* Determine how much data manipulation language (DML) statements + need to be delayed in order to reduce the lagging of the purge + thread. */ + srv_dml_needed_delay = 0; /* in microseconds; default: no delay */ + + /* If we cannot advance the 'purge view' because of an old + 'consistent read view', then the DML statements cannot be delayed. + Also, srv_max_purge_lag <= 0 means 'infinity'. */ + if (srv_max_purge_lag > 0 + && !UT_LIST_GET_LAST(trx_sys->view_list)) { + float ratio = (float) trx_sys->rseg_history_len + / srv_max_purge_lag; + if (ratio > ULINT_MAX / 10000) { + /* Avoid overflow: maximum delay is 4295 seconds */ + srv_dml_needed_delay = ULINT_MAX; + } else if (ratio > 1) { + /* If the history list length exceeds the + innodb_max_purge_lag, the + data manipulation statements are delayed + by at least 5000 microseconds. */ + srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); + } + } + + purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero, + purge_sys->heap); + mutex_exit(&kernel_mutex); + + rw_lock_x_unlock(&(purge_sys->latch)); + + purge_sys->state = TRX_PURGE_ON; + + /* Handle at most 20 undo log pages in one purge batch */ + + purge_sys->handle_limit = purge_sys->n_pages_handled + 20; + + old_pages_handled = purge_sys->n_pages_handled; + + mutex_exit(&(purge_sys->mutex)); + + mutex_enter(&kernel_mutex); + + thr = que_fork_start_command(purge_sys->query); + + ut_ad(thr); + + /* thr2 = que_fork_start_command(purge_sys->query); + + ut_ad(thr2); */ + + + mutex_exit(&kernel_mutex); + + /* srv_que_task_enqueue(thr2); */ + + if (srv_print_thread_releases) { + + fputs("Starting purge\n", stderr); + } + + que_run_threads(thr); + + if (srv_print_thread_releases) { + + fprintf(stderr, + "Purge ends; pages handled %lu\n", + (ulong) purge_sys->n_pages_handled); + } + + return(purge_sys->n_pages_handled - old_pages_handled); +} + +/********************************************************************** +Prints information of the purge system to stderr. */ +UNIV_INTERN +void +trx_purge_sys_print(void) +/*=====================*/ +{ + fprintf(stderr, "InnoDB: Purge system view:\n"); + read_view_print(purge_sys->view); + + fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT + ", undo n:o " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF(purge_sys->purge_trx_no), + TRX_ID_PREP_PRINTF(purge_sys->purge_undo_no)); + fprintf(stderr, + "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n" + "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n", + (ulong) purge_sys->next_stored, + (ulong) purge_sys->page_no, + (ulong) purge_sys->offset, + (ulong) purge_sys->hdr_page_no, + (ulong) purge_sys->hdr_offset); +} diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c new file mode 100644 index 00000000000..148f93cdbe7 --- /dev/null +++ b/storage/xtradb/trx/trx0rec.c @@ -0,0 +1,1607 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rec.h" + +#ifdef UNIV_NONINL +#include "trx0rec.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "dict0dict.h" +#include "ut0mem.h" +#include "row0ext.h" +#include "row0upd.h" +#include "que0que.h" +#include "trx0purge.h" +#include "row0row.h" + +/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/ + +/************************************************************************** +Writes the mtr log entry of the inserted undo log record on the undo log +page. */ +UNIV_INLINE +void +trx_undof_page_add_undo_rec_log( +/*============================*/ + page_t* undo_page, /* in: undo log page */ + ulint old_free, /* in: start offset of the inserted entry */ + ulint new_free, /* in: end offset of the entry */ + mtr_t* mtr) /* in: mtr */ +{ + byte* log_ptr; + const byte* log_end; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN); + + if (log_ptr == NULL) { + + return; + } + + log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN]; + log_ptr = mlog_write_initial_log_record_fast( + undo_page, MLOG_UNDO_INSERT, log_ptr, mtr); + len = new_free - old_free - 4; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + + if (log_ptr + len <= log_end) { + memcpy(log_ptr, undo_page + old_free + 2, len); + mlog_close(mtr, log_ptr + len); + } else { + mlog_close(mtr, log_ptr); + mlog_catenate_string(mtr, undo_page + old_free + 2, len); + } +} + +/*************************************************************** +Parses a redo log record of adding an undo log record. */ +UNIV_INTERN +byte* +trx_undo_parse_add_undo_rec( +/*========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page) /* in: page or NULL */ +{ + ulint len; + byte* rec; + ulint first_free; + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + len = mach_read_from_2(ptr); + ptr += 2; + + if (end_ptr < ptr + len) { + + return(NULL); + } + + if (page == NULL) { + + return(ptr + len); + } + + first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + rec = page + first_free; + + mach_write_to_2(rec, first_free + 4 + len); + mach_write_to_2(rec + 2 + len, first_free); + + mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + first_free + 4 + len); + ut_memcpy(rec + 2, ptr, len); + + return(ptr + len); +} + +/************************************************************************** +Calculates the free space left for extending an undo log record. */ +UNIV_INLINE +ulint +trx_undo_left( +/*==========*/ + /* out: bytes left */ + const page_t* page, /* in: undo log page */ + const byte* ptr) /* in: pointer to page */ +{ + /* The '- 10' is a safety margin, in case we have some small + calculation error below */ + + return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END); +} + +/************************************************************************** +Set the next and previous pointers in the undo page for the undo record +that was written to ptr. Update the first free value by the number of bytes +written for this undo record.*/ +static +ulint +trx_undo_page_set_next_prev_and_add( +/*================================*/ + /* out: offset of the inserted entry + on the page if succeeded, 0 if fail */ + page_t* undo_page, /* in/out: undo log page */ + byte* ptr, /* in: ptr up to where data has been + written on this undo page. */ + mtr_t* mtr) /* in: mtr */ +{ + ulint first_free; /* offset within undo_page */ + ulint end_of_rec; /* offset within undo_page */ + byte* ptr_to_first_free; + /* pointer within undo_page + that points to the next free + offset value within undo_page.*/ + + ut_ad(ptr > undo_page); + ut_ad(ptr < undo_page + UNIV_PAGE_SIZE); + + if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) { + + return(0); + } + + ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE; + + first_free = mach_read_from_2(ptr_to_first_free); + + /* Write offset of the previous undo log record */ + mach_write_to_2(ptr, first_free); + ptr += 2; + + end_of_rec = ptr - undo_page; + + /* Write offset of the next undo log record */ + mach_write_to_2(undo_page + first_free, end_of_rec); + + /* Update the offset to first free undo record */ + mach_write_to_2(ptr_to_first_free, end_of_rec); + + /* Write this log entry to the UNDO log */ + trx_undof_page_add_undo_rec_log(undo_page, first_free, + end_of_rec, mtr); + + return(first_free); +} + +/************************************************************************** +Reports in the undo log of an insert of a clustered index record. */ +static +ulint +trx_undo_page_report_insert( +/*========================*/ + /* out: offset of the inserted entry + on the page if succeed, 0 if fail */ + page_t* undo_page, /* in: undo log page */ + trx_t* trx, /* in: transaction */ + dict_index_t* index, /* in: clustered index */ + const dtuple_t* clust_entry, /* in: index entry which will be + inserted to the clustered index */ + mtr_t* mtr) /* in: mtr */ +{ + ulint first_free; + byte* ptr; + ulint i; + + ut_ad(dict_index_is_clust(index)); + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT); + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) { + + /* Not enough space for writing the general parameters */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + *ptr++ = TRX_UNDO_INSERT_REC; + ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no); + ptr += mach_dulint_write_much_compressed(ptr, index->table->id); + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the record + to be inserted in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + const dfield_t* field = dtuple_get_nth_field(clust_entry, i); + ulint flen = dfield_get_len(field); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, dfield_get_data(field), flen); + ptr += flen; + } + } + + return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr)); +} + +/************************************************************************** +Reads from an undo log record the general parameters. */ +UNIV_INTERN +byte* +trx_undo_rec_get_pars( +/*==================*/ + /* out: remaining part of undo log + record after reading these values */ + trx_undo_rec_t* undo_rec, /* in: undo log record */ + ulint* type, /* out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + ulint* cmpl_info, /* out: compiler info, relevant only + for update type records */ + ibool* updated_extern, /* out: TRUE if we updated an + externally stored fild */ + dulint* undo_no, /* out: undo log record number */ + dulint* table_id) /* out: table id */ +{ + byte* ptr; + ulint type_cmpl; + + ptr = undo_rec + 2; + + type_cmpl = mach_read_from_1(ptr); + ptr++; + + if (type_cmpl & TRX_UNDO_UPD_EXTERN) { + *updated_extern = TRUE; + type_cmpl -= TRX_UNDO_UPD_EXTERN; + } else { + *updated_extern = FALSE; + } + + *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1); + *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT; + + *undo_no = mach_dulint_read_much_compressed(ptr); + ptr += mach_dulint_get_much_compressed_size(*undo_no); + + *table_id = mach_dulint_read_much_compressed(ptr); + ptr += mach_dulint_get_much_compressed_size(*table_id); + + return(ptr); +} + +/************************************************************************** +Reads from an undo log record a stored column value. */ +static +byte* +trx_undo_rec_get_col_val( +/*=====================*/ + /* out: remaining part of undo log record after + reading these values */ + byte* ptr, /* in: pointer to remaining part of undo log record */ + byte** field, /* out: pointer to stored field */ + ulint* len, /* out: length of the field, or UNIV_SQL_NULL */ + ulint* orig_len)/* out: original length of the locally + stored part of an externally stored column, or 0 */ +{ + *len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*len); + + *orig_len = 0; + + switch (*len) { + case UNIV_SQL_NULL: + *field = NULL; + break; + case UNIV_EXTERN_STORAGE_FIELD: + *orig_len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*orig_len); + *len = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*len); + *field = ptr; + ptr += *len; + + ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(*len > *orig_len); + ut_ad(*len >= REC_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE); + + *len += UNIV_EXTERN_STORAGE_FIELD; + break; + default: + *field = ptr; + if (*len >= UNIV_EXTERN_STORAGE_FIELD) { + ptr += *len - UNIV_EXTERN_STORAGE_FIELD; + } else { + ptr += *len; + } + } + + return(ptr); +} + +/*********************************************************************** +Builds a row reference from an undo log record. */ +UNIV_INTERN +byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + dtuple_t** ref, /* out, own: row reference */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + ulint ref_len; + ulint i; + + ut_ad(index && ptr && ref && heap); + ut_a(dict_index_is_clust(index)); + + ref_len = dict_index_get_n_unique(index); + + *ref = dtuple_create(heap, ref_len); + + dict_index_copy_types(*ref, index, ref_len); + + for (i = 0; i < ref_len; i++) { + dfield_t* dfield; + byte* field; + ulint len; + ulint orig_len; + + dfield = dtuple_get_nth_field(*ref, i); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + dfield_set_data(dfield, field, len); + } + + return(ptr); +} + +/*********************************************************************** +Skips a row reference from an undo log record. */ +UNIV_INTERN +byte* +trx_undo_rec_skip_row_ref( +/*======================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part in update undo log + record, at the start of the row reference */ + dict_index_t* index) /* in: clustered index */ +{ + ulint ref_len; + ulint i; + + ut_ad(index && ptr); + ut_a(dict_index_is_clust(index)); + + ref_len = dict_index_get_n_unique(index); + + for (i = 0; i < ref_len; i++) { + byte* field; + ulint len; + ulint orig_len; + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + } + + return(ptr); +} + +/************************************************************************** +Fetch a prefix of an externally stored column, for writing to the undo log +of an update or delete marking of a clustered index record. */ +static +byte* +trx_undo_page_fetch_ext( +/*====================*/ + /* out: ext_buf */ + byte* ext_buf, /* in: a buffer of + REC_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE */ + ulint zip_size, /* compressed page size in bytes, + or 0 for uncompressed BLOB */ + const byte* field, /* in: an externally stored column */ + ulint* len) /* in: length of field; + out: used length of ext_buf */ +{ + /* Fetch the BLOB. */ + ulint ext_len = btr_copy_externally_stored_field_prefix( + ext_buf, REC_MAX_INDEX_COL_LEN, zip_size, field, *len); + /* BLOBs should always be nonempty. */ + ut_a(ext_len); + /* Append the BLOB pointer to the prefix. */ + memcpy(ext_buf + ext_len, + field + *len - BTR_EXTERN_FIELD_REF_SIZE, + BTR_EXTERN_FIELD_REF_SIZE); + *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE; + return(ext_buf); +} + +/************************************************************************** +Writes to the undo log a prefix of an externally stored column. */ +static +byte* +trx_undo_page_report_modify_ext( +/*============================*/ + /* out: undo log position */ + byte* ptr, /* in: undo log position, + at least 15 bytes must be available */ + byte* ext_buf, /* in: a buffer of + REC_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE, + or NULL when should not fetch + a longer prefix */ + ulint zip_size, /* compressed page size in bytes, + or 0 for uncompressed BLOB */ + const byte** field, /* in/out: the locally stored part of + the externally stored column */ + ulint* len) /* in/out: length of field, in bytes */ +{ + if (ext_buf) { + /* If an ordering column is externally stored, we will + have to store a longer prefix of the field. In this + case, write to the log a marker followed by the + original length and the real length of the field. */ + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD); + + ptr += mach_write_compressed(ptr, *len); + + *field = trx_undo_page_fetch_ext(ext_buf, zip_size, + *field, len); + + ptr += mach_write_compressed(ptr, *len); + } else { + ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD + + *len); + } + + return(ptr); +} + +/************************************************************************** +Reports in the undo log of an update or delete marking of a clustered index +record. */ +static +ulint +trx_undo_page_report_modify( +/*========================*/ + /* out: byte offset of the inserted + undo log entry on the page if succeed, + 0 if fail */ + page_t* undo_page, /* in: undo log page */ + trx_t* trx, /* in: transaction */ + dict_index_t* index, /* in: clustered index where update or + delete marking is done */ + const rec_t* rec, /* in: clustered index record which + has NOT yet been modified */ + const ulint* offsets, /* in: rec_get_offsets(rec, index) */ + const upd_t* update, /* in: update vector which tells the + columns to be updated; in the case of + a delete, this should be set to NULL */ + ulint cmpl_info, /* in: compiler info on secondary + index updates */ + mtr_t* mtr) /* in: mtr */ +{ + dict_table_t* table; + ulint first_free; + byte* ptr; + const byte* field; + ulint flen; + ulint col_no; + ulint type_cmpl; + byte* type_cmpl_ptr; + ulint i; + dulint trx_id; + ibool ignore_prefix = FALSE; + byte ext_buf[REC_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE]; + + ut_a(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); + table = index->table; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + ptr = undo_page + first_free; + + ut_ad(first_free <= UNIV_PAGE_SIZE); + + if (trx_undo_left(undo_page, ptr) < 50) { + + /* NOTE: the value 50 must be big enough so that the general + fields written below fit on the undo log page */ + + return(0); + } + + /* Reserve 2 bytes for the pointer to the next undo log record */ + ptr += 2; + + /* Store first some general parameters to the undo log */ + + if (!update) { + type_cmpl = TRX_UNDO_DEL_MARK_REC; + } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { + type_cmpl = TRX_UNDO_UPD_DEL_REC; + /* We are about to update a delete marked record. + We don't typically need the prefix in this case unless + the delete marking is done by the same transaction + (which we check below). */ + ignore_prefix = TRUE; + } else { + type_cmpl = TRX_UNDO_UPD_EXIST_REC; + } + + type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT; + type_cmpl_ptr = ptr; + + *ptr++ = (byte) type_cmpl; + ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no); + + ptr += mach_dulint_write_much_compressed(ptr, table->id); + + /*----------------------------------------*/ + /* Store the state of the info bits */ + + *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); + + /* Store the values of the system columns */ + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos( + index, DATA_TRX_ID), &flen); + ut_ad(flen == DATA_TRX_ID_LEN); + + trx_id = trx_read_trx_id(field); + + /* If it is an update of a delete marked record, then we are + allowed to ignore blob prefixes if the delete marking was done + by some other trx as it must have committed by now for us to + allow an over-write. */ + if (ignore_prefix) { + ignore_prefix = ut_dulint_cmp(trx_id, trx->id) != 0; + } + ptr += mach_dulint_write_compressed(ptr, trx_id); + + field = rec_get_nth_field(rec, offsets, + dict_index_get_sys_col_pos( + index, DATA_ROLL_PTR), &flen); + ut_ad(flen == DATA_ROLL_PTR_LEN); + + ptr += mach_dulint_write_compressed(ptr, trx_read_roll_ptr(field)); + + /*----------------------------------------*/ + /* Store then the fields required to uniquely determine the + record which will be modified in the clustered index */ + + for (i = 0; i < dict_index_get_n_unique(index); i++) { + + field = rec_get_nth_field(rec, offsets, i, &flen); + + /* The ordering columns must not be stored externally. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + ut_ad(dict_index_get_nth_col(index, i)->ord_part); + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, flen); + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + + /*----------------------------------------*/ + /* Save to the undo log the old values of the columns to be updated. */ + + if (update) { + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, upd_get_n_fields(update)); + + for (i = 0; i < upd_get_n_fields(update); i++) { + + ulint pos = upd_get_nth_field(update, i)->field_no; + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + ptr += mach_write_compressed(ptr, pos); + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, &flen); + + if (trx_undo_left(undo_page, ptr) < 15) { + + return(0); + } + + if (rec_offs_nth_extern(offsets, pos)) { + ptr = trx_undo_page_report_modify_ext( + ptr, + dict_index_get_nth_col(index, pos) + ->ord_part + && !ignore_prefix + && flen < REC_MAX_INDEX_COL_LEN + ? ext_buf : NULL, + dict_table_zip_size(table), + &field, &flen); + + /* Notify purge that it eventually has to + free the old externally stored field */ + + trx->update_undo->del_marks = TRUE; + + *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; + } else { + ptr += mach_write_compressed(ptr, flen); + } + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + /*----------------------------------------*/ + /* In the case of a delete marking, and also in the case of an update + where any ordering field of any index changes, store the values of all + columns which occur as ordering fields in any index. This info is used + in the purge of old versions where we use it to build and search the + delete marked index records, to look if we can remove them from the + index tree. Note that starting from 4.0.14 also externally stored + fields can be ordering in some index. Starting from 5.2, we no longer + store REC_MAX_INDEX_COL_LEN first bytes to the undo log record, + but we can construct the column prefix fields in the index by + fetching the first page of the BLOB that is pointed to by the + clustered index. This works also in crash recovery, because all pages + (including BLOBs) are recovered before anything is rolled back. */ + + if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + byte* old_ptr = ptr; + + trx->update_undo->del_marks = TRUE; + + if (trx_undo_left(undo_page, ptr) < 5) { + + return(0); + } + + /* Reserve 2 bytes to write the number of bytes the stored + fields take in this undo record */ + + ptr += 2; + + for (col_no = 0; col_no < dict_table_get_n_cols(table); + col_no++) { + + const dict_col_t* col + = dict_table_get_nth_col(table, col_no); + + if (col->ord_part) { + ulint pos; + + /* Write field number to undo log */ + if (trx_undo_left(undo_page, ptr) < 5 + 15) { + + return(0); + } + + pos = dict_index_get_nth_col_pos(index, + col_no); + ptr += mach_write_compressed(ptr, pos); + + /* Save the old value of field */ + field = rec_get_nth_field(rec, offsets, pos, + &flen); + + if (rec_offs_nth_extern(offsets, pos)) { + ptr = trx_undo_page_report_modify_ext( + ptr, + flen < REC_MAX_INDEX_COL_LEN + && !ignore_prefix + ? ext_buf : NULL, + dict_table_zip_size(table), + &field, &flen); + } else { + ptr += mach_write_compressed( + ptr, flen); + } + + if (flen != UNIV_SQL_NULL) { + if (trx_undo_left(undo_page, ptr) + < flen) { + + return(0); + } + + ut_memcpy(ptr, field, flen); + ptr += flen; + } + } + } + + mach_write_to_2(old_ptr, ptr - old_ptr); + } + + /*----------------------------------------*/ + /* Write pointers to the previous and the next undo log records */ + if (trx_undo_left(undo_page, ptr) < 2) { + + return(0); + } + + mach_write_to_2(ptr, first_free); + ptr += 2; + mach_write_to_2(undo_page + first_free, ptr - undo_page); + + mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, + ptr - undo_page); + + /* Write to the REDO log about this change in the UNDO log */ + + trx_undof_page_add_undo_rec_log(undo_page, first_free, + ptr - undo_page, mtr); + return(first_free); +} + +/************************************************************************** +Reads from an undo log update record the system field values of the old +version. */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + /* out: remaining part of undo log + record after reading these values */ + byte* ptr, /* in: remaining part of undo log + record after reading general + parameters */ + dulint* trx_id, /* out: trx id */ + dulint* roll_ptr, /* out: roll ptr */ + ulint* info_bits) /* out: info bits state */ +{ + /* Read the state of the info bits */ + *info_bits = mach_read_from_1(ptr); + ptr += 1; + + /* Read the values of the system columns */ + + *trx_id = mach_dulint_read_compressed(ptr); + ptr += mach_dulint_get_compressed_size(*trx_id); + + *roll_ptr = mach_dulint_read_compressed(ptr); + ptr += mach_dulint_get_compressed_size(*roll_ptr); + + return(ptr); +} + +/************************************************************************** +Reads from an update undo log record the number of updated fields. */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_n_upd_fields( +/*=================================*/ + /* out: remaining part of undo log record after + reading this value */ + byte* ptr, /* in: pointer to remaining part of undo log record */ + ulint* n) /* out: number of fields */ +{ + *n = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*n); + + return(ptr); +} + +/************************************************************************** +Reads from an update undo log record a stored field number. */ +UNIV_INLINE +byte* +trx_undo_update_rec_get_field_no( +/*=============================*/ + /* out: remaining part of undo log record after + reading this value */ + byte* ptr, /* in: pointer to remaining part of undo log record */ + ulint* field_no)/* out: field number */ +{ + *field_no = mach_read_compressed(ptr); + ptr += mach_get_compressed_size(*field_no); + + return(ptr); +} + +/*********************************************************************** +Builds an update vector based on a remaining part of an undo log record. */ +UNIV_INTERN +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + /* out: remaining part of the record, + NULL if an error detected, which means that + the record is corrupted */ + byte* ptr, /* in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + ulint type, /* in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + dulint trx_id, /* in: transaction id from this undo record */ + dulint roll_ptr,/* in: roll pointer from this undo record */ + ulint info_bits,/* in: info bits from this undo record */ + trx_t* trx, /* in: transaction */ + mem_heap_t* heap, /* in: memory heap from which the memory + needed is allocated */ + upd_t** upd) /* out, own: update vector */ +{ + upd_field_t* upd_field; + upd_t* update; + ulint n_fields; + byte* buf; + ulint i; + + ut_a(dict_index_is_clust(index)); + + if (type != TRX_UNDO_DEL_MARK_REC) { + ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields); + } else { + n_fields = 0; + } + + update = upd_create(n_fields + 2, heap); + + update->info_bits = info_bits; + + /* Store first trx id and roll ptr to update vector */ + + upd_field = upd_get_nth_field(update, n_fields); + buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN); + trx_write_trx_id(buf, trx_id); + + upd_field_set_field_no(upd_field, + dict_index_get_sys_col_pos(index, DATA_TRX_ID), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN); + + upd_field = upd_get_nth_field(update, n_fields + 1); + buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(buf, roll_ptr); + + upd_field_set_field_no( + upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR), + index, trx); + dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN); + + /* Store then the updated ordinary columns to the update vector */ + + for (i = 0; i < n_fields; i++) { + + byte* field; + ulint len; + ulint field_no; + ulint orig_len; + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + if (field_no >= dict_index_get_n_fields(index)) { + fprintf(stderr, + "InnoDB: Error: trying to access" + " update undo rec field %lu in ", + (ulong) field_no); + dict_index_name_print(stderr, trx, index); + fprintf(stderr, "\n" + "InnoDB: but index has only %lu fields\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n" + "InnoDB: Run also CHECK TABLE ", + (ulong) dict_index_get_n_fields(index)); + ut_print_name(stderr, trx, TRUE, index->table_name); + fprintf(stderr, "\n" + "InnoDB: n_fields = %lu, i = %lu, ptr %p\n", + (ulong) n_fields, (ulong) i, ptr); + return(NULL); + } + + upd_field = upd_get_nth_field(update, i); + + upd_field_set_field_no(upd_field, field_no, index, trx); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + upd_field->orig_len = orig_len; + + if (len == UNIV_SQL_NULL) { + dfield_set_null(&upd_field->new_val); + } else if (len < UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_data(&upd_field->new_val, field, len); + } else { + len -= UNIV_EXTERN_STORAGE_FIELD; + + dfield_set_data(&upd_field->new_val, field, len); + dfield_set_ext(&upd_field->new_val); + } + } + + *upd = update; + + return(ptr); +} + +/*********************************************************************** +Builds a partial row from an update undo log record. It contains the +columns which occur as ordering in any index of the table. */ +UNIV_INTERN +byte* +trx_undo_rec_get_partial_row( +/*=========================*/ + /* out: pointer to remaining part of undo + record */ + byte* ptr, /* in: remaining part in update undo log + record of a suitable type, at the start of + the stored index columns; + NOTE that this copy of the undo log record must + be preserved as long as the partial row is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /* in: clustered index */ + dtuple_t** row, /* out, own: partial row */ + ibool ignore_prefix, /* in: flag to indicate if we + expect blob prefixes in undo. Used + only in the assertion. */ + mem_heap_t* heap) /* in: memory heap from which the memory + needed is allocated */ +{ + const byte* end_ptr; + ulint row_len; + + ut_ad(index); + ut_ad(ptr); + ut_ad(row); + ut_ad(heap); + ut_ad(dict_index_is_clust(index)); + + row_len = dict_table_get_n_cols(index->table); + + *row = dtuple_create(heap, row_len); + + dict_table_copy_types(*row, index->table); + + end_ptr = ptr + mach_read_from_2(ptr); + ptr += 2; + + while (ptr != end_ptr) { + dfield_t* dfield; + byte* field; + ulint field_no; + const dict_col_t* col; + ulint col_no; + ulint len; + ulint orig_len; + + ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); + + col = dict_index_get_nth_col(index, field_no); + col_no = dict_col_get_no(col); + + ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); + + dfield = dtuple_get_nth_field(*row, col_no); + + dfield_set_data(dfield, field, len); + + if (len != UNIV_SQL_NULL + && len >= UNIV_EXTERN_STORAGE_FIELD) { + dfield_set_len(dfield, + len - UNIV_EXTERN_STORAGE_FIELD); + dfield_set_ext(dfield); + /* If the prefix of this column is indexed, + ensure that enough prefix is stored in the + undo log record. */ + ut_a(ignore_prefix + || !col->ord_part + || dfield_get_len(dfield) + >= REC_MAX_INDEX_COL_LEN + + BTR_EXTERN_FIELD_REF_SIZE); + } + } + + return(ptr); +} + +/*************************************************************************** +Erases the unused undo log page end. */ +static +void +trx_undo_erase_page_end( +/*====================*/ + page_t* undo_page, /* in: undo page whose end to erase */ + mtr_t* mtr) /* in: mtr */ +{ + ulint first_free; + + first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE); + memset(undo_page + first_free, 0xff, + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free); + + mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr); +} + +/*************************************************************** +Parses a redo log record of erasing of an undo page end. */ +UNIV_INTERN +byte* +trx_undo_parse_erase_page_end( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(ptr && end_ptr); + + if (page == NULL) { + + return(ptr); + } + + trx_undo_erase_page_end(page, mtr); + + return(ptr); +} + +/*************************************************************************** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. */ +UNIV_INTERN +ulint +trx_undo_report_row_operation( +/*==========================*/ + /* out: DB_SUCCESS or error code */ + ulint flags, /* in: if BTR_NO_UNDO_LOG_FLAG bit is + set, does nothing */ + ulint op_type, /* in: TRX_UNDO_INSERT_OP or + TRX_UNDO_MODIFY_OP */ + que_thr_t* thr, /* in: query thread */ + dict_index_t* index, /* in: clustered index */ + const dtuple_t* clust_entry, /* in: in the case of an insert, + index entry to insert into the + clustered index, otherwise NULL */ + const upd_t* update, /* in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /* in: compiler info on secondary + index updates */ + const rec_t* rec, /* in: in case of an update or delete + marking, the record in the clustered + index, otherwise NULL */ + dulint* roll_ptr) /* out: rollback pointer to the + inserted undo log record, + ut_dulint_zero if BTR_NO_UNDO_LOG + flag was specified */ +{ + trx_t* trx; + trx_undo_t* undo; + ulint page_no; + trx_rseg_t* rseg; + mtr_t mtr; + ulint err = DB_SUCCESS; + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + + ut_a(dict_index_is_clust(index)); + + if (flags & BTR_NO_UNDO_LOG_FLAG) { + + *roll_ptr = ut_dulint_zero; + + return(DB_SUCCESS); + } + + ut_ad(thr); + ut_ad((op_type != TRX_UNDO_INSERT_OP) + || (clust_entry && !update && !rec)); + + trx = thr_get_trx(thr); + rseg = trx->rseg; + + mutex_enter(&(trx->undo_mutex)); + + /* If the undo log is not assigned yet, assign one */ + + if (op_type == TRX_UNDO_INSERT_OP) { + + if (trx->insert_undo == NULL) { + + err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT); + } + + undo = trx->insert_undo; + + if (UNIV_UNLIKELY(!undo)) { + /* Did not succeed */ + mutex_exit(&(trx->undo_mutex)); + + return(err); + } + } else { + ut_ad(op_type == TRX_UNDO_MODIFY_OP); + + if (trx->update_undo == NULL) { + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + } + + undo = trx->update_undo; + + if (UNIV_UNLIKELY(!undo)) { + /* Did not succeed */ + mutex_exit(&(trx->undo_mutex)); + return(err); + } + + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + } + + page_no = undo->last_page_no; + + mtr_start(&mtr); + + for (;;) { + buf_block_t* undo_block; + page_t* undo_page; + ulint offset; + + undo_block = buf_page_get_gen(undo->space, undo->zip_size, + page_no, RW_X_LATCH, + undo->guess_block, BUF_GET, + __FILE__, __LINE__, &mtr); + buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE); + + undo_page = buf_block_get_frame(undo_block); + + if (op_type == TRX_UNDO_INSERT_OP) { + offset = trx_undo_page_report_insert( + undo_page, trx, index, clust_entry, &mtr); + } else { + offset = trx_undo_page_report_modify( + undo_page, trx, index, rec, offsets, update, + cmpl_info, &mtr); + } + + if (UNIV_UNLIKELY(offset == 0)) { + /* The record did not fit on the page. We erase the + end segment of the undo log page and write a log + record of it: this is to ensure that in the debug + version the replicate page constructed using the log + records stays identical to the original page */ + + trx_undo_erase_page_end(undo_page, &mtr); + mtr_commit(&mtr); + } else { + /* Success */ + + mtr_commit(&mtr); + + undo->empty = FALSE; + undo->top_page_no = page_no; + undo->top_offset = offset; + undo->top_undo_no = trx->undo_no; + undo->guess_block = undo_block; + + UT_DULINT_INC(trx->undo_no); + + mutex_exit(&trx->undo_mutex); + + *roll_ptr = trx_undo_build_roll_ptr( + op_type == TRX_UNDO_INSERT_OP, + rseg->id, page_no, offset); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(DB_SUCCESS); + } + + ut_ad(page_no == undo->last_page_no); + + /* We have to extend the undo log by one page */ + + mtr_start(&mtr); + + /* When we add a page to an undo log, this is analogous to + a pessimistic insert in a B-tree, and we must reserve the + counterpart of the tree latch, which is the rseg mutex. */ + + mutex_enter(&(rseg->mutex)); + + page_no = trx_undo_add_page(trx, undo, &mtr); + + mutex_exit(&(rseg->mutex)); + + if (UNIV_UNLIKELY(page_no == FIL_NULL)) { + /* Did not succeed: out of space */ + + mutex_exit(&(trx->undo_mutex)); + mtr_commit(&mtr); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(DB_OUT_OF_FILE_SPACE); + } + } +} + +/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ + +/********************************************************************** +Copies an undo record to heap. This function can be called if we know that +the undo log record exists. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_undo_rec_low( +/*======================*/ + /* out, own: copy of the record */ + dulint roll_ptr, /* in: roll pointer to record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_rec_t* undo_rec; + ulint rseg_id; + ulint page_no; + ulint offset; + page_t* undo_page; + trx_rseg_t* rseg; + ibool is_insert; + mtr_t mtr; + + trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, + &offset); + rseg = trx_rseg_get_on_id(rseg_id); + + mtr_start(&mtr); + + undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, + page_no, &mtr); + + undo_rec = trx_undo_rec_copy(undo_page + offset, heap); + + mtr_commit(&mtr); + + return(undo_rec); +} + +/********************************************************************** +Copies an undo record to heap. */ +UNIV_INTERN +ulint +trx_undo_get_undo_rec( +/*==================*/ + /* out: DB_SUCCESS, or + DB_MISSING_HISTORY if the undo log + has been truncated and we cannot + fetch the old version; NOTE: the + caller must have latches on the + clustered index page and purge_view */ + dulint roll_ptr, /* in: roll pointer to record */ + dulint trx_id, /* in: id of the trx that generated + the roll pointer: it points to an + undo log of this transaction */ + trx_undo_rec_t** undo_rec, /* out, own: copy of the record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (!trx_purge_update_undo_must_exist(trx_id)) { + + /* It may be that the necessary undo log has already been + deleted */ + + return(DB_MISSING_HISTORY); + } + + *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Build a previous version of a clustered index record. This function checks +that the caller has a latch on the index page of the clustered index record +and an s-latch on the purge_view. This guarantees that the stack of versions +is locked all the way down to the purge_view. */ +UNIV_INTERN +ulint +trx_undo_prev_version_build( +/*========================*/ + /* out: DB_SUCCESS, or DB_MISSING_HISTORY if + the previous version is not >= purge_view, + which means that it may have been removed, + DB_ERROR if corrupted record */ + const rec_t* index_rec,/* in: clustered index record in the + index tree */ + mtr_t* index_mtr __attribute__((unused)), + /* in: mtr which contains the latch to + index_rec page and purge_view */ + const rec_t* rec, /* in: version of a clustered index record */ + dict_index_t* index, /* in: clustered index */ + ulint* offsets,/* in: rec_get_offsets(rec, index) */ + mem_heap_t* heap, /* in: memory heap from which the memory + needed is allocated */ + rec_t** old_vers)/* out, own: previous version, or NULL if + rec is the first inserted version, or if + history data has been deleted (an error), + or if the purge COULD have removed the version + though it has not yet done so */ +{ + trx_undo_rec_t* undo_rec = NULL; + dtuple_t* entry; + dulint rec_trx_id; + ulint type; + dulint undo_no; + dulint table_id; + dulint trx_id; + dulint roll_ptr; + dulint old_roll_ptr; + upd_t* update; + byte* ptr; + ulint info_bits; + ulint cmpl_info; + ibool dummy_extern; + byte* buf; + ulint err; +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX) + || mtr_memo_contains_page(index_mtr, index_rec, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (!dict_index_is_clust(index)) { + fprintf(stderr, "InnoDB: Error: trying to access" + " update undo rec for non-clustered index %s\n" + "InnoDB: Submit a detailed bug report to" + " http://bugs.mysql.com\n" + "InnoDB: index record ", index->name); + rec_print(stderr, index_rec, index); + fputs("\n" + "InnoDB: record version ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + return(DB_ERROR); + } + + roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); + old_roll_ptr = roll_ptr; + + *old_vers = NULL; + + if (trx_undo_roll_ptr_is_insert(roll_ptr)) { + + /* The record rec is the first inserted version */ + + return(DB_SUCCESS); + } + + rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + + err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + /* The undo record may already have been purged. + This should never happen in InnoDB. */ + + return(err); + } + + ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, + &dummy_extern, &undo_no, &table_id); + + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, + &info_bits); + + /* (a) If a clustered index record version is such that the + trx id stamp in it is bigger than purge_sys->view, then the + BLOBs in that version are known to exist (the purge has not + progressed that far); + + (b) if the version is the first version such that trx id in it + is less than purge_sys->view, and it is not delete-marked, + then the BLOBs in that version are known to exist (the purge + cannot have purged the BLOBs referenced by that version + yet). + + This function does not fetch any BLOBs. The callers might, by + possibly invoking row_ext_create() via row_build(). However, + they should have all needed information in the *old_vers + returned by this function. This is because *old_vers is based + on the transaction undo log records. The function + trx_undo_page_fetch_ext() will write BLOB prefixes to the + transaction undo log that are at least as long as the longest + possible column prefix in a secondary index. Thus, secondary + index entries for *old_vers can be constructed without + dereferencing any BLOB pointers. */ + + ptr = trx_undo_rec_skip_row_ref(ptr, index); + + ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, + roll_ptr, info_bits, + NULL, heap, &update); + + if (ut_dulint_cmp(table_id, index->table->id) != 0) { + ptr = NULL; + + fprintf(stderr, + "InnoDB: Error: trying to access update undo rec" + " for table %s\n" + "InnoDB: but the table id in the" + " undo record is wrong\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n" + "InnoDB: Run also CHECK TABLE %s\n", + index->table_name, index->table_name); + } + + if (ptr == NULL) { + /* The record was corrupted, return an error; these printfs + should catch an elusive bug in row_vers_old_has_index_entry */ + + fprintf(stderr, + "InnoDB: table %s, index %s, n_uniq %lu\n" + "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n" + "InnoDB: undo rec table id %lu %lu," + " index table id %lu %lu\n" + "InnoDB: dump of 150 bytes in undo rec: ", + index->table_name, index->name, + (ulong) dict_index_get_n_unique(index), + undo_rec, (ulong) type, (ulong) cmpl_info, + (ulong) ut_dulint_get_high(table_id), + (ulong) ut_dulint_get_low(table_id), + (ulong) ut_dulint_get_high(index->table->id), + (ulong) ut_dulint_get_low(index->table->id)); + ut_print_buf(stderr, undo_rec, 150); + fputs("\n" + "InnoDB: index record ", stderr); + rec_print(stderr, index_rec, index); + fputs("\n" + "InnoDB: record version ", stderr); + rec_print_new(stderr, rec, offsets); + fprintf(stderr, "\n" + "InnoDB: Record trx id " TRX_ID_FMT + ", update rec trx id " TRX_ID_FMT "\n" + "InnoDB: Roll ptr in rec %lu %lu, in update rec" + " %lu %lu\n", + TRX_ID_PREP_PRINTF(rec_trx_id), + TRX_ID_PREP_PRINTF(trx_id), + (ulong) ut_dulint_get_high(old_roll_ptr), + (ulong) ut_dulint_get_low(old_roll_ptr), + (ulong) ut_dulint_get_high(roll_ptr), + (ulong) ut_dulint_get_low(roll_ptr)); + + trx_purge_sys_print(); + return(DB_ERROR); + } + + if (row_upd_changes_field_size_or_external(index, offsets, update)) { + ulint n_ext; + + /* We have to set the appropriate extern storage bits in the + old version of the record: the extern bits in rec for those + fields that update does NOT update, as well as the the bits for + those fields that update updates to become externally stored + fields. Store the info: */ + + entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, + offsets, &n_ext, heap); + n_ext += btr_push_update_extern_fields(entry, update, heap); + /* The page containing the clustered index record + corresponding to entry is latched in mtr. Thus the + following call is safe. */ + row_upd_index_replace_new_col_vals(entry, index, update, heap); + + buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry, + n_ext)); + + *old_vers = rec_convert_dtuple_to_rec(buf, index, + entry, n_ext); + } else { + buf = mem_heap_alloc(heap, rec_offs_size(offsets)); + *old_vers = rec_copy(buf, rec, offsets); + rec_offs_make_valid(*old_vers, index, offsets); + row_upd_rec_in_place(*old_vers, index, offsets, update, NULL); + } + + return(DB_SUCCESS); +} diff --git a/storage/xtradb/trx/trx0roll.c b/storage/xtradb/trx/trx0roll.c new file mode 100644 index 00000000000..5f3cb15a254 --- /dev/null +++ b/storage/xtradb/trx/trx0roll.c @@ -0,0 +1,1359 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0roll.h" + +#ifdef UNIV_NONINL +#include "trx0roll.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "que0que.h" +#include "usr0sess.h" +#include "srv0que.h" +#include "srv0start.h" +#include "row0undo.h" +#include "row0mysql.h" +#include "lock0lock.h" +#include "pars0pars.h" + +/* This many pages must be undone before a truncate is tried within rollback */ +#define TRX_ROLL_TRUNC_THRESHOLD 1 + +/* In crash recovery, the current trx to be rolled back */ +static trx_t* trx_roll_crash_recv_trx = NULL; + +/* In crash recovery we set this to the undo n:o of the current trx to be +rolled back. Then we can print how many % the rollback has progressed. */ +static ib_int64_t trx_roll_max_undo_no; + +/* Auxiliary variable which tells the previous progress % we printed */ +static ulint trx_roll_progress_printed_pct; + +/*********************************************************************** +Rollback a transaction used in MySQL. */ +UNIV_INTERN +int +trx_general_rollback_for_mysql( +/*===========================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + ibool partial,/* in: TRUE if partial rollback requested */ + trx_savept_t* savept) /* in: pointer to savepoint undo number, if + partial rollback requested */ +{ +#ifndef UNIV_HOTBACKUP + mem_heap_t* heap; + que_thr_t* thr; + roll_node_t* roll_node; + + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + trx_start_if_not_started(trx); + + heap = mem_heap_create(512); + + roll_node = roll_node_create(heap); + + roll_node->partial = partial; + + if (partial) { + roll_node->savept = *savept; + } + + trx->error_state = DB_SUCCESS; + + thr = pars_complete_graph_for_exec(roll_node, trx, heap); + + ut_a(thr == que_fork_start_command(que_node_get_parent(thr))); + que_run_threads(thr); + + mutex_enter(&kernel_mutex); + + while (trx->que_state != TRX_QUE_RUNNING) { + + mutex_exit(&kernel_mutex); + + os_thread_sleep(100000); + + mutex_enter(&kernel_mutex); + } + + mutex_exit(&kernel_mutex); + + mem_heap_free(heap); + + ut_a(trx->error_state == DB_SUCCESS); + + /* Tell Innobase server that there might be work for + utility threads: */ + + srv_active_wake_master_thread(); + + return((int) trx->error_state); +#else /* UNIV_HOTBACKUP */ + /* This function depends on MySQL code that is not included in + InnoDB Hot Backup builds. Besides, this function should never + be called in InnoDB Hot Backup. */ + ut_error; + return(DB_FAIL); +#endif /* UNIV_HOTBACKUP */ +} + +/*********************************************************************** +Rollback a transaction used in MySQL. */ +UNIV_INTERN +int +trx_rollback_for_mysql( +/*===================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx) /* in: transaction handle */ +{ + int err; + + if (trx->conc_state == TRX_NOT_STARTED) { + + return(DB_SUCCESS); + } + + trx->op_info = "rollback"; + + /* If we are doing the XA recovery of prepared transactions, then + the transaction object does not have an InnoDB session object, and we + set a dummy session that we use for all MySQL transactions. */ + + err = trx_general_rollback_for_mysql(trx, FALSE, NULL); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************** +Rollback the latest SQL statement for MySQL. */ +UNIV_INTERN +int +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + /* out: error code or DB_SUCCESS */ + trx_t* trx) /* in: transaction handle */ +{ + int err; + + if (trx->conc_state == TRX_NOT_STARTED) { + + return(DB_SUCCESS); + } + + trx->op_info = "rollback of SQL statement"; + + err = trx_general_rollback_for_mysql(trx, TRUE, + &(trx->last_sql_stat_start)); + /* The following call should not be needed, but we play safe: */ + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************** +Frees a single savepoint struct. */ +UNIV_INTERN +void +trx_roll_savepoint_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep) /* in: savepoint to free */ +{ + ut_a(savep != NULL); + ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0); + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + mem_free(savep->name); + mem_free(savep); +} + +/*********************************************************************** +Frees savepoint structs starting from savep, if savep == NULL then +free all savepoints. */ + +void +trx_roll_savepoints_free( +/*=====================*/ + trx_t* trx, /* in: transaction handle */ + trx_named_savept_t* savep) /* in: free all savepoints > this one; + if this is NULL, free all savepoints + of trx */ +{ + trx_named_savept_t* next_savep; + + if (savep == NULL) { + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + } else { + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + while (savep != NULL) { + next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + + trx_roll_savepoint_free(trx, savep); + + savep = next_savep; + } +} + +/*********************************************************************** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. */ +UNIV_INTERN +ulint +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name, /* in: savepoint name */ + ib_int64_t* mysql_binlog_cache_pos) /* out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ +{ + trx_named_savept_t* savep; + ulint err; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep == NULL) { + + return(DB_NO_SAVEPOINT); + } + + if (trx->conc_state == TRX_NOT_STARTED) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: transaction has a savepoint ", stderr); + ut_print_name(stderr, trx, FALSE, savep->name); + fputs(" though it is not started\n", stderr); + return(DB_ERROR); + } + + /* We can now free all savepoints strictly later than this one */ + + trx_roll_savepoints_free(trx, savep); + + *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos; + + trx->op_info = "rollback to a savepoint"; + + err = trx_general_rollback_for_mysql(trx, TRUE, &(savep->savept)); + + /* Store the current undo_no of the transaction so that we know where + to roll back if we have to roll back the next SQL statement: */ + + trx_mark_sql_stat_end(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. */ +UNIV_INTERN +ulint +trx_savepoint_for_mysql( +/*====================*/ + /* out: always DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name, /* in: savepoint name */ + ib_int64_t binlog_cache_pos) /* in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ +{ + trx_named_savept_t* savep; + + ut_a(trx); + ut_a(savepoint_name); + + trx_start_if_not_started(trx); + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + /* Found */ + break; + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + if (savep) { + /* There is a savepoint with the same name: free that */ + + UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep); + + mem_free(savep->name); + mem_free(savep); + } + + /* Create a new savepoint and add it as the last in the list */ + + savep = mem_alloc(sizeof(trx_named_savept_t)); + + savep->name = mem_strdup(savepoint_name); + + savep->savept = trx_savept_take(trx); + + savep->mysql_binlog_cache_pos = binlog_cache_pos; + + UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep); + + return(DB_SUCCESS); +} + +/*********************************************************************** +Releases only the named savepoint. Savepoints which were set after this +savepoint are left as is. */ +UNIV_INTERN +ulint +trx_release_savepoint_for_mysql( +/*============================*/ + /* out: if no savepoint + of the name found then + DB_NO_SAVEPOINT, + otherwise DB_SUCCESS */ + trx_t* trx, /* in: transaction handle */ + const char* savepoint_name) /* in: savepoint name */ +{ + trx_named_savept_t* savep; + + savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + + /* Search for the savepoint by name and free if found. */ + while (savep != NULL) { + if (0 == ut_strcmp(savep->name, savepoint_name)) { + trx_roll_savepoint_free(trx, savep); + return(DB_SUCCESS); + } + savep = UT_LIST_GET_NEXT(trx_savepoints, savep); + } + + return(DB_NO_SAVEPOINT); +} + +/*********************************************************************** +Determines if this transaction is rolling back an incomplete transaction +in crash recovery. */ +UNIV_INTERN +ibool +trx_is_recv( +/*========*/ + /* out: TRUE if trx is an incomplete + transaction that is being rolled back + in crash recovery */ + const trx_t* trx) /* in: transaction */ +{ + return(trx == trx_roll_crash_recv_trx); +} + +/*********************************************************************** +Returns a transaction savepoint taken at this point in time. */ +UNIV_INTERN +trx_savept_t +trx_savept_take( +/*============*/ + /* out: savepoint */ + trx_t* trx) /* in: transaction */ +{ + trx_savept_t savept; + + savept.least_undo_no = trx->undo_no; + + return(savept); +} + +/*********************************************************************** +Roll back an active transaction. */ +static +void +trx_rollback_active( +/*================*/ + trx_t* trx) /* in/out: transaction */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + roll_node_t* roll_node; + dict_table_t* table; + ib_int64_t rows_to_undo; + const char* unit = ""; + ibool dictionary_locked = FALSE; + + heap = mem_heap_create(512); + + fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + + roll_node = roll_node_create(heap); + + thr->child = roll_node; + roll_node->common.parent = thr; + + mutex_enter(&kernel_mutex); + + trx->graph = fork; + + ut_a(thr == que_fork_start_command(fork)); + + trx_roll_crash_recv_trx = trx; + trx_roll_max_undo_no = ut_conv_dulint_to_longlong(trx->undo_no); + trx_roll_progress_printed_pct = 0; + rows_to_undo = trx_roll_max_undo_no; + + if (rows_to_undo > 1000000000) { + rows_to_undo = rows_to_undo / 1000000; + unit = "M"; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s" + " rows to undo\n", + TRX_ID_PREP_PRINTF(trx->id), + (ulong) rows_to_undo, unit); + mutex_exit(&kernel_mutex); + + trx->mysql_thread_id = os_thread_get_curr_id(); + + trx->mysql_process_no = os_proc_get_number(); + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + row_mysql_lock_data_dictionary(trx); + dictionary_locked = TRUE; + } + + que_run_threads(thr); + + mutex_enter(&kernel_mutex); + + while (trx->que_state != TRX_QUE_RUNNING) { + + mutex_exit(&kernel_mutex); + + fprintf(stderr, + "InnoDB: Waiting for rollback of trx id %lu to end\n", + (ulong) ut_dulint_get_low(trx->id)); + os_thread_sleep(100000); + + mutex_enter(&kernel_mutex); + } + + mutex_exit(&kernel_mutex); + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE + && !ut_dulint_is_zero(trx->table_id)) { + + /* If the transaction was for a dictionary operation, we + drop the relevant table, if it still exists */ + + fprintf(stderr, + "InnoDB: Dropping table with id %lu %lu" + " in recovery if it exists\n", + (ulong) ut_dulint_get_high(trx->table_id), + (ulong) ut_dulint_get_low(trx->table_id)); + + table = dict_table_get_on_id_low(trx->table_id); + + if (table) { + ulint err; + + fputs("InnoDB: Table found: dropping table ", stderr); + ut_print_name(stderr, trx, TRUE, table->name); + fputs(" in recovery\n", stderr); + + err = row_drop_table_for_mysql(table->name, trx, TRUE); + trx_commit_for_mysql(trx); + + ut_a(err == (int) DB_SUCCESS); + } + } + + if (dictionary_locked) { + row_mysql_unlock_data_dictionary(trx); + } + + fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT + " completed\n", + TRX_ID_PREP_PRINTF(trx->id)); + mem_heap_free(heap); + + trx_roll_crash_recv_trx = NULL; +} + +/*********************************************************************** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ +UNIV_INTERN +os_thread_ret_t +trx_rollback_or_clean_all_recovered( +/*================================*/ + /* out: a dummy parameter */ + void* arg __attribute__((unused))) + /* in: a dummy parameter required by + os_thread_create */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + if (UT_LIST_GET_FIRST(trx_sys->trx_list)) { + + fprintf(stderr, + "InnoDB: Starting in background the rollback" + " of uncommitted transactions\n"); + } else { + goto leave_function; + } + + mutex_exit(&kernel_mutex); + +loop: + mutex_enter(&kernel_mutex); + + for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx; + trx = UT_LIST_GET_NEXT(trx_list, trx)) { + if (!trx->is_recovered) { + continue; + } + + switch (trx->conc_state) { + case TRX_NOT_STARTED: + case TRX_PREPARED: + continue; + + case TRX_COMMITTED_IN_MEMORY: + mutex_exit(&kernel_mutex); + fprintf(stderr, + "InnoDB: Cleaning up trx with id " + TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF(trx->id)); + trx_cleanup_at_db_startup(trx); + goto loop; + + case TRX_ACTIVE: + mutex_exit(&kernel_mutex); + trx_rollback_active(trx); + goto loop; + } + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Rollback of non-prepared transactions completed\n"); + +leave_function: + mutex_exit(&kernel_mutex); + + /* We count the number of threads in os_thread_exit(). A created + thread should always use that to exit and not use return() to exit. */ + + os_thread_exit(NULL); + + OS_THREAD_DUMMY_RETURN; +} + +/*********************************************************************** +Creates an undo number array. */ +UNIV_INTERN +trx_undo_arr_t* +trx_undo_arr_create(void) +/*=====================*/ +{ + trx_undo_arr_t* arr; + mem_heap_t* heap; + ulint i; + + heap = mem_heap_create(1024); + + arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t)); + + arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t) + * UNIV_MAX_PARALLELISM); + arr->n_cells = UNIV_MAX_PARALLELISM; + arr->n_used = 0; + + arr->heap = heap; + + for (i = 0; i < UNIV_MAX_PARALLELISM; i++) { + + (trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE; + } + + return(arr); +} + +/*********************************************************************** +Frees an undo number array. */ +UNIV_INTERN +void +trx_undo_arr_free( +/*==============*/ + trx_undo_arr_t* arr) /* in: undo number array */ +{ + ut_ad(arr->n_used == 0); + + mem_heap_free(arr->heap); +} + +/*********************************************************************** +Stores info of an undo log record to the array if it is not stored yet. */ +static +ibool +trx_undo_arr_store_info( +/*====================*/ + /* out: FALSE if the record already existed in the + array */ + trx_t* trx, /* in: transaction */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_inf_t* cell; + trx_undo_inf_t* stored_here; + trx_undo_arr_t* arr; + ulint n_used; + ulint n; + ulint i; + + n = 0; + arr = trx->undo_no_arr; + n_used = arr->n_used; + stored_here = NULL; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (!cell->in_use) { + if (!stored_here) { + /* Not in use, we may store here */ + cell->undo_no = undo_no; + cell->in_use = TRUE; + + arr->n_used++; + + stored_here = cell; + } + } else { + n++; + + if (0 == ut_dulint_cmp(cell->undo_no, undo_no)) { + + if (stored_here) { + stored_here->in_use = FALSE; + ut_ad(arr->n_used > 0); + arr->n_used--; + } + + ut_ad(arr->n_used == n_used); + + return(FALSE); + } + } + + if (n == n_used && stored_here) { + + ut_ad(arr->n_used == 1 + n_used); + + return(TRUE); + } + } +} + +/*********************************************************************** +Removes an undo number from the array. */ +static +void +trx_undo_arr_remove_info( +/*=====================*/ + trx_undo_arr_t* arr, /* in: undo number array */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_inf_t* cell; + ulint n_used; + ulint n; + ulint i; + + n_used = arr->n_used; + n = 0; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use + && 0 == ut_dulint_cmp(cell->undo_no, undo_no)) { + + cell->in_use = FALSE; + + ut_ad(arr->n_used > 0); + + arr->n_used--; + + return; + } + } +} + +/*********************************************************************** +Gets the biggest undo number in an array. */ +static +dulint +trx_undo_arr_get_biggest( +/*=====================*/ + /* out: biggest value, ut_dulint_zero if + the array is empty */ + trx_undo_arr_t* arr) /* in: undo number array */ +{ + trx_undo_inf_t* cell; + ulint n_used; + dulint biggest; + ulint n; + ulint i; + + n = 0; + n_used = arr->n_used; + biggest = ut_dulint_zero; + + for (i = 0;; i++) { + cell = trx_undo_arr_get_nth_info(arr, i); + + if (cell->in_use) { + n++; + if (ut_dulint_cmp(cell->undo_no, biggest) > 0) { + + biggest = cell->undo_no; + } + } + + if (n == n_used) { + return(biggest); + } + } +} + +/*************************************************************************** +Tries truncate the undo logs. */ +UNIV_INTERN +void +trx_roll_try_truncate( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + trx_undo_arr_t* arr; + dulint limit; + dulint biggest; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&((trx->rseg)->mutex))); + + trx->pages_undone = 0; + + arr = trx->undo_no_arr; + + limit = trx->undo_no; + + if (arr->n_used > 0) { + biggest = trx_undo_arr_get_biggest(arr); + + if (ut_dulint_cmp(biggest, limit) >= 0) { + + limit = ut_dulint_add(biggest, 1); + } + } + + if (trx->insert_undo) { + trx_undo_truncate_end(trx, trx->insert_undo, limit); + } + + if (trx->update_undo) { + trx_undo_truncate_end(trx, trx->update_undo, limit); + } +} + +/*************************************************************************** +Pops the topmost undo log record in a single undo log and updates the info +about the topmost record in the undo log memory struct. */ +static +trx_undo_rec_t* +trx_roll_pop_top_rec( +/*=================*/ + /* out: undo log record, the page s-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* undo_page; + ulint offset; + trx_undo_rec_t* prev_rec; + page_t* prev_rec_page; + + ut_ad(mutex_own(&(trx->undo_mutex))); + + undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size, + undo->top_page_no, mtr); + offset = undo->top_offset; + + /* fprintf(stderr, "Thread %lu undoing trx %lu undo record %lu\n", + os_thread_get_curr_id(), ut_dulint_get_low(trx->id), + ut_dulint_get_low(undo->top_undo_no)); */ + + prev_rec = trx_undo_get_prev_rec(undo_page + offset, + undo->hdr_page_no, undo->hdr_offset, + mtr); + if (prev_rec == NULL) { + + undo->empty = TRUE; + } else { + prev_rec_page = page_align(prev_rec); + + if (prev_rec_page != undo_page) { + + trx->pages_undone++; + } + + undo->top_page_no = page_get_page_no(prev_rec_page); + undo->top_offset = prev_rec - prev_rec_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); + } + + return(undo_page + offset); +} + +/************************************************************************ +Pops the topmost record when the two undo logs of a transaction are seen +as a single stack of records ordered by their undo numbers. Inserts the +undo number of the popped undo record to the array of currently processed +undo numbers in the transaction. When the query thread finishes processing +of this undo record, it must be released with trx_undo_rec_release. */ +UNIV_INTERN +trx_undo_rec_t* +trx_roll_pop_top_rec_of_trx( +/*========================*/ + /* out: undo log record copied to heap, NULL + if none left, or if the undo number of the + top record would be less than the limit */ + trx_t* trx, /* in: transaction */ + dulint limit, /* in: least undo number we need */ + dulint* roll_ptr,/* out: roll pointer to undo record */ + mem_heap_t* heap) /* in: memory heap where copied */ +{ + trx_undo_t* undo; + trx_undo_t* ins_undo; + trx_undo_t* upd_undo; + trx_undo_rec_t* undo_rec; + trx_undo_rec_t* undo_rec_copy; + dulint undo_no; + ibool is_insert; + trx_rseg_t* rseg; + ulint progress_pct; + mtr_t mtr; + + rseg = trx->rseg; +try_again: + mutex_enter(&(trx->undo_mutex)); + + if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) { + mutex_enter(&(rseg->mutex)); + + trx_roll_try_truncate(trx); + + mutex_exit(&(rseg->mutex)); + } + + ins_undo = trx->insert_undo; + upd_undo = trx->update_undo; + + if (!ins_undo || ins_undo->empty) { + undo = upd_undo; + } else if (!upd_undo || upd_undo->empty) { + undo = ins_undo; + } else if (ut_dulint_cmp(upd_undo->top_undo_no, + ins_undo->top_undo_no) > 0) { + undo = upd_undo; + } else { + undo = ins_undo; + } + + if (!undo || undo->empty + || (ut_dulint_cmp(limit, undo->top_undo_no) > 0)) { + + if ((trx->undo_no_arr)->n_used == 0) { + /* Rollback is ending */ + + mutex_enter(&(rseg->mutex)); + + trx_roll_try_truncate(trx); + + mutex_exit(&(rseg->mutex)); + } + + mutex_exit(&(trx->undo_mutex)); + + return(NULL); + } + + if (undo == ins_undo) { + is_insert = TRUE; + } else { + is_insert = FALSE; + } + + *roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id, + undo->top_page_no, + undo->top_offset); + mtr_start(&mtr); + + undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr); + + undo_no = trx_undo_rec_get_undo_no(undo_rec); + + ut_ad(ut_dulint_cmp(ut_dulint_add(undo_no, 1), trx->undo_no) == 0); + + /* We print rollback progress info if we are in a crash recovery + and the transaction has at least 1000 row operations to undo. */ + + if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) { + + progress_pct = 100 - (ulint) + ((ut_conv_dulint_to_longlong(undo_no) * 100) + / trx_roll_max_undo_no); + if (progress_pct != trx_roll_progress_printed_pct) { + if (trx_roll_progress_printed_pct == 0) { + fprintf(stderr, + "\nInnoDB: Progress in percents:" + " %lu", (ulong) progress_pct); + } else { + fprintf(stderr, + " %lu", (ulong) progress_pct); + } + fflush(stderr); + trx_roll_progress_printed_pct = progress_pct; + } + } + + trx->undo_no = undo_no; + + if (!trx_undo_arr_store_info(trx, undo_no)) { + /* A query thread is already processing this undo log record */ + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + goto try_again; + } + + undo_rec_copy = trx_undo_rec_copy(undo_rec, heap); + + mutex_exit(&(trx->undo_mutex)); + + mtr_commit(&mtr); + + return(undo_rec_copy); +} + +/************************************************************************ +Reserves an undo log record for a query thread to undo. This should be +called if the query thread gets the undo log record not using the pop +function above. */ +UNIV_INTERN +ibool +trx_undo_rec_reserve( +/*=================*/ + /* out: TRUE if succeeded */ + trx_t* trx, /* in: transaction */ + dulint undo_no)/* in: undo number of the record */ +{ + ibool ret; + + mutex_enter(&(trx->undo_mutex)); + + ret = trx_undo_arr_store_info(trx, undo_no); + + mutex_exit(&(trx->undo_mutex)); + + return(ret); +} + +/*********************************************************************** +Releases a reserved undo record. */ +UNIV_INTERN +void +trx_undo_rec_release( +/*=================*/ + trx_t* trx, /* in: transaction */ + dulint undo_no)/* in: undo number */ +{ + trx_undo_arr_t* arr; + + mutex_enter(&(trx->undo_mutex)); + + arr = trx->undo_no_arr; + + trx_undo_arr_remove_info(arr, undo_no); + + mutex_exit(&(trx->undo_mutex)); +} + +/************************************************************************* +Starts a rollback operation. */ +UNIV_INTERN +void +trx_rollback( +/*=========*/ + trx_t* trx, /* in: transaction */ + trx_sig_t* sig, /* in: signal starting the rollback */ + que_thr_t** next_thr)/* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the passed value is + NULL, the parameter is ignored */ +{ + que_t* roll_graph; + que_thr_t* thr; + /* que_thr_t* thr2; */ + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0)); + + /* Initialize the rollback field in the transaction */ + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + trx->roll_limit = ut_dulint_zero; + + } else if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) { + + trx->roll_limit = (sig->savept).least_undo_no; + + } else if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx->roll_limit = trx->last_sql_stat_start.least_undo_no; + } else { + ut_error; + } + + ut_a(ut_dulint_cmp(trx->roll_limit, trx->undo_no) <= 0); + + trx->pages_undone = 0; + + if (trx->undo_no_arr == NULL) { + trx->undo_no_arr = trx_undo_arr_create(); + } + + /* Build a 'query' graph which will perform the undo operations */ + + roll_graph = trx_roll_graph_build(trx); + + trx->graph = roll_graph; + trx->que_state = TRX_QUE_ROLLING_BACK; + + thr = que_fork_start_command(roll_graph); + + ut_ad(thr); + + /* thr2 = que_fork_start_command(roll_graph); + + ut_ad(thr2); */ + + if (next_thr && (*next_thr == NULL)) { + *next_thr = thr; + /* srv_que_task_enqueue_low(thr2); */ + } else { + srv_que_task_enqueue_low(thr); + /* srv_que_task_enqueue_low(thr2); */ + } +} + +/******************************************************************** +Builds an undo 'query' graph for a transaction. The actual rollback is +performed by executing this query graph like a query subprocedure call. +The reply about the completion of the rollback will be sent by this +graph. */ +UNIV_INTERN +que_t* +trx_roll_graph_build( +/*=================*/ + /* out, own: the query graph */ + trx_t* trx) /* in: trx handle */ +{ + mem_heap_t* heap; + que_fork_t* fork; + que_thr_t* thr; + /* que_thr_t* thr2; */ + + ut_ad(mutex_own(&kernel_mutex)); + + heap = mem_heap_create(512); + fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap); + fork->trx = trx; + + thr = que_thr_create(fork, heap); + /* thr2 = que_thr_create(fork, heap); */ + + thr->child = row_undo_node_create(trx, thr, heap); + /* thr2->child = row_undo_node_create(trx, thr2, heap); */ + + return(fork); +} + +/************************************************************************* +Finishes error processing after the necessary partial rollback has been +done. */ +static +void +trx_finish_error_processing( +/*========================*/ + trx_t* trx) /* in: transaction */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + + ut_ad(mutex_own(&kernel_mutex)); + + sig = UT_LIST_GET_FIRST(trx->signals); + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/************************************************************************* +Finishes a partial rollback operation. */ +static +void +trx_finish_partial_rollback_off_kernel( +/*===================================*/ + trx_t* trx, /* in: transaction */ + que_thr_t** next_thr)/* in/out: next query thread to run; + if the value which is passed in is a pointer + to a NULL pointer, then the calling function + can start running a new query thread; if this + parameter is NULL, it is ignored */ +{ + trx_sig_t* sig; + + ut_ad(mutex_own(&kernel_mutex)); + + sig = UT_LIST_GET_FIRST(trx->signals); + + /* Remove the signal from the signal queue and send reply message + to it */ + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + + trx->que_state = TRX_QUE_RUNNING; +} + +/******************************************************************** +Finishes a transaction rollback. */ +UNIV_INTERN +void +trx_finish_rollback_off_kernel( +/*===========================*/ + que_t* graph, /* in: undo graph which can now be freed */ + trx_t* trx, /* in: transaction */ + que_thr_t** next_thr)/* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if this parameter is + NULL, it is ignored */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + + ut_ad(mutex_own(&kernel_mutex)); + + ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0); + + /* Free the memory reserved by the undo graph */ + que_graph_free(graph); + + sig = UT_LIST_GET_FIRST(trx->signals); + + if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) { + + trx_finish_partial_rollback_off_kernel(trx, next_thr); + + return; + + } else if (sig->type == TRX_SIG_ERROR_OCCURRED) { + + trx_finish_error_processing(trx); + + return; + } + +#ifdef UNIV_DEBUG + if (lock_print_waits) { + fprintf(stderr, "Trx %lu rollback finished\n", + (ulong) ut_dulint_get_low(trx->id)); + } +#endif /* UNIV_DEBUG */ + + trx_commit_off_kernel(trx); + + /* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and + send reply messages to them */ + + trx->que_state = TRX_QUE_RUNNING; + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + trx_sig_reply(sig, next_thr); + + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } +} + +/************************************************************************* +Creates a rollback command node struct. */ +UNIV_INTERN +roll_node_t* +roll_node_create( +/*=============*/ + /* out, own: rollback node struct */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + roll_node_t* node; + + node = mem_heap_alloc(heap, sizeof(roll_node_t)); + node->common.type = QUE_NODE_ROLLBACK; + node->state = ROLL_NODE_SEND; + + node->partial = FALSE; + + return(node); +} + +/*************************************************************** +Performs an execution step for a rollback command node in a query graph. */ +UNIV_INTERN +que_thr_t* +trx_rollback_step( +/*==============*/ + /* out: query thread to run next, or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + roll_node_t* node; + ulint sig_no; + trx_savept_t* savept; + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = ROLL_NODE_SEND; + } + + if (node->state == ROLL_NODE_SEND) { + mutex_enter(&kernel_mutex); + + node->state = ROLL_NODE_WAIT; + + if (node->partial) { + sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT; + savept = &(node->savept); + } else { + sig_no = TRX_SIG_TOTAL_ROLLBACK; + savept = NULL; + } + + /* Send a rollback signal to the transaction */ + + trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr, + savept, NULL); + + thr->state = QUE_THR_SIG_REPLY_WAIT; + + mutex_exit(&kernel_mutex); + + return(NULL); + } + + ut_ad(node->state == ROLL_NODE_WAIT); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.c new file mode 100644 index 00000000000..db5efd65eb3 --- /dev/null +++ b/storage/xtradb/trx/trx0rseg.c @@ -0,0 +1,282 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0rseg.h" + +#ifdef UNIV_NONINL +#include "trx0rseg.ic" +#endif + +#include "trx0undo.h" +#include "fut0lst.h" +#include "srv0srv.h" +#include "trx0purge.h" + +/********************************************************************** +Looks for a rollback segment, based on the rollback segment id. */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_get_on_id( +/*===============*/ + /* out: rollback segment */ + ulint id) /* in: rollback segment id */ +{ + trx_rseg_t* rseg; + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + ut_ad(rseg); + + while (rseg->id != id) { + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + ut_ad(rseg); + } + + return(rseg); +} + +/******************************************************************** +Creates a rollback segment header. This function is called only when +a new rollback segment is created in the database. */ +UNIV_INTERN +ulint +trx_rseg_header_create( +/*===================*/ + /* out: page number of the created segment, + FIL_NULL if fail */ + ulint space, /* in: space id */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint max_size, /* in: max size in pages */ + ulint* slot_no, /* out: rseg id == slot number in trx sys */ + mtr_t* mtr) /* in: mtr */ +{ + ulint page_no; + trx_rsegf_t* rsegf; + trx_sysf_t* sys_header; + ulint i; + buf_block_t* block; + + ut_ad(mtr); + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL), + MTR_MEMO_X_LOCK)); + sys_header = trx_sysf_get(mtr); + + *slot_no = trx_sysf_rseg_find_free(mtr); + + if (*slot_no == ULINT_UNDEFINED) { + + return(FIL_NULL); + } + + /* Allocate a new file segment for the rollback segment */ + block = fseg_create(space, 0, + TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr); + + if (block == NULL) { + /* No space left */ + + return(FIL_NULL); + } + + buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW); + + page_no = buf_block_get_page_no(block); + + /* Get the rollback segment file page */ + rsegf = trx_rsegf_get_new(space, zip_size, page_no, mtr); + + /* Initialize max size field */ + mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size, + MLOG_4BYTES, mtr); + + /* Initialize the history list */ + + mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr); + flst_init(rsegf + TRX_RSEG_HISTORY, mtr); + + /* Reset the undo log slots */ + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + + trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr); + } + + /* Add the rollback segment info to the free slot in the trx system + header */ + + trx_sysf_rseg_set_space(sys_header, *slot_no, space, mtr); + trx_sysf_rseg_set_page_no(sys_header, *slot_no, page_no, mtr); + + return(page_no); +} + +/*************************************************************************** +Creates and initializes a rollback segment object. The values for the +fields are read from the header. The object is inserted to the rseg +list of the trx system object and a pointer is inserted in the rseg +array in the trx system object. */ +static +trx_rseg_t* +trx_rseg_mem_create( +/*================*/ + /* out, own: rollback segment object */ + ulint id, /* in: rollback segment id */ + ulint space, /* in: space where the segment placed */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no, /* in: page number of the segment header */ + mtr_t* mtr) /* in: mtr */ +{ + trx_rsegf_t* rseg_header; + trx_rseg_t* rseg; + trx_ulogf_t* undo_log_hdr; + fil_addr_t node_addr; + ulint sum_of_undo_sizes; + ulint len; + + ut_ad(mutex_own(&kernel_mutex)); + + rseg = mem_alloc(sizeof(trx_rseg_t)); + + rseg->id = id; + rseg->space = space; + rseg->zip_size = zip_size; + rseg->page_no = page_no; + + mutex_create(&rseg->mutex, SYNC_RSEG); + + UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg); + + trx_sys_set_nth_rseg(trx_sys, id, rseg); + + rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr); + + rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE, + MLOG_4BYTES, mtr); + + /* Initialize the undo log lists according to the rseg header */ + + sum_of_undo_sizes = trx_undo_lists_init(rseg); + + rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr) + + 1 + sum_of_undo_sizes; + + len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr); + if (len > 0) { + trx_sys->rseg_history_len += len; + + node_addr = trx_purge_get_log_from_hist( + flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr)); + rseg->last_page_no = node_addr.page; + rseg->last_offset = node_addr.boffset; + + undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size, + node_addr.page, + mtr) + node_addr.boffset; + + rseg->last_trx_no = mtr_read_dulint( + undo_log_hdr + TRX_UNDO_TRX_NO, mtr); + rseg->last_del_marks = mtr_read_ulint( + undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr); + } else { + rseg->last_page_no = FIL_NULL; + } + + return(rseg); +} + +/************************************************************************* +Creates the memory copies for rollback segments and initializes the +rseg list and array in trx_sys at a database startup. */ +UNIV_INTERN +void +trx_rseg_list_and_array_init( +/*=========================*/ + trx_sysf_t* sys_header, /* in: trx system header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint i; + ulint page_no; + ulint space; + + UT_LIST_INIT(trx_sys->rseg_list); + + trx_sys->rseg_history_len = 0; + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + + trx_sys_set_nth_rseg(trx_sys, i, NULL); + } else { + ulint zip_size; + + space = trx_sysf_rseg_get_space(sys_header, i, mtr); + + zip_size = space ? fil_space_get_zip_size(space) : 0; + + trx_rseg_mem_create(i, space, zip_size, page_no, mtr); + } + } +} + +/******************************************************************** +Creates a new rollback segment to the database. */ +UNIV_INTERN +trx_rseg_t* +trx_rseg_create( +/*============*/ + /* out: the created segment object, NULL if + fail */ + ulint space, /* in: space id */ + ulint max_size, /* in: max size in pages */ + ulint* id, /* out: rseg id */ + mtr_t* mtr) /* in: mtr */ +{ + ulint flags; + ulint zip_size; + ulint page_no; + trx_rseg_t* rseg; + + mtr_x_lock(fil_space_get_latch(space, &flags), mtr); + zip_size = dict_table_flags_to_zip_size(flags); + mutex_enter(&kernel_mutex); + + page_no = trx_rseg_header_create(space, zip_size, max_size, id, mtr); + + if (page_no == FIL_NULL) { + + mutex_exit(&kernel_mutex); + return(NULL); + } + + rseg = trx_rseg_mem_create(*id, space, zip_size, page_no, mtr); + + mutex_exit(&kernel_mutex); + + return(rseg); +} diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c new file mode 100644 index 00000000000..b80a50738c0 --- /dev/null +++ b/storage/xtradb/trx/trx0sys.c @@ -0,0 +1,1391 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0sys.h" + +#ifdef UNIV_NONINL +#include "trx0sys.ic" +#endif + +#include "fsp0fsp.h" +#include "mtr0mtr.h" +#include "trx0trx.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "srv0srv.h" +#include "trx0purge.h" +#include "log0log.h" +#include "os0file.h" + +/* The file format tag structure with id and name. */ +struct file_format_struct { + ulint id; /* id of the file format */ + const char* name; /* text representation of the + file format */ + mutex_t mutex; /* covers changes to the above + fields */ +}; + +typedef struct file_format_struct file_format_t; + +/* The transaction system */ +UNIV_INTERN trx_sys_t* trx_sys = NULL; +UNIV_INTERN trx_doublewrite_t* trx_doublewrite = NULL; + +/* The following is set to TRUE when we are upgrading from the old format data +files to the new >= 4.1.x format multiple tablespaces format data files */ + +UNIV_INTERN ibool trx_doublewrite_must_reset_space_ids = FALSE; + +/* The following is TRUE when we are using the database in the new format, +i.e., we have successfully upgraded, or have created a new database +installation */ + +UNIV_INTERN ibool trx_sys_multiple_tablespace_format = FALSE; + +/* In a MySQL replication slave, in crash recovery we store the master log +file name and position here. We have successfully got the updates to InnoDB +up to this position. If .._pos is -1, it means no crash recovery was needed, +or there was no master log position info inside InnoDB. */ + +UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN]; +UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1; + +UNIV_INTERN char trx_sys_mysql_relay_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN]; +UNIV_INTERN ib_int64_t trx_sys_mysql_relay_log_pos = -1; + +/* If this MySQL server uses binary logging, after InnoDB has been inited +and if it has done a crash recovery, we store the binlog file name and position +here. If .._pos is -1, it means there was no binlog position info inside +InnoDB. */ + +UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN]; +UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1; + +/* List of animal names representing file format. */ +static const char* file_format_name_map[] = { + "Antelope", + "Barracuda", + "Cheetah", + "Dragon", + "Elk", + "Fox", + "Gazelle", + "Hornet", + "Impala", + "Jaguar", + "Kangaroo", + "Leopard", + "Moose", + "Nautilus", + "Ocelot", + "Porpoise", + "Quail", + "Rabbit", + "Shark", + "Tiger", + "Urchin", + "Viper", + "Whale", + "Xenops", + "Yak", + "Zebra" +}; + +/* The number of elements in the file format name array. */ +static const ulint FILE_FORMAT_NAME_N + = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]); + +/* This is used to track the maximum file format id known to InnoDB. It's +updated via SET GLOBAL innodb_file_format_check = 'x' or when we open +or create a table. */ +static file_format_t file_format_max; + +/******************************************************************** +Determines if a page number is located inside the doublewrite buffer. */ +UNIV_INTERN +ibool +trx_doublewrite_page_inside( +/*========================*/ + /* out: TRUE if the location is inside + the two blocks of the doublewrite buffer */ + ulint page_no) /* in: page number */ +{ + if (trx_doublewrite == NULL) { + + return(FALSE); + } + + if (page_no >= trx_doublewrite->block1 + && page_no < trx_doublewrite->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + if (page_no >= trx_doublewrite->block2 + && page_no < trx_doublewrite->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + return(FALSE); +} + +/******************************************************************** +Creates or initialializes the doublewrite buffer at a database start. */ +static +void +trx_doublewrite_init( +/*=================*/ + byte* doublewrite) /* in: pointer to the doublewrite buf + header on trx sys page */ +{ + trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); + + /* Since we now start to use the doublewrite buffer, no need to call + fsync() after every write to a data file */ +#ifdef UNIV_DO_FLUSH + os_do_not_call_flush_at_each_write = TRUE; +#endif /* UNIV_DO_FLUSH */ + + mutex_create(&trx_doublewrite->mutex, SYNC_DOUBLEWRITE); + + trx_doublewrite->first_free = 0; + + trx_doublewrite->block1 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1); + trx_doublewrite->block2 = mach_read_from_4( + doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2); + trx_doublewrite->write_buf_unaligned = ut_malloc( + (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE); + + trx_doublewrite->write_buf = ut_align( + trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE); + trx_doublewrite->buf_block_arr = mem_alloc( + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*)); +} + +/******************************************************************** +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ +UNIV_INTERN +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void) +/*===============================================*/ +{ + buf_block_t* block; + byte* doublewrite; + mtr_t mtr; + + /* We upgraded to 4.1.x and reset the space id fields in the + doublewrite buffer. Let us mark to the trx_sys header that the upgrade + has been done. */ + + mtr_start(&mtr); + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + trx_sys_multiple_tablespace_format = TRUE; +} + +/******************************************************************** +Creates the doublewrite buffer to a new InnoDB installation. The header of the +doublewrite buffer is placed on the trx system header page. */ +UNIV_INTERN +void +trx_sys_create_doublewrite_buf(void) +/*================================*/ +{ + buf_block_t* block; + buf_block_t* block2; + buf_block_t* new_block; + byte* doublewrite; + byte* fseg_header; + ulint page_no; + ulint prev_page_no; + ulint i; + mtr_t mtr; + + if (trx_doublewrite) { + /* Already inited */ + + return; + } + +start_again: + mtr_start(&mtr); + + block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + + doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has already been created: + just read in some numbers */ + + trx_doublewrite_init(doublewrite); + + mtr_commit(&mtr); + } else { + fprintf(stderr, + "InnoDB: Doublewrite buffer not found:" + " creating new\n"); + + if (buf_pool_get_curr_size() + < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2 + 100) + * UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer:" + " you must\n" + "InnoDB: increase your buffer pool size.\n" + "InnoDB: Cannot continue operation.\n"); + + exit(1); + } + + block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_FSEG, &mtr); + + /* fseg_create acquires a second latch on the page, + therefore we must declare it: */ + + buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK); + + if (block2 == NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite buffer:" + " you must\n" + "InnoDB: increase your tablespace size.\n" + "InnoDB: Cannot continue operation.\n"); + + /* We exit without committing the mtr to prevent + its modifications to the database getting to disk */ + + exit(1); + } + + fseg_header = buf_block_get_frame(block) + + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG; + prev_page_no = 0; + + for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + + FSP_EXTENT_SIZE / 2; i++) { + page_no = fseg_alloc_free_page(fseg_header, + prev_page_no + 1, + FSP_UP, &mtr); + if (page_no == FIL_NULL) { + fprintf(stderr, + "InnoDB: Cannot create doublewrite" + " buffer: you must\n" + "InnoDB: increase your" + " tablespace size.\n" + "InnoDB: Cannot continue operation.\n" + ); + + exit(1); + } + + /* We read the allocated pages to the buffer pool; + when they are written to disk in a flush, the space + id and page number fields are also written to the + pages. When we at database startup read pages + from the doublewrite buffer, we know that if the + space id and page number in them are the same as + the page position in the tablespace, then the page + has not been written to in doublewrite. */ + + new_block = buf_page_get(TRX_SYS_SPACE, 0, page_no, + RW_X_LATCH, &mtr); + buf_block_dbg_add_level(new_block, + SYNC_NO_ORDER_CHECK); + + /* Make a dummy change to the page to ensure it will + be written to disk in a flush */ + + mlog_write_ulint(buf_block_get_frame(new_block) + + FIL_PAGE_DATA, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + if (i == FSP_EXTENT_SIZE / 2) { + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1, + page_no, MLOG_4BYTES, &mtr); + } else if (i == FSP_EXTENT_SIZE / 2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2, + page_no, MLOG_4BYTES, &mtr); + } else if (i > FSP_EXTENT_SIZE / 2) { + ut_a(page_no == prev_page_no + 1); + } + + prev_page_no = page_no; + } + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT, + TRX_SYS_DOUBLEWRITE_MAGIC_N, + MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + fprintf(stderr, "InnoDB: Doublewrite buffer created\n"); + + trx_sys_multiple_tablespace_format = TRUE; + + goto start_again; + } +} + +/******************************************************************** +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore +half-written pages in the data files. */ +UNIV_INTERN +void +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages) +{ + byte* buf; + byte* read_buf; + byte* unaligned_read_buf; + ulint block1; + ulint block2; + ulint source_page_no; + byte* page; + byte* doublewrite; + ulint space_id; + ulint page_no; + ulint i; + + /* We do the file i/o past the buffer pool */ + + unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); + read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE); + + /* Read the trx sys header to check if we are using the doublewrite + buffer */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, + UNIV_PAGE_SIZE, read_buf, NULL); + doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) + == TRX_SYS_DOUBLEWRITE_MAGIC_N) { + /* The doublewrite buffer has been created */ + + trx_doublewrite_init(doublewrite); + + block1 = trx_doublewrite->block1; + block2 = trx_doublewrite->block2; + + buf = trx_doublewrite->write_buf; + } else { + goto leave_func; + } + + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + + /* We are upgrading from a version < 4.1.x to a version where + multiple tablespaces are supported. We must reset the space id + field in the pages in the doublewrite buffer because starting + from this version the space id is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ + + trx_doublewrite_must_reset_space_ids = TRUE; + + fprintf(stderr, + "InnoDB: Resetting space id's in the" + " doublewrite buffer\n"); + } else { + trx_sys_multiple_tablespace_format = TRUE; + } + + /* Read the pages from the doublewrite buffer to memory */ + + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf, NULL); + fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + NULL); + /* Check if any of these pages is half-written in data files, in the + intended position */ + + page = buf; + + for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { + + page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); + + if (trx_doublewrite_must_reset_space_ids) { + + space_id = 0; + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0); + /* We do not need to calculate new checksums for the + pages because the field .._SPACE_ID does not affect + them. Write the page back to where we read it from. */ + + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + source_page_no = block1 + i; + } else { + source_page_no = block2 + + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + /* printf("Resetting space id in page %lu\n", + source_page_no); */ + } else { + space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } + + if (!restore_corrupt_pages) { + /* The database was shut down gracefully: no need to + restore pages */ + + } else if (!fil_tablespace_exists_in_mem(space_id)) { + /* Maybe we have dropped the single-table tablespace + and this page once belonged to it: do nothing */ + + } else if (!fil_check_adress_in_tablespace(space_id, + page_no)) { + fprintf(stderr, + "InnoDB: Warning: a page in the" + " doublewrite buffer is not within space\n" + "InnoDB: bounds; space id %lu" + " page number %lu, page %lu in" + " doublewrite buf.\n", + (ulong) space_id, (ulong) page_no, (ulong) i); + + } else if (space_id == TRX_SYS_SPACE + && ((page_no >= block1 + && page_no + < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) + || (page_no >= block2 + && page_no + < (block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) { + + /* It is an unwritten doublewrite buffer page: + do nothing */ + } else { + ulint zip_size = fil_space_get_zip_size(space_id); + + /* Read in the actual page from the file */ + fil_io(OS_FILE_READ, TRUE, space_id, zip_size, + page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + read_buf, NULL); + + /* Check if the page is corrupt */ + + if (UNIV_UNLIKELY + (buf_page_is_corrupted(read_buf, zip_size))) { + + fprintf(stderr, + "InnoDB: Warning: database page" + " corruption or a failed\n" + "InnoDB: file read of" + " space %lu page %lu.\n" + "InnoDB: Trying to recover it from" + " the doublewrite buffer.\n", + (ulong) space_id, (ulong) page_no); + + if (buf_page_is_corrupted(page, zip_size)) { + fprintf(stderr, + "InnoDB: Dump of the page:\n"); + buf_page_print(read_buf, zip_size); + fprintf(stderr, + "InnoDB: Dump of" + " corresponding page" + " in doublewrite buffer:\n"); + buf_page_print(page, zip_size); + + fprintf(stderr, + "InnoDB: Also the page in the" + " doublewrite buffer" + " is corrupt.\n" + "InnoDB: Cannot continue" + " operation.\n" + "InnoDB: You can try to" + " recover the database" + " with the my.cnf\n" + "InnoDB: option:\n" + "InnoDB: set-variable=" + "innodb_force_recovery=6\n"); + exit(1); + } + + /* Write the good page from the + doublewrite buffer to the intended + position */ + + fil_io(OS_FILE_WRITE, TRUE, space_id, + zip_size, page_no, 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + page, NULL); + fprintf(stderr, + "InnoDB: Recovered the page from" + " the doublewrite buffer.\n"); + } + } + + page += UNIV_PAGE_SIZE; + } + + fil_flush_file_spaces(FIL_TABLESPACE); + +leave_func: + ut_free(unaligned_read_buf); +} + +/******************************************************************** +Checks that trx is in the trx list. */ +UNIV_INTERN +ibool +trx_in_trx_list( +/*============*/ + /* out: TRUE if is in */ + trx_t* in_trx) /* in: trx */ +{ + trx_t* trx; + + ut_ad(mutex_own(&(kernel_mutex))); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx != NULL) { + + if (trx == in_trx) { + + return(TRUE); + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + return(FALSE); +} + +/********************************************************************* +Writes the value of max_trx_id to the file based trx system header. */ +UNIV_INTERN +void +trx_sys_flush_max_trx_id(void) +/*==========================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + ut_ad(mutex_own(&kernel_mutex)); + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE, + trx_sys->max_trx_id, &mtr); + mtr_commit(&mtr); +} + +/********************************************************************* +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +UNIV_INTERN +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name_in,/* in: MySQL log file name */ + ib_int64_t offset, /* in: position in that log file */ + ulint field, /* in: offset of the MySQL log info field in + the trx sys header */ + mtr_t* mtr) /* in: mtr */ +{ + trx_sysf_t* sys_header; + const char* file_name; + + if (ut_strlen(file_name_in) >= TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN) { + + /* We cannot fit the name to the 512 bytes we have reserved */ + /* -> To store relay log file information, file_name must fit to the 480 bytes */ + + file_name = ""; + } + else { + file_name = file_name_in; + } + + sys_header = trx_sysf_get(mtr); + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD, + TRX_SYS_MYSQL_LOG_MAGIC_N, + MLOG_4BYTES, mtr); + } + + if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME), + file_name)) { + + mlog_write_string(sys_header + field + + TRX_SYS_MYSQL_LOG_NAME, + (byte*) file_name, 1 + ut_strlen(file_name), + mtr); + } + + if (mach_read_from_4(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0 + || (offset >> 32) > 0) { + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH, + (ulint)(offset >> 32), + MLOG_4BYTES, mtr); + } + + mlog_write_ulint(sys_header + field + + TRX_SYS_MYSQL_LOG_OFFSET_LOW, + (ulint)(offset & 0xFFFFFFFFUL), + MLOG_4BYTES, mtr); +} + +#ifdef UNIV_HOTBACKUP +/********************************************************************* +Prints to stderr the MySQL binlog info in the system header if the +magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset_from_page( +/*========================================*/ + const byte* page) /* in: buffer containing the trx + system header page, i.e., page number + TRX_SYS_PAGE_NO in the tablespace */ +{ + const trx_sysf_t* sys_header; + + sys_header = page + TRX_SYS; + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + == TRX_SYS_MYSQL_LOG_MAGIC_N) { + + fprintf(stderr, + "ibbackup: Last MySQL binlog file position %lu %lu," + " file name %s\n", + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + } +} +#endif /* UNIV_HOTBACKUP */ + +/********************************************************************* +Stores the MySQL binlog offset info in the trx system header if +the magic number shows it valid, and print the info to stderr */ +UNIV_INTERN +void +trx_sys_print_mysql_binlog_offset(void) +/*===================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + ulint trx_sys_mysql_bin_log_pos_high; + ulint trx_sys_mysql_bin_log_pos_low; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + trx_sys_mysql_bin_log_pos_high = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH); + trx_sys_mysql_bin_log_pos_low = mach_read_from_4( + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW); + + trx_sys_mysql_bin_log_pos + = (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32) + + (ib_int64_t)trx_sys_mysql_bin_log_pos_low; + + ut_memcpy(trx_sys_mysql_bin_log_name, + sys_header + TRX_SYS_MYSQL_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN); + + fprintf(stderr, + "InnoDB: Last MySQL binlog file position %lu %lu," + " file name %s\n", + trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low, + trx_sys_mysql_bin_log_name); + + mtr_commit(&mtr); +} + +/********************************************************************* +Prints to stderr the MySQL master log offset info in the trx system header if +the magic number shows it valid. */ +UNIV_INTERN +void +trx_sys_print_mysql_master_log_pos(void) +/*====================================*/ +{ + trx_sysf_t* sys_header; + mtr_t mtr; + + mtr_start(&mtr); + + sys_header = trx_sysf_get(&mtr); + + if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD) + != TRX_SYS_MYSQL_LOG_MAGIC_N) { + + mtr_commit(&mtr); + + return; + } + + fprintf(stderr, + "InnoDB: In a MySQL replication slave the last" + " master binlog file\n" + "InnoDB: position %lu %lu, file name %s\n", + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + + fprintf(stderr, + "InnoDB: and relay log file\n" + "InnoDB: position %lu %lu, file name %s\n", + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_RELAY_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH), + (ulong) mach_read_from_4(sys_header + + TRX_SYS_MYSQL_RELAY_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW), + sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME); + + /* Copy the master log position info to global variables we can + use in ha_innobase.cc to initialize glob_mi to right values */ + + ut_memcpy(trx_sys_mysql_master_log_name, + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, + TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN); + + trx_sys_mysql_master_log_pos + = (((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32) + + ((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW)); + + ut_memcpy(trx_sys_mysql_relay_log_name, + sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO + + TRX_SYS_MYSQL_LOG_NAME, + TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN); + + trx_sys_mysql_relay_log_pos + = (((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32) + + ((ib_int64_t) mach_read_from_4( + sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO + + TRX_SYS_MYSQL_LOG_OFFSET_LOW)); + mtr_commit(&mtr); +} + +/******************************************************************** +Looks for a free slot for a rollback segment in the trx system file copy. */ +UNIV_INTERN +ulint +trx_sysf_rseg_find_free( +/*====================*/ + /* out: slot index or ULINT_UNDEFINED if not found */ + mtr_t* mtr) /* in: mtr */ +{ + trx_sysf_t* sys_header; + ulint page_no; + ulint i; + + ut_ad(mutex_own(&(kernel_mutex))); + + sys_header = trx_sysf_get(mtr); + + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/********************************************************************* +Creates the file page for the transaction system. This function is called only +at the database creation, before trx_sys_init. */ +static +void +trx_sysf_create( +/*============*/ + mtr_t* mtr) /* in: mtr */ +{ + trx_sysf_t* sys_header; + ulint slot_no; + buf_block_t* block; + page_t* page; + ulint page_no; + ulint i; + + ut_ad(mtr); + + /* Note that below we first reserve the file space x-latch, and + then enter the kernel: we must do it in this order to conform + to the latching order rules. */ + + mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr); + mutex_enter(&kernel_mutex); + + /* Create the trx sys file block in a new allocated file segment */ + block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER, + mtr); + buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); + + ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO); + + page = buf_block_get_frame(block); + + mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS, + MLOG_2BYTES, mtr); + + /* Reset the doublewrite buffer magic number to zero so that we + know that the doublewrite buffer has not yet been created (this + suppresses a Valgrind warning) */ + + mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE + + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr); + + sys_header = trx_sysf_get(mtr); + + /* Start counting transaction ids from number 1 up */ + mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE, + ut_dulint_create(0, 1), mtr); + + /* Reset the rollback segment slots */ + for (i = 0; i < TRX_SYS_N_RSEGS; i++) { + + trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr); + trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr); + } + + /* The remaining area (up to the page trailer) is uninitialized. + Silence Valgrind warnings about it. */ + UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE), + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + - (TRX_SYS_RSEGS + + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE + + TRX_SYS_RSEG_SPACE)) + + page - sys_header); + + /* Create the first rollback segment in the SYSTEM tablespace */ + page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, &slot_no, + mtr); + ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); + ut_a(page_no != FIL_NULL); + + mutex_exit(&kernel_mutex); +} + +/********************************************************************* +Creates and initializes the central memory structures for the transaction +system. This is called when the database is started. */ +UNIV_INTERN +void +trx_sys_init_at_db_start(void) +/*==========================*/ +{ + trx_sysf_t* sys_header; + ib_int64_t rows_to_undo = 0; + const char* unit = ""; + trx_t* trx; + mtr_t mtr; + + mtr_start(&mtr); + + ut_ad(trx_sys == NULL); + + mutex_enter(&kernel_mutex); + + trx_sys = mem_alloc(sizeof(trx_sys_t)); + + sys_header = trx_sysf_get(&mtr); + + trx_rseg_list_and_array_init(sys_header, &mtr); + + trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + /* VERY important: after the database is started, max_trx_id value is + divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in + trx_sys_get_new_trx_id will evaluate to TRUE when the function + is first time called, and the value for trx id will be written + to the disk-based header! Thus trx id values will not overlap when + the database is repeatedly started! */ + + trx_sys->max_trx_id = ut_dulint_add( + ut_dulint_align_up(mtr_read_dulint( + sys_header + + TRX_SYS_TRX_ID_STORE, &mtr), + TRX_SYS_TRX_ID_WRITE_MARGIN), + 2 * TRX_SYS_TRX_ID_WRITE_MARGIN); + + UT_LIST_INIT(trx_sys->mysql_trx_list); + trx_dummy_sess = sess_open(); + trx_lists_init_at_db_start(); + + if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) { + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + for (;;) { + + if ( trx->conc_state != TRX_PREPARED) { + rows_to_undo += ut_conv_dulint_to_longlong( + trx->undo_no); + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + + if (!trx) { + break; + } + } + + if (rows_to_undo > 1000000000) { + unit = "M"; + rows_to_undo = rows_to_undo / 1000000; + } + + fprintf(stderr, + "InnoDB: %lu transaction(s) which must be" + " rolled back or cleaned up\n" + "InnoDB: in total %lu%s row operations to undo\n", + (ulong) UT_LIST_GET_LEN(trx_sys->trx_list), + (ulong) rows_to_undo, unit); + + fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n", + TRX_ID_PREP_PRINTF(trx_sys->max_trx_id)); + } + + UT_LIST_INIT(trx_sys->view_list); + + trx_purge_sys_create(); + + mutex_exit(&kernel_mutex); + + mtr_commit(&mtr); +} + +/********************************************************************* +Creates and initializes the transaction system at the database creation. */ +UNIV_INTERN +void +trx_sys_create(void) +/*================*/ +{ + mtr_t mtr; + + mtr_start(&mtr); + + trx_sysf_create(&mtr); + + mtr_commit(&mtr); + + trx_sys_init_at_db_start(); +} + +/********************************************************************* +Create extra rollback segments when create_new_db */ +UNIV_INTERN +void +trx_sys_create_extra_rseg( +/*======================*/ + ulint num) /* in: number of extra user rollback segments */ +{ + mtr_t mtr; + ulint slot_no; + ulint i; + + /* Craete extra rollback segments */ + mtr_start(&mtr); + for (i = 1; i < num + 1; i++) { + if(!trx_rseg_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no, &mtr)) { + fprintf(stderr, +"InnoDB: Warning: Failed to create extra rollback segments.\n"); + break; + } + ut_a(slot_no == i); + } + mtr_commit(&mtr); +} + +/********************************************************************* +Update the file format tag. */ +static +ibool +trx_sys_file_format_max_write( +/*==========================*/ + /* out: always TRUE */ + ulint format_id, /* in: file format id */ + const char** name) /* out: max file format name, can + be NULL */ +{ + mtr_t mtr; + byte* ptr; + buf_block_t* block; + ulint tag_value_low; + + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + tag_value_low = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW; + + if (name) { + *name = file_format_max.name; + } + + mlog_write_dulint( + ptr, + ut_dulint_create(TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH, + tag_value_low), + &mtr); + + mtr_commit(&mtr); + + return(TRUE); +} + +/********************************************************************* +Read the file format tag. */ +static +ulint +trx_sys_file_format_max_read(void) +/*==============================*/ + /* out: the file format or + ULINT_UNDEFINED if not set. */ +{ + mtr_t mtr; + const byte* ptr; + const buf_block_t* block; + ulint format_id; + dulint file_format_id; + + /* Since this is called during the startup phase it's safe to + read the value without a covering mutex. */ + mtr_start(&mtr); + + block = buf_page_get( + TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + + ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG; + file_format_id = mach_read_from_8(ptr); + + mtr_commit(&mtr); + + format_id = file_format_id.low - TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW; + + if (file_format_id.high != TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH + || format_id >= FILE_FORMAT_NAME_N) { + + /* Either it has never been tagged, or garbage in it. */ + return(ULINT_UNDEFINED); + } + + return(format_id); +} + +/********************************************************************* +Get the name representation of the file format from its id. */ +UNIV_INTERN +const char* +trx_sys_file_format_id_to_name( +/*===========================*/ + /* out: pointer to the name */ + const ulint id) /* in: id of the file format */ +{ + ut_a(id < FILE_FORMAT_NAME_N); + + return(file_format_name_map[id]); +} + +/********************************************************************* +Check for the max file format tag stored on disk. Note: If max_format_id +is == DICT_TF_FORMAT_MAX + 1 then we only print a warning. */ +UNIV_INTERN +ulint +trx_sys_file_format_max_check( +/*==========================*/ + /* out: DB_SUCCESS or error code */ + ulint max_format_id) /* in: max format id to check */ +{ + ulint format_id; + + /* Check the file format in the tablespace. Do not try to + recover if the file format is not supported by the engine + unless forced by the user. */ + format_id = trx_sys_file_format_max_read(); + if (format_id == ULINT_UNDEFINED) { + /* Format ID was not set. Set it to minimum possible + value. */ + format_id = DICT_TF_FORMAT_51; + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: highest supported file format is %s.\n", + trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX)); + + if (format_id > DICT_TF_FORMAT_MAX) { + + ut_a(format_id < FILE_FORMAT_NAME_N); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: %s: the system tablespace is in a file " + "format that this version doesn't support - %s\n", + ((max_format_id <= DICT_TF_FORMAT_MAX) + ? "Error" : "Warning"), + trx_sys_file_format_id_to_name(format_id)); + + if (max_format_id <= DICT_TF_FORMAT_MAX) { + return(DB_ERROR); + } + } + + format_id = (format_id > max_format_id) ? format_id : max_format_id; + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = format_id; + file_format_max.name = trx_sys_file_format_id_to_name(format_id); + + return(DB_SUCCESS); +} + +/********************************************************************* +Set the file format id unconditionally except if it's already the +same value. */ +UNIV_INTERN +ibool +trx_sys_file_format_max_set( +/*========================*/ + /* out: TRUE if value updated */ + ulint format_id, /* in: file format id */ + const char** name) /* out: max file format name or + NULL if not needed. */ +{ + ibool ret = FALSE; + + ut_a(format_id <= DICT_TF_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + /* Only update if not already same value. */ + if (format_id != file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/************************************************************************ +Tags the system table space with minimum format id if it has not been +tagged yet. +WARNING: This function is only called during the startup and AFTER the +redo log application during recovery has finished. */ +UNIV_INTERN +void +trx_sys_file_format_tag_init(void) +/*==============================*/ +{ + ulint format_id; + + format_id = trx_sys_file_format_max_read(); + + /* If format_id is not set then set it to the minimum. */ + if (format_id == ULINT_UNDEFINED) { + trx_sys_file_format_max_set(DICT_TF_FORMAT_51, NULL); + } +} + +/************************************************************************ +Update the file format tag in the system tablespace only if the given +format id is greater than the known max id. */ +UNIV_INTERN +ibool +trx_sys_file_format_max_upgrade( +/*============================*/ + /* out: TRUE if format_id was + bigger than the known max id */ + const char** name, /* out: max file format name */ + ulint format_id) /* in: file format identifier */ +{ + ibool ret = FALSE; + + ut_a(name); + ut_a(file_format_max.name != NULL); + ut_a(format_id <= DICT_TF_FORMAT_MAX); + + mutex_enter(&file_format_max.mutex); + + if (format_id > file_format_max.id) { + + ret = trx_sys_file_format_max_write(format_id, name); + } + + mutex_exit(&file_format_max.mutex); + + return(ret); +} + +/********************************************************************* +Get the name representation of the file format from its id. */ +UNIV_INTERN +const char* +trx_sys_file_format_max_get(void) +/*=============================*/ + /* out: pointer to the max format name */ +{ + return(file_format_max.name); +} + +/********************************************************************* +Initializes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_init(void) +/*==========================*/ +{ + mutex_create(&file_format_max.mutex, SYNC_FILE_FORMAT_TAG); + + /* We don't need a mutex here, as this function should only + be called once at start up. */ + file_format_max.id = DICT_TF_FORMAT_51; + + file_format_max.name = trx_sys_file_format_id_to_name( + file_format_max.id); +} + +/********************************************************************* +Closes the tablespace tag system. */ +UNIV_INTERN +void +trx_sys_file_format_close(void) +/*===========================*/ +{ + /* Does nothing at the moment */ +} diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c new file mode 100644 index 00000000000..5fb234e3aa7 --- /dev/null +++ b/storage/xtradb/trx/trx0trx.c @@ -0,0 +1,2082 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0trx.h" + +#ifdef UNIV_NONINL +#include "trx0trx.ic" +#endif + +#include "trx0undo.h" +#include "trx0rseg.h" +#include "log0log.h" +#include "que0que.h" +#include "lock0lock.h" +#include "trx0roll.h" +#include "usr0sess.h" +#include "read0read.h" +#include "srv0srv.h" +#include "thr0loc.h" +#include "btr0sea.h" +#include "os0proc.h" +#include "trx0xa.h" +#include "ha_prototypes.h" + +/* Dummy session used currently in MySQL interface */ +UNIV_INTERN sess_t* trx_dummy_sess = NULL; + +/* Number of transactions currently allocated for MySQL: protected by +the kernel mutex */ +UNIV_INTERN ulint trx_n_mysql_transactions = 0; + +/***************************************************************** +Set detailed error message for the transaction. */ +UNIV_INTERN +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /* in: transaction struct */ + const char* msg) /* in: detailed error message */ +{ + ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error)); +} + +/***************************************************************** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +UNIV_INTERN +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /* in: transaction struct */ + FILE* file) /* in: file to read message from */ +{ + os_file_read_string(file, trx->detailed_error, + sizeof(trx->detailed_error)); +} + +/******************************************************************** +Creates and initializes a transaction object. */ +UNIV_INTERN +trx_t* +trx_create( +/*=======*/ + /* out, own: the transaction */ + sess_t* sess) /* in: session */ +{ + trx_t* trx; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(sess); + + trx = mem_alloc(sizeof(trx_t)); + + trx->magic_n = TRX_MAGIC_N; + + trx->op_info = ""; + + trx->is_purge = 0; + trx->is_recovered = 0; + trx->conc_state = TRX_NOT_STARTED; + trx->start_time = time(NULL); + + trx->isolation_level = TRX_ISO_REPEATABLE_READ; + + trx->id = ut_dulint_zero; + trx->no = ut_dulint_max; + + trx->support_xa = TRUE; + + trx->check_foreigns = TRUE; + trx->check_unique_secondary = TRUE; + + trx->flush_log_later = FALSE; + trx->must_flush_log_later = FALSE; + + trx->dict_operation = TRX_DICT_OP_NONE; + trx->table_id = ut_dulint_zero; + + trx->mysql_thd = NULL; + trx->mysql_query_str = NULL; + trx->active_trans = 0; + trx->duplicates = 0; + + trx->n_mysql_tables_in_use = 0; + trx->mysql_n_tables_locked = 0; + + trx->mysql_log_file_name = NULL; + trx->mysql_log_offset = 0; + trx->mysql_master_log_file_name = ""; + trx->mysql_master_log_pos = 0; + trx->mysql_relay_log_file_name = ""; + trx->mysql_relay_log_pos = 0; + + mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO); + + trx->rseg = NULL; + + trx->undo_no = ut_dulint_zero; + trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; + trx->insert_undo = NULL; + trx->update_undo = NULL; + trx->undo_no_arr = NULL; + + trx->error_state = DB_SUCCESS; + trx->error_key_num = 0; + trx->detailed_error[0] = '\0'; + + trx->sess = sess; + trx->que_state = TRX_QUE_RUNNING; + trx->n_active_thrs = 0; + + trx->handling_signals = FALSE; + + UT_LIST_INIT(trx->signals); + UT_LIST_INIT(trx->reply_signals); + + trx->graph = NULL; + + trx->wait_lock = NULL; + trx->was_chosen_as_deadlock_victim = FALSE; + UT_LIST_INIT(trx->wait_thrs); + + trx->lock_heap = mem_heap_create_in_buffer(256); + UT_LIST_INIT(trx->trx_locks); + + UT_LIST_INIT(trx->trx_savepoints); + + trx->dict_operation_lock_mode = 0; + trx->has_search_latch = FALSE; + trx->search_latch_timeout = BTR_SEA_TIMEOUT; + + trx->declared_to_be_inside_innodb = FALSE; + trx->n_tickets_to_enter_innodb = 0; + + trx->global_read_view_heap = mem_heap_create(256); + trx->global_read_view = NULL; + trx->read_view = NULL; + + /* Set X/Open XA transaction identification to NULL */ + memset(&trx->xid, 0, sizeof(trx->xid)); + trx->xid.formatID = -1; + + trx->n_autoinc_rows = 0; + + /* Remember to free the vector explicitly. */ + trx->autoinc_locks = ib_vector_create( + mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4); + + trx_reset_new_rec_lock_info(trx); + + return(trx); +} + +/************************************************************************ +Creates a transaction object for MySQL. */ +UNIV_INTERN +trx_t* +trx_allocate_for_mysql(void) +/*========================*/ + /* out, own: transaction object */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + trx = trx_create(trx_dummy_sess); + + trx_n_mysql_transactions++; + + UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + mutex_exit(&kernel_mutex); + + trx->mysql_thread_id = os_thread_get_curr_id(); + + trx->mysql_process_no = os_proc_get_number(); + + return(trx); +} + +/************************************************************************ +Creates a transaction object for background operations by the master thread. */ +UNIV_INTERN +trx_t* +trx_allocate_for_background(void) +/*=============================*/ + /* out, own: transaction object */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + + trx = trx_create(trx_dummy_sess); + + mutex_exit(&kernel_mutex); + + return(trx); +} + +/************************************************************************ +Releases the search latch if trx has reserved it. */ +UNIV_INTERN +void +trx_search_latch_release_if_reserved( +/*=================================*/ + trx_t* trx) /* in: transaction */ +{ + if (trx->has_search_latch) { + rw_lock_s_unlock(&btr_search_latch); + + trx->has_search_latch = FALSE; + } +} + +/************************************************************************ +Frees a transaction object. */ +UNIV_INTERN +void +trx_free( +/*=====*/ + trx_t* trx) /* in, own: trx object */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + if (trx->declared_to_be_inside_innodb) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: Freeing a trx which is declared" + " to be processing\n" + "InnoDB: inside InnoDB.\n", stderr); + trx_print(stderr, trx, 600); + putc('\n', stderr); + + /* This is an error but not a fatal error. We must keep + the counters like srv_conc_n_threads accurate. */ + srv_conc_force_exit_innodb(trx); + } + + if (trx->n_mysql_tables_in_use != 0 + || trx->mysql_n_tables_locked != 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: MySQL is freeing a thd\n" + "InnoDB: though trx->n_mysql_tables_in_use is %lu\n" + "InnoDB: and trx->mysql_n_tables_locked is %lu.\n", + (ulong)trx->n_mysql_tables_in_use, + (ulong)trx->mysql_n_tables_locked); + + trx_print(stderr, trx, 600); + + ut_print_buf(stderr, trx, sizeof(trx_t)); + putc('\n', stderr); + } + + ut_a(trx->magic_n == TRX_MAGIC_N); + + trx->magic_n = 11112222; + + ut_a(trx->conc_state == TRX_NOT_STARTED); + + mutex_free(&(trx->undo_mutex)); + + ut_a(trx->insert_undo == NULL); + ut_a(trx->update_undo == NULL); + + if (trx->undo_no_arr) { + trx_undo_arr_free(trx->undo_no_arr); + } + + ut_a(UT_LIST_GET_LEN(trx->signals) == 0); + ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0); + + ut_a(trx->wait_lock == NULL); + ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + ut_a(!trx->has_search_latch); + + ut_a(trx->dict_operation_lock_mode == 0); + + if (trx->lock_heap) { + mem_heap_free(trx->lock_heap); + } + + ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0); + + if (trx->global_read_view_heap) { + mem_heap_free(trx->global_read_view_heap); + } + + trx->global_read_view = NULL; + + ut_a(trx->read_view == NULL); + + ut_a(ib_vector_is_empty(trx->autoinc_locks)); + /* We allocated a dedicated heap for the vector. */ + ib_vector_free(trx->autoinc_locks); + + mem_free(trx); +} + +/************************************************************************ +Frees a transaction object for MySQL. */ +UNIV_INTERN +void +trx_free_for_mysql( +/*===============*/ + trx_t* trx) /* in, own: trx object */ +{ + mutex_enter(&kernel_mutex); + + UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx); + + trx_free(trx); + + ut_a(trx_n_mysql_transactions > 0); + + trx_n_mysql_transactions--; + + mutex_exit(&kernel_mutex); +} + +/************************************************************************ +Frees a transaction object of a background operation of the master thread. */ +UNIV_INTERN +void +trx_free_for_background( +/*====================*/ + trx_t* trx) /* in, own: trx object */ +{ + mutex_enter(&kernel_mutex); + + trx_free(trx); + + mutex_exit(&kernel_mutex); +} + +/******************************************************************** +Inserts the trx handle in the trx system trx list in the right position. +The list is sorted on the trx id so that the biggest id is at the list +start. This function is used at the database startup to insert incomplete +transactions to the list. */ +static +void +trx_list_insert_ordered( +/*====================*/ + trx_t* trx) /* in: trx handle */ +{ + trx_t* trx2; + + ut_ad(mutex_own(&kernel_mutex)); + + trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx2 != NULL) { + if (ut_dulint_cmp(trx->id, trx2->id) >= 0) { + + ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1); + break; + } + trx2 = UT_LIST_GET_NEXT(trx_list, trx2); + } + + if (trx2 != NULL) { + trx2 = UT_LIST_GET_PREV(trx_list, trx2); + + if (trx2 == NULL) { + UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); + } else { + UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list, + trx2, trx); + } + } else { + UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx); + } +} + +/******************************************************************** +Creates trx objects for transactions and initializes the trx list of +trx_sys at database start. Rollback segment and undo log lists must +already exist when this function is called, because the lists of +transactions to be rolled back or cleaned up are built based on the +undo log lists. */ +UNIV_INTERN +void +trx_lists_init_at_db_start(void) +/*============================*/ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + trx_t* trx; + + UT_LIST_INIT(trx_sys->trx_list); + + /* Look from the rollback segments if there exist undo logs for + transactions */ + + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + + while (rseg != NULL) { + undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); + + while (undo != NULL) { + + trx = trx_create(trx_dummy_sess); + + trx->is_recovered = TRUE; + trx->id = undo->trx_id; + trx->xid = undo->xid; + trx->insert_undo = undo; + trx->rseg = rseg; + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + + fprintf(stderr, + "InnoDB: Transaction " + TRX_ID_FMT + " was in the" + " XA prepared state.\n", + TRX_ID_PREP_PRINTF(trx->id)); + + if (srv_force_recovery == 0) { + + trx->conc_state = TRX_PREPARED; + } else { + fprintf(stderr, + "InnoDB: Since" + " innodb_force_recovery" + " > 0, we will" + " rollback it" + " anyway.\n"); + + trx->conc_state = TRX_ACTIVE; + } + } else { + trx->conc_state + = TRX_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx no; + this should have no relevance since purge + is not interested in committed transaction + numbers, unless they are in the history + list, in which case it looks the number + from the disk based undo log structure */ + + trx->no = trx->id; + } else { + trx->conc_state = TRX_ACTIVE; + + /* A running transaction always has the number + field inited to ut_dulint_max */ + + trx->no = ut_dulint_max; + } + + if (undo->dict_operation) { + trx_set_dict_operation( + trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + + if (!undo->empty) { + trx->undo_no = ut_dulint_add(undo->top_undo_no, + 1); + } + + trx_list_insert_ordered(trx); + + undo = UT_LIST_GET_NEXT(undo_list, undo); + } + + undo = UT_LIST_GET_FIRST(rseg->update_undo_list); + + while (undo != NULL) { + trx = trx_get_on_id(undo->trx_id); + + if (NULL == trx) { + trx = trx_create(trx_dummy_sess); + + trx->is_recovered = TRUE; + trx->id = undo->trx_id; + trx->xid = undo->xid; + + if (undo->state != TRX_UNDO_ACTIVE) { + + /* Prepared transactions are left in + the prepared state waiting for a + commit or abort decision from MySQL */ + + if (undo->state == TRX_UNDO_PREPARED) { + fprintf(stderr, + "InnoDB: Transaction " + TRX_ID_FMT " was in the" + " XA prepared state.\n", + TRX_ID_PREP_PRINTF( + trx->id)); + + if (srv_force_recovery == 0) { + + trx->conc_state + = TRX_PREPARED; + } else { + fprintf(stderr, + "InnoDB: Since" + " innodb_force_recovery" + " > 0, we will" + " rollback it" + " anyway.\n"); + + trx->conc_state + = TRX_ACTIVE; + } + } else { + trx->conc_state + = TRX_COMMITTED_IN_MEMORY; + } + + /* We give a dummy value for the trx + number */ + + trx->no = trx->id; + } else { + trx->conc_state = TRX_ACTIVE; + + /* A running transaction always has + the number field inited to + ut_dulint_max */ + + trx->no = ut_dulint_max; + } + + trx->rseg = rseg; + trx_list_insert_ordered(trx); + + if (undo->dict_operation) { + trx_set_dict_operation( + trx, TRX_DICT_OP_TABLE); + trx->table_id = undo->table_id; + } + } + + trx->update_undo = undo; + + if ((!undo->empty) + && (ut_dulint_cmp(undo->top_undo_no, + trx->undo_no) >= 0)) { + + trx->undo_no = ut_dulint_add(undo->top_undo_no, + 1); + } + + undo = UT_LIST_GET_NEXT(undo_list, undo); + } + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + } +} + +/********************************************************************** +Assigns a rollback segment to a transaction in a round-robin fashion. +Skips the SYSTEM rollback segment if another is available. */ +UNIV_INLINE +ulint +trx_assign_rseg(void) +/*=================*/ + /* out: assigned rollback segment id */ +{ + trx_rseg_t* rseg = trx_sys->latest_rseg; + + ut_ad(mutex_own(&kernel_mutex)); +loop: + /* Get next rseg in a round-robin fashion */ + + rseg = UT_LIST_GET_NEXT(rseg_list, rseg); + + if (rseg == NULL) { + rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); + } + + /* If it is the SYSTEM rollback segment, and there exist others, skip + it */ + + if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID) + && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) { + goto loop; + } + + trx_sys->latest_rseg = rseg; + + return(rseg->id); +} + +/******************************************************************** +Starts a new transaction. */ +UNIV_INTERN +ibool +trx_start_low( +/*==========*/ + /* out: TRUE */ + trx_t* trx, /* in: transaction */ + ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +{ + trx_rseg_t* rseg; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->rseg == NULL); + + if (trx->is_purge) { + trx->id = ut_dulint_zero; + trx->conc_state = TRX_ACTIVE; + trx->start_time = time(NULL); + + return(TRUE); + } + + ut_ad(trx->conc_state != TRX_ACTIVE); + + if (rseg_id == ULINT_UNDEFINED) { + + rseg_id = trx_assign_rseg(); + } + + rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id); + + trx->id = trx_sys_get_new_trx_id(); + + /* The initial value for trx->no: ut_dulint_max is used in + read_view_open_now: */ + + trx->no = ut_dulint_max; + + trx->rseg = rseg; + + trx->conc_state = TRX_ACTIVE; + trx->start_time = time(NULL); + + UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); + + return(TRUE); +} + +/******************************************************************** +Starts a new transaction. */ +UNIV_INTERN +ibool +trx_start( +/*======*/ + /* out: TRUE */ + trx_t* trx, /* in: transaction */ + ulint rseg_id)/* in: rollback segment id; if ULINT_UNDEFINED + is passed, the system chooses the rollback segment + automatically in a round-robin fashion */ +{ + ibool ret; + + /* Update the info whether we should skip XA steps that eat CPU time + For the duration of the transaction trx->support_xa is not reread + from thd so any changes in the value take effect in the next + transaction. This is to avoid a scenario where some undo + generated by a transaction, has XA stuff, and other undo, + generated by the same transaction, doesn't. */ + trx->support_xa = thd_supports_xa(trx->mysql_thd); + + mutex_enter(&kernel_mutex); + + ret = trx_start_low(trx, rseg_id); + + mutex_exit(&kernel_mutex); + + return(ret); +} + +/******************************************************************** +Commits a transaction. */ +UNIV_INTERN +void +trx_commit_off_kernel( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + page_t* update_hdr_page; + ib_uint64_t lsn = 0; + trx_rseg_t* rseg; + trx_undo_t* undo; + mtr_t mtr; + + ut_ad(mutex_own(&kernel_mutex)); + + trx->must_flush_log_later = FALSE; + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to some other state: these modifications to the file data + structure define the transaction as committed in the file + based world, at the serialization point of the log sequence + number lsn obtained below. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + trx_undo_set_state_at_finish( + rseg, trx, trx->insert_undo, &mtr); + } + + undo = trx->update_undo; + + if (undo) { + mutex_enter(&kernel_mutex); + trx->no = trx_sys_get_new_trx_no(); + + mutex_exit(&kernel_mutex); + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction commit for this transaction. */ + + update_hdr_page = trx_undo_set_state_at_finish( + rseg, trx, undo, &mtr); + + /* We have to do the cleanup for the update log while + holding the rseg mutex because update log headers + have to be put to the history list in the order of + the trx number. */ + + trx_undo_update_cleanup(trx, update_hdr_page, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + /* Update the latest MySQL binlog name and offset info + in trx sys header if MySQL binlogging is on or the database + server is a MySQL replication slave */ + + if (trx->mysql_log_file_name + && trx->mysql_log_file_name[0] != '\0') { + trx_sys_update_mysql_binlog_offset( + trx->mysql_log_file_name, + trx->mysql_log_offset, + TRX_SYS_MYSQL_LOG_INFO, &mtr); + trx->mysql_log_file_name = NULL; + } + + if (trx->mysql_master_log_file_name[0] != '\0') { + /* This database server is a MySQL replication slave */ + trx_sys_update_mysql_binlog_offset( + trx->mysql_relay_log_file_name, + trx->mysql_relay_log_pos, + TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr); + trx_sys_update_mysql_binlog_offset( + trx->mysql_master_log_file_name, + trx->mysql_master_log_pos, + TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr); + trx->mysql_master_log_file_name = ""; + } + + /* The following call commits the mini-transaction, making the + whole transaction committed in the file-based world, at this + log sequence number. The transaction becomes 'durable' when + we write the log to disk, but in the logical sense the commit + in the file-based data structures (undo logs etc.) happens + here. + + NOTE that transaction numbers, which are assigned only to + transactions with an update undo log, do not necessarily come + in exactly the same order as commit lsn's, if the transactions + have different rollback segments. To get exactly the same + order we should hold the kernel mutex up to this point, + adding to to the contention of the kernel mutex. However, if + a transaction T2 is able to see modifications made by + a transaction T1, T2 will always get a bigger transaction + number and a bigger commit lsn than T1. */ + + /*--------------*/ + mtr_commit(&mtr); + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + + ut_ad(trx->conc_state == TRX_ACTIVE + || trx->conc_state == TRX_PREPARED); + ut_ad(mutex_own(&kernel_mutex)); + + /* The following assignment makes the transaction committed in memory + and makes its changes to data visible to other transactions. + NOTE that there is a small discrepancy from the strict formal + visibility rules here: a human user of the database can see + modifications made by another transaction T even before the necessary + log segment has been flushed to the disk. If the database happens to + crash before the flush, the user has seen modifications from T which + will never be a committed transaction. However, any transaction T2 + which sees the modifications of the committing transaction T, and + which also itself makes modifications to the database, will get an lsn + larger than the committing transaction T. In the case where the log + flush fails, and T never gets committed, also T2 will never get + committed. */ + + /*--------------------------------------*/ + trx->conc_state = TRX_COMMITTED_IN_MEMORY; + /*--------------------------------------*/ + + /* If we release kernel_mutex below and we are still doing + recovery i.e.: back ground rollback thread is still active + then there is a chance that the rollback thread may see + this trx as COMMITTED_IN_MEMORY and goes adhead to clean it + up calling trx_cleanup_at_db_startup(). This can happen + in the case we are committing a trx here that is left in + PREPARED state during the crash. Note that commit of the + rollback of a PREPARED trx happens in the recovery thread + while the rollback of other transactions happen in the + background thread. To avoid this race we unconditionally + unset the is_recovered flag from the trx. */ + + trx->is_recovered = FALSE; + + lock_release_off_kernel(trx); + + if (trx->global_read_view) { + read_view_close(trx->global_read_view); + mem_heap_empty(trx->global_read_view_heap); + trx->global_read_view = NULL; + } + + trx->read_view = NULL; + + if (lsn) { + + mutex_exit(&kernel_mutex); + + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + /* NOTE that we could possibly make a group commit more + efficient here: call os_thread_yield here to allow also other + trxs to come to commit! */ + + /*-------------------------------------*/ + + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the transaction durable if + the OS does not crash. We may also flush the log files to + disk, making the transaction durable also at an OS crash or a + power outage. + + The idea in InnoDB's group commit is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which commits the whole + group. Note that this group commit will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + If we are calling trx_commit() under MySQL's binlog mutex, we + will delay possible log write and flush to a separate function + trx_commit_complete_for_mysql(), which is only called when the + thread has released the binlog mutex. This is to make the + group commit algorithm to work. Otherwise, the MySQL binlog + mutex would serialize all commits and prevent a group of + transactions from gathering. */ + + if (trx->flush_log_later) { + /* Do nothing yet */ + trx->must_flush_log_later = TRUE; + } else if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + trx->commit_lsn = lsn; + + /*-------------------------------------*/ + + mutex_enter(&kernel_mutex); + } + + /* Free all savepoints */ + trx_roll_free_all_savepoints(trx); + + trx->conc_state = TRX_NOT_STARTED; + trx->rseg = NULL; + trx->undo_no = ut_dulint_zero; + trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; + trx->mysql_query_str = NULL; + + ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0); + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); +} + +/******************************************************************** +Cleans up a transaction at database startup. The cleanup is needed if +the transaction already got to the middle of a commit when the database +crashed, andf we cannot roll it back. */ +UNIV_INTERN +void +trx_cleanup_at_db_startup( +/*======================*/ + trx_t* trx) /* in: transaction */ +{ + if (trx->insert_undo != NULL) { + + trx_undo_insert_cleanup(trx); + } + + trx->conc_state = TRX_NOT_STARTED; + trx->rseg = NULL; + trx->undo_no = ut_dulint_zero; + trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; + + UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); +} + +/************************************************************************ +Assigns a read view for a consistent read query. All the consistent reads +within the same transaction will get the same read view, which is created +when this function is first called for a new started transaction. */ +UNIV_INTERN +read_view_t* +trx_assign_read_view( +/*=================*/ + /* out: consistent read view */ + trx_t* trx) /* in: active transaction */ +{ + ut_ad(trx->conc_state == TRX_ACTIVE); + + if (trx->read_view) { + return(trx->read_view); + } + + mutex_enter(&kernel_mutex); + + if (!trx->read_view) { + trx->read_view = read_view_open_now( + trx->id, trx->global_read_view_heap); + trx->global_read_view = trx->read_view; + } + + mutex_exit(&kernel_mutex); + + return(trx->read_view); +} + +/******************************************************************** +Commits a transaction. NOTE that the kernel mutex is temporarily released. */ +static +void +trx_handle_commit_sig_off_kernel( +/*=============================*/ + trx_t* trx, /* in: transaction */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + trx_sig_t* sig; + trx_sig_t* next_sig; + + ut_ad(mutex_own(&kernel_mutex)); + + trx->que_state = TRX_QUE_COMMITTING; + + trx_commit_off_kernel(trx); + + ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); + + /* Remove all TRX_SIG_COMMIT signals from the signal queue and send + reply messages to them */ + + sig = UT_LIST_GET_FIRST(trx->signals); + + while (sig != NULL) { + next_sig = UT_LIST_GET_NEXT(signals, sig); + + if (sig->type == TRX_SIG_COMMIT) { + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + } + + sig = next_sig; + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/*************************************************************** +The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to +the TRX_QUE_RUNNING state and releases query threads which were +waiting for a lock in the wait_thrs list. */ +UNIV_INTERN +void +trx_end_lock_wait( +/*==============*/ + trx_t* trx) /* in: transaction */ +{ + que_thr_t* thr; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + + while (thr != NULL) { + que_thr_end_wait_no_next_thr(thr); + + UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/*************************************************************** +Moves the query threads in the lock wait list to the SUSPENDED state and puts +the transaction to the TRX_QUE_RUNNING state. */ +static +void +trx_lock_wait_to_suspended( +/*=======================*/ + trx_t* trx) /* in: transaction in the TRX_QUE_LOCK_WAIT state */ +{ + que_thr_t* thr; + + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + + while (thr != NULL) { + thr->state = QUE_THR_SUSPENDED; + + UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr); + + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + + trx->que_state = TRX_QUE_RUNNING; +} + +/*************************************************************** +Moves the query threads in the sig reply wait list of trx to the SUSPENDED +state. */ +static +void +trx_sig_reply_wait_to_suspended( +/*============================*/ + trx_t* trx) /* in: transaction */ +{ + trx_sig_t* sig; + que_thr_t* thr; + + ut_ad(mutex_own(&kernel_mutex)); + + sig = UT_LIST_GET_FIRST(trx->reply_signals); + + while (sig != NULL) { + thr = sig->receiver; + + ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT); + + thr->state = QUE_THR_SUSPENDED; + + sig->receiver = NULL; + + UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig); + + sig = UT_LIST_GET_FIRST(trx->reply_signals); + } +} + +/********************************************************************* +Checks the compatibility of a new signal with the other signals in the +queue. */ +static +ibool +trx_sig_is_compatible( +/*==================*/ + /* out: TRUE if the signal can be queued */ + trx_t* trx, /* in: trx handle */ + ulint type, /* in: signal type */ + ulint sender) /* in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */ +{ + trx_sig_t* sig; + + ut_ad(mutex_own(&kernel_mutex)); + + if (UT_LIST_GET_LEN(trx->signals) == 0) { + + return(TRUE); + } + + if (sender == TRX_SIG_SELF) { + if (type == TRX_SIG_ERROR_OCCURRED) { + + return(TRUE); + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + return(TRUE); + } else { + return(FALSE); + } + } + + ut_ad(sender == TRX_SIG_OTHER_SESS); + + sig = UT_LIST_GET_FIRST(trx->signals); + + if (type == TRX_SIG_COMMIT) { + while (sig != NULL) { + + if (sig->type == TRX_SIG_TOTAL_ROLLBACK) { + + return(FALSE); + } + + sig = UT_LIST_GET_NEXT(signals, sig); + } + + return(TRUE); + + } else if (type == TRX_SIG_TOTAL_ROLLBACK) { + while (sig != NULL) { + + if (sig->type == TRX_SIG_COMMIT) { + + return(FALSE); + } + + sig = UT_LIST_GET_NEXT(signals, sig); + } + + return(TRUE); + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + return(TRUE); + } else { + ut_error; + + return(FALSE); + } +} + +/******************************************************************** +Sends a signal to a trx object. */ +UNIV_INTERN +void +trx_sig_send( +/*=========*/ + trx_t* trx, /* in: trx handle */ + ulint type, /* in: signal type */ + ulint sender, /* in: TRX_SIG_SELF or + TRX_SIG_OTHER_SESS */ + que_thr_t* receiver_thr, /* in: query thread which wants the + reply, or NULL; if type is + TRX_SIG_END_WAIT, this must be NULL */ + trx_savept_t* savept, /* in: possible rollback savepoint, or + NULL */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the parameter + is NULL, it is ignored */ +{ + trx_sig_t* sig; + trx_t* receiver_trx; + + ut_ad(trx); + ut_ad(mutex_own(&kernel_mutex)); + + if (!trx_sig_is_compatible(trx, type, sender)) { + /* The signal is not compatible with the other signals in + the queue: die */ + + ut_error; + } + + /* Queue the signal object */ + + if (UT_LIST_GET_LEN(trx->signals) == 0) { + + /* The signal list is empty: the 'sig' slot must be unused + (we improve performance a bit by avoiding mem_alloc) */ + sig = &(trx->sig); + } else { + /* It might be that the 'sig' slot is unused also in this + case, but we choose the easy way of using mem_alloc */ + + sig = mem_alloc(sizeof(trx_sig_t)); + } + + UT_LIST_ADD_LAST(signals, trx->signals, sig); + + sig->type = type; + sig->sender = sender; + sig->receiver = receiver_thr; + + if (savept) { + sig->savept = *savept; + } + + if (receiver_thr) { + receiver_trx = thr_get_trx(receiver_thr); + + UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals, + sig); + } + + if (trx->sess->state == SESS_ERROR) { + + trx_sig_reply_wait_to_suspended(trx); + } + + if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) { + ut_error; + } + + /* If there were no other signals ahead in the queue, try to start + handling of the signal */ + + if (UT_LIST_GET_FIRST(trx->signals) == sig) { + + trx_sig_start_handle(trx, next_thr); + } +} + +/******************************************************************** +Ends signal handling. If the session is in the error state, and +trx->graph_before_signal_handling != NULL, then returns control to the error +handling routine of the graph (currently just returns the control to the +graph root which then will send an error message to the client). */ +UNIV_INTERN +void +trx_end_signal_handling( +/*====================*/ + trx_t* trx) /* in: trx */ +{ + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(trx->handling_signals == TRUE); + + trx->handling_signals = FALSE; + + trx->graph = trx->graph_before_signal_handling; + + if (trx->graph && (trx->sess->state == SESS_ERROR)) { + + que_fork_error_handle(trx, trx->graph); + } +} + +/******************************************************************** +Starts handling of a trx signal. */ +UNIV_INTERN +void +trx_sig_start_handle( +/*=================*/ + trx_t* trx, /* in: trx handle */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread; if the parameter + is NULL, it is ignored */ +{ + trx_sig_t* sig; + ulint type; +loop: + /* We loop in this function body as long as there are queued signals + we can process immediately */ + + ut_ad(trx); + ut_ad(mutex_own(&kernel_mutex)); + + if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) { + + trx_end_signal_handling(trx); + + return; + } + + if (trx->conc_state == TRX_NOT_STARTED) { + + trx_start_low(trx, ULINT_UNDEFINED); + } + + /* If the trx is in a lock wait state, moves the waiting query threads + to the suspended state */ + + if (trx->que_state == TRX_QUE_LOCK_WAIT) { + + trx_lock_wait_to_suspended(trx); + } + + /* If the session is in the error state and this trx has threads + waiting for reply from signals, moves these threads to the suspended + state, canceling wait reservations; note that if the transaction has + sent a commit or rollback signal to itself, and its session is not in + the error state, then nothing is done here. */ + + if (trx->sess->state == SESS_ERROR) { + trx_sig_reply_wait_to_suspended(trx); + } + + /* If there are no running query threads, we can start processing of a + signal, otherwise we have to wait until all query threads of this + transaction are aware of the arrival of the signal. */ + + if (trx->n_active_thrs > 0) { + + return; + } + + if (trx->handling_signals == FALSE) { + trx->graph_before_signal_handling = trx->graph; + + trx->handling_signals = TRUE; + } + + sig = UT_LIST_GET_FIRST(trx->signals); + type = sig->type; + + if (type == TRX_SIG_COMMIT) { + + trx_handle_commit_sig_off_kernel(trx, next_thr); + + } else if ((type == TRX_SIG_TOTAL_ROLLBACK) + || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) { + + trx_rollback(trx, sig, next_thr); + + /* No further signals can be handled until the rollback + completes, therefore we return */ + + return; + + } else if (type == TRX_SIG_ERROR_OCCURRED) { + + trx_rollback(trx, sig, next_thr); + + /* No further signals can be handled until the rollback + completes, therefore we return */ + + return; + + } else if (type == TRX_SIG_BREAK_EXECUTION) { + + trx_sig_reply(sig, next_thr); + trx_sig_remove(trx, sig); + } else { + ut_error; + } + + goto loop; +} + +/******************************************************************** +Send the reply message when a signal in the queue of the trx has been +handled. */ +UNIV_INTERN +void +trx_sig_reply( +/*==========*/ + trx_sig_t* sig, /* in: signal */ + que_thr_t** next_thr) /* in/out: next query thread to run; + if the value which is passed in is + a pointer to a NULL pointer, then the + calling function can start running + a new query thread */ +{ + trx_t* receiver_trx; + + ut_ad(sig); + ut_ad(mutex_own(&kernel_mutex)); + + if (sig->receiver != NULL) { + ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT); + + receiver_trx = thr_get_trx(sig->receiver); + + UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals, + sig); + ut_ad(receiver_trx->sess->state != SESS_ERROR); + + que_thr_end_wait(sig->receiver, next_thr); + + sig->receiver = NULL; + + } +} + +/******************************************************************** +Removes a signal object from the trx signal queue. */ +UNIV_INTERN +void +trx_sig_remove( +/*===========*/ + trx_t* trx, /* in: trx handle */ + trx_sig_t* sig) /* in, own: signal */ +{ + ut_ad(trx && sig); + ut_ad(mutex_own(&kernel_mutex)); + + ut_ad(sig->receiver == NULL); + + UT_LIST_REMOVE(signals, trx->signals, sig); + sig->type = 0; /* reset the field to catch possible bugs */ + + if (sig != &(trx->sig)) { + mem_free(sig); + } +} + +/************************************************************************* +Creates a commit command node struct. */ +UNIV_INTERN +commit_node_t* +commit_node_create( +/*===============*/ + /* out, own: commit node struct */ + mem_heap_t* heap) /* in: mem heap where created */ +{ + commit_node_t* node; + + node = mem_heap_alloc(heap, sizeof(commit_node_t)); + node->common.type = QUE_NODE_COMMIT; + node->state = COMMIT_NODE_SEND; + + return(node); +} + +/*************************************************************** +Performs an execution step for a commit type node in a query graph. */ +UNIV_INTERN +que_thr_t* +trx_commit_step( +/*============*/ + /* out: query thread to run next, or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + commit_node_t* node; + que_thr_t* next_thr; + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = COMMIT_NODE_SEND; + } + + if (node->state == COMMIT_NODE_SEND) { + mutex_enter(&kernel_mutex); + + node->state = COMMIT_NODE_WAIT; + + next_thr = NULL; + + thr->state = QUE_THR_SIG_REPLY_WAIT; + + /* Send the commit signal to the transaction */ + + trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF, + thr, NULL, &next_thr); + + mutex_exit(&kernel_mutex); + + return(next_thr); + } + + ut_ad(node->state == COMMIT_NODE_WAIT); + + node->state = COMMIT_NODE_SEND; + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/************************************************************************** +Does the transaction commit for MySQL. */ +UNIV_INTERN +ulint +trx_commit_for_mysql( +/*=================*/ + /* out: DB_SUCCESS or error number */ + trx_t* trx) /* in: trx handle */ +{ + /* Because we do not do the commit by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx_start_if_not_started(trx); + + trx->op_info = "committing"; + + mutex_enter(&kernel_mutex); + + trx_commit_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(DB_SUCCESS); +} + +/************************************************************************** +If required, flushes the log to disk if we called trx_commit_for_mysql() +with trx->flush_log_later == TRUE. */ +UNIV_INTERN +ulint +trx_commit_complete_for_mysql( +/*==========================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + ib_uint64_t lsn = trx->commit_lsn; + + ut_a(trx); + + trx->op_info = "flushing log"; + + if (!trx->must_flush_log_later) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + /* Write the log to the log files AND flush them to + disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + trx->must_flush_log_later = FALSE; + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +Marks the latest SQL statement ended. */ +UNIV_INTERN +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx) /* in: trx handle */ +{ + ut_a(trx); + + if (trx->conc_state == TRX_NOT_STARTED) { + trx->undo_no = ut_dulint_zero; + } + + trx->last_sql_stat_start.least_undo_no = trx->undo_no; +} + +/************************************************************************** +Prints info about a transaction to the given file. The caller must own the +kernel mutex and must have called +innobase_mysql_prepare_print_arbitrary_thd(), unless he knows that MySQL +or InnoDB cannot meanwhile change the info printed here. */ +UNIV_INTERN +void +trx_print( +/*======*/ + FILE* f, /* in: output stream */ + trx_t* trx, /* in: transaction */ + ulint max_query_len) /* in: max query length to print, or 0 to + use the default max length */ +{ + ibool newline; + + fprintf(f, "TRANSACTION " TRX_ID_FMT, TRX_ID_PREP_PRINTF(trx->id)); + + switch (trx->conc_state) { + case TRX_NOT_STARTED: + fputs(", not started", f); + break; + case TRX_ACTIVE: + fprintf(f, ", ACTIVE %lu sec", + (ulong)difftime(time(NULL), trx->start_time)); + break; + case TRX_PREPARED: + fprintf(f, ", ACTIVE (PREPARED) %lu sec", + (ulong)difftime(time(NULL), trx->start_time)); + break; + case TRX_COMMITTED_IN_MEMORY: + fputs(", COMMITTED IN MEMORY", f); + break; + default: + fprintf(f, " state %lu", (ulong) trx->conc_state); + } + +#ifdef UNIV_LINUX + fprintf(f, ", process no %lu", trx->mysql_process_no); +#endif + fprintf(f, ", OS thread id %lu", + (ulong) os_thread_pf(trx->mysql_thread_id)); + + if (*trx->op_info) { + putc(' ', f); + fputs(trx->op_info, f); + } + + if (trx->is_recovered) { + fputs(" recovered trx", f); + } + + if (trx->is_purge) { + fputs(" purge trx", f); + } + + if (trx->declared_to_be_inside_innodb) { + fprintf(f, ", thread declared inside InnoDB %lu", + (ulong) trx->n_tickets_to_enter_innodb); + } + + putc('\n', f); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + fprintf(f, "mysql tables in use %lu, locked %lu\n", + (ulong) trx->n_mysql_tables_in_use, + (ulong) trx->mysql_n_tables_locked); + } + + newline = TRUE; + + switch (trx->que_state) { + case TRX_QUE_RUNNING: + newline = FALSE; break; + case TRX_QUE_LOCK_WAIT: + fputs("LOCK WAIT ", f); break; + case TRX_QUE_ROLLING_BACK: + fputs("ROLLING BACK ", f); break; + case TRX_QUE_COMMITTING: + fputs("COMMITTING ", f); break; + default: + fprintf(f, "que state %lu ", (ulong) trx->que_state); + } + + if (0 < UT_LIST_GET_LEN(trx->trx_locks) + || mem_heap_get_size(trx->lock_heap) > 400) { + newline = TRUE; + + fprintf(f, "%lu lock struct(s), heap size %lu," + " %lu row lock(s)", + (ulong) UT_LIST_GET_LEN(trx->trx_locks), + (ulong) mem_heap_get_size(trx->lock_heap), + (ulong) lock_number_of_rows_locked(trx)); + } + + if (trx->has_search_latch) { + newline = TRUE; + fputs(", holds adaptive hash latch", f); + } + + if (!ut_dulint_is_zero(trx->undo_no)) { + newline = TRUE; + fprintf(f, ", undo log entries %lu", + (ulong) ut_dulint_get_low(trx->undo_no)); + } + + if (newline) { + putc('\n', f); + } + + if (trx->mysql_thd != NULL) { + innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len); + } +} + +/*********************************************************************** +Compares the "weight" (or size) of two transactions. Transactions that +have edited non-transactional tables are considered heavier than ones +that have not. */ +UNIV_INTERN +int +trx_weight_cmp( +/*===========*/ + /* out: <0, 0 or >0; similar to strcmp(3) */ + const trx_t* a, /* in: the first transaction to be compared */ + const trx_t* b) /* in: the second transaction to be compared */ +{ + ibool a_notrans_edit; + ibool b_notrans_edit; + + /* If mysql_thd is NULL for a transaction we assume that it has + not edited non-transactional tables. */ + + a_notrans_edit = a->mysql_thd != NULL + && thd_has_edited_nontrans_tables(a->mysql_thd); + + b_notrans_edit = b->mysql_thd != NULL + && thd_has_edited_nontrans_tables(b->mysql_thd); + + if (a_notrans_edit && !b_notrans_edit) { + + return(1); + } + + if (!a_notrans_edit && b_notrans_edit) { + + return(-1); + } + + /* Either both had edited non-transactional tables or both had + not, we fall back to comparing the number of altered/locked + rows. */ + +#if 0 + fprintf(stderr, + "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n", + __func__, + ut_conv_dulint_to_longlong(a->undo_no), + UT_LIST_GET_LEN(a->trx_locks), + ut_conv_dulint_to_longlong(b->undo_no), + UT_LIST_GET_LEN(b->trx_locks)); +#endif + + return(ut_dulint_cmp(TRX_WEIGHT(a), TRX_WEIGHT(b))); +} + +/******************************************************************** +Prepares a transaction. */ +UNIV_INTERN +void +trx_prepare_off_kernel( +/*===================*/ + trx_t* trx) /* in: transaction */ +{ + page_t* update_hdr_page; + trx_rseg_t* rseg; + ib_uint64_t lsn = 0; + mtr_t mtr; + + ut_ad(mutex_own(&kernel_mutex)); + + rseg = trx->rseg; + + if (trx->insert_undo != NULL || trx->update_undo != NULL) { + + mutex_exit(&kernel_mutex); + + mtr_start(&mtr); + + /* Change the undo log segment states from TRX_UNDO_ACTIVE + to TRX_UNDO_PREPARED: these modifications to the file data + structure define the transaction as prepared in the + file-based world, at the serialization point of lsn. */ + + mutex_enter(&(rseg->mutex)); + + if (trx->insert_undo != NULL) { + + /* It is not necessary to obtain trx->undo_mutex here + because only a single OS thread is allowed to do the + transaction prepare for this transaction. */ + + trx_undo_set_state_at_prepare(trx, trx->insert_undo, + &mtr); + } + + if (trx->update_undo) { + update_hdr_page = trx_undo_set_state_at_prepare( + trx, trx->update_undo, &mtr); + } + + mutex_exit(&(rseg->mutex)); + + /*--------------*/ + mtr_commit(&mtr); /* This mtr commit makes the + transaction prepared in the file-based + world */ + /*--------------*/ + lsn = mtr.end_lsn; + + mutex_enter(&kernel_mutex); + } + + ut_ad(mutex_own(&kernel_mutex)); + + /*--------------------------------------*/ + trx->conc_state = TRX_PREPARED; + /*--------------------------------------*/ + + if (lsn) { + /* Depending on the my.cnf options, we may now write the log + buffer to the log files, making the prepared state of the + transaction durable if the OS does not crash. We may also + flush the log files to disk, making the prepared state of the + transaction durable also at an OS crash or a power outage. + + The idea in InnoDB's group prepare is that a group of + transactions gather behind a trx doing a physical disk write + to log files, and when that physical write has been completed, + one of those transactions does a write which prepares the whole + group. Note that this group prepare will only bring benefit if + there are > 2 users in the database. Then at least 2 users can + gather behind one doing the physical log write to disk. + + TODO: find out if MySQL holds some mutex when calling this. + That would spoil our group prepare algorithm. */ + + mutex_exit(&kernel_mutex); + + if (srv_flush_log_at_trx_commit == 0) { + /* Do nothing */ + } else if (srv_flush_log_at_trx_commit == 1) { + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, + FALSE); + } else { + /* Write the log to the log files AND flush + them to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + + log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); + } else { + ut_error; + } + + mutex_enter(&kernel_mutex); + } +} + +/************************************************************************** +Does the transaction prepare for MySQL. */ +UNIV_INTERN +ulint +trx_prepare_for_mysql( +/*==================*/ + /* out: 0 or error number */ + trx_t* trx) /* in: trx handle */ +{ + /* Because we do not do the prepare by sending an Innobase + sig to the transaction, we must here make sure that trx has been + started. */ + + ut_a(trx); + + trx->op_info = "preparing"; + + trx_start_if_not_started(trx); + + mutex_enter(&kernel_mutex); + + trx_prepare_off_kernel(trx); + + mutex_exit(&kernel_mutex); + + trx->op_info = ""; + + return(0); +} + +/************************************************************************** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. */ +UNIV_INTERN +int +trx_recover_for_mysql( +/*==================*/ + /* out: number of prepared transactions + stored in xid_list */ + XID* xid_list, /* in/out: prepared transactions */ + ulint len) /* in: number of slots in xid_list */ +{ + trx_t* trx; + ulint count = 0; + + ut_ad(xid_list); + ut_ad(len); + + /* We should set those transactions which are in the prepared state + to the xid_list */ + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + if (trx->conc_state == TRX_PREPARED) { + xid_list[count] = trx->xid; + + if (count == 0) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Starting recovery for" + " XA transactions...\n"); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction " TRX_ID_FMT " in" + " prepared state after recovery\n", + TRX_ID_PREP_PRINTF(trx->id)); + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Transaction contains changes" + " to %lu rows\n", + (ulong) ut_conv_dulint_to_longlong( + trx->undo_no)); + + count++; + + if (count == len) { + break; + } + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (count > 0){ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: %lu transactions in prepared state" + " after recovery\n", + (ulong) count); + } + + return ((int) count); +} + +/*********************************************************************** +This function is used to find one X/Open XA distributed transaction +which is in the prepared state */ +UNIV_INTERN +trx_t* +trx_get_trx_by_xid( +/*===============*/ + /* out: trx or NULL */ + XID* xid) /* in: X/Open XA transaction identification */ +{ + trx_t* trx; + + if (xid == NULL) { + + return (NULL); + } + + mutex_enter(&kernel_mutex); + + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); + + while (trx) { + /* Compare two X/Open XA transaction id's: their + length should be the same and binary comparison + of gtrid_lenght+bqual_length bytes should be + the same */ + + if (xid->gtrid_length == trx->xid.gtrid_length + && xid->bqual_length == trx->xid.bqual_length + && memcmp(xid->data, trx->xid.data, + xid->gtrid_length + xid->bqual_length) == 0) { + break; + } + + trx = UT_LIST_GET_NEXT(trx_list, trx); + } + + mutex_exit(&kernel_mutex); + + if (trx) { + if (trx->conc_state != TRX_PREPARED) { + + return(NULL); + } + + return(trx); + } else { + return(NULL); + } +} diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.c new file mode 100644 index 00000000000..bb5710aeba9 --- /dev/null +++ b/storage/xtradb/trx/trx0undo.c @@ -0,0 +1,2021 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "trx0undo.h" + +#ifdef UNIV_NONINL +#include "trx0undo.ic" +#endif + +#include "fsp0fsp.h" +#include "mach0data.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "trx0rec.h" +#include "trx0purge.h" +#include "trx0xa.h" + +/* How should the old versions in the history list be managed? + ---------------------------------------------------------- +If each transaction is given a whole page for its update undo log, file +space consumption can be 10 times higher than necessary. Therefore, +partly filled update undo log pages should be reusable. But then there +is no way individual pages can be ordered so that the ordering agrees +with the serialization numbers of the transactions on the pages. Thus, +the history list must be formed of undo logs, not their header pages as +it was in the old implementation. + However, on a single header page the transactions are placed in +the order of their serialization numbers. As old versions are purged, we +may free the page when the last transaction on the page has been purged. + A problem is that the purge has to go through the transactions +in the serialization order. This means that we have to look through all +rollback segments for the one that has the smallest transaction number +in its history list. + When should we do a purge? A purge is necessary when space is +running out in any of the rollback segments. Then we may have to purge +also old version which might be needed by some consistent read. How do +we trigger the start of a purge? When a transaction writes to an undo log, +it may notice that the space is running out. When a read view is closed, +it may make some history superfluous. The server can have an utility which +periodically checks if it can purge some history. + In a parallellized purge we have the problem that a query thread +can remove a delete marked clustered index record before another query +thread has processed an earlier version of the record, which cannot then +be done because the row cannot be constructed from the clustered index +record. To avoid this problem, we will store in the update and delete mark +undo record also the columns necessary to construct the secondary index +entries which are modified. + We can latch the stack of versions of a single clustered index record +by taking a latch on the clustered index page. As long as the latch is held, +no new versions can be added and no versions removed by undo. But, a purge +can still remove old versions from the bottom of the stack. */ + +/* How to protect rollback segments, undo logs, and history lists with + ------------------------------------------------------------------- +latches? +------- +The contention of the kernel mutex should be minimized. When a transaction +does its first insert or modify in an index, an undo log is assigned for it. +Then we must have an x-latch to the rollback segment header. + When the transaction does more modifys or rolls back, the undo log is +protected with undo_mutex in the transaction. + When the transaction commits, its insert undo log is either reset and +cached for a fast reuse, or freed. In these cases we must have an x-latch on +the rollback segment page. The update undo log is put to the history list. If +it is not suitable for reuse, its slot in the rollback segment is reset. In +both cases, an x-latch must be acquired on the rollback segment. + The purge operation steps through the history list without modifying +it until a truncate operation occurs, which can remove undo logs from the end +of the list and release undo log segments. In stepping through the list, +s-latches on the undo log pages are enough, but in a truncate, x-latches must +be obtained on the rollback segment and individual pages. */ + +/************************************************************************ +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*===============*/ + page_t* undo_page, /* in: undo log segment page */ + ulint type, /* in: undo log segment type */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************ +Creates and initializes an undo log memory object. */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + /* out, own: the undo log memory object */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint id, /* in: slot index within rseg */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + const XID* xid, /* in: X/Open XA transaction identification*/ + ulint page_no,/* in: undo log header page number */ + ulint offset);/* in: undo log header byte offset on page */ +/******************************************************************* +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + /* out: undo log header byte offset on page */ + page_t* undo_page, /* in: insert undo log segment header page, + x-latched */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr); /* in: mtr */ +/************************************************************************** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /* in: header page of an undo log of size 1 */ + mtr_t* mtr); /* in: mtr */ + + +/*************************************************************************** +Gets the previous record in an undo log from the previous page. */ +static +trx_undo_rec_t* +trx_undo_get_prev_rec_from_prev_page( +/*=================================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr) /* in: mtr */ +{ + ulint space; + ulint zip_size; + ulint prev_page_no; + page_t* prev_page; + page_t* undo_page; + + undo_page = page_align(rec); + + prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + + if (prev_page_no == FIL_NULL) { + + return(NULL); + } + + space = page_get_space_id(undo_page); + zip_size = fil_space_get_zip_size(space); + + prev_page = trx_undo_page_get_s_latched(space, zip_size, + prev_page_no, mtr); + + return(trx_undo_page_get_last_rec(prev_page, page_no, offset)); +} + +/*************************************************************************** +Gets the previous record in an undo log. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_prev_rec( +/*==================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr) /* in: mtr */ +{ + trx_undo_rec_t* prev_rec; + + prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset); + + if (prev_rec) { + + return(prev_rec); + } + + /* We have to go to the previous undo log page to look for the + previous record */ + + return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset, + mtr)); +} + +/*************************************************************************** +Gets the next record in an undo log from the next page. */ +static +trx_undo_rec_t* +trx_undo_get_next_rec_from_next_page( +/*=================================*/ + /* out: undo log record, the page latched, NULL if + none */ + ulint space, /* in: undo log header space */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* undo_page, /* in: undo log page */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + ulint mode, /* in: latch mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /* in: mtr */ +{ + trx_ulogf_t* log_hdr; + ulint next_page_no; + page_t* next_page; + ulint next; + + if (page_no == page_get_page_no(undo_page)) { + + log_hdr = undo_page + offset; + next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG); + + if (next != 0) { + + return(NULL); + } + } + + next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_NODE, mtr) + .page; + if (next_page_no == FIL_NULL) { + + return(NULL); + } + + if (mode == RW_S_LATCH) { + next_page = trx_undo_page_get_s_latched(space, zip_size, + next_page_no, mtr); + } else { + ut_ad(mode == RW_X_LATCH); + next_page = trx_undo_page_get(space, zip_size, + next_page_no, mtr); + } + + return(trx_undo_page_get_first_rec(next_page, page_no, offset)); +} + +/*************************************************************************** +Gets the next record in an undo log. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_next_rec( +/*==================*/ + /* out: undo log record, the page s-latched, + NULL if none */ + trx_undo_rec_t* rec, /* in: undo record */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + mtr_t* mtr) /* in: mtr */ +{ + ulint space; + ulint zip_size; + trx_undo_rec_t* next_rec; + + next_rec = trx_undo_page_get_next_rec(rec, page_no, offset); + + if (next_rec) { + return(next_rec); + } + + space = page_get_space_id(page_align(rec)); + zip_size = fil_space_get_zip_size(space); + + return(trx_undo_get_next_rec_from_next_page(space, zip_size, + page_align(rec), + page_no, offset, + RW_S_LATCH, mtr)); +} + +/*************************************************************************** +Gets the first record in an undo log. */ +UNIV_INTERN +trx_undo_rec_t* +trx_undo_get_first_rec( +/*===================*/ + /* out: undo log record, the page latched, NULL if + none */ + ulint space, /* in: undo log header space */ + ulint zip_size,/* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint page_no,/* in: undo log header page number */ + ulint offset, /* in: undo log header offset on page */ + ulint mode, /* in: latching mode: RW_S_LATCH or RW_X_LATCH */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + + if (mode == RW_S_LATCH) { + undo_page = trx_undo_page_get_s_latched(space, zip_size, + page_no, mtr); + } else { + undo_page = trx_undo_page_get(space, zip_size, page_no, mtr); + } + + rec = trx_undo_page_get_first_rec(undo_page, page_no, offset); + + if (rec) { + return(rec); + } + + return(trx_undo_get_next_rec_from_next_page(space, zip_size, + undo_page, page_no, offset, + mode, mtr)); +} + +/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/ + +/************************************************************************** +Writes the mtr log entry of an undo log page initialization. */ +UNIV_INLINE +void +trx_undo_page_init_log( +/*===================*/ + page_t* undo_page, /* in: undo log page */ + ulint type, /* in: undo log type */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr); + + mlog_catenate_ulint_compressed(mtr, type); +} + +/*************************************************************** +Parses the redo log entry of an undo log page initialization. */ +UNIV_INTERN +byte* +trx_undo_parse_page_init( +/*=====================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ulint type; + + ptr = mach_parse_compressed(ptr, end_ptr, &type); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + trx_undo_page_init(page, type, mtr); + } + + return(ptr); +} + +/************************************************************************ +Initializes the fields in an undo log segment page. */ +static +void +trx_undo_page_init( +/*===============*/ + page_t* undo_page, /* in: undo log segment page */ + ulint type, /* in: undo log segment type */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + + fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG); + + trx_undo_page_init_log(undo_page, type, mtr); +} + +/******************************************************************* +Creates a new undo log segment in file. */ +static +ulint +trx_undo_seg_create( +/*================*/ + /* out: DB_SUCCESS if page creation OK + possible error codes are: + DB_TOO_MANY_CONCURRENT_TRXS + DB_OUT_OF_FILE_SPACE */ + trx_rseg_t* rseg __attribute__((unused)),/* in: rollback segment */ + trx_rsegf_t* rseg_hdr,/* in: rollback segment header, page + x-latched */ + ulint type, /* in: type of the segment: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + ulint* id, /* out: slot index within rseg header */ + page_t** undo_page, + /* out: segment header page x-latched, NULL + if there was an error */ + mtr_t* mtr) /* in: mtr */ +{ + ulint slot_no; + ulint space; + buf_block_t* block; + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + ulint n_reserved; + ibool success; + ulint err = DB_SUCCESS; + + ut_ad(mtr && id && rseg_hdr); + ut_ad(mutex_own(&(rseg->mutex))); + + /* fputs(type == TRX_UNDO_INSERT + ? "Creating insert undo log segment\n" + : "Creating update undo log segment\n", stderr); */ + slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr); + + if (slot_no == ULINT_UNDEFINED) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: cannot find a free slot for" + " an undo log. Do you have too\n" + "InnoDB: many active transactions" + " running concurrently?\n"); + + return(DB_TOO_MANY_CONCURRENT_TRXS); + } + + space = page_get_space_id(page_align(rseg_hdr)); + + success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, + mtr); + if (!success) { + + return(DB_OUT_OF_FILE_SPACE); + } + + /* Allocate a new file segment for the undo log */ + block = fseg_create_general(space, 0, + TRX_UNDO_SEG_HDR + + TRX_UNDO_FSEG_HEADER, TRUE, mtr); + + fil_space_release_free_extents(space, n_reserved); + + if (block == NULL) { + /* No space left */ + + return(DB_OUT_OF_FILE_SPACE); + } + + buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); + + *undo_page = buf_block_get_frame(block); + + page_hdr = *undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = *undo_page + TRX_UNDO_SEG_HDR; + + trx_undo_page_init(*undo_page, type, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, + TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE, + MLOG_2BYTES, mtr); + + mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr); + + flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr); + + flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST, + page_hdr + TRX_UNDO_PAGE_NODE, mtr); + + trx_rsegf_set_nth_undo(rseg_hdr, slot_no, + page_get_page_no(*undo_page), mtr); + *id = slot_no; + + return(err); +} + +/************************************************************************** +Writes the mtr log entry of an undo log header initialization. */ +UNIV_INLINE +void +trx_undo_header_create_log( +/*=======================*/ + page_t* undo_page, /* in: undo log header page */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr); + + mlog_catenate_dulint_compressed(mtr, trx_id); +} + +/******************************************************************* +Creates a new undo log header in file. NOTE that this function has its own +log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of +this function! */ +static +ulint +trx_undo_header_create( +/*===================*/ + /* out: header byte offset on page */ + page_t* undo_page, /* in: undo log segment header page, + x-latched; it is assumed that there are + TRX_UNDO_LOG_XA_HDR_SIZE bytes free space + on it */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint prev_log; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + + if (prev_log != 0) { + prev_log_hdr = undo_page + prev_log; + + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free); + } + + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free); + + log_hdr = undo_page + free; + + mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE); + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0); + mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log); + + /* Write the log record about the header creation */ + trx_undo_header_create_log(undo_page, trx_id, mtr); + + return(free); +} + +/************************************************************************ +Write X/Open XA Transaction Identification (XID) to undo log header */ +static +void +trx_undo_write_xid( +/*===============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + const XID* xid, /* in: X/Open XA Transaction Identification */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT, + (ulint)xid->formatID, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN, + (ulint)xid->gtrid_length, MLOG_4BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN, + (ulint)xid->bqual_length, MLOG_4BYTES, mtr); + + mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data, + XIDDATASIZE, mtr); +} + +/************************************************************************ +Read X/Open XA Transaction Identification (XID) from undo log header */ +static +void +trx_undo_read_xid( +/*==============*/ + trx_ulogf_t* log_hdr,/* in: undo log header */ + XID* xid) /* out: X/Open XA Transaction Identification */ +{ + xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT); + + xid->gtrid_length + = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN); + xid->bqual_length + = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN); + + memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE); +} + +/******************************************************************* +Adds space for the XA XID after an undo log old-style header. */ +static +void +trx_undo_header_add_space_for_xid( +/*==============================*/ + page_t* undo_page,/* in: undo log segment header page */ + trx_ulogf_t* log_hdr,/* in: undo log header */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + ulint free; + ulint new_free; + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE); + + /* free is now the end offset of the old style undo log header */ + + ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE); + + new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE + - TRX_UNDO_LOG_OLD_HDR_SIZE); + + /* Add space for a XID after the header, update the free offset + fields on the undo log page and in the undo log header */ + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free, + MLOG_2BYTES, mtr); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free, + MLOG_2BYTES, mtr); +} + +/************************************************************************** +Writes the mtr log entry of an undo log header reuse. */ +UNIV_INLINE +void +trx_undo_insert_header_reuse_log( +/*=============================*/ + page_t* undo_page, /* in: undo log header page */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr); + + mlog_catenate_dulint_compressed(mtr, trx_id); +} + +/*************************************************************** +Parses the redo log entry of an undo log page header create or reuse. */ +UNIV_INTERN +byte* +trx_undo_parse_page_header( +/*=======================*/ + /* out: end of log record or NULL */ + ulint type, /* in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */ + byte* ptr, /* in: buffer */ + byte* end_ptr,/* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + dulint trx_id; + + ptr = mach_dulint_parse_compressed(ptr, end_ptr, &trx_id); + + if (ptr == NULL) { + + return(NULL); + } + + if (page) { + if (type == MLOG_UNDO_HDR_CREATE) { + trx_undo_header_create(page, trx_id, mtr); + } else { + ut_ad(type == MLOG_UNDO_HDR_REUSE); + trx_undo_insert_header_reuse(page, trx_id, mtr); + } + } + + return(ptr); +} + +/******************************************************************* +Initializes a cached insert undo log header page for new use. NOTE that this +function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change +the operation of this function! */ +static +ulint +trx_undo_insert_header_reuse( +/*=========================*/ + /* out: undo log header byte offset on page */ + page_t* undo_page, /* in: insert undo log segment header page, + x-latched */ + dulint trx_id, /* in: transaction id */ + mtr_t* mtr) /* in: mtr */ +{ + trx_upagef_t* page_hdr; + trx_usegf_t* seg_hdr; + trx_ulogf_t* log_hdr; + ulint free; + ulint new_free; + + ut_ad(mtr && undo_page); + + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + + free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE; + + ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100); + + log_hdr = undo_page + free; + + new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE; + + /* Insert undo data is not needed after commit: we may free all + the space on the page */ + + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_INSERT); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free); + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE); + + log_hdr = undo_page + free; + + mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id); + mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free); + + mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE); + mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE); + + /* Write the log record MLOG_UNDO_HDR_REUSE */ + trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr); + + return(free); +} + +/************************************************************************** +Writes the redo log entry of an update undo log header discard. */ +UNIV_INLINE +void +trx_undo_discard_latest_log( +/*========================*/ + page_t* undo_page, /* in: undo log header page */ + mtr_t* mtr) /* in: mtr */ +{ + mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr); +} + +/*************************************************************** +Parses the redo log entry of an undo log page header discard. */ +UNIV_INTERN +byte* +trx_undo_parse_discard_latest( +/*==========================*/ + /* out: end of log record or NULL */ + byte* ptr, /* in: buffer */ + byte* end_ptr __attribute__((unused)), /* in: buffer end */ + page_t* page, /* in: page or NULL */ + mtr_t* mtr) /* in: mtr or NULL */ +{ + ut_ad(end_ptr); + + if (page) { + trx_undo_discard_latest_update_undo(page, mtr); + } + + return(ptr); +} + +/************************************************************************** +If an update undo log can be discarded immediately, this function frees the +space, resetting the page to the proper state for caching. */ +static +void +trx_undo_discard_latest_update_undo( +/*================================*/ + page_t* undo_page, /* in: header page of an undo log of size 1 */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* log_hdr; + trx_ulogf_t* prev_log_hdr; + ulint free; + ulint prev_hdr_offset; + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + log_hdr = undo_page + free; + + prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG); + + if (prev_hdr_offset != 0) { + prev_log_hdr = undo_page + prev_hdr_offset; + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, + mach_read_from_2(prev_log_hdr + + TRX_UNDO_LOG_START)); + mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0); + } + + mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free); + + mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED); + mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset); + + trx_undo_discard_latest_log(undo_page, mtr); +} + +/************************************************************************ +Tries to add a page to the undo log segment where the undo log is placed. */ +UNIV_INTERN +ulint +trx_undo_add_page( +/*==============*/ + /* out: page number if success, else + FIL_NULL */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory object */ + mtr_t* mtr) /* in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + page_t* new_page; + trx_rseg_t* rseg; + ulint page_no; + ulint n_reserved; + ibool success; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(!mutex_own(&kernel_mutex)); + ut_ad(mutex_own(&(trx->rseg->mutex))); + + rseg = trx->rseg; + + if (rseg->curr_size == rseg->max_size) { + + return(FIL_NULL); + } + + header_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + success = fsp_reserve_free_extents(&n_reserved, undo->space, 1, + FSP_UNDO, mtr); + if (!success) { + + return(FIL_NULL); + } + + page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR + + TRX_UNDO_FSEG_HEADER, + undo->top_page_no + 1, FSP_UP, + TRUE, mtr); + + fil_space_release_free_extents(undo->space, n_reserved); + + if (page_no == FIL_NULL) { + + /* No space left */ + + return(FIL_NULL); + } + + undo->last_page_no = page_no; + + new_page = trx_undo_page_get(undo->space, undo->zip_size, + page_no, mtr); + + trx_undo_page_init(new_page, undo->type, mtr); + + flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + undo->size++; + rseg->curr_size++; + + return(page_no); +} + +/************************************************************************ +Frees an undo log page that is not the header page. */ +static +ulint +trx_undo_free_page( +/*===============*/ + /* out: last page number in remaining log */ + trx_rseg_t* rseg, /* in: rollback segment */ + ibool in_history, /* in: TRUE if the undo log is in the history + list */ + ulint space, /* in: space */ + ulint hdr_page_no, /* in: header page number */ + ulint page_no, /* in: page number to free: must not be the + header page */ + mtr_t* mtr) /* in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + page_t* header_page; + page_t* undo_page; + fil_addr_t last_addr; + trx_rsegf_t* rseg_header; + ulint hist_size; + ulint zip_size; + + ut_a(hdr_page_no != page_no); + ut_ad(!mutex_own(&kernel_mutex)); + ut_ad(mutex_own(&(rseg->mutex))); + + zip_size = rseg->zip_size; + + undo_page = trx_undo_page_get(space, zip_size, page_no, mtr); + + header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr); + + flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST, + undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr); + + fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, + space, page_no, mtr); + + last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR + + TRX_UNDO_PAGE_LIST, mtr); + rseg->curr_size--; + + if (in_history) { + rseg_header = trx_rsegf_get(space, zip_size, + rseg->page_no, mtr); + + hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + MLOG_4BYTES, mtr); + ut_ad(hist_size > 0); + mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE, + hist_size - 1, MLOG_4BYTES, mtr); + } + + return(last_addr.page); +} + +/************************************************************************ +Frees an undo log page when there is also the memory object for the undo +log. */ +static +void +trx_undo_free_page_in_rollback( +/*===========================*/ + trx_t* trx __attribute__((unused)), /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + ulint page_no,/* in: page number to free: must not be the + header page */ + mtr_t* mtr) /* in: mtr which does not have a latch to any + undo log page; the caller must have reserved + the rollback segment mutex */ +{ + ulint last_page_no; + + ut_ad(undo->hdr_page_no != page_no); + ut_ad(mutex_own(&(trx->undo_mutex))); + + last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space, + undo->hdr_page_no, page_no, mtr); + + undo->last_page_no = last_page_no; + undo->size--; +} + +/************************************************************************ +Empties an undo log header page of undo records for that undo log. Other +undo logs may still have records on that page, if it is an update undo log. */ +static +void +trx_undo_empty_header_page( +/*=======================*/ + ulint space, /* in: space */ + ulint zip_size, /* in: compressed page size in bytes + or 0 for uncompressed pages */ + ulint hdr_page_no, /* in: header page number */ + ulint hdr_offset, /* in: header offset */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* header_page; + trx_ulogf_t* log_hdr; + ulint end; + + header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr); + + log_hdr = header_page + hdr_offset; + + end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset); + + mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr); +} + +/*************************************************************************** +Truncates an undo log from the end. This function is used during a rollback +to free space from an undo log. */ +UNIV_INTERN +void +trx_undo_truncate_end( +/*==================*/ + trx_t* trx, /* in: transaction whose undo log it is */ + trx_undo_t* undo, /* in: undo log */ + dulint limit) /* in: all undo records with undo number + >= this value should be truncated */ +{ + page_t* undo_page; + ulint last_page_no; + trx_undo_rec_t* rec; + trx_undo_rec_t* trunc_here; + trx_rseg_t* rseg; + mtr_t mtr; + + ut_ad(mutex_own(&(trx->undo_mutex))); + ut_ad(mutex_own(&(trx->rseg->mutex))); + + rseg = trx->rseg; + + for (;;) { + mtr_start(&mtr); + + trunc_here = NULL; + + last_page_no = undo->last_page_no; + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + last_page_no, &mtr); + + rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no, + undo->hdr_offset); + for (;;) { + if (rec == NULL) { + if (last_page_no == undo->hdr_page_no) { + + goto function_exit; + } + + trx_undo_free_page_in_rollback( + trx, undo, last_page_no, &mtr); + break; + } + + if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit) + >= 0) { + /* Truncate at least this record off, maybe + more */ + trunc_here = rec; + } else { + goto function_exit; + } + + rec = trx_undo_page_get_prev_rec(rec, + undo->hdr_page_no, + undo->hdr_offset); + } + + mtr_commit(&mtr); + } + +function_exit: + if (trunc_here) { + mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_FREE, + trunc_here - undo_page, MLOG_2BYTES, &mtr); + } + + mtr_commit(&mtr); +} + +/*************************************************************************** +Truncates an undo log from the start. This function is used during a purge +operation. */ +UNIV_INTERN +void +trx_undo_truncate_start( +/*====================*/ + trx_rseg_t* rseg, /* in: rollback segment */ + ulint space, /* in: space id of the log */ + ulint hdr_page_no, /* in: header page number */ + ulint hdr_offset, /* in: header offset on the page */ + dulint limit) /* in: all undo pages with undo numbers < + this value should be truncated; NOTE that + the function only frees whole pages; the + header page is not freed, but emptied, if + all the records there are < limit */ +{ + page_t* undo_page; + trx_undo_rec_t* rec; + trx_undo_rec_t* last_rec; + ulint page_no; + mtr_t mtr; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (ut_dulint_is_zero(limit)) { + + return; + } +loop: + mtr_start(&mtr); + + rec = trx_undo_get_first_rec(space, rseg->zip_size, + hdr_page_no, hdr_offset, + RW_X_LATCH, &mtr); + if (rec == NULL) { + /* Already empty */ + + mtr_commit(&mtr); + + return; + } + + undo_page = page_align(rec); + + last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no, + hdr_offset); + if (ut_dulint_cmp(trx_undo_rec_get_undo_no(last_rec), limit) >= 0) { + + mtr_commit(&mtr); + + return; + } + + page_no = page_get_page_no(undo_page); + + if (page_no == hdr_page_no) { + trx_undo_empty_header_page(space, rseg->zip_size, + hdr_page_no, hdr_offset, + &mtr); + } else { + trx_undo_free_page(rseg, TRUE, space, hdr_page_no, + page_no, &mtr); + } + + mtr_commit(&mtr); + + goto loop; +} + +/************************************************************************** +Frees an undo log segment which is not in the history list. */ +static +void +trx_undo_seg_free( +/*==============*/ + trx_undo_t* undo) /* in: undo log */ +{ + trx_rseg_t* rseg; + fseg_header_t* file_seg; + trx_rsegf_t* rseg_header; + trx_usegf_t* seg_header; + ibool finished; + mtr_t mtr; + + rseg = undo->rseg; + + do { + + mtr_start(&mtr); + + ut_ad(!mutex_own(&kernel_mutex)); + + mutex_enter(&(rseg->mutex)); + + seg_header = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, + &mtr) + TRX_UNDO_SEG_HDR; + + file_seg = seg_header + TRX_UNDO_FSEG_HEADER; + + finished = fseg_free_step(file_seg, &mtr); + + if (finished) { + /* Update the rseg header */ + rseg_header = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, + &mtr); + trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, + &mtr); + } + + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + } while (!finished); +} + +/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/ + +/************************************************************************ +Creates and initializes an undo log memory object according to the values +in the header in file, when the database is started. The memory object is +inserted in the appropriate list of rseg. */ +static +trx_undo_t* +trx_undo_mem_create_at_db_start( +/*============================*/ + /* out, own: the undo log memory object */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint id, /* in: slot index within rseg */ + ulint page_no,/* in: undo log segment page number */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* undo_page; + trx_upagef_t* page_header; + trx_usegf_t* seg_header; + trx_ulogf_t* undo_header; + trx_undo_t* undo; + ulint type; + ulint state; + dulint trx_id; + ulint offset; + fil_addr_t last_addr; + page_t* last_page; + trx_undo_rec_t* rec; + XID xid; + ibool xid_exists = FALSE; + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, + page_no, mtr); + + page_header = undo_page + TRX_UNDO_PAGE_HDR; + + type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES, + mtr); + seg_header = undo_page + TRX_UNDO_SEG_HDR; + + state = mach_read_from_2(seg_header + TRX_UNDO_STATE); + + offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG); + + undo_header = undo_page + offset; + + trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr); + + xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS, + MLOG_1BYTE, mtr); + + /* Read X/Open XA transaction identification if it exists, or + set it to NULL. */ + + memset(&xid, 0, sizeof(xid)); + xid.formatID = -1; + + if (xid_exists == TRUE) { + trx_undo_read_xid(undo_header, &xid); + } + + mutex_enter(&(rseg->mutex)); + + undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid, + page_no, offset); + mutex_exit(&(rseg->mutex)); + + undo->dict_operation = mtr_read_ulint( + undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr); + + undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr); + undo->state = state; + undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + /* If the log segment is being freed, the page list is inconsistent! */ + if (state == TRX_UNDO_TO_FREE) { + + goto add_to_list; + } + + last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr); + + undo->last_page_no = last_addr.page; + undo->top_page_no = last_addr.page; + + last_page = trx_undo_page_get(rseg->space, rseg->zip_size, + undo->last_page_no, mtr); + + rec = trx_undo_page_get_last_rec(last_page, page_no, offset); + + if (rec == NULL) { + undo->empty = TRUE; + } else { + undo->empty = FALSE; + undo->top_offset = rec - last_page; + undo->top_undo_no = trx_undo_rec_get_undo_no(rec); + } +add_to_list: + if (type == TRX_UNDO_INSERT) { + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached, + undo); + } + } else { + ut_ad(type == TRX_UNDO_UPDATE); + if (state != TRX_UNDO_CACHED) { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list, + undo); + } else { + UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached, + undo); + } + } + + return(undo); +} + +/************************************************************************ +Initializes the undo log lists for a rollback segment memory copy. This +function is only called when the database is started or a new rollback +segment is created. */ +UNIV_INTERN +ulint +trx_undo_lists_init( +/*================*/ + /* out: the combined size of undo log segments + in pages */ + trx_rseg_t* rseg) /* in: rollback segment memory object */ +{ + ulint page_no; + trx_undo_t* undo; + ulint size = 0; + trx_rsegf_t* rseg_header; + ulint i; + mtr_t mtr; + + UT_LIST_INIT(rseg->update_undo_list); + UT_LIST_INIT(rseg->update_undo_cached); + UT_LIST_INIT(rseg->insert_undo_list); + UT_LIST_INIT(rseg->insert_undo_cached); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size, + rseg->page_no, &mtr); + + if (!srv_extra_undoslots) { + /* uses direct call for avoid "Assertion failure" */ + //page_no = trx_rsegf_get_nth_undo(rseg_header, TRX_RSEG_N_EXTRA_SLOTS - 1, &mtr); + page_no = mtr_read_ulint(rseg_header + TRX_RSEG_UNDO_SLOTS + + (TRX_RSEG_N_EXTRA_SLOTS - 1) * TRX_RSEG_SLOT_SIZE, + MLOG_4BYTES, &mtr); + if (page_no != 0) { + /* check extended slots are not used */ + for (i = TRX_RSEG_N_SLOTS; i < TRX_RSEG_N_EXTRA_SLOTS; i++) { + /* uses direct call for avoid "Assertion failure" */ + page_no = mtr_read_ulint(rseg_header + TRX_RSEG_UNDO_SLOTS + + i * TRX_RSEG_SLOT_SIZE, + MLOG_4BYTES, &mtr); + if (page_no != FIL_NULL) { + srv_extra_undoslots = TRUE; + fprintf(stderr, +"InnoDB: Error: innodb_extra_undoslots option is disabled, but it was enabled before.\n" +"InnoDB: The datafile is not normal for mysqld and disabled innodb_extra_undoslots.\n" +"InnoDB: Enable innodb_extra_undoslots if it was enabled before, and\n" +"InnoDB: ### don't use this datafile with other mysqld or ibbackup! ###\n" +"InnoDB: Cannot continue operation for the safety. Calling exit(1).\n"); + exit(1); + } + } + fprintf(stderr, +"InnoDB: Warning: innodb_extra_undoslots option is disabled, but it was enabled before.\n" +"InnoDB: But extended undo slots seem not used, so continue operation.\n"); + } + } + + for (i = 0; i < TRX_RSEG_N_SLOTS; i++) { + page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr); + + /* If it was not initialized when the datafile created, + page_no will be 0 for the extended slots after that */ + + if (page_no == 0) { + page_no = FIL_NULL; + trx_rsegf_set_nth_undo(rseg_header, i, page_no, &mtr); + } + + /* In forced recovery: try to avoid operations which look + at database pages; undo logs are rapidly changing data, and + the probability that they are in an inconsistent state is + high */ + + if (page_no != FIL_NULL + && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) { + + undo = trx_undo_mem_create_at_db_start(rseg, i, + page_no, &mtr); + size += undo->size; + + mtr_commit(&mtr); + + mtr_start(&mtr); + + rseg_header = trx_rsegf_get( + rseg->space, rseg->zip_size, rseg->page_no, + &mtr); + } + } + + mtr_commit(&mtr); + + return(size); +} + +/************************************************************************ +Creates and initializes an undo log memory object. */ +static +trx_undo_t* +trx_undo_mem_create( +/*================*/ + /* out, own: the undo log memory object */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint id, /* in: slot index within rseg */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + const XID* xid, /* in: X/Open transaction identification */ + ulint page_no,/* in: undo log header page number */ + ulint offset) /* in: undo log header byte offset on page */ +{ + trx_undo_t* undo; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) id); + ut_error; + } + + undo = mem_alloc(sizeof(trx_undo_t)); + + if (undo == NULL) { + + return NULL; + } + + undo->id = id; + undo->type = type; + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->rseg = rseg; + + undo->space = rseg->space; + undo->zip_size = rseg->zip_size; + undo->hdr_page_no = page_no; + undo->hdr_offset = offset; + undo->last_page_no = page_no; + undo->size = 1; + + undo->empty = TRUE; + undo->top_page_no = page_no; + undo->guess_block = NULL; + + return(undo); +} + +/************************************************************************ +Initializes a cached undo log object for new use. */ +static +void +trx_undo_mem_init_for_reuse( +/*========================*/ + trx_undo_t* undo, /* in: undo log to init */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + const XID* xid, /* in: X/Open XA transaction identification*/ + ulint offset) /* in: undo log header byte offset on page */ +{ + ut_ad(mutex_own(&((undo->rseg)->mutex))); + + if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + + mem_analyze_corruption(undo); + ut_error; + } + + undo->state = TRX_UNDO_ACTIVE; + undo->del_marks = FALSE; + undo->trx_id = trx_id; + undo->xid = *xid; + + undo->dict_operation = FALSE; + + undo->hdr_offset = offset; + undo->empty = TRUE; +} + +/************************************************************************ +Frees an undo log memory copy. */ +static +void +trx_undo_mem_free( +/*==============*/ + trx_undo_t* undo) /* in: the undo object to be freed */ +{ + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, + "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id); + ut_error; + } + + mem_free(undo); +} + +/************************************************************************** +Creates a new undo log. */ +static +ulint +trx_undo_create( +/*============*/ + /* out: DB_SUCCESS if successful in creating + the new undo lob object, possible error + codes are: + DB_TOO_MANY_CONCURRENT_TRXS + DB_OUT_OF_FILE_SPACE + DB_OUT_OF_MEMORY*/ + trx_t* trx, /* in: transaction */ + trx_rseg_t* rseg, /* in: rollback segment memory copy */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is created */ + const XID* xid, /* in: X/Open transaction identification*/ + trx_undo_t** undo, /* out: the new undo log object, undefined + * if did not succeed */ + mtr_t* mtr) /* in: mtr */ +{ + trx_rsegf_t* rseg_header; + ulint page_no; + ulint offset; + ulint id; + page_t* undo_page; + ulint err; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (rseg->curr_size == rseg->max_size) { + + return(DB_OUT_OF_FILE_SPACE); + } + + rseg->curr_size++; + + rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, + mtr); + + err = trx_undo_seg_create(rseg, rseg_header, type, &id, + &undo_page, mtr); + + if (err != DB_SUCCESS) { + /* Did not succeed */ + + rseg->curr_size--; + + return(err); + } + + page_no = page_get_page_no(undo_page); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid(undo_page, + undo_page + offset, mtr); + } + + *undo = trx_undo_mem_create(rseg, id, type, trx_id, xid, + page_no, offset); + if (*undo == NULL) { + + err = DB_OUT_OF_MEMORY; + } + + return(err); +} + +/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/ + +/************************************************************************ +Reuses a cached undo log. */ +static +trx_undo_t* +trx_undo_reuse_cached( +/*==================*/ + /* out: the undo log memory object, NULL if + none cached */ + trx_t* trx, /* in: transaction */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + ulint type, /* in: type of the log: TRX_UNDO_INSERT or + TRX_UNDO_UPDATE */ + dulint trx_id, /* in: id of the trx for which the undo log + is used */ + const XID* xid, /* in: X/Open XA transaction identification */ + mtr_t* mtr) /* in: mtr */ +{ + trx_undo_t* undo; + page_t* undo_page; + ulint offset; + + ut_ad(mutex_own(&(rseg->mutex))); + + if (type == TRX_UNDO_INSERT) { + + undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo); + } else { + ut_ad(type == TRX_UNDO_UPDATE); + + undo = UT_LIST_GET_FIRST(rseg->update_undo_cached); + if (undo == NULL) { + + return(NULL); + } + + UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo); + } + + ut_ad(undo->size == 1); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + if (type == TRX_UNDO_INSERT) { + offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid( + undo_page, undo_page + offset, mtr); + } + } else { + ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + + TRX_UNDO_PAGE_TYPE) + == TRX_UNDO_UPDATE); + + offset = trx_undo_header_create(undo_page, trx_id, mtr); + + if (trx->support_xa) { + trx_undo_header_add_space_for_xid( + undo_page, undo_page + offset, mtr); + } + } + + trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset); + + return(undo); +} + +/************************************************************************** +Marks an undo log header as a header of a data dictionary operation +transaction. */ +static +void +trx_undo_mark_as_dict_operation( +/*============================*/ + trx_t* trx, /* in: dict op transaction */ + trx_undo_t* undo, /* in: assigned undo log */ + mtr_t* mtr) /* in: mtr */ +{ + page_t* hdr_page; + + hdr_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + ut_error; + case TRX_DICT_OP_INDEX: + /* Do not discard the table on recovery. */ + undo->table_id = ut_dulint_zero; + break; + case TRX_DICT_OP_TABLE: + undo->table_id = trx->table_id; + break; + } + + mlog_write_ulint(hdr_page + undo->hdr_offset + + TRX_UNDO_DICT_TRANS, + TRUE, MLOG_1BYTE, mtr); + + mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID, + undo->table_id, mtr); + + undo->dict_operation = TRUE; +} + +/************************************************************************** +Assigns an undo log for a transaction. A new undo log is created or a cached +undo log reused. */ +UNIV_INTERN +ulint +trx_undo_assign_undo( +/*=================*/ + /* out: DB_SUCCESS if undo log assign + successful, possible error codes are: + DB_TOO_MANY_CONCURRENT_TRXS + DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY*/ + trx_t* trx, /* in: transaction */ + ulint type) /* in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + mtr_t mtr; + ulint err = DB_SUCCESS; + + ut_ad(trx); + ut_ad(trx->rseg); + + rseg = trx->rseg; + + ut_ad(mutex_own(&(trx->undo_mutex))); + + mtr_start(&mtr); + + ut_ad(!mutex_own(&kernel_mutex)); + + mutex_enter(&(rseg->mutex)); + + undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid, + &mtr); + if (undo == NULL) { + err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid, + &undo, &mtr); + if (err != DB_SUCCESS) { + + goto func_exit; + } + } + + if (type == TRX_UNDO_INSERT) { + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo); + ut_ad(trx->insert_undo == NULL); + trx->insert_undo = undo; + } else { + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo); + ut_ad(trx->update_undo == NULL); + trx->update_undo = undo; + } + + if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) { + trx_undo_mark_as_dict_operation(trx, undo, &mtr); + } + +func_exit: + mutex_exit(&(rseg->mutex)); + mtr_commit(&mtr); + + return err; +} + +/********************************************************************** +Sets the state of the undo log segment at a transaction finish. */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_finish( +/*=========================*/ + /* out: undo log segment header page, + x-latched */ + trx_rseg_t* rseg, /* in: rollback segment memory object */ + trx_t* trx __attribute__((unused)), /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + page_t* undo_page; + ulint state; + + ut_ad(trx); + ut_ad(undo); + ut_ad(mtr); + ut_ad(mutex_own(&rseg->mutex)); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + if (undo->size == 1 + && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE) + < TRX_UNDO_PAGE_REUSE_LIMIT) { + + /* This is a heuristic to avoid the problem of all UNDO + slots ending up in one of the UNDO lists. Previously if + the server crashed with all the slots in one of the lists, + transactions that required the slots of a different type + would fail for lack of slots. */ + + if (UT_LIST_GET_LEN(rseg->update_undo_list) < 500 + && UT_LIST_GET_LEN(rseg->insert_undo_list) < 500) { + + state = TRX_UNDO_CACHED; + } else { + state = TRX_UNDO_TO_FREE; + } + + } else if (undo->type == TRX_UNDO_INSERT) { + + state = TRX_UNDO_TO_FREE; + } else { + state = TRX_UNDO_TO_PURGE; + } + + undo->state = state; + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr); + + return(undo_page); +} + +/********************************************************************** +Sets the state of the undo log segment at a transaction prepare. */ +UNIV_INTERN +page_t* +trx_undo_set_state_at_prepare( +/*==========================*/ + /* out: undo log segment header page, + x-latched */ + trx_t* trx, /* in: transaction */ + trx_undo_t* undo, /* in: undo log memory copy */ + mtr_t* mtr) /* in: mtr */ +{ + trx_usegf_t* seg_hdr; + trx_upagef_t* page_hdr; + trx_ulogf_t* undo_header; + page_t* undo_page; + ulint offset; + + ut_ad(trx && undo && mtr); + + if (undo->id >= TRX_RSEG_N_SLOTS) { + fprintf(stderr, "InnoDB: Error: undo->id is %lu\n", + (ulong) undo->id); + mem_analyze_corruption(undo); + ut_error; + } + + undo_page = trx_undo_page_get(undo->space, undo->zip_size, + undo->hdr_page_no, mtr); + + seg_hdr = undo_page + TRX_UNDO_SEG_HDR; + page_hdr = undo_page + TRX_UNDO_PAGE_HDR; + + /*------------------------------*/ + undo->state = TRX_UNDO_PREPARED; + undo->xid = trx->xid; + /*------------------------------*/ + + mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state, + MLOG_2BYTES, mtr); + + offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG); + undo_header = undo_page + offset; + + mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS, + TRUE, MLOG_1BYTE, mtr); + + trx_undo_write_xid(undo_header, &undo->xid, mtr); + + return(undo_page); +} + +/************************************************************************** +Adds the update undo log header as the first in the history list, and +frees the memory object, or puts it to the list of cached update undo log +segments. */ +UNIV_INTERN +void +trx_undo_update_cleanup( +/*====================*/ + trx_t* trx, /* in: trx owning the update undo log */ + page_t* undo_page, /* in: update undo log header page, + x-latched */ + mtr_t* mtr) /* in: mtr */ +{ + trx_rseg_t* rseg; + trx_undo_t* undo; + + undo = trx->update_undo; + rseg = trx->rseg; + + ut_ad(mutex_own(&(rseg->mutex))); + + trx_purge_add_update_undo_to_history(trx, undo_page, mtr); + + UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo); + + trx->update_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo); + } else { + ut_ad(undo->state == TRX_UNDO_TO_PURGE); + + trx_undo_mem_free(undo); + } +} + +/********************************************************************** +Frees or caches an insert undo log after a transaction commit or rollback. +Knowledge of inserts is not needed after a commit or rollback, therefore +the data can be discarded. */ +UNIV_INTERN +void +trx_undo_insert_cleanup( +/*====================*/ + trx_t* trx) /* in: transaction handle */ +{ + trx_undo_t* undo; + trx_rseg_t* rseg; + + undo = trx->insert_undo; + ut_ad(undo); + + rseg = trx->rseg; + + mutex_enter(&(rseg->mutex)); + + UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo); + trx->insert_undo = NULL; + + if (undo->state == TRX_UNDO_CACHED) { + + UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo); + } else { + ut_ad(undo->state == TRX_UNDO_TO_FREE); + + /* Delete first the undo log segment in the file */ + + mutex_exit(&(rseg->mutex)); + + trx_undo_seg_free(undo); + + mutex_enter(&(rseg->mutex)); + + ut_ad(rseg->curr_size > undo->size); + + rseg->curr_size -= undo->size; + + trx_undo_mem_free(undo); + } + + mutex_exit(&(rseg->mutex)); +} diff --git a/storage/xtradb/usr/usr0sess.c b/storage/xtradb/usr/usr0sess.c new file mode 100644 index 00000000000..f45c43869ea --- /dev/null +++ b/storage/xtradb/usr/usr0sess.c @@ -0,0 +1,97 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Sessions + +Created 6/25/1996 Heikki Tuuri +*******************************************************/ + +#include "usr0sess.h" + +#ifdef UNIV_NONINL +#include "usr0sess.ic" +#endif + +#include "trx0trx.h" + +/************************************************************************* +Closes a session, freeing the memory occupied by it. */ +static +void +sess_close( +/*=======*/ + sess_t* sess); /* in, own: session object */ + +/************************************************************************* +Opens a session. */ +UNIV_INTERN +sess_t* +sess_open(void) +/*===========*/ + /* out, own: session object */ +{ + sess_t* sess; + + ut_ad(mutex_own(&kernel_mutex)); + + sess = mem_alloc(sizeof(sess_t)); + + sess->state = SESS_ACTIVE; + + sess->trx = trx_create(sess); + + UT_LIST_INIT(sess->graphs); + + return(sess); +} + +/************************************************************************* +Closes a session, freeing the memory occupied by it. */ +static +void +sess_close( +/*=======*/ + sess_t* sess) /* in, own: session object */ +{ + ut_ad(mutex_own(&kernel_mutex)); + ut_ad(sess->trx == NULL); + + mem_free(sess); +} + +/************************************************************************* +Closes a session, freeing the memory occupied by it, if it is in a state +where it should be closed. */ +UNIV_INTERN +ibool +sess_try_close( +/*===========*/ + /* out: TRUE if closed */ + sess_t* sess) /* in, own: session object */ +{ + ut_ad(mutex_own(&kernel_mutex)); + + if (UT_LIST_GET_LEN(sess->graphs) == 0) { + sess_close(sess); + + return(TRUE); + } + + return(FALSE); +} diff --git a/storage/xtradb/ut/ut0auxconf.c b/storage/xtradb/ut/ut0auxconf.c new file mode 100644 index 00000000000..fd9433d16f6 --- /dev/null +++ b/storage/xtradb/ut/ut0auxconf.c @@ -0,0 +1,13 @@ +#include + +int +main(int argc, char** argv) +{ + pthread_t x1; + pthread_t x2; + pthread_t x3; + + __sync_bool_compare_and_swap(&x1, x2, x3); + + return(0); +} diff --git a/storage/xtradb/ut/ut0byte.c b/storage/xtradb/ut/ut0byte.c new file mode 100644 index 00000000000..5e11e37d0b6 --- /dev/null +++ b/storage/xtradb/ut/ut0byte.c @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************* +Byte utilities + +Created 5/11/1994 Heikki Tuuri +********************************************************************/ + +#include "ut0byte.h" + +#ifdef UNIV_NONINL +#include "ut0byte.ic" +#endif + +/* Zero value for a dulint */ +UNIV_INTERN const dulint ut_dulint_zero = {0, 0}; + +/* Maximum value for a dulint */ +UNIV_INTERN const dulint ut_dulint_max = {0xFFFFFFFFUL, 0xFFFFFFFFUL}; + +#ifdef notdefined /* unused code */ +#include "ut0sort.h" + +/**************************************************************** +Sort function for dulint arrays. */ +UNIV_INTERN +void +ut_dulint_sort(dulint* arr, dulint* aux_arr, ulint low, ulint high) +/*===============================================================*/ +{ + UT_SORT_FUNCTION_BODY(ut_dulint_sort, arr, aux_arr, low, high, + ut_dulint_cmp); +} +#endif /* notdefined */ diff --git a/storage/xtradb/ut/ut0dbg.c b/storage/xtradb/ut/ut0dbg.c new file mode 100644 index 00000000000..55dd457a177 --- /dev/null +++ b/storage/xtradb/ut/ut0dbg.c @@ -0,0 +1,180 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/********************************************************************* +Debug utilities for Innobase. + +Created 1/30/1994 Heikki Tuuri +**********************************************************************/ + +#include "univ.i" +#include "ut0dbg.h" + +#if defined(__GNUC__) && (__GNUC__ > 2) +#else +/* This is used to eliminate compiler warnings */ +UNIV_INTERN ulint ut_dbg_zero = 0; +#endif + +#if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT) +/* If this is set to TRUE all threads will stop into the next assertion +and assert */ +UNIV_INTERN ibool ut_dbg_stop_threads = FALSE; +#endif +#ifdef __NETWARE__ +/* This is set to TRUE when on NetWare there happens an InnoDB +assertion failure or other fatal error condition that requires an +immediate shutdown. */ +UNIV_INTERN ibool panic_shutdown = FALSE; +#elif !defined(UT_DBG_USE_ABORT) +/* Null pointer used to generate memory trap */ +UNIV_INTERN ulint* ut_dbg_null_ptr = NULL; +#endif + +/***************************************************************** +Report a failed assertion. */ +UNIV_INTERN +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /* in: the failed assertion (optional) */ + const char* file, /* in: source file containing the assertion */ + ulint line) /* in: line number of the assertion */ +{ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Assertion failure in thread %lu" + " in file %s line %lu\n", + os_thread_pf(os_thread_get_curr_id()), file, line); + if (expr) { + fprintf(stderr, + "InnoDB: Failing assertion: %s\n", expr); + } + + fputs("InnoDB: We intentionally generate a memory trap.\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com.\n" + "InnoDB: If you get repeated assertion failures" + " or crashes, even\n" + "InnoDB: immediately after the mysqld startup, there may be\n" + "InnoDB: corruption in the InnoDB tablespace. Please refer to\n" + "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/" + "forcing-recovery.html\n" + "InnoDB: about forcing recovery.\n", stderr); +#if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT) + ut_dbg_stop_threads = TRUE; +#endif +} + +#ifdef __NETWARE__ +/***************************************************************** +Shut down MySQL/InnoDB after assertion failure. */ +UNIV_INTERN +void +ut_dbg_panic(void) +/*==============*/ +{ + if (!panic_shutdown) { + panic_shutdown = TRUE; + innobase_shutdown_for_mysql(); + } + exit(1); +} +#else /* __NETWARE__ */ +# if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT) +/***************************************************************** +Stop a thread after assertion failure. */ +UNIV_INTERN +void +ut_dbg_stop_thread( +/*===============*/ + const char* file, + ulint line) +{ + fprintf(stderr, "InnoDB: Thread %lu stopped in file %s line %lu\n", + os_thread_pf(os_thread_get_curr_id()), file, line); + os_thread_sleep(1000000000); +} +# endif +#endif /* __NETWARE__ */ + +#ifdef UNIV_COMPILE_TEST_FUNCS + +#include +#include +#include + +#include + +#ifndef timersub +#define timersub(a, b, r) \ + do { \ + (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((r)->tv_usec < 0) { \ + (r)->tv_sec--; \ + (r)->tv_usec += 1000000; \ + } \ + } while (0) +#endif /* timersub */ + +/*********************************************************************** +Resets a speedo (records the current time in it). */ +UNIV_INTERN +void +speedo_reset( +/*=========*/ + speedo_t* speedo) /* out: speedo */ +{ + gettimeofday(&speedo->tv, NULL); + + getrusage(RUSAGE_SELF, &speedo->ru); +} + +/*********************************************************************** +Shows the time elapsed and usage statistics since the last reset of a +speedo. */ +UNIV_INTERN +void +speedo_show( +/*========*/ + const speedo_t* speedo) /* in: speedo */ +{ + struct rusage ru_now; + struct timeval tv_now; + struct timeval tv_diff; + + getrusage(RUSAGE_SELF, &ru_now); + + gettimeofday(&tv_now, NULL); + +#define PRINT_TIMEVAL(prefix, tvp) \ + fprintf(stderr, "%s% 5ld.%06ld sec\n", \ + prefix, (tvp)->tv_sec, (tvp)->tv_usec) + + timersub(&tv_now, &speedo->tv, &tv_diff); + PRINT_TIMEVAL("real", &tv_diff); + + timersub(&ru_now.ru_utime, &speedo->ru.ru_utime, &tv_diff); + PRINT_TIMEVAL("user", &tv_diff); + + timersub(&ru_now.ru_stime, &speedo->ru.ru_stime, &tv_diff); + PRINT_TIMEVAL("sys ", &tv_diff); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/ut/ut0list.c b/storage/xtradb/ut/ut0list.c new file mode 100644 index 00000000000..c6250edb6cd --- /dev/null +++ b/storage/xtradb/ut/ut0list.c @@ -0,0 +1,187 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +#include "ut0list.h" +#ifdef UNIV_NONINL +#include "ut0list.ic" +#endif + +/******************************************************************** +Create a new list. */ +UNIV_INTERN +ib_list_t* +ib_list_create(void) +/*=================*/ + /* out: list */ +{ + ib_list_t* list = mem_alloc(sizeof(ib_list_t)); + + list->first = NULL; + list->last = NULL; + list->is_heap_list = FALSE; + + return(list); +} + +/******************************************************************** +Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for +lists created with this function. */ +UNIV_INTERN +ib_list_t* +ib_list_create_heap( +/*================*/ + /* out: list */ + mem_heap_t* heap) /* in: memory heap to use */ +{ + ib_list_t* list = mem_heap_alloc(heap, sizeof(ib_list_t)); + + list->first = NULL; + list->last = NULL; + list->is_heap_list = TRUE; + + return(list); +} + +/******************************************************************** +Free a list. */ +UNIV_INTERN +void +ib_list_free( +/*=========*/ + ib_list_t* list) /* in: list */ +{ + ut_a(!list->is_heap_list); + + /* We don't check that the list is empty because it's entirely valid + to e.g. have all the nodes allocated from a single heap that is then + freed after the list itself is freed. */ + + mem_free(list); +} + +/******************************************************************** +Add the data to the start of the list. */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_first( +/*==============*/ + /* out: new list node*/ + ib_list_t* list, /* in: list */ + void* data, /* in: data */ + mem_heap_t* heap) /* in: memory heap to use */ +{ + return(ib_list_add_after(list, ib_list_get_first(list), data, heap)); +} + +/******************************************************************** +Add the data to the end of the list. */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_last( +/*=============*/ + /* out: new list node*/ + ib_list_t* list, /* in: list */ + void* data, /* in: data */ + mem_heap_t* heap) /* in: memory heap to use */ +{ + return(ib_list_add_after(list, ib_list_get_last(list), data, heap)); +} + +/******************************************************************** +Add the data after the indicated node. */ +UNIV_INTERN +ib_list_node_t* +ib_list_add_after( +/*==============*/ + /* out: new list node*/ + ib_list_t* list, /* in: list */ + ib_list_node_t* prev_node, /* in: node preceding new node (can + be NULL) */ + void* data, /* in: data */ + mem_heap_t* heap) /* in: memory heap to use */ +{ + ib_list_node_t* node = mem_heap_alloc(heap, sizeof(ib_list_node_t)); + + node->data = data; + + if (!list->first) { + /* Empty list. */ + + ut_a(!prev_node); + + node->prev = NULL; + node->next = NULL; + + list->first = node; + list->last = node; + } else if (!prev_node) { + /* Start of list. */ + + node->prev = NULL; + node->next = list->first; + + list->first->prev = node; + + list->first = node; + } else { + /* Middle or end of list. */ + + node->prev = prev_node; + node->next = prev_node->next; + + prev_node->next = node; + + if (node->next) { + node->next->prev = node; + } else { + list->last = node; + } + } + + return(node); +} + +/******************************************************************** +Remove the node from the list. */ +UNIV_INTERN +void +ib_list_remove( +/*===========*/ + ib_list_t* list, /* in: list */ + ib_list_node_t* node) /* in: node to remove */ +{ + if (node->prev) { + node->prev->next = node->next; + } else { + /* First item in list. */ + + ut_ad(list->first == node); + + list->first = node->next; + } + + if (node->next) { + node->next->prev = node->prev; + } else { + /* Last item in list. */ + + ut_ad(list->last == node); + + list->last = node->prev; + } +} diff --git a/storage/xtradb/ut/ut0mem.c b/storage/xtradb/ut/ut0mem.c new file mode 100644 index 00000000000..c87a6a4b57e --- /dev/null +++ b/storage/xtradb/ut/ut0mem.c @@ -0,0 +1,673 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/************************************************************************ +Memory primitives + +Created 5/11/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0mem.h" + +#ifdef UNIV_NONINL +#include "ut0mem.ic" +#endif + +#include "mem0mem.h" +#include "os0thread.h" +#include "srv0srv.h" + +#include + +/* This struct is placed first in every allocated memory block */ +typedef struct ut_mem_block_struct ut_mem_block_t; + +/* The total amount of memory currently allocated from the operating +system with os_mem_alloc_large() or malloc(). Does not count malloc() +if srv_use_sys_malloc is set. Protected by ut_list_mutex. */ +UNIV_INTERN ulint ut_total_allocated_memory = 0; + +/* Mutex protecting ut_total_allocated_memory and ut_mem_block_list */ +UNIV_INTERN os_fast_mutex_t ut_list_mutex; + +struct ut_mem_block_struct{ + UT_LIST_NODE_T(ut_mem_block_t) mem_block_list; + /* mem block list node */ + ulint size; /* size of allocated memory */ + ulint magic_n; +}; + +#define UT_MEM_MAGIC_N 1601650166 + +/* List of all memory blocks allocated from the operating system +with malloc. Protected by ut_list_mutex. */ +static UT_LIST_BASE_NODE_T(ut_mem_block_t) ut_mem_block_list; + +static ibool ut_mem_block_list_inited = FALSE; + +static ulint* ut_mem_null_ptr = NULL; + +/************************************************************************** +Initializes the mem block list at database startup. */ +UNIV_INTERN +void +ut_mem_init(void) +/*=============*/ +{ + ut_a(!ut_mem_block_list_inited); + os_fast_mutex_init(&ut_list_mutex); + UT_LIST_INIT(ut_mem_block_list); + ut_mem_block_list_inited = TRUE; +} + +/************************************************************************** +Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is +defined and set_to_zero is TRUE. */ +UNIV_INTERN +void* +ut_malloc_low( +/*==========*/ + /* out, own: allocated memory */ + ulint n, /* in: number of bytes to allocate */ + ibool set_to_zero, /* in: TRUE if allocated memory should be + set to zero if UNIV_SET_MEM_TO_ZERO is + defined */ + ibool assert_on_error)/* in: if TRUE, we crash mysqld if the + memory cannot be allocated */ +{ + ulint retry_count; + void* ret; + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + ret = malloc(n); + ut_a(ret || !assert_on_error); + +#ifdef UNIV_SET_MEM_TO_ZERO + if (set_to_zero) { + memset(ret, '\0', n); + UNIV_MEM_ALLOC(ret, n); + } +#endif + return(ret); + } + + ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */ + ut_a(ut_mem_block_list_inited); + + retry_count = 0; +retry: + os_fast_mutex_lock(&ut_list_mutex); + + ret = malloc(n + sizeof(ut_mem_block_t)); + + if (ret == NULL && retry_count < 60) { + if (retry_count == 0) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: cannot allocate" + " %lu bytes of\n" + "InnoDB: memory with malloc!" + " Total allocated memory\n" + "InnoDB: by InnoDB %lu bytes." + " Operating system errno: %lu\n" + "InnoDB: Check if you should" + " increase the swap file or\n" + "InnoDB: ulimits of your operating system.\n" + "InnoDB: On FreeBSD check you" + " have compiled the OS with\n" + "InnoDB: a big enough maximum process size.\n" + "InnoDB: Note that in most 32-bit" + " computers the process\n" + "InnoDB: memory space is limited" + " to 2 GB or 4 GB.\n" + "InnoDB: We keep retrying" + " the allocation for 60 seconds...\n", + (ulong) n, (ulong) ut_total_allocated_memory, +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif + ); + } + + os_fast_mutex_unlock(&ut_list_mutex); + + /* Sleep for a second and retry the allocation; maybe this is + just a temporary shortage of memory */ + + os_thread_sleep(1000000); + + retry_count++; + + goto retry; + } + + if (ret == NULL) { + /* Flush stderr to make more probable that the error + message gets in the error file before we generate a seg + fault */ + + fflush(stderr); + + os_fast_mutex_unlock(&ut_list_mutex); + + /* Make an intentional seg fault so that we get a stack + trace */ + /* Intentional segfault on NetWare causes an abend. Avoid this + by graceful exit handling in ut_a(). */ +#if (!defined __NETWARE__) + if (assert_on_error) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: We now intentionally" + " generate a seg fault so that\n" + "InnoDB: on Linux we get a stack trace.\n"); + + if (*ut_mem_null_ptr) ut_mem_null_ptr = 0; + } else { + return(NULL); + } +#else + ut_a(0); +#endif + } + + if (set_to_zero) { +#ifdef UNIV_SET_MEM_TO_ZERO + memset(ret, '\0', n + sizeof(ut_mem_block_t)); +#endif + } + + UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t)); + + ((ut_mem_block_t*)ret)->size = n + sizeof(ut_mem_block_t); + ((ut_mem_block_t*)ret)->magic_n = UT_MEM_MAGIC_N; + + ut_total_allocated_memory += n + sizeof(ut_mem_block_t); + + UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list, + ((ut_mem_block_t*)ret)); + os_fast_mutex_unlock(&ut_list_mutex); + + return((void*)((byte*)ret + sizeof(ut_mem_block_t))); +} + +/************************************************************************** +Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is +defined. */ +UNIV_INTERN +void* +ut_malloc( +/*======*/ + /* out, own: allocated memory */ + ulint n) /* in: number of bytes to allocate */ +{ + return(ut_malloc_low(n, TRUE, TRUE)); +} + +/************************************************************************** +Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs +out. It cannot be used if we want to return an error message. Prints to +stderr a message if fails. */ +UNIV_INTERN +ibool +ut_test_malloc( +/*===========*/ + /* out: TRUE if succeeded */ + ulint n) /* in: try to allocate this many bytes */ +{ + void* ret; + + ret = malloc(n); + + if (ret == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: cannot allocate" + " %lu bytes of memory for\n" + "InnoDB: a BLOB with malloc! Total allocated memory\n" + "InnoDB: by InnoDB %lu bytes." + " Operating system errno: %d\n" + "InnoDB: Check if you should increase" + " the swap file or\n" + "InnoDB: ulimits of your operating system.\n" + "InnoDB: On FreeBSD check you have" + " compiled the OS with\n" + "InnoDB: a big enough maximum process size.\n", + (ulong) n, + (ulong) ut_total_allocated_memory, + (int) errno); + return(FALSE); + } + + free(ret); + + return(TRUE); +} + +/************************************************************************** +Frees a memory block allocated with ut_malloc. */ +UNIV_INTERN +void +ut_free( +/*====*/ + void* ptr) /* in, own: memory block */ +{ + ut_mem_block_t* block; + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + free(ptr); + return; + } + + block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t)); + + os_fast_mutex_lock(&ut_list_mutex); + + ut_a(block->magic_n == UT_MEM_MAGIC_N); + ut_a(ut_total_allocated_memory >= block->size); + + ut_total_allocated_memory -= block->size; + + UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block); + free(block); + + os_fast_mutex_unlock(&ut_list_mutex); +} + +/************************************************************************** +Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not +use this function because the allocation functions in mem0mem.h are the +recommended ones in InnoDB. + +man realloc in Linux, 2004: + + realloc() changes the size of the memory block pointed to + by ptr to size bytes. The contents will be unchanged to + the minimum of the old and new sizes; newly allocated mem­ + ory will be uninitialized. If ptr is NULL, the call is + equivalent to malloc(size); if size is equal to zero, the + call is equivalent to free(ptr). Unless ptr is NULL, it + must have been returned by an earlier call to malloc(), + calloc() or realloc(). + +RETURN VALUE + realloc() returns a pointer to the newly allocated memory, + which is suitably aligned for any kind of variable and may + be different from ptr, or NULL if the request fails. If + size was equal to 0, either NULL or a pointer suitable to + be passed to free() is returned. If realloc() fails the + original block is left untouched - it is not freed or + moved. */ +UNIV_INTERN +void* +ut_realloc( +/*=======*/ + /* out, own: pointer to new mem block or NULL */ + void* ptr, /* in: pointer to old block or NULL */ + ulint size) /* in: desired size */ +{ + ut_mem_block_t* block; + ulint old_size; + ulint min_size; + void* new_ptr; + + if (UNIV_LIKELY(srv_use_sys_malloc)) { + return(realloc(ptr, size)); + } + + if (ptr == NULL) { + + return(ut_malloc(size)); + } + + if (size == 0) { + ut_free(ptr); + + return(NULL); + } + + block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t)); + + ut_a(block->magic_n == UT_MEM_MAGIC_N); + + old_size = block->size - sizeof(ut_mem_block_t); + + if (size < old_size) { + min_size = size; + } else { + min_size = old_size; + } + + new_ptr = ut_malloc(size); + + if (new_ptr == NULL) { + + return(NULL); + } + + /* Copy the old data from ptr */ + ut_memcpy(new_ptr, ptr, min_size); + + ut_free(ptr); + + return(new_ptr); +} + +/************************************************************************** +Frees in shutdown all allocated memory not freed yet. */ +UNIV_INTERN +void +ut_free_all_mem(void) +/*=================*/ +{ + ut_mem_block_t* block; + + ut_a(ut_mem_block_list_inited); + ut_mem_block_list_inited = FALSE; + os_fast_mutex_free(&ut_list_mutex); + + while ((block = UT_LIST_GET_FIRST(ut_mem_block_list))) { + + ut_a(block->magic_n == UT_MEM_MAGIC_N); + ut_a(ut_total_allocated_memory >= block->size); + + ut_total_allocated_memory -= block->size; + + UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block); + free(block); + } + + if (ut_total_allocated_memory != 0) { + fprintf(stderr, + "InnoDB: Warning: after shutdown" + " total allocated memory is %lu\n", + (ulong) ut_total_allocated_memory); + } +} + +/************************************************************************** +Copies up to size - 1 characters from the NUL-terminated string src to +dst, NUL-terminating the result. Returns strlen(src), so truncation +occurred if the return value >= size. */ +UNIV_INTERN +ulint +ut_strlcpy( +/*=======*/ + /* out: strlen(src) */ + char* dst, /* in: destination buffer */ + const char* src, /* in: source buffer */ + ulint size) /* in: size of destination buffer */ +{ + ulint src_size = strlen(src); + + if (size != 0) { + ulint n = ut_min(src_size, size - 1); + + memcpy(dst, src, n); + dst[n] = '\0'; + } + + return(src_size); +} + +/************************************************************************** +Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last +(size - 1) bytes of src, not the first. */ +UNIV_INTERN +ulint +ut_strlcpy_rev( +/*===========*/ + /* out: strlen(src) */ + char* dst, /* in: destination buffer */ + const char* src, /* in: source buffer */ + ulint size) /* in: size of destination buffer */ +{ + ulint src_size = strlen(src); + + if (size != 0) { + ulint n = ut_min(src_size, size - 1); + + memcpy(dst, src + src_size - n, n + 1); + } + + return(src_size); +} + +/************************************************************************** +Make a quoted copy of a NUL-terminated string. Leading and trailing +quotes will not be included; only embedded quotes will be escaped. +See also ut_strlenq() and ut_memcpyq(). */ +UNIV_INTERN +char* +ut_strcpyq( +/*=======*/ + /* out: pointer to end of dest */ + char* dest, /* in: output buffer */ + char q, /* in: the quote character */ + const char* src) /* in: null-terminated string */ +{ + while (*src) { + if ((*dest++ = *src++) == q) { + *dest++ = q; + } + } + + return(dest); +} + +/************************************************************************** +Make a quoted copy of a fixed-length string. Leading and trailing +quotes will not be included; only embedded quotes will be escaped. +See also ut_strlenq() and ut_strcpyq(). */ +UNIV_INTERN +char* +ut_memcpyq( +/*=======*/ + /* out: pointer to end of dest */ + char* dest, /* in: output buffer */ + char q, /* in: the quote character */ + const char* src, /* in: string to be quoted */ + ulint len) /* in: length of src */ +{ + const char* srcend = src + len; + + while (src < srcend) { + if ((*dest++ = *src++) == q) { + *dest++ = q; + } + } + + return(dest); +} + +/************************************************************************** +Return the number of times s2 occurs in s1. Overlapping instances of s2 +are only counted once. */ +UNIV_INTERN +ulint +ut_strcount( +/*========*/ + /* out: the number of times s2 occurs in s1 */ + const char* s1, /* in: string to search in */ + const char* s2) /* in: string to search for */ +{ + ulint count = 0; + ulint len = strlen(s2); + + if (len == 0) { + + return(0); + } + + for (;;) { + s1 = strstr(s1, s2); + + if (!s1) { + + break; + } + + count++; + s1 += len; + } + + return(count); +} + +/************************************************************************** +Replace every occurrence of s1 in str with s2. Overlapping instances of s1 +are only replaced once. */ +UNIV_INTERN +char* +ut_strreplace( +/*==========*/ + /* out, own: modified string, must be + freed with mem_free() */ + const char* str, /* in: string to operate on */ + const char* s1, /* in: string to replace */ + const char* s2) /* in: string to replace s1 with */ +{ + char* new_str; + char* ptr; + const char* str_end; + ulint str_len = strlen(str); + ulint s1_len = strlen(s1); + ulint s2_len = strlen(s2); + ulint count = 0; + int len_delta = (int)s2_len - (int)s1_len; + + str_end = str + str_len; + + if (len_delta <= 0) { + len_delta = 0; + } else { + count = ut_strcount(str, s1); + } + + new_str = mem_alloc(str_len + count * len_delta + 1); + ptr = new_str; + + while (str) { + const char* next = strstr(str, s1); + + if (!next) { + next = str_end; + } + + memcpy(ptr, str, next - str); + ptr += next - str; + + if (next == str_end) { + + break; + } + + memcpy(ptr, s2, s2_len); + ptr += s2_len; + + str = next + s1_len; + } + + *ptr = '\0'; + + return(new_str); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +void +test_ut_str_sql_format() +{ + char buf[128]; + ulint ret; + +#define CALL_AND_TEST(str, str_len, buf, buf_size, ret_expected, buf_expected)\ + do {\ + ibool ok = TRUE;\ + memset(buf, 'x', 10);\ + buf[10] = '\0';\ + fprintf(stderr, "TESTING \"%s\", %lu, %lu\n",\ + str, (ulint) str_len, (ulint) buf_size);\ + ret = ut_str_sql_format(str, str_len, buf, buf_size);\ + if (ret != ret_expected) {\ + fprintf(stderr, "expected ret %lu, got %lu\n",\ + (ulint) ret_expected, ret);\ + ok = FALSE;\ + }\ + if (strcmp((char*) buf, buf_expected) != 0) {\ + fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\ + buf_expected, buf);\ + ok = FALSE;\ + }\ + if (ok) {\ + fprintf(stderr, "OK: %lu, \"%s\"\n\n",\ + (ulint) ret, buf);\ + } else {\ + return;\ + }\ + } while (0) + + CALL_AND_TEST("abcd", 4, buf, 0, 0, "xxxxxxxxxx"); + + CALL_AND_TEST("abcd", 4, buf, 1, 1, ""); + + CALL_AND_TEST("abcd", 4, buf, 2, 1, ""); + + CALL_AND_TEST("abcd", 0, buf, 3, 3, "''"); + CALL_AND_TEST("abcd", 1, buf, 3, 1, ""); + CALL_AND_TEST("abcd", 2, buf, 3, 1, ""); + CALL_AND_TEST("abcd", 3, buf, 3, 1, ""); + CALL_AND_TEST("abcd", 4, buf, 3, 1, ""); + + CALL_AND_TEST("abcd", 0, buf, 4, 3, "''"); + CALL_AND_TEST("abcd", 1, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcd", 2, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcd", 3, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcd", 4, buf, 4, 4, "'a'"); + CALL_AND_TEST("abcde", 5, buf, 4, 4, "'a'"); + CALL_AND_TEST("'", 1, buf, 4, 3, "''"); + CALL_AND_TEST("''", 2, buf, 4, 3, "''"); + CALL_AND_TEST("a'", 2, buf, 4, 4, "'a'"); + CALL_AND_TEST("'a", 2, buf, 4, 3, "''"); + CALL_AND_TEST("ab", 2, buf, 4, 4, "'a'"); + + CALL_AND_TEST("abcdef", 0, buf, 5, 3, "''"); + CALL_AND_TEST("abcdef", 1, buf, 5, 4, "'a'"); + CALL_AND_TEST("abcdef", 2, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 3, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 4, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 5, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abcdef", 6, buf, 5, 5, "'ab'"); + CALL_AND_TEST("'", 1, buf, 5, 5, "''''"); + CALL_AND_TEST("''", 2, buf, 5, 5, "''''"); + CALL_AND_TEST("a'", 2, buf, 5, 4, "'a'"); + CALL_AND_TEST("'a", 2, buf, 5, 5, "''''"); + CALL_AND_TEST("ab", 2, buf, 5, 5, "'ab'"); + CALL_AND_TEST("abc", 3, buf, 5, 5, "'ab'"); + + CALL_AND_TEST("ab", 2, buf, 6, 5, "'ab'"); + + CALL_AND_TEST("a'b'c", 5, buf, 32, 10, "'a''b''c'"); + CALL_AND_TEST("a'b'c'", 6, buf, 32, 12, "'a''b''c'''"); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/xtradb/ut/ut0rnd.c b/storage/xtradb/ut/ut0rnd.c new file mode 100644 index 00000000000..f5d6cb08b0f --- /dev/null +++ b/storage/xtradb/ut/ut0rnd.c @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************* +Random numbers and hashing + +Created 5/11/1994 Heikki Tuuri +********************************************************************/ + +#include "ut0rnd.h" + +#ifdef UNIV_NONINL +#include "ut0rnd.ic" +#endif + +/* These random numbers are used in ut_find_prime */ +#define UT_RANDOM_1 1.0412321 +#define UT_RANDOM_2 1.1131347 +#define UT_RANDOM_3 1.0132677 + + +UNIV_INTERN ulint ut_rnd_ulint_counter = 65654363; + +/*************************************************************** +Looks for a prime number slightly greater than the given argument. +The prime is chosen so that it is not near any power of 2. */ +UNIV_INTERN +ulint +ut_find_prime( +/*==========*/ + /* out: prime */ + ulint n) /* in: positive number > 100 */ +{ + ulint pow2; + ulint i; + + n += 100; + + pow2 = 1; + while (pow2 * 2 < n) { + pow2 = 2 * pow2; + } + + if ((double)n < 1.05 * (double)pow2) { + n = (ulint) ((double)n * UT_RANDOM_1); + } + + pow2 = 2 * pow2; + + if ((double)n > 0.95 * (double)pow2) { + n = (ulint) ((double)n * UT_RANDOM_2); + } + + if (n > pow2 - 20) { + n += 30; + } + + /* Now we have n far enough from powers of 2. To make + n more random (especially, if it was not near + a power of 2), we then multiply it by a random number. */ + + n = (ulint) ((double)n * UT_RANDOM_3); + + for (;; n++) { + i = 2; + while (i * i <= n) { + if (n % i == 0) { + goto next_n; + } + i++; + } + + /* Found a prime */ + break; +next_n: ; + } + + return(n); +} diff --git a/storage/xtradb/ut/ut0ut.c b/storage/xtradb/ut/ut0ut.c new file mode 100644 index 00000000000..12500988ec6 --- /dev/null +++ b/storage/xtradb/ut/ut0ut.c @@ -0,0 +1,594 @@ +/***************************************************************************** + +Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/******************************************************************* +Various utilities for Innobase. + +Created 5/11/1994 Heikki Tuuri +********************************************************************/ + +#include "ut0ut.h" + +#ifdef UNIV_NONINL +#include "ut0ut.ic" +#endif + +#include +#include +#include + +#include "trx0trx.h" +#include "ha_prototypes.h" +#ifndef UNIV_HOTBACKUP +# include "mysql_com.h" /* NAME_LEN */ +#endif /* UNIV_HOTBACKUP */ + +UNIV_INTERN ibool ut_always_false = FALSE; + +#ifdef __WIN__ +/********************************************************************* +NOTE: The Windows epoch starts from 1601/01/01 whereas the Unix +epoch starts from 1970/1/1. For selection of constant see: +http://support.microsoft.com/kb/167296/ */ +#define WIN_TO_UNIX_DELTA_USEC ((ib_int64_t) 11644473600000000ULL) + + +/********************************************************************* +This is the Windows version of gettimeofday(2).*/ +static +int +ut_gettimeofday( +/*============*/ + /* out: 0 if all OK else -1 */ + struct timeval* tv, /* out: Values are relative to Unix epoch */ + void* tz) /* in: not used */ +{ + FILETIME ft; + ib_int64_t tm; + + if (!tv) { + errno = EINVAL; + return(-1); + } + + GetSystemTimeAsFileTime(&ft); + + tm = (ib_int64_t) ft.dwHighDateTime << 32; + tm |= ft.dwLowDateTime; + + ut_a(tm >= 0); /* If tm wraps over to negative, the quotient / 10 + does not work */ + + tm /= 10; /* Convert from 100 nsec periods to usec */ + + /* If we don't convert to the Unix epoch the value for + struct timeval::tv_sec will overflow.*/ + tm -= WIN_TO_UNIX_DELTA_USEC; + + tv->tv_sec = (long) (tm / 1000000L); + tv->tv_usec = (long) (tm % 1000000L); + + return(0); +} +#else +#define ut_gettimeofday gettimeofday +#endif + +/************************************************************ +Gets the high 32 bits in a ulint. That is makes a shift >> 32, +but since there seem to be compiler bugs in both gcc and Visual C++, +we do this by a special conversion. */ +UNIV_INTERN +ulint +ut_get_high32( +/*==========*/ + /* out: a >> 32 */ + ulint a) /* in: ulint */ +{ + ib_int64_t i; + + i = (ib_int64_t)a; + + i = i >> 32; + + return((ulint)i); +} + +/************************************************************** +Returns system time. We do not specify the format of the time returned: +the only way to manipulate it is to use the function ut_difftime. */ +UNIV_INTERN +ib_time_t +ut_time(void) +/*=========*/ +{ + return(time(NULL)); +} + +/************************************************************** +Returns system time. +Upon successful completion, the value 0 is returned; otherwise the +value -1 is returned and the global variable errno is set to indicate the +error. */ +UNIV_INTERN +int +ut_usectime( +/*========*/ + /* out: 0 on success, -1 otherwise */ + ulint* sec, /* out: seconds since the Epoch */ + ulint* ms) /* out: microseconds since the Epoch+*sec */ +{ + struct timeval tv; + int ret; + int errno_gettimeofday; + int i; + + for (i = 0; i < 10; i++) { + + ret = ut_gettimeofday(&tv, NULL); + + if (ret == -1) { + errno_gettimeofday = errno; + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: gettimeofday(): %s\n", + strerror(errno_gettimeofday)); + os_thread_sleep(100000); /* 0.1 sec */ + errno = errno_gettimeofday; + } else { + break; + } + } + + if (ret != -1) { + *sec = (ulint) tv.tv_sec; + *ms = (ulint) tv.tv_usec; + } + + return(ret); +} + +/************************************************************** +Returns the number of microseconds since epoch. Similar to +time(3), the return value is also stored in *tloc, provided +that tloc is non-NULL. */ +UNIV_INTERN +ullint +ut_time_us( +/*=======*/ + /* out: us since epoch */ + ullint* tloc) /* out: us since epoch, if non-NULL */ +{ + struct timeval tv; + ullint us; + + ut_gettimeofday(&tv, NULL); + + us = (ullint) tv.tv_sec * 1000000 + tv.tv_usec; + + if (tloc != NULL) { + *tloc = us; + } + + return(us); +} + +/************************************************************** +Returns the difference of two times in seconds. */ +UNIV_INTERN +double +ut_difftime( +/*========*/ + /* out: time2 - time1 expressed in seconds */ + ib_time_t time2, /* in: time */ + ib_time_t time1) /* in: time */ +{ + return(difftime(time2, time1)); +} + +/************************************************************** +Prints a timestamp to a file. */ +UNIV_INTERN +void +ut_print_timestamp( +/*===============*/ + FILE* file) /* in: file where to print */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + fprintf(file,"%02d%02d%02d %2d:%02d:%02d", + (int)cal_tm.wYear % 100, + (int)cal_tm.wMonth, + (int)cal_tm.wDay, + (int)cal_tm.wHour, + (int)cal_tm.wMinute, + (int)cal_tm.wSecond); +#else + struct tm cal_tm; + struct tm* cal_tm_ptr; + time_t tm; + + time(&tm); + +#ifdef HAVE_LOCALTIME_R + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + cal_tm_ptr = localtime(&tm); +#endif + fprintf(file,"%02d%02d%02d %2d:%02d:%02d", + cal_tm_ptr->tm_year % 100, + cal_tm_ptr->tm_mon + 1, + cal_tm_ptr->tm_mday, + cal_tm_ptr->tm_hour, + cal_tm_ptr->tm_min, + cal_tm_ptr->tm_sec); +#endif +} + +/************************************************************** +Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */ +UNIV_INTERN +void +ut_sprintf_timestamp( +/*=================*/ + char* buf) /* in: buffer where to sprintf */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + sprintf(buf, "%02d%02d%02d %2d:%02d:%02d", + (int)cal_tm.wYear % 100, + (int)cal_tm.wMonth, + (int)cal_tm.wDay, + (int)cal_tm.wHour, + (int)cal_tm.wMinute, + (int)cal_tm.wSecond); +#else + struct tm cal_tm; + struct tm* cal_tm_ptr; + time_t tm; + + time(&tm); + +#ifdef HAVE_LOCALTIME_R + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + cal_tm_ptr = localtime(&tm); +#endif + sprintf(buf, "%02d%02d%02d %2d:%02d:%02d", + cal_tm_ptr->tm_year % 100, + cal_tm_ptr->tm_mon + 1, + cal_tm_ptr->tm_mday, + cal_tm_ptr->tm_hour, + cal_tm_ptr->tm_min, + cal_tm_ptr->tm_sec); +#endif +} + +#ifdef UNIV_HOTBACKUP +/************************************************************** +Sprintfs a timestamp to a buffer with no spaces and with ':' characters +replaced by '_'. */ +UNIV_INTERN +void +ut_sprintf_timestamp_without_extra_chars( +/*=====================================*/ + char* buf) /* in: buffer where to sprintf */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d", + (int)cal_tm.wYear % 100, + (int)cal_tm.wMonth, + (int)cal_tm.wDay, + (int)cal_tm.wHour, + (int)cal_tm.wMinute, + (int)cal_tm.wSecond); +#else + struct tm cal_tm; + struct tm* cal_tm_ptr; + time_t tm; + + time(&tm); + +#ifdef HAVE_LOCALTIME_R + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + cal_tm_ptr = localtime(&tm); +#endif + sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d", + cal_tm_ptr->tm_year % 100, + cal_tm_ptr->tm_mon + 1, + cal_tm_ptr->tm_mday, + cal_tm_ptr->tm_hour, + cal_tm_ptr->tm_min, + cal_tm_ptr->tm_sec); +#endif +} + +/************************************************************** +Returns current year, month, day. */ +UNIV_INTERN +void +ut_get_year_month_day( +/*==================*/ + ulint* year, /* out: current year */ + ulint* month, /* out: month */ + ulint* day) /* out: day */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + *year = (ulint)cal_tm.wYear; + *month = (ulint)cal_tm.wMonth; + *day = (ulint)cal_tm.wDay; +#else + struct tm cal_tm; + struct tm* cal_tm_ptr; + time_t tm; + + time(&tm); + +#ifdef HAVE_LOCALTIME_R + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; +#else + cal_tm_ptr = localtime(&tm); +#endif + *year = (ulint)cal_tm_ptr->tm_year + 1900; + *month = (ulint)cal_tm_ptr->tm_mon + 1; + *day = (ulint)cal_tm_ptr->tm_mday; +#endif +} +#endif /* UNIV_HOTBACKUP */ + +/***************************************************************** +Runs an idle loop on CPU. The argument gives the desired delay +in microseconds on 100 MHz Pentium + Visual C++. */ +UNIV_INTERN +ulint +ut_delay( +/*=====*/ + /* out: dummy value */ + ulint delay) /* in: delay in microseconds on 100 MHz Pentium */ +{ + ulint i, j; + + j = 0; + + for (i = 0; i < delay * 50; i++) { + j += i; + } + + if (ut_always_false) { + ut_always_false = (ibool) j; + } + + return(j); +} + +/***************************************************************** +Prints the contents of a memory buffer in hex and ascii. */ +UNIV_INTERN +void +ut_print_buf( +/*=========*/ + FILE* file, /* in: file where to print */ + const void* buf, /* in: memory buffer */ + ulint len) /* in: length of the buffer */ +{ + const byte* data; + ulint i; + + UNIV_MEM_ASSERT_RW(buf, len); + + fprintf(file, " len %lu; hex ", len); + + for (data = (const byte*)buf, i = 0; i < len; i++) { + fprintf(file, "%02lx", (ulong)*data++); + } + + fputs("; asc ", file); + + data = (const byte*)buf; + + for (i = 0; i < len; i++) { + int c = (int) *data++; + putc(isprint(c) ? c : ' ', file); + } + + putc(';', file); +} + +/***************************************************************** +Calculates fast the number rounded up to the nearest power of 2. */ +UNIV_INTERN +ulint +ut_2_power_up( +/*==========*/ + /* out: first power of 2 which is >= n */ + ulint n) /* in: number != 0 */ +{ + ulint res; + + res = 1; + + ut_ad(n > 0); + + while (res < n) { + res = res * 2; + } + + return(res); +} + +/************************************************************************** +Outputs a NUL-terminated file name, quoted with apostrophes. */ +UNIV_INTERN +void +ut_print_filename( +/*==============*/ + FILE* f, /* in: output stream */ + const char* name) /* in: name to print */ +{ + putc('\'', f); + for (;;) { + int c = *name++; + switch (c) { + case 0: + goto done; + case '\'': + putc(c, f); + /* fall through */ + default: + putc(c, f); + } + } +done: + putc('\'', f); +} + +/************************************************************************** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_name( +/*==========*/ + FILE* f, /* in: output stream */ + trx_t* trx, /* in: transaction */ + ibool table_id,/* in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name) /* in: name to print */ +{ + ut_print_namel(f, trx, table_id, name, strlen(name)); +} + +/************************************************************************** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +UNIV_INTERN +void +ut_print_namel( +/*===========*/ + FILE* f, /* in: output stream */ + trx_t* trx, /* in: transaction (NULL=no quotes) */ + ibool table_id,/* in: TRUE=print a table name, + FALSE=print other identifier */ + const char* name, /* in: name to print */ + ulint namelen)/* in: length of name */ +{ +#ifdef UNIV_HOTBACKUP + fwrite(name, 1, namelen, f); +#else + /* 2 * NAME_LEN for database and table name, + and some slack for the #mysql50# prefix and quotes */ + char buf[3 * NAME_LEN]; + const char* bufend; + + bufend = innobase_convert_name(buf, sizeof buf, + name, namelen, + trx ? trx->mysql_thd : NULL, + table_id); + + fwrite(buf, 1, bufend - buf, f); +#endif +} + +/************************************************************************** +Catenate files. */ +UNIV_INTERN +void +ut_copy_file( +/*=========*/ + FILE* dest, /* in: output file */ + FILE* src) /* in: input file to be appended to output */ +{ + long len = ftell(src); + char buf[4096]; + + rewind(src); + do { + size_t maxs = len < (long) sizeof buf + ? (size_t) len + : sizeof buf; + size_t size = fread(buf, 1, maxs, src); + fwrite(buf, 1, size, dest); + len -= (long) size; + if (size < maxs) { + break; + } + } while (len > 0); +} + +/************************************************************************** +snprintf(). */ + +#ifdef __WIN__ +#include +int +ut_snprintf( + /* out: number of characters that would + have been printed if the size were + unlimited, not including the terminating + '\0'. */ + char* str, /* out: string */ + size_t size, /* in: str size */ + const char* fmt, /* in: format */ + ...) /* in: format values */ +{ + int res; + va_list ap1; + va_list ap2; + + va_start(ap1, fmt); + va_start(ap2, fmt); + + res = _vscprintf(fmt, ap1); + ut_a(res != -1); + + if (size > 0) { + _vsnprintf(str, size, fmt, ap2); + + if ((size_t) res >= size) { + str[size - 1] = '\0'; + } + } + + va_end(ap1); + va_end(ap2); + + return(res); +} +#endif /* __WIN__ */ diff --git a/storage/xtradb/ut/ut0vec.c b/storage/xtradb/ut/ut0vec.c new file mode 100644 index 00000000000..69b7bec701a --- /dev/null +++ b/storage/xtradb/ut/ut0vec.c @@ -0,0 +1,72 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +#include "ut0vec.h" +#ifdef UNIV_NONINL +#include "ut0vec.ic" +#endif +#include + +/******************************************************************** +Create a new vector with the given initial size. */ +UNIV_INTERN +ib_vector_t* +ib_vector_create( +/*=============*/ + /* out: vector */ + mem_heap_t* heap, /* in: heap */ + ulint size) /* in: initial size */ +{ + ib_vector_t* vec; + + ut_a(size > 0); + + vec = mem_heap_alloc(heap, sizeof(*vec)); + + vec->heap = heap; + vec->data = mem_heap_alloc(heap, sizeof(void*) * size); + vec->used = 0; + vec->total = size; + + return(vec); +} + +/******************************************************************** +Push a new element to the vector, increasing its size if necessary. */ +UNIV_INTERN +void +ib_vector_push( +/*===========*/ + ib_vector_t* vec, /* in: vector */ + void* elem) /* in: data element */ +{ + if (vec->used >= vec->total) { + void** new_data; + ulint new_total = vec->total * 2; + + new_data = mem_heap_alloc(vec->heap, + sizeof(void*) * new_total); + memcpy(new_data, vec->data, sizeof(void*) * vec->total); + + vec->data = new_data; + vec->total = new_total; + } + + vec->data[vec->used] = elem; + vec->used++; +} diff --git a/storage/xtradb/ut/ut0wqueue.c b/storage/xtradb/ut/ut0wqueue.c new file mode 100644 index 00000000000..a5c14ac8130 --- /dev/null +++ b/storage/xtradb/ut/ut0wqueue.c @@ -0,0 +1,110 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +#include "ut0wqueue.h" + +/******************************************************************** +Create a new work queue. */ +UNIV_INTERN +ib_wqueue_t* +ib_wqueue_create(void) +/*===================*/ + /* out: work queue */ +{ + ib_wqueue_t* wq = mem_alloc(sizeof(ib_wqueue_t)); + + mutex_create(&wq->mutex, SYNC_WORK_QUEUE); + + wq->items = ib_list_create(); + wq->event = os_event_create(NULL); + + return(wq); +} + +/******************************************************************** +Free a work queue. */ +UNIV_INTERN +void +ib_wqueue_free( +/*===========*/ + ib_wqueue_t* wq) /* in: work queue */ +{ + ut_a(!ib_list_get_first(wq->items)); + + mutex_free(&wq->mutex); + ib_list_free(wq->items); + os_event_free(wq->event); + + mem_free(wq); +} + +/******************************************************************** +Add a work item to the queue. */ +UNIV_INTERN +void +ib_wqueue_add( +/*==========*/ + ib_wqueue_t* wq, /* in: work queue */ + void* item, /* in: work item */ + mem_heap_t* heap) /* in: memory heap to use for allocating the + list node */ +{ + mutex_enter(&wq->mutex); + + ib_list_add_last(wq->items, item, heap); + os_event_set(wq->event); + + mutex_exit(&wq->mutex); +} + +/******************************************************************** +Wait for a work item to appear in the queue. */ +UNIV_INTERN +void* +ib_wqueue_wait( + /* out: work item */ + ib_wqueue_t* wq) /* in: work queue */ +{ + ib_list_node_t* node; + + for (;;) { + os_event_wait(wq->event); + + mutex_enter(&wq->mutex); + + node = ib_list_get_first(wq->items); + + if (node) { + ib_list_remove(wq->items, node); + + if (!ib_list_get_first(wq->items)) { + /* We must reset the event when the list + gets emptied. */ + os_event_reset(wq->event); + } + + break; + } + + mutex_exit(&wq->mutex); + } + + mutex_exit(&wq->mutex); + + return(node->data); +} diff --git a/storage/xtradb/win-plugin/README b/storage/xtradb/win-plugin/README new file mode 100644 index 00000000000..9182f2c555c --- /dev/null +++ b/storage/xtradb/win-plugin/README @@ -0,0 +1,25 @@ +This directory contains patches that need to be applied to the MySQL +source tree in order to build the dynamic plugin on Windows -- +HA_INNODB.DLL. Please note the followings when adding the patches: + +* The patch must be applied from the mysql top-level source directory. + patch -p0 < win-plugin.diff +* The patch filenames end in ".diff". +* All patches here are expected to apply cleanly to the latest MySQL 5.1 + tree when storage/innobase is replaced with this InnoDB branch. + +When applying the patch, the following files will be modified: + + * CMakeLists.txt + * sql/CMakeLists.txt + * win/configure.js + * win/build-vs71.bat + * win/build-vs8.bat + * win/build-vs8_x64.bat + +Also, two new files will be added: + + * sql/mysqld.def + * sql/mysqld_x64.def + +You can get "patch" utility for Windows from http://unxutils.sourceforge.net/ diff --git a/storage/xtradb/win-plugin/win-plugin.diff b/storage/xtradb/win-plugin/win-plugin.diff new file mode 100644 index 00000000000..46d2e5b2d2d --- /dev/null +++ b/storage/xtradb/win-plugin/win-plugin.diff @@ -0,0 +1,297 @@ +diff -Nur CMakeLists.txt.orig CMakeLists.txt +--- CMakeLists.txt.orig 2008-10-03 12:25:41 -05:00 ++++ CMakeLists.txt 2008-09-26 17:32:51 -05:00 +@@ -244,9 +244,9 @@ + IF(WITH_FEDERATED_STORAGE_ENGINE) + ADD_SUBDIRECTORY(storage/federated) + ENDIF(WITH_FEDERATED_STORAGE_ENGINE) +-IF(WITH_INNOBASE_STORAGE_ENGINE) ++IF(WITH_INNOBASE_STORAGE_ENGINE OR INNODB_DYNAMIC_PLUGIN) + ADD_SUBDIRECTORY(storage/innobase) +-ENDIF(WITH_INNOBASE_STORAGE_ENGINE) ++ENDIF(WITH_INNOBASE_STORAGE_ENGINE OR INNODB_DYNAMIC_PLUGIN) + ADD_SUBDIRECTORY(sql) + ADD_SUBDIRECTORY(server-tools/instance-manager) + ADD_SUBDIRECTORY(libmysql) + +diff -Nur sql/CMakeLists.txt.orig sql/CMakeLists.txt +--- sql/CMakeLists.txt.orig 2008-10-03 12:25:41 -05:00 ++++ sql/CMakeLists.txt 2008-09-24 03:58:19 -05:00 +@@ -100,6 +100,15 @@ + LINK_FLAGS "/PDB:${CMAKE_CFG_INTDIR}/mysqld${MYSQLD_EXE_SUFFIX}.pdb") + ENDIF(cmake_version EQUAL 20406) + ++# Checks for 64-bit version ++IF(CMAKE_SIZEOF_VOID_P MATCHES 8) ++SET_TARGET_PROPERTIES(mysqld PROPERTIES ++ LINK_FLAGS "/def:\"${PROJECT_SOURCE_DIR}/sql/mysqld_x64.def\"") ++ELSE(CMAKE_SIZEOF_VOID_P MATCHES 8) ++SET_TARGET_PROPERTIES(mysqld PROPERTIES ++ LINK_FLAGS "/def:\"${PROJECT_SOURCE_DIR}/sql/mysqld.def\"") ++ENDIF(CMAKE_SIZEOF_VOID_P MATCHES 8) ++ + IF(EMBED_MANIFESTS) + MYSQL_EMBED_MANIFEST("mysqld" "asInvoker") + ENDIF(EMBED_MANIFESTS) + +diff -Nur sql/mysqld.def.orig sql/mysqld.def +--- sql/mysqld.def.orig 1969-12-31 18:00:00 -06:00 ++++ sql/mysqld.def 2008-10-31 02:20:32 -05:00 +@@ -0,0 +1,98 @@ ++EXPORTS ++ ?use_hidden_primary_key@handler@@UAEXXZ ++ ?get_dynamic_partition_info@handler@@UAEXPAUPARTITION_INFO@@I@Z ++ ?read_first_row@handler@@UAEHPAEI@Z ++ ?read_range_next@handler@@UAEHXZ ++ ?read_range_first@handler@@UAEHPBUst_key_range@@0_N1@Z ++ ?read_multi_range_first@handler@@UAEHPAPAUst_key_multi_range@@PAU2@I_NPAUst_handler_buffer@@@Z ++ ?read_multi_range_next@handler@@UAEHPAPAUst_key_multi_range@@@Z ++ ?index_read_idx_map@handler@@UAEHPAEIPBEKW4ha_rkey_function@@@Z ++ ?print_error@handler@@UAEXHH@Z ++ ?clone@handler@@UAEPAV1@PAUst_mem_root@@@Z ++ ?get_auto_increment@handler@@UAEX_K00PA_K1@Z ++ ?index_next_same@handler@@UAEHPAEPBEI@Z ++ ?get_error_message@handler@@UAE_NHPAVString@@@Z ++ ?ha_thd@handler@@IBEPAVTHD@@XZ ++ ?update_auto_increment@handler@@QAEHXZ ++ ?ha_statistic_increment@handler@@IBEXPQsystem_status_var@@K@Z ++ ?trans_register_ha@@YAXPAVTHD@@_NPAUhandlerton@@@Z ++ ?cmp@Field_blob@@QAEHPBEI0I@Z ++ ?set_time@Field_timestamp@@QAEXXZ ++ ?sql_print_error@@YAXPBDZZ ++ ?sql_print_warning@@YAXPBDZZ ++ ?check_global_access@@YA_NPAVTHD@@K@Z ++ ?schema_table_store_record@@YA_NPAVTHD@@PAUst_table@@@Z ++ ?get_quote_char_for_identifier@@YAHPAVTHD@@PBDI@Z ++ ?copy@String@@QAE_NXZ ++ ?copy@String@@QAE_NABV1@@Z ++ ?copy@String@@QAE_NPBDIPAUcharset_info_st@@@Z ++ ?copy_and_convert@@YAIPADIPAUcharset_info_st@@PBDI1PAI@Z ++ ?filename_to_tablename@@YAIPBDPADI@Z ++ ?strconvert@@YAIPAUcharset_info_st@@PBD0PADIPAI@Z ++ ?calculate_key_len@@YAIPAUst_table@@IPBEK@Z ++ ?sql_alloc@@YAPAXI@Z ++ ?localtime_to_TIME@@YAXPAUst_mysql_time@@PAUtm@@@Z ++ ?push_warning@@YAPAVMYSQL_ERROR@@PAVTHD@@W4enum_warning_level@1@IPBD@Z ++ ?push_warning_printf@@YAXPAVTHD@@W4enum_warning_level@MYSQL_ERROR@@IPBDZZ ++ ?drop_table@handler@@EAEXPBD@Z ++ ?column_bitmaps_signal@handler@@UAEXXZ ++ ?delete_table@handler@@MAEHPBD@Z ++ ?rename_table@handler@@MAEHPBD0@Z ++ ?key_map_empty@@3V?$Bitmap@$0EA@@@B ++ ?THR_THD@@3PAVTHD@@A ++ ?end_of_list@@3Ulist_node@@A ++ ?mysql_tmpdir_list@@3Ust_my_tmpdir@@A ++ mysql_query_cache_invalidate4 ++ thd_query ++ thd_sql_command ++ thd_get_thread_id ++ thd_get_xid ++ thd_slave_thread ++ thd_non_transactional_update ++ thd_mark_transaction_to_rollback ++ thd_security_context ++ thd_charset ++ thd_test_options ++ thd_ha_data ++ thd_killed ++ thd_tx_isolation ++ thd_tablespace_op ++ thd_sql_command ++ thd_memdup ++ thd_make_lex_string ++ thd_in_lock_tables ++ thd_binlog_format ++ _my_hash_init ++ my_hash_free ++ my_tmpdir ++ check_if_legal_filename ++ my_filename ++ my_sync_dir_by_file ++ alloc_root ++ thr_lock_data_init ++ thr_lock_init ++ thr_lock_delete ++ my_multi_malloc ++ get_charset ++ unpack_filename ++ my_hash_insert ++ my_hash_search ++ my_hash_delete ++ mysql_bin_log_file_pos ++ mysql_bin_log_file_name ++ mysqld_embedded ++ my_thread_name ++ my_malloc ++ my_no_flags_free ++ _sanity ++ _mymalloc ++ _myfree ++ _my_strdup ++ _my_thread_var ++ my_error ++ pthread_cond_init ++ pthread_cond_signal ++ pthread_cond_wait ++ pthread_cond_destroy ++ localtime_r ++ my_strdup + +diff -Nur sql/mysqld_x64.def.orig sql/mysqld_x64.def +--- sql/mysqld_x64.def.orig 1969-12-31 18:00:00 -06:00 ++++ sql/mysqld_x64.def 2008-10-31 02:22:04 -05:00 +@@ -0,0 +1,98 @@ ++EXPORTS ++ ?use_hidden_primary_key@handler@@UEAAXXZ ++ ?get_dynamic_partition_info@handler@@UEAAXPEAUPARTITION_INFO@@I@Z ++ ?read_first_row@handler@@UEAAHPEAEI@Z ++ ?read_range_next@handler@@UEAAHXZ ++ ?read_range_first@handler@@UEAAHPEBUst_key_range@@0_N1@Z ++ ?read_multi_range_first@handler@@UEAAHPEAPEAUst_key_multi_range@@PEAU2@I_NPEAUst_handler_buffer@@@Z ++ ?read_multi_range_next@handler@@UEAAHPEAPEAUst_key_multi_range@@@Z ++ ?index_read_idx_map@handler@@UEAAHPEAEIPEBEKW4ha_rkey_function@@@Z ++ ?print_error@handler@@UEAAXHH@Z ++ ?clone@handler@@UEAAPEAV1@PEAUst_mem_root@@@Z ++ ?get_auto_increment@handler@@UEAAX_K00PEA_K1@Z ++ ?index_next_same@handler@@UEAAHPEAEPEBEI@Z ++ ?get_error_message@handler@@UEAA_NHPEAVString@@@Z ++ ?ha_thd@handler@@IEBAPEAVTHD@@XZ ++ ?update_auto_increment@handler@@QEAAHXZ ++ ?ha_statistic_increment@handler@@IEBAXPEQsystem_status_var@@K@Z ++ ?trans_register_ha@@YAXPEAVTHD@@_NPEAUhandlerton@@@Z ++ ?cmp@Field_blob@@QEAAHPEBEI0I@Z ++ ?set_time@Field_timestamp@@QEAAXXZ ++ ?sql_print_error@@YAXPEBDZZ ++ ?sql_print_warning@@YAXPEBDZZ ++ ?check_global_access@@YA_NPEAVTHD@@K@Z ++ ?schema_table_store_record@@YA_NPEAVTHD@@PEAUst_table@@@Z ++ ?get_quote_char_for_identifier@@YAHPEAVTHD@@PEBDI@Z ++ ?copy@String@@QEAA_NXZ ++ ?copy@String@@QEAA_NAEBV1@@Z ++ ?copy@String@@QEAA_NPEBDIPEAUcharset_info_st@@@Z ++ ?copy_and_convert@@YAIPEADIPEAUcharset_info_st@@PEBDI1PEAI@Z ++ ?filename_to_tablename@@YAIPEBDPEADI@Z ++ ?strconvert@@YAIPEAUcharset_info_st@@PEBD0PEADIPEAI@Z ++ ?calculate_key_len@@YAIPEAUst_table@@IPEBEK@Z ++ ?sql_alloc@@YAPEAX_K@Z ++ ?localtime_to_TIME@@YAXPEAUst_mysql_time@@PEAUtm@@@Z ++ ?push_warning@@YAPEAVMYSQL_ERROR@@PEAVTHD@@W4enum_warning_level@1@IPEBD@Z ++ ?push_warning_printf@@YAXPEAVTHD@@W4enum_warning_level@MYSQL_ERROR@@IPEBDZZ ++ ?drop_table@handler@@EEAAXPEBD@Z ++ ?column_bitmaps_signal@handler@@UEAAXXZ ++ ?delete_table@handler@@MEAAHPEBD@Z ++ ?rename_table@handler@@MEAAHPEBD0@Z ++ ?key_map_empty@@3V?$Bitmap@$0EA@@@B ++ ?THR_THD@@3PEAVTHD@@EA ++ ?end_of_list@@3Ulist_node@@A ++ ?mysql_tmpdir_list@@3Ust_my_tmpdir@@A ++ mysql_query_cache_invalidate4 ++ thd_query ++ thd_sql_command ++ thd_get_thread_id ++ thd_get_xid ++ thd_slave_thread ++ thd_non_transactional_update ++ thd_mark_transaction_to_rollback ++ thd_security_context ++ thd_charset ++ thd_test_options ++ thd_ha_data ++ thd_killed ++ thd_tx_isolation ++ thd_tablespace_op ++ thd_sql_command ++ thd_memdup ++ thd_make_lex_string ++ thd_in_lock_tables ++ thd_binlog_format ++ _my_hash_init ++ my_hash_free ++ my_tmpdir ++ check_if_legal_filename ++ my_filename ++ my_sync_dir_by_file ++ alloc_root ++ thr_lock_data_init ++ thr_lock_init ++ thr_lock_delete ++ my_multi_malloc ++ get_charset ++ unpack_filename ++ my_hash_insert ++ my_hash_search ++ my_hash_delete ++ mysql_bin_log_file_pos ++ mysql_bin_log_file_name ++ mysqld_embedded ++ my_thread_name ++ my_malloc ++ my_no_flags_free ++ _sanity ++ _mymalloc ++ _myfree ++ _my_strdup ++ _my_thread_var ++ my_error ++ pthread_cond_init ++ pthread_cond_signal ++ pthread_cond_wait ++ pthread_cond_destroy ++ localtime_r ++ my_strdup + +diff -Nur win/configure.js.orig win/configure.js +--- win/configure.js.orig 2008-09-26 21:18:37 -05:00 ++++ win/configure.js 2008-10-01 11:21:27 -05:00 +@@ -49,6 +49,7 @@ + case "CYBOZU": + case "EMBED_MANIFESTS": + case "WITH_EMBEDDED_SERVER": ++ case "INNODB_DYNAMIC_PLUGIN": + configfile.WriteLine("SET (" + args.Item(i) + " TRUE)"); + break; + case "MYSQL_SERVER_SUFFIX": + +diff -Nur win/build-vs71.bat.orig win/build-vs71.bat +--- win/build-vs71.bat.orig 2008-08-20 10:21:59 -05:00 ++++ win/build-vs71.bat 2008-10-27 10:52:38 -05:00 +@@ -15,8 +15,10 @@ + REM along with this program; if not, write to the Free Software + REM Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + ++REM CMAKE_BUILD_TYPE can be specified as Release or Debug ++ + if exist cmakecache.txt del cmakecache.txt + copy win\vs71cache.txt cmakecache.txt +-cmake -G "Visual Studio 7 .NET 2003" ++cmake -G "Visual Studio 7 .NET 2003" -DCMAKE_BUILD_TYPE=%1 + copy cmakecache.txt win\vs71cache.txt + +diff -Nur win/build-vs8.bat.orig win/build-vs8.bat +--- win/build-vs8.bat.orig 2008-08-20 10:21:59 -05:00 ++++ win/build-vs8.bat 2008-10-27 10:52:31 -05:00 +@@ -15,7 +15,9 @@ + REM along with this program; if not, write to the Free Software + REM Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + ++REM CMAKE_BUILD_TYPE can be specified as Release or Debug ++ + if exist cmakecache.txt del cmakecache.txt + copy win\vs8cache.txt cmakecache.txt +-cmake -G "Visual Studio 8 2005" ++cmake -G "Visual Studio 8 2005" -DCMAKE_BUILD_TYPE=%1 + copy cmakecache.txt win\vs8cache.txt +diff -Nur win/build-vs8_x64.bat.orig win/build-vs8_x64.bat +--- win/build-vs8_x64.bat.orig 2008-08-20 10:21:59 -05:00 ++++ win/build-vs8_x64.bat 2008-10-27 10:53:11 -05:00 +@@ -15,7 +15,9 @@ + REM along with this program; if not, write to the Free Software + REM Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + ++REM CMAKE_BUILD_TYPE can be specified as Release or Debug ++ + if exist cmakecache.txt del cmakecache.txt + copy win\vs8cache.txt cmakecache.txt +-cmake -G "Visual Studio 8 2005 Win64" ++cmake -G "Visual Studio 8 2005 Win64" -DCMAKE_BUILD_TYPE=%1 + copy cmakecache.txt win\vs8cache.txt