Many files:

Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution mysqld.cc: Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway sql/mysqld.cc: Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway sql/ha_innodb.cc: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution sql/ha_innodb.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/buf0buf.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/dict0dict.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/fil0fil.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/lock0lock.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0file.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0proc.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0thread.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/page0cur.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/page0page.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/read0read.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/rem0rec.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/srv0srv.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/sync0rw.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/sync0sync.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/trx0purge.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/trx0trx.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/rem0rec.ic: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0btr.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0cur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0pcur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/buf/buf0buf.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/buf/buf0flu.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/dict/dict0dict.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/fil/fil0fil.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/fsp/fsp0fsp.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/ibuf/ibuf0ibuf.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/lock/lock0lock.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/mem/mem0dbg.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/os/os0file.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/os/os0proc.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/page/page0cur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/page/page0page.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/pars/lexyy.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/pars/pars0grm.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/read/read0read.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0ins.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0mysql.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0purge.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0sel.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0uins.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0undo.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0upd.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/srv/srv0srv.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/srv/srv0start.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/sync/sync0rw.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/sync/sync0sync.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/trx/trx0purge.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/trx/trx0trx.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
2025-07-29 05:21:33 +03:00 · 2002-10-29 23:16:46 +02:00
parent 2d9a473bb6
commit 3cb98f0d66
51 changed files with 1577 additions and 580 deletions
--- a/innobase/btr/btr0btr.c
+++ b/innobase/btr/btr0btr.c
@ -274,6 +274,7 @@ btr_page_create(
 	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
 			      				MTR_MEMO_PAGE_X_FIX));
 	page_create(page, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;
 	
 	btr_page_set_index_id(page, tree->id, mtr);
 }
@ -713,6 +714,7 @@ btr_create(
 	
 	/* Create a new index page on the the allocated segment page */
 	page = page_create(frame, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;

 	/* Set the index id of the page */
 	btr_page_set_index_id(page, index_id, mtr);
@ -847,6 +849,7 @@ btr_page_reorganize_low(
 	segment headers, next page-field, etc.) is preserved intact */

 	page_create(page, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;
 	
 	/* Copy the records from the temporary space to the recreated page;
 	do not copy the lock bits yet */
@ -919,6 +922,7 @@ btr_page_empty(
 	segment headers, next page-field, etc.) is preserved intact */

 	page_create(page, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;
 }

 /*****************************************************************
--- a/innobase/btr/btr0cur.c
+++ b/innobase/btr/btr0cur.c
@ -121,16 +121,19 @@ btr_cur_latch_leaves(
 {
 	ulint	left_page_no;
 	ulint	right_page_no;
+	page_t*	get_page;
 	
 	ut_ad(tree && page && mtr);

 	if (latch_mode == BTR_SEARCH_LEAF) {
 	
-		btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;

 	} else if (latch_mode == BTR_MODIFY_LEAF) {

-		btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;

 	} else if (latch_mode == BTR_MODIFY_TREE) {

@ -138,15 +141,22 @@ btr_cur_latch_leaves(
 		left_page_no = btr_page_get_prev(page, mtr);

 		if (left_page_no != FIL_NULL) {
-			btr_page_get(space, left_page_no, RW_X_LATCH, mtr);
+			get_page = btr_page_get(space, left_page_no,
+							RW_X_LATCH, mtr);
+			buf_block_align(get_page)->check_index_page_at_flush =
+									TRUE;
 		}
 				
-		btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;

 		right_page_no = btr_page_get_next(page, mtr);

 		if (right_page_no != FIL_NULL) {
-			btr_page_get(space, right_page_no, RW_X_LATCH, mtr);
+			get_page = btr_page_get(space, right_page_no,
+							RW_X_LATCH, mtr);
+			buf_block_align(get_page)->check_index_page_at_flush =
+									TRUE;
 		}

 	} else if (latch_mode == BTR_SEARCH_PREV) {
@ -157,9 +167,12 @@ btr_cur_latch_leaves(
 		if (left_page_no != FIL_NULL) {
 			cursor->left_page = btr_page_get(space, left_page_no,
 							RW_S_LATCH, mtr);
+			buf_block_align(
+			cursor->left_page)->check_index_page_at_flush = TRUE;
 		}

-		btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;

 	} else if (latch_mode == BTR_MODIFY_PREV) {

@ -169,9 +182,12 @@ btr_cur_latch_leaves(
 		if (left_page_no != FIL_NULL) {
 			cursor->left_page = btr_page_get(space, left_page_no,
 							RW_X_LATCH, mtr);
+			buf_block_align(
+			cursor->left_page)->check_index_page_at_flush = TRUE;
 		}

-		btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;
 	} else {
 		ut_error;
 	}
@ -274,6 +290,7 @@ btr_cur_search_to_nth_level(
 	if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
 		&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
 		&& !estimate
+		&& mode != PAGE_CUR_LE_OR_EXTENDS
 	        && btr_search_guess_on_hash(index, info, tuple, mode,
 						latch_mode, cursor,
 						has_search_latch, mtr)) {
@ -334,12 +351,18 @@ btr_cur_search_to_nth_level(
 	rw_latch = RW_NO_LATCH;
 	buf_mode = BUF_GET;

+	/* We use these modified search modes on non-leaf levels of the
+	B-tree. These let us end up in the right B-tree leaf. In that leaf
+	we use the original search mode. */
+
 	if (mode == PAGE_CUR_GE) {
 		page_mode = PAGE_CUR_L;
 	} else if (mode == PAGE_CUR_G) {
 		page_mode = PAGE_CUR_LE;
 	} else if (mode == PAGE_CUR_LE) {
 		page_mode = PAGE_CUR_LE;
+	} else if (mode == PAGE_CUR_LE_OR_EXTENDS) {
+		page_mode = PAGE_CUR_LE_OR_EXTENDS;
 	} else {
 		ut_ad(mode == PAGE_CUR_L);
 		page_mode = PAGE_CUR_L;
@ -391,6 +414,8 @@ retry_page_get:
 			goto retry_page_get;
 		}

+		buf_block_align(page)->check_index_page_at_flush = TRUE;
+			
 #ifdef UNIV_SYNC_DEBUG					
 		if (rw_latch != RW_NO_LATCH) {
 			buf_page_dbg_add_level(page, SYNC_TREE_NODE);
@ -543,6 +568,8 @@ btr_cur_open_at_index_side(
 		ut_ad(0 == ut_dulint_cmp(tree->id,
 						btr_page_get_index_id(page)));

+		buf_block_align(page)->check_index_page_at_flush = TRUE;
+
 		if (height == ULINT_UNDEFINED) {
 			/* We are in the root node */

--- a/innobase/btr/btr0pcur.c
+++ b/innobase/btr/btr0pcur.c
@ -354,6 +354,7 @@ btr_pcur_move_to_next_page(
 	ut_ad(next_page_no != FIL_NULL);	

 	next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr);
+	buf_block_align(next_page)->check_index_page_at_flush = TRUE;

 	btr_leaf_page_release(page, cursor->latch_mode, mtr);
 	
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@ -331,6 +331,11 @@ buf_page_print(
 						index->table_name,
 						index->name);
 		}
+	} else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) {
+		fprintf(stderr, "InnoDB: Page may be an 'inode' page\n");
+	} else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) {
+		fprintf(stderr,
+		"InnoDB: Page may be an insert buffer free list page\n");
 	}
 }

@ -351,6 +356,8 @@ buf_block_init(
 	
 	block->file_page_was_freed = FALSE;

+	block->check_index_page_at_flush = FALSE;
+
 	rw_lock_create(&(block->lock));
 	ut_ad(rw_lock_validate(&(block->lock)));

@ -616,6 +623,29 @@ buf_page_peek_block(
 	return(block);
 }

+/************************************************************************
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+	ulint	space,	/* in: space id */
+	ulint	offset)	/* in: page number */
+{
+	buf_block_t*	block;
+
+	mutex_enter_fast(&(buf_pool->mutex));
+
+	block = buf_page_hash_get(space, offset);
+
+	if (block) {
+		block->check_index_page_at_flush = FALSE;
+	}
+	
+	mutex_exit(&(buf_pool->mutex));
+}
+
 /************************************************************************
 Returns the current state of is_hashed of a page. FALSE if the page is
 not in the pool. NOTE that this operation does not fix the page in the
@ -1185,6 +1215,8 @@ buf_page_init(
 	block->space 		= space;
 	block->offset 		= offset;

+	block->check_index_page_at_flush = FALSE;
+	
 	block->lock_hash_val	= lock_rec_hash(space, offset);
 	block->lock_mutex	= NULL;
 	
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@ -15,6 +15,7 @@ Created 11/11/1995 Heikki Tuuri

 #include "ut0byte.h"
 #include "ut0lst.h"
+#include "page0page.h"
 #include "fil0fil.h"
 #include "buf0buf.h"
 #include "buf0lru.h"
@ -225,6 +226,24 @@ buf_flush_buffered_writes(void)
 		return;
 	}

+	for (i = 0; i < trx_doublewrite->first_free; i++) {
+		block = trx_doublewrite->buf_block_arr[i];
+
+		if (block->check_index_page_at_flush
+				&& !page_simple_validate(block->frame)) {
+
+			buf_page_print(block->frame);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+	"  InnoDB: Apparent corruption of an index page\n"
+	"InnoDB: to be written to data file. We intentionally crash server\n"
+	"InnoDB: to prevent corrupt data from ending up in data\n"
+	"InnoDB: files.\n");
+			ut_a(0);
+		}
+	}
+
 	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 		len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 	} else {
--- a/innobase/dict/dict0dict.c
+++ b/innobase/dict/dict0dict.c
@ -29,7 +29,14 @@ Created 1/8/1996 Heikki Tuuri

 dict_sys_t*	dict_sys	= NULL;	/* the dictionary system */

-rw_lock_t	dict_foreign_key_check_lock;
+rw_lock_t	dict_operation_lock;	/* table create, drop, etc. reserve
+					this in X-mode, implicit or backround
+					operations purge, rollback, foreign
+					key checks reserve this in S-mode; we
+					cannot trust that MySQL protects
+					implicit or background operations
+					from dropping a table: this is our
+					mechanism */

 #define	DICT_HEAP_SIZE		100	/* initial memory heap size when
 					creating a table or index object */
@ -509,9 +516,8 @@ dict_init(void)

 	UT_LIST_INIT(dict_sys->table_LRU);

-	rw_lock_create(&dict_foreign_key_check_lock);
-	rw_lock_set_level(&dict_foreign_key_check_lock,
-						SYNC_FOREIGN_KEY_CHECK);
+	rw_lock_create(&dict_operation_lock);
+	rw_lock_set_level(&dict_operation_lock, SYNC_DICT_OPERATION);
 }

 /**************************************************************************
@ -1851,7 +1857,7 @@ loop:

 /*************************************************************************
 Accepts a specified string. Comparisons are case-insensitive. */
-static
+
 char*
 dict_accept(
 /*========*/
--- a/innobase/fil/fil0fil.c
+++ b/innobase/fil/fil0fil.c
@ -967,6 +967,7 @@ fil_extend_last_data_file(
 	fil_node_t*	node;
 	fil_space_t*	space;
 	fil_system_t*	system		= fil_system;
+	byte*		buf2;
 	byte*		buf;
 	ibool		success;
 	ulint		i;
@ -981,19 +982,23 @@ fil_extend_last_data_file(

 	fil_node_prepare_for_io(node, system, space);

-	buf = mem_alloc(1024 * 1024);
+	buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE);
+	buf = ut_align(buf2, UNIV_PAGE_SIZE);

 	memset(buf, '\0', 1024 * 1024);

 	for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) {

-		success = os_file_write(node->name, node->handle, buf,
+		/* If we use native Windows aio, then also this write is
+		done using it */
+
+		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
+			node->name, node->handle, buf,
 			(node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF,
 			node->size >> (32 - UNIV_PAGE_SIZE_SHIFT),
-			1024 * 1024);
+			1024 * 1024, NULL, NULL);

 		if (!success) {
-
 			break;
 		}

@ -1003,7 +1008,7 @@ fil_extend_last_data_file(
 		os_has_said_disk_full = FALSE;
 	}

-	mem_free(buf);
+	mem_free(buf2);

 	fil_node_complete_io(node, system, OS_FILE_WRITE);

@ -1528,7 +1533,6 @@ fil_page_set_type(
 	ulint	type)	/* in: type */
 {
 	ut_ad(page);
-	ut_ad((type == FIL_PAGE_INDEX) || (type == FIL_PAGE_UNDO_LOG));

 	mach_write_to_2(page + FIL_PAGE_TYPE, type);
 }	
--- a/innobase/fsp/fsp0fsp.c
+++ b/innobase/fsp/fsp0fsp.c
@ -769,6 +769,8 @@ fsp_init_file_page_low(
 #endif
 	page = buf_frame_align(ptr);

+	buf_block_align(page)->check_index_page_at_flush = FALSE;	
+	
 #ifdef UNIV_BASIC_LOG_DEBUG	
 /*	printf("In log debug version: Erase the contents of the file page\n");
 */
@ -1097,7 +1099,7 @@ fsp_fill_free_list(

 			/* Initialize the ibuf page in a separate
 			mini-transaction because it is low in the latching
-			order, and we must be able to release the its latch
+			order, and we must be able to release its latch
 			before returning from the fsp routine */
 			
 			mtr_start(&ibuf_mtr);
@ -1264,7 +1266,12 @@ fsp_alloc_free_page(

 	free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE,
 						hint % FSP_EXTENT_SIZE, mtr);
-	ut_a(free != ULINT_UNDEFINED);
+	if (free == ULINT_UNDEFINED) {
+
+		ut_print_buf(((byte*)descr) - 500, 1000);
+
+		ut_a(0);
+	}

 	xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);

@ -1412,7 +1419,12 @@ fsp_free_extent(

 	descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);

-	ut_a(xdes_get_state(descr, mtr) != XDES_FREE);
+	if (xdes_get_state(descr, mtr) == XDES_FREE) {
+
+		ut_print_buf(((byte*)descr) - 500, 1000);
+
+		ut_a(0);
+	}

 	xdes_init(descr, mtr);

@ -1523,6 +1535,10 @@ fsp_alloc_seg_inode_page(

 	page = buf_page_get(space, page_no, RW_X_LATCH, mtr);	

+	buf_block_align(page)->check_index_page_at_flush = FALSE;
+
+	fil_page_set_type(page, FIL_PAGE_INODE);
+	
 	buf_page_dbg_add_level(page, SYNC_FSP_PAGE);

 	for (i = 0; i < FSP_SEG_INODES_PER_PAGE; i++) {
@ -2298,6 +2314,8 @@ fseg_alloc_free_page_low(
 		fseg_mark_page_used(seg_inode, space, ret_page, mtr);
 	}

+	buf_reset_check_index_page_at_flush(space, ret_page);
+	
 	return(ret_page);	
 }

--- a/innobase/ibuf/ibuf0ibuf.c
+++ b/innobase/ibuf/ibuf0ibuf.c
@ -1295,6 +1295,8 @@ ibuf_add_free_page(
 	flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 		      page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);

+	fil_page_set_type(page, FIL_PAGE_IBUF_FREE_LIST);
+		      
 	ibuf_data->seg_size++;
 	ibuf_data->free_list_len++;

@ -1305,6 +1307,7 @@ ibuf_add_free_page(

 	ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_IBUF,
 								TRUE, &mtr);
+
 	mtr_commit(&mtr);

 	mutex_exit(&ibuf_mutex);
--- a/innobase/include/buf0buf.h
+++ b/innobase/include/buf0buf.h
@ -274,6 +274,15 @@ buf_page_peek_block(
 	ulint	space,	/* in: space id */
 	ulint	offset);/* in: page number */
 /************************************************************************
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+	ulint	space,	/* in: space id */
+	ulint	offset);/* in: page number */
+/************************************************************************
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
 This function should be called when we free a file page and want the
 debug version to check that it is not accessed any more unless
@ -648,6 +657,14 @@ struct buf_block_struct{
 					then it can wait for this rw-lock */
 	buf_block_t*	hash;		/* node used in chaining to the page
 					hash table */
+	ibool		check_index_page_at_flush;
+					/* TRUE if we know that this is
+					an index page, and want the database
+					to check its consistency before flush;
+					note that there may be pages in the
+					buffer pool which are index pages,
+					but this flag is not set because
+					we do not keep track of all pages */
 	/* 2. Page flushing fields */

 	UT_LIST_NODE_T(buf_block_t) flush_list;
--- a/innobase/include/dict0dict.h
+++ b/innobase/include/dict0dict.h
@ -26,6 +26,18 @@ Created 1/8/1996 Heikki Tuuri
 #include "ut0byte.h"
 #include "trx0types.h"

+/*************************************************************************
+Accepts a specified string. Comparisons are case-insensitive. */
+
+char*
+dict_accept(
+/*========*/
+			/* out: if string was accepted, the pointer
+			is moved after that, else ptr is returned */
+	char*	ptr,	/* in: scan from this */
+	const char* string,/* in: accept only this string as the next
+			non-whitespace string */
+	ibool*	success);/* out: TRUE if accepted */
 /************************************************************************
 Decrements the count of open MySQL handles to a table. */

@ -798,7 +810,7 @@ dict_mutex_exit_for_mysql(void);


 extern dict_sys_t*	dict_sys;	/* the dictionary system */
-extern rw_lock_t	dict_foreign_key_check_lock;
+extern rw_lock_t	dict_operation_lock;

 /* Dictionary system struct */
 struct dict_sys_struct{
--- a/innobase/include/fil0fil.h
+++ b/innobase/include/fil0fil.h
@ -73,6 +73,8 @@ extern fil_addr_t	fil_addr_null;
 /* File page types */
 #define FIL_PAGE_INDEX		17855
 #define FIL_PAGE_UNDO_LOG	2
+#define FIL_PAGE_INODE		3
+#define FIL_PAGE_IBUF_FREE_LIST	4

 /* Space types */
 #define FIL_TABLESPACE 		501
--- a/innobase/include/lock0lock.h
+++ b/innobase/include/lock0lock.h
@ -292,6 +292,27 @@ lock_sec_rec_modify_check_and_lock(
 	dict_index_t*	index,	/* in: secondary index */
 	que_thr_t*	thr);	/* in: query thread */
 /*************************************************************************
+Like the counterpart for a clustered index below, but now we read a
+secondary index record. */
+
+ulint
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+				/* out: DB_SUCCESS, DB_LOCK_WAIT,
+				DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+	ulint		flags,	/* in: if BTR_NO_LOCKING_FLAG bit is set,
+				does nothing */
+	rec_t*		rec,	/* in: user record or page supremum record
+				which should be read or passed over by a read
+				cursor */
+	dict_index_t*	index,	/* in: secondary index */
+	ulint		mode,	/* in: mode of the lock which the read cursor
+				should set on records: LOCK_S or LOCK_X; the
+				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
+	que_thr_t*	thr);	/* in: query thread */
+/*************************************************************************
 Checks if locks of other transactions prevent an immediate read, or passing
 over by a read cursor, of a clustered index record. If they do, first tests
 if the query thread should anyway be suspended for some reason; if not, then
@ -313,25 +334,8 @@ lock_clust_rec_read_check_and_lock(
 	ulint		mode,	/* in: mode of the lock which the read cursor
 				should set on records: LOCK_S or LOCK_X; the
 				latter is possible in SELECT FOR UPDATE */
-	que_thr_t*	thr);	/* in: query thread */
-/*************************************************************************
-Like the counterpart for a clustered index above, but now we read a
-secondary index record. */
-
-ulint
-lock_sec_rec_read_check_and_lock(
-/*=============================*/
-				/* out: DB_SUCCESS, DB_LOCK_WAIT,
-				DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
-	ulint		flags,	/* in: if BTR_NO_LOCKING_FLAG bit is set,
-				does nothing */
-	rec_t*		rec,	/* in: user record or page supremum record
-				which should be read or passed over by a read
-				cursor */
-	dict_index_t*	index,	/* in: secondary index */
-	ulint		mode,	/* in: mode of the lock which the read cursor
-				should set on records: LOCK_S or LOCK_X; the
-				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
 	que_thr_t*	thr);	/* in: query thread */
 /*************************************************************************
 Checks that a record is seen in a consistent read. */
@ -509,6 +513,7 @@ lock_validate(void);
 extern lock_sys_t*	lock_sys;

 /* Lock modes and types */
+/* Basic modes */
 #define	LOCK_NONE	0	/* this flag is used elsewhere to note
 				consistent read */
 #define	LOCK_IS		2	/* intention shared */
@ -519,15 +524,20 @@ extern lock_sys_t*	lock_sys;
 				in an exclusive mode */
 #define LOCK_MODE_MASK	0xF	/* mask used to extract mode from the
 				type_mode field in a lock */
+/* Lock types */
 #define LOCK_TABLE	16	/* these type values should be so high that */
 #define	LOCK_REC	32	/* they can be ORed to the lock mode */
 #define LOCK_TYPE_MASK	0xF0	/* mask used to extract lock type from the
 				type_mode field in a lock */
+/* Waiting lock flag */
 #define LOCK_WAIT	256	/* this wait bit should be so high that
 				it can be ORed to the lock mode and type;
 				when this bit is set, it means that the
 				lock has not yet been granted, it is just
 				waiting for its turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY	0	/* this flag denotes an ordinary next-key lock
+				in contrast to LOCK_GAP or LOCK_REC_NOT_GAP */ 
 #define LOCK_GAP	512	/* this gap bit should be so high that
 				it can be ORed to the other flags;
 				when this bit is set, it means that the
@ -537,7 +547,15 @@ extern lock_sys_t*	lock_sys;
 				the bit is set; locks of this type are created
 				when records are removed from the index chain
 				of records */
-#define LOCK_INSERT_INTENTION 1024 /* this bit is set when we place a waiting
+#define LOCK_REC_NOT_GAP 1024 	/* this bit means that the lock is only on
+				the index record and does NOT block inserts
+				to the gap before the index record; this is
+				used in the case when we retrieve a record
+				with a unique key, and is also used in
+				locking plain SELECTs (not part of UPDATE
+				or DELETE) when the user has set the READ
+				COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048 /* this bit is set when we place a waiting
 				gap type record lock request in order to let
 				an insert of an index record to wait until
 				there are no conflicting locks by other
--- a/innobase/include/os0file.h
+++ b/innobase/include/os0file.h
@ -111,6 +111,7 @@ log. */
 #define OS_WIN31     1
 #define OS_WIN95     2	
 #define OS_WINNT     3
+#define OS_WIN2000   4

 extern ulint	os_n_file_reads;
 extern ulint	os_n_file_writes;
@ -122,7 +123,7 @@ Gets the operating system version. Currently works only on Windows. */
 ulint
 os_get_os_version(void);
 /*===================*/
-                  /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */
+                  /* out: OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
 /********************************************************************
 Creates the seek mutexes used in positioned reads and writes. */

--- a/innobase/include/os0proc.h
+++ b/innobase/include/os0proc.h
@ -15,6 +15,15 @@ Created 9/30/1995 Heikki Tuuri
 typedef void*			os_process_t;
 typedef unsigned long int	os_process_id_t;

+/********************************************************************
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'. */
+
+ulint
+os_proc_get_number(void);
+/*====================*/
 /********************************************************************
 Allocates non-cacheable memory. */

--- a/innobase/include/os0thread.h
+++ b/innobase/include/os0thread.h
@ -16,11 +16,8 @@ Created 9/8/1995 Heikki Tuuri
 this is also the size of the wait slot array for MySQL threads which
 can wait inside InnoDB */
 #ifdef __WIN__
-/* Windows 95/98/ME seemed to have difficulties creating the all
-the event semaphores for the wait array slots. If the computer had
-<= 64 MB memory, InnoDB startup could take minutes or even crash.
-That is why we set this to only 1000 in Windows. */
-
+/* Create less event semaphores because Win 98/ME had difficult creating
+40000 event semaphores */
 #define	OS_THREAD_MAX_N		1000
 #else
 #define	OS_THREAD_MAX_N		10000
--- a/innobase/include/page0cur.h
+++ b/innobase/include/page0cur.h
@ -26,7 +26,12 @@ Created 10/4/1994 Heikki Tuuri
 #define	PAGE_CUR_GE	2
 #define	PAGE_CUR_L	3
 #define	PAGE_CUR_LE	4
-#define	PAGE_CUR_DBG	5
+#define PAGE_CUR_LE_OR_EXTENDS 5 /* This is a search mode used in
+				 "column LIKE 'abc%' ORDER BY column DESC";
+				 we have to find strings which are <= 'abc' or
+				 which extend it */
+#define	PAGE_CUR_DBG	6
+

 extern ulint	page_cur_short_succ;

--- a/innobase/include/page0page.h
+++ b/innobase/include/page0page.h
@ -666,6 +666,16 @@ page_rec_validate(
 			/* out: TRUE if ok */
 	rec_t* 	rec);	/* in: record on the page */
 /*******************************************************************
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage. */
+
+ibool
+page_simple_validate(
+/*=================*/
+			/* out: TRUE if ok */
+	page_t*	page);	/* in: index page */
+/*******************************************************************
 This function checks the consistency of an index page. */

 ibool
--- a/innobase/include/read0read.h
+++ b/innobase/include/read0read.h
@ -45,6 +45,14 @@ read_view_close(
 /*============*/
 	read_view_t*	view);	/* in: read view */
 /*************************************************************************
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*	trx);	/* in: trx which has a read view */
+/*************************************************************************
 Checks if a read view sees the specified transaction. */
 UNIV_INLINE
 ibool
--- a/innobase/include/rem0rec.h
+++ b/innobase/include/rem0rec.h
@ -148,12 +148,22 @@ data field in the record. */
 byte*
 rec_get_nth_field(
 /*==============*/
- 			/* out: pointer to the field, NULL if SQL null */
+ 			/* out: pointer to the field */
 	rec_t*	rec, 	/* in: record */
 	ulint	n,	/* in: index of the field */
 	ulint*	len);	/* out: length of the field; UNIV_SQL_NULL 
 			if SQL null */
 /****************************************************************
+Return field length or UNIV_SQL_NULL. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_len(
+/*==================*/
+ 			/* out: length of the field; UNIV_SQL_NULL if SQL
+			null */
+ 	rec_t*	rec, 	/* in: record */
+ 	ulint	n);	/* in: index of the field */
+/****************************************************************
 Gets the physical size of a field. Also an SQL null may have a field of
 size > 0, if the data type is of a fixed size. */
 UNIV_INLINE
--- a/innobase/include/rem0rec.ic
+++ b/innobase/include/rem0rec.ic
@ -65,6 +65,24 @@ a field stored to another page: */

 #define REC_2BYTE_EXTERN_MASK	0x4000

+/****************************************************************
+Return field length or UNIV_SQL_NULL. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_len(
+/*==================*/
+ 			/* out: length of the field; UNIV_SQL_NULL if SQL
+			null */
+ 	rec_t*	rec, 	/* in: record */
+ 	ulint	n)	/* in: index of the field */
+{
+	ulint	len;
+
+	rec_get_nth_field(rec, n, &len);
+
+	return(len);
+}
+
 /***************************************************************
 Sets the value of the ith field SQL null bit. */

--- a/innobase/include/srv0srv.h
+++ b/innobase/include/srv0srv.h
@ -57,8 +57,6 @@ extern ulint	srv_flush_log_at_trx_commit;

 extern byte	srv_latin1_ordering[256];/* The sort order table of the latin1
 					character set */
-extern ibool	srv_use_native_aio;		
-
 extern ulint	srv_pool_size;
 extern ulint	srv_mem_pool_size;
 extern ulint	srv_lock_table_size;
@ -70,8 +68,9 @@ extern dulint	srv_archive_recovery_limit_lsn;

 extern ulint	srv_lock_wait_timeout;

-extern char*    srv_unix_file_flush_method_str;
+extern char*    srv_file_flush_method_str;
 extern ulint    srv_unix_file_flush_method;
+extern ulint   	srv_win_file_flush_method;
 extern ulint	srv_force_recovery;
 extern ulint	srv_thread_concurrency;

@ -154,13 +153,19 @@ typedef struct srv_sys_struct	srv_sys_t;
 /* The server system */
 extern srv_sys_t*	srv_sys;

-/* Alternatives for the field flush option in Unix; see the InnoDB manual about
+/* Alternatives for the file flush option in Unix; see the InnoDB manual about
 what these mean */
-#define SRV_UNIX_FDATASYNC   1
+#define SRV_UNIX_FDATASYNC   1	/* This is the default; it is currently mapped
+				to a call of fsync() because fdatasync()
+				seemed to corrupt files in Linux and Solaris */
 #define SRV_UNIX_O_DSYNC     2
 #define SRV_UNIX_LITTLESYNC  3
 #define SRV_UNIX_NOSYNC      4

+/* Alternatives for file i/o in Windows */
+#define SRV_WIN_IO_NORMAL		1
+#define SRV_WIN_IO_UNBUFFERED		2	/* This is the default */
+
 /* Alternatives for srv_force_recovery. Non-zero values are intended
 to help the user get a damaged database up so that he can dump intact
 tables and rows with SELECT INTO OUTFILE. The database must not otherwise
@ -311,15 +316,17 @@ srv_conc_exit_innodb(
 	trx_t*	trx);	/* in: transaction object associated with the
 			thread */
 /*******************************************************************
-Puts a MySQL OS thread to wait for a lock to be released. */
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */

-ibool
+void
 srv_suspend_mysql_thread(
 /*=====================*/
-				/* out: TRUE if the lock wait timeout was
-				exceeded */
-	que_thr_t*	thr);	/* in: query thread associated with
-				the MySQL OS thread */
+	que_thr_t*	thr);	/* in: query thread associated with the MySQL
+				OS thread */
 /************************************************************************
 Releases a MySQL OS thread waiting for a lock to be released, if the
 thread is already suspended. */
@ -407,3 +414,4 @@ struct srv_sys_struct{
 extern ulint	srv_n_threads_active[];

 #endif
+
--- a/innobase/include/sync0rw.h
+++ b/innobase/include/sync0rw.h
@ -335,7 +335,8 @@ ibool
 rw_lock_own(
 /*========*/
 	rw_lock_t*	lock,		/* in: rw-lock */
-	ulint		lock_type);	/* in: lock type */
+	ulint		lock_type);	/* in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
 /**********************************************************************
 Checks if somebody has locked the rw-lock in the specified mode. */

--- a/innobase/include/sync0sync.h
+++ b/innobase/include/sync0sync.h
@ -371,10 +371,12 @@ or row lock! */
 #define SYNC_NO_ORDER_CHECK	3000	/* this can be used to suppress
 					latching order checking */
 #define	SYNC_LEVEL_NONE		2000	/* default: level not defined */
-#define	SYNC_FOREIGN_KEY_CHECK	1001
+#define	SYNC_DICT_OPERATION	1001	/* table create, drop, etc. reserve
+					this in X-mode, implicit or backround
+					operations purge, rollback, foreign
+					key checks reserve this in S-mode */
 #define SYNC_DICT		1000
 #define SYNC_DICT_AUTOINC_MUTEX	999
-#define	SYNC_PURGE_IS_RUNNING	997
 #define SYNC_DICT_HEADER	995
 #define SYNC_IBUF_HEADER	914
 #define SYNC_IBUF_PESS_INSERT_MUTEX 912
--- a/innobase/include/trx0purge.h
+++ b/innobase/include/trx0purge.h
@ -111,9 +111,6 @@ struct trx_purge_struct{
 					of the trx system and it never ends */
 	que_t*		query;		/* The query graph which will do the
 					parallelized purge operation */
-	rw_lock_t	purge_is_running;/* Purge operation set an x-latch here
-					while it is accessing a table: this
-					prevents dropping of the table */
 	rw_lock_t	latch;		/* The latch protecting the purge view.
 					A purge operation must acquire an
 					x-latch here for the instant at which
--- a/innobase/include/trx0trx.h
+++ b/innobase/include/trx0trx.h
@ -327,6 +327,7 @@ struct trx_struct{
        time_t          start_time;     /* time the trx object was created
                                        or the state last time became
                                        TRX_ACTIVE */
+	ulint		isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
 	ibool		check_foreigns;	/* normally TRUE, but if the user
 					wants to suppress foreign key checks,
 					(in table imports, for example) we
@ -350,6 +351,9 @@ struct trx_struct{
 	/*------------------------------*/
        void*           mysql_thd;      /* MySQL thread handle corresponding
                                        to this trx, or NULL */
+	char**		mysql_query_str;/* pointer to the field in mysqld_thd
+					which contains the pointer to the
+					current SQL query string */
 	char*		mysql_log_file_name;
 					/* if MySQL binlog is used, this field
 					contains a pointer to the latest file
@ -371,6 +375,9 @@ struct trx_struct{
 					replication has processed */
 	os_thread_id_t	mysql_thread_id;/* id of the MySQL thread associated
 					with this transaction object */
+	ulint		mysql_process_no;/* since in Linux, 'top' reports
+					process id's and not thread id's, we
+					store the process number too */
 	/*------------------------------*/
 	ulint		n_mysql_tables_in_use; /* number of Innobase tables
 					used in the processing of the current
@ -379,9 +386,9 @@ struct trx_struct{
                                        /* how many tables the current SQL
 					statement uses, except those
 					in consistent read */
-	ibool		has_dict_foreign_key_check_lock;
+	ibool		has_dict_operation_lock;
 					/* TRUE if the trx currently holds
-					an s-lock on dict_foreign_... */
+					an s-lock on dict_operation_lock */
        ibool           has_search_latch;
 			                /* TRUE if this trx has latched the
 			                search system latch in S-mode */
@ -523,6 +530,41 @@ struct trx_struct{
 #define TRX_QUE_ROLLING_BACK	3	/* transaction is rolling back */
 #define TRX_QUE_COMMITTING	4	/* transaction is committing */

+/* Transaction isolation levels */
+#define TRX_ISO_READ_UNCOMMITTED	1	/* dirty read: non-locking
+						SELECTs are performed so that
+						we do not look at a possible
+						earlier version of a record;
+						thus they are not 'consistent'
+						reads under this isolation
+						level; otherwise like level
+						2 */
+
+#define TRX_ISO_READ_COMMITTED		2	/* somewhat Oracle-like
+						isolation, except that in
+						range UPDATE and DELETE we
+						must block phantom rows
+						with next-key locks;
+						SELECT ... FOR UPDATE and ...
+						LOCK IN SHARE MODE only lock
+						the index records, NOT the
+						gaps before them, and thus
+						allow free inserting;
+						each consistent read reads its
+						own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ		3	/* this is the default;
+						all consistent reads in the
+						same trx read the same
+						snapshot;
+						full next-key locking used
+						in locking reads to block
+						insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE		4	/* all plain SELECTs are
+						converted to LOCK IN SHARE
+						MODE reads */
+
 /* Types of a trx signal */
 #define TRX_SIG_NO_SIGNAL		100
 #define TRX_SIG_TOTAL_ROLLBACK		1
--- a/innobase/lock/lock0lock.c
+++ b/innobase/lock/lock0lock.c
@ -70,6 +70,11 @@ A waiting record lock can also be of the gap type. A waiting lock request
 can be granted when there is no conflicting mode lock request by another
 transaction ahead of it in the explicit lock queue.

+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
 -------------------------------------------------------------------------
 RULE 1: If there is an implicit x-lock on a record, and there are non-gap
 -------
@ -294,7 +299,9 @@ struct lock_struct{
 	UT_LIST_NODE_T(lock_t)		
 			trx_locks;	/* list of the locks of the
 					transaction */
-	ulint		type_mode;	/* lock type, mode, gap flag, and
+	ulint		type_mode;	/* lock type, mode, LOCK_GAP or
+					LOCK_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION,
 					wait flag, ORed */
 	hash_node_t	hash;		/* hash chain node for a record lock */
 	dict_index_t*	index;		/* index for a record lock */
@ -309,6 +316,10 @@ Monitor will then fetch it and print */
 ibool	lock_deadlock_found = FALSE;
 char*	lock_latest_err_buf;		/* We allocate 5000 bytes for this */

+/* Flags for recursive deadlock search */
+#define LOCK_VICTIM_IS_START	1
+#define LOCK_VICTIM_IS_OTHER	2
+
 /************************************************************************
 Checks if a lock request results in a deadlock. */
 static
@ -700,23 +711,23 @@ lock_rec_get_gap(
 }

 /*************************************************************************
-Sets the gap flag of a record lock. */
+Gets the LOCK_REC_NOT_GAP flag of a record lock. */
 UNIV_INLINE
-void
-lock_rec_set_gap(
-/*=============*/
-	lock_t*	lock,	/* in: record lock */
-	ibool	val)	/* in: value to set: TRUE or FALSE */
+ibool
+lock_rec_get_rec_not_gap(
+/*=====================*/
+			/* out: TRUE if LOCK_REC_NOT_GAP flag set */
+	lock_t*	lock)	/* in: record lock */
 {
 	ut_ad(lock);
-	ut_ad((val == TRUE) || (val == FALSE));
 	ut_ad(lock_get_type(lock) == LOCK_REC);

-	if (val) {
- 		lock->type_mode = lock->type_mode | LOCK_GAP;
-	} else {
-		lock->type_mode = lock->type_mode & ~LOCK_GAP;
+	if (lock->type_mode & LOCK_REC_NOT_GAP) {
+
+		return(TRUE);
 	}
+
+	return(FALSE);
 }

 /*************************************************************************
@ -739,26 +750,6 @@ lock_rec_get_insert_intention(
 	return(FALSE);
 }

-/*************************************************************************
-Sets the waiting insert flag of a record lock. */
-UNIV_INLINE
-void
-lock_rec_set_insert_intention(
-/*==========================*/
-	lock_t*	lock,	/* in: record lock */
-	ibool	val)	/* in: value to set: TRUE or FALSE */
-{
-	ut_ad(lock);
-	ut_ad((val == TRUE) || (val == FALSE));
-	ut_ad(lock_get_type(lock) == LOCK_REC);
-
-	if (val) {
- 		lock->type_mode = lock->type_mode | LOCK_INSERT_INTENTION;
-	} else {
-		lock->type_mode = lock->type_mode & ~LOCK_INSERT_INTENTION;
-	}
-}
-
 /*************************************************************************
 Calculates if lock mode 1 is stronger or equal to lock mode 2. */
 UNIV_INLINE
@ -848,47 +839,52 @@ lock_rec_has_to_wait(
 			/* out: TRUE if new lock has to wait for lock2 to be
 			removed */
 	trx_t*	trx,	/* in: trx of new lock */
-	ulint	mode,	/* in: LOCK_S or LOCK_X */
-	ulint	gap,	/* in: LOCK_GAP or 0 */
-	ulint	insert_intention,
-			/* in: LOCK_INSERT_INTENTION or 0 */
+	ulint	type_mode,/* in: precise mode of the new lock to set:
+			LOCK_S or LOCK_X, possibly ORed to
+			LOCK_GAP or LOCK_REC_NOT_GAP, LOCK_INSERT_INTENTION */
 	lock_t*	lock2)	/* in: another record lock; NOTE that it is assumed
 			that this has a lock bit set on the same record as
-			in lock1 */
+			in the new lock we are setting */
 {
 	ut_ad(trx && lock2);
 	ut_ad(lock_get_type(lock2) == LOCK_REC);
-	ut_ad(mode == LOCK_S || mode == LOCK_X);
-	ut_ad(gap == LOCK_GAP || gap == 0);
-	ut_ad(insert_intention == LOCK_INSERT_INTENTION
-	      				|| insert_intention == 0);

-	if (trx != lock2->trx && !lock_mode_compatible(mode,
+	if (trx != lock2->trx
+	    && !lock_mode_compatible(LOCK_MODE_MASK & type_mode,
 				     		lock_get_mode(lock2))) {

-		/* We have somewhat complex rules when gap type
-		record locks cause waits */
+		/* We have somewhat complex rules when gap type record locks
+		cause waits */

-		if (!gap && lock_rec_get_insert_intention(lock2)) {
-
-			/* Request of a full next-key record does not
-			need to wait for an insert intention lock to be
-			removed. This is ok since our rules allow conflicting
-			locks on gaps. This eliminates a spurious deadlock
-			caused by a next-key lock waiting for an insert
-			intention lock; when the insert intention lock was
-			granted, the insert deadlocked on the waiting
-			next-key lock. */
+		if ((type_mode & LOCK_REC_NOT_GAP)
+						&& lock_rec_get_gap(lock2)) {
+			/* Lock on just the record does not need to wait for
+			a gap type lock */

 			return(FALSE);
 		}

-		if (insert_intention && lock_rec_get_insert_intention(lock2)) {
+		if ((type_mode & LOCK_GAP)
+					&& lock_rec_get_rec_not_gap(lock2)) {
 		
-			/* An insert intention is not disturbed by another
-			insert intention; this removes a spurious deadlock
-			caused by inserts which had to wait for a next-key
-			lock to be removed */
+			/* Lock on gap does not need to wait for
+			a LOCK_REC_NOT_GAP type lock */
+
+			return(FALSE);
+		}
+
+		if (lock_rec_get_insert_intention(lock2)) {
+
+			/* No lock request needs to wait for an insert
+			intention lock to be removed. This is ok since our
+			rules allow conflicting locks on gaps. This eliminates
+			a spurious deadlock caused by a next-key lock waiting
+			for an insert intention lock; when the insert
+			intention lock was granted, the insert deadlocked on
+			the waiting next-key lock.
+
+			Also, insert intention locks do not disturb each
+			other. */
 				
 			return(FALSE);
 		}
@ -921,10 +917,7 @@ lock_has_to_wait(
 			ut_ad(lock_get_type(lock2) == LOCK_REC);
 				
 			return(lock_rec_has_to_wait(lock1->trx,
-				lock_get_mode(lock1),
-				lock_rec_get_gap(lock1),
-				lock_rec_get_insert_intention(lock1),
-				lock2));
+						lock1->type_mode, lock2));
 		}

 		return(TRUE);
@ -1386,32 +1379,41 @@ lock_table_has(
 /*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/

 /*************************************************************************
-Checks if a transaction has a GRANTED explicit lock on rec, where the gap
-flag or the insert intention flag is not set, stronger or equal to mode.
-Note that locks on the supremum of a page are a special case here, since
-they are always gap type locks, even if the gap flag is not set in them. */
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode. */
 UNIV_INLINE
 lock_t*
 lock_rec_has_expl(
 /*==============*/
 			/* out: lock or NULL */
-	ulint	mode,	/* in: lock mode */
+	ulint	precise_mode,/* in: LOCK_S or LOCK_X possibly ORed to
+			LOCK_GAP or LOCK_REC_NOT_GAP,
+			for a supremum record we regard this always a gap
+			type request */
 	rec_t*	rec,	/* in: record */
 	trx_t*	trx)	/* in: transaction */
 {
 	lock_t*	lock;

 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode == LOCK_X) || (mode == LOCK_S));
+	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
 	
 	lock = lock_rec_get_first(rec);

 	while (lock) {
 		if (lock->trx == trx
-		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)
+		    && lock_mode_stronger_or_eq(lock_get_mode(lock),
+		    				precise_mode & LOCK_MODE_MASK)
 		    && !lock_get_wait(lock)
-		    && !lock_rec_get_insert_intention(lock)
-		    && !lock_rec_get_gap(lock)) {
+		    && (!lock_rec_get_rec_not_gap(lock)
+		    		|| (precise_mode & LOCK_REC_NOT_GAP)
+		    		|| page_rec_is_supremum(rec))
+		    && (!lock_rec_get_gap(lock)
+				|| (precise_mode & LOCK_GAP)
+				|| page_rec_is_supremum(rec))
+		    && (!lock_rec_get_insert_intention(lock))) {

 		    	return(lock);
 		}
@ -1429,7 +1431,7 @@ lock_t*
 lock_rec_other_has_expl_req(
 /*========================*/
 			/* out: lock or NULL */
-	ulint	mode,	/* in: lock mode */
+	ulint	mode,	/* in: LOCK_S or LOCK_X */
 	ulint	gap,	/* in: LOCK_GAP if also gap locks are taken
 			into account, or 0 if not */
 	ulint	wait,	/* in: LOCK_WAIT if also waiting locks are
@ -1471,27 +1473,21 @@ lock_t*
 lock_rec_other_has_conflicting(
 /*===========================*/
 			/* out: lock or NULL */
-	ulint	mode,	/* in: lock mode of the lock we are going to reserve */
-	ulint	gap,	/* in: LOCK_GAP if we are going to reserve a gap type
-			lock, else 0 */
-	ulint	insert_intention,
-			/* in: LOCK_INSERT_INTENTION if we are going to
-			reserve an insert intention lock */
+	ulint	mode,	/* in: LOCK_S or LOCK_X,
+			possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP,
+			LOCK_INSERT_INTENTION */
 	rec_t*	rec,	/* in: record to look at */	
 	trx_t*	trx)	/* in: our transaction */
 {
 	lock_t*	lock;
 	
 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(mode == LOCK_X || mode == LOCK_S);
-	ut_ad(gap == 0 || gap == LOCK_GAP);
-	ut_ad(insert_intention == LOCK_INSERT_INTENTION
-						|| insert_intention == 0);
+
 	lock = lock_rec_get_first(rec);

 	while (lock) {
-		if (lock_rec_has_to_wait(trx, mode, gap, insert_intention,
-								lock)) {
+		if (lock_rec_has_to_wait(trx, mode, lock)) {
+
 			return(lock);
 		}
 		
@ -1607,14 +1603,14 @@ lock_rec_create(
 	page_no	= buf_frame_get_page_no(page);
 	heap_no = rec_get_heap_no(rec);

-	/* If rec is the supremum record, then we reset the gap bit, as
-	all locks on the supremum are automatically of the gap type, and
-	we try to avoid unnecessary memory consumption of a new record lock
-	struct for a gap type lock */
+	/* If rec is the supremum record, then we reset the gap and
+	LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+	automatically of the gap type */

 	if (rec == page_get_supremum_rec(page)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));

-		type_mode = type_mode & ~LOCK_GAP;
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
 	}

 	/* Make lock bitmap bigger by a safety margin */
@ -1666,10 +1662,14 @@ ulint
 lock_rec_enqueue_waiting(
 /*=====================*/
 				/* out: DB_LOCK_WAIT, DB_DEADLOCK, or
-				DB_QUE_THR_SUSPENDED */
+				DB_QUE_THR_SUSPENDED, or DB_SUCCESS;
+				DB_SUCCESS means that there was a deadlock,
+				but another transaction was chosen as a
+				victim, and we got the lock immediately:
+				no need to wait then */
 	ulint		type_mode,/* in: lock mode this transaction is
-				requesting: LOCK_S or LOCK_X, ORed with
-				LOCK_GAP if a gap lock is requested, ORed
+				requesting: LOCK_S or LOCK_X, possibly ORed
+				with LOCK_GAP or LOCK_REC_NOT_GAP, ORed
 				with LOCK_INSERT_INTENTION if this waiting
 				lock request is set when performing an
 				insert of an index record */
@ -1718,6 +1718,14 @@ index->table_name);
 		return(DB_DEADLOCK);
 	}

+	/* If there was a deadlock but we chose another transaction as a
+	victim, it is possible that we already have the lock now granted! */
+
+	if (trx->wait_lock == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
 	trx->que_state = TRX_QUE_LOCK_WAIT;
 	trx->wait_started = time(NULL);

@ -1744,8 +1752,8 @@ lock_rec_add_to_queue(
 /*==================*/
 				/* out: lock where the bit was set, NULL if out
 				of memory */
-	ulint		type_mode,/* in: lock mode, wait, and gap flags; type
-				is ignored and replaced by LOCK_REC */
+	ulint		type_mode,/* in: lock mode, wait, gap etc. flags;
+				type is ignored and replaced by LOCK_REC */
 	rec_t*		rec,	/* in: record on page */
 	dict_index_t*	index,	/* in: index of record */
 	trx_t*		trx)	/* in: transaction */
@ -1759,12 +1767,11 @@ lock_rec_add_to_queue(
 	ut_ad(mutex_own(&kernel_mutex));
 	ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP))
 	      || ((type_mode & LOCK_MODE_MASK) != LOCK_S)
-	      || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT,
-						rec, trx));
+	      || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, rec, trx));
 	ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP))
 	      || ((type_mode & LOCK_MODE_MASK) != LOCK_X)
-	      || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
-						rec, trx));
+	      || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, trx));
+
 	type_mode = type_mode | LOCK_REC;

 	page = buf_frame_align(rec);
@ -1775,12 +1782,15 @@ lock_rec_add_to_queue(
 	struct for a gap type lock */

 	if (rec == page_get_supremum_rec(page)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));

-		type_mode = type_mode & ~LOCK_GAP;
+		/* There should never be LOCK_REC_NOT_GAP on a supremum
+		record, but let us play safe */
+		
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
 	}

-	/* Look for a waiting lock request on the same record, or for a
-	similar record lock on the same page */
+	/* Look for a waiting lock request on the same record or on a gap */

 	heap_no = rec_get_heap_no(rec);
 	lock = lock_rec_get_first_on_page(rec);
@ -1795,6 +1805,9 @@ lock_rec_add_to_queue(
 		lock = lock_rec_get_next_on_page(lock);
 	}

+	/* Look for a similar record lock on the same page: if one is found
+	and there are no waiting lock requests, we can just set the bit */
+
 	similar_lock = lock_rec_find_similar_on_page(type_mode, rec, trx);

 	if (similar_lock && !somebody_waits && !(type_mode & LOCK_WAIT)) {
@ -1822,7 +1835,8 @@ lock_rec_lock_fast(
 	ibool		impl,	/* in: if TRUE, no lock is set if no wait
 				is necessary: we assume that the caller will
 				set an implicit lock */
-	ulint		mode,	/* in: lock mode */
+	ulint		mode,	/* in: lock mode: LOCK_X or LOCK_S possibly
+				ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index of record */
 	que_thr_t* 	thr)	/* in: query thread */
@ -1831,7 +1845,15 @@ lock_rec_lock_fast(
 	ulint	heap_no;

 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode == LOCK_X) || (mode == LOCK_S));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+		|| (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == 0
+			|| mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
 			
 	heap_no = rec_get_heap_no(rec);
 	
@ -1877,7 +1899,8 @@ lock_rec_lock_slow(
 	ibool		impl,	/* in: if TRUE, no lock is set if no wait is
 				necessary: we assume that the caller will set
 				an implicit lock */
-	ulint		mode,	/* in: lock mode */
+	ulint		mode,	/* in: lock mode: LOCK_X or LOCK_S possibly
+				ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index of record */
 	que_thr_t* 	thr)	/* in: query thread */
@ -1886,20 +1909,24 @@ lock_rec_lock_slow(
 	ulint	err;

 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode == LOCK_X) || (mode == LOCK_S));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+		|| (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == 0
+			|| mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
 			
 	trx = thr_get_trx(thr);
 		
-	ut_ad((mode != LOCK_S) || lock_table_has(trx, index->table,
-								LOCK_IS));
-	ut_ad((mode != LOCK_X) || lock_table_has(trx, index->table,
-								LOCK_IX));
 	if (lock_rec_has_expl(mode, rec, trx)) {
 		/* The trx already has a strong enough lock on rec: do
 		nothing */

 		err = DB_SUCCESS;
-	} else if (lock_rec_other_has_conflicting(mode, 0, 0, rec, trx)) {
+	} else if (lock_rec_other_has_conflicting(mode, rec, trx)) {

 		/* If another transaction has a non-gap conflicting request in
 		the queue, as this transaction does not have a lock strong
@ -1935,7 +1962,8 @@ lock_rec_lock(
 	ibool		impl,	/* in: if TRUE, no lock is set if no wait is
 				necessary: we assume that the caller will set
 				an implicit lock */
-	ulint		mode,	/* in: lock mode */
+	ulint		mode,	/* in: lock mode: LOCK_X or LOCK_S possibly
+				ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index of record */
 	que_thr_t* 	thr)	/* in: query thread */
@ -1943,10 +1971,15 @@ lock_rec_lock(
 	ulint	err;

 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode != LOCK_S) || lock_table_has(thr_get_trx(thr),
-						index->table, LOCK_IS));
-	ut_ad((mode != LOCK_X) || lock_table_has(thr_get_trx(thr),
-						index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+		|| (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == 0);
 			
 	if (lock_rec_lock_fast(impl, mode, rec, index, thr)) {

@ -2030,8 +2063,15 @@ lock_grant(
 					ut_dulint_get_low(lock->trx->id));
 	}

+	/* If we are resolving a deadlock by choosing another transaction
+	as a victim, then our original transaction may not be in the
+	TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
+	for it */
+	
+	if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) {	
 		trx_end_lock_wait(lock->trx);
 	}
+}

 /*****************************************************************
 Cancels a waiting record lock request and releases the waiting transaction
@ -2199,9 +2239,10 @@ lock_rec_reset_and_release_wait(
 }	

 /*****************************************************************
-Makes a record to inherit the locks of another record as gap type locks, but
-does not reset the lock bits of the other record. Also waiting lock requests
-on rec are inherited as GRANTED gap locks. */
+Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks. */

 void
 lock_rec_inherit_to_gap(
@ -2217,9 +2258,45 @@ lock_rec_inherit_to_gap(
 	lock = lock_rec_get_first(rec);

 	while (lock != NULL) {
-		lock_rec_add_to_queue(((lock->type_mode | LOCK_GAP)
-					& ~LOCK_WAIT),
+		if (!lock_rec_get_insert_intention(lock)) {
+			
+			lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock)
+						| LOCK_GAP,
 	 			     		heir, lock->index, lock->trx);
+	 	}
+	 	
+		lock = lock_rec_get_next(rec, lock);
+	}
+}	
+
+/*****************************************************************
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+	rec_t*	heir,	/* in: record which inherits */
+	rec_t*	rec)	/* in: record from which inherited; does NOT reset
+			the locks on this record */
+{
+	lock_t*	lock;
+	
+	ut_ad(mutex_own(&kernel_mutex));
+	
+	lock = lock_rec_get_first(rec);
+
+	while (lock != NULL) {
+		if (!lock_rec_get_insert_intention(lock)
+		    && (page_rec_is_supremum(rec)
+			|| !lock_rec_get_rec_not_gap(lock))) {
+			
+			lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock)
+						| LOCK_GAP,
+	 			     		heir, lock->index, lock->trx);
+	 	}
+
 		lock = lock_rec_get_next(rec, lock);
 	}
 }	
@ -2778,9 +2855,10 @@ lock_update_insert(
 {
 	lock_mutex_enter_kernel();

-	/* Inherit the locks for rec, in gap mode, from the next record */
+	/* Inherit the gap-locking locks for rec, in gap mode, from the next
+	record */

-	lock_rec_inherit_to_gap(rec, page_rec_get_next(rec));
+	lock_rec_inherit_to_gap_if_gap_lock(rec, page_rec_get_next(rec));

 	lock_mutex_exit_kernel();
 }	
@ -2859,20 +2937,23 @@ static
 ibool
 lock_deadlock_occurs(
 /*=================*/
-			/* out: TRUE if a deadlock was detected */
+			/* out: TRUE if a deadlock was detected and we
+			chose trx as a victim; FALSE if no deadlock, or
+			there was a deadlock, but we chose other
+			transaction(s) as victim(s) */
 	lock_t*	lock,	/* in: lock the transaction is requesting */
 	trx_t*	trx)	/* in: transaction */
 {
 	dict_table_t*	table;
 	dict_index_t*	index;
 	trx_t*		mark_trx;
-	ibool		ret;
+	ulint		ret;
 	ulint		cost	= 0;
 	char*		err_buf;

 	ut_ad(trx && lock);
 	ut_ad(mutex_own(&kernel_mutex));
-
+retry:
 	/* We check that adding this trx to the waits-for graph
 	does not produce a cycle. First mark all active transactions
 	with 0: */
@ -2886,7 +2967,14 @@ lock_deadlock_occurs(

 	ret = lock_deadlock_recursive(trx, trx, lock, &cost);

-	if (ret) {
+	if (ret == LOCK_VICTIM_IS_OTHER) {
+		/* We chose some other trx as a victim: retry if there still
+		is a deadlock */
+
+		goto retry;
+	}
+
+	if (ret == LOCK_VICTIM_IS_START) {
 		if (lock_get_type(lock) == LOCK_TABLE) {
 			table = lock->un_member.tab_lock.table;
 			index = NULL;
@ -2899,19 +2987,6 @@ lock_deadlock_occurs(

 		err_buf = lock_latest_err_buf + strlen(lock_latest_err_buf);
 			
-		err_buf += sprintf(err_buf,
-		"*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
-
-		ut_a(err_buf <= lock_latest_err_buf + 4000);
-			
-		if (lock_get_type(lock) == LOCK_REC) {
-			lock_rec_print(err_buf, lock);
-			err_buf += strlen(err_buf);
-		} else {
-			lock_table_print(err_buf, lock);
-			err_buf += strlen(err_buf);
-		}
-			
 		ut_a(err_buf <= lock_latest_err_buf + 4000);

 		err_buf += sprintf(err_buf,
@ -2923,30 +2998,39 @@ lock_deadlock_occurs(
 		sess_raise_error_low(trx, DB_DEADLOCK, lock->type_mode, table,
 						index, NULL, NULL, NULL);
 		*/
+
+		return(TRUE);
 	}
 	
-	return(ret);
+	return(FALSE);
 }

 /************************************************************************
 Looks recursively for a deadlock. */
 static
-ibool
+ulint
 lock_deadlock_recursive(
 /*====================*/
-				/* out: TRUE if a deadlock was detected
-				or the calculation took too long */
+				/* out: 0 if no deadlock found,
+				LOCK_VICTIM_IS_START if there was a deadlock
+				and we chose 'start' as the victim,
+				LOCK_VICTIM_IS_OTHER if a deadlock
+				was found and we chose some other trx as a
+				victim: we must do the search again in this
+				last case because there may be another
+				deadlock! */
 	trx_t*	start,		/* in: recursion starting point */
 	trx_t*	trx,		/* in: a transaction waiting for a lock */
 	lock_t*	wait_lock,	/* in: the lock trx is waiting to be granted */
 	ulint*	cost)		/* in/out: number of calculation steps thus
 				far: if this exceeds LOCK_MAX_N_STEPS_...
-				we return TRUE */
+				we return LOCK_VICTIM_IS_START */
 {
 	lock_t*	lock;
 	ulint	bit_no;
 	trx_t*	lock_trx;
 	char*	err_buf;
+	ulint	ret;
 	
 	ut_a(trx && start && wait_lock);
 	ut_ad(mutex_own(&kernel_mutex));
@ -2955,14 +3039,14 @@ lock_deadlock_recursive(
 		/* We have already exhaustively searched the subtree starting
 		from this trx */

-		return(FALSE);
+		return(0);
 	}

 	*cost = *cost + 1;

 	if (*cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK) {

-		return(TRUE);
+		return(LOCK_VICTIM_IS_START);
 	}

 	lock = wait_lock;
@ -2998,6 +3082,9 @@ lock_deadlock_recursive(
 			lock_trx = lock->trx;

 			if (lock_trx == start) {
+				/* We came back to the recursion starting
+				point: a deadlock detected */
+				
 				err_buf = lock_latest_err_buf;

 				ut_sprintf_timestamp(err_buf);
@ -3045,11 +3132,59 @@ lock_deadlock_recursive(
 			
 				ut_a(err_buf <= lock_latest_err_buf + 4000);

+				err_buf += sprintf(err_buf,
+			"*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+				ut_a(err_buf <= lock_latest_err_buf + 4000);
+			
+				if (lock_get_type(start->wait_lock)
+								== LOCK_REC) {
+					lock_rec_print(err_buf,
+							start->wait_lock);
+					err_buf += strlen(err_buf);
+				} else {
+					lock_table_print(err_buf,
+							start->wait_lock);
+					err_buf += strlen(err_buf);
+				}
+
 				if (lock_print_waits) {
 					printf("Deadlock detected\n");
 				}

-				return(TRUE);
+				if (ut_dulint_cmp(wait_lock->trx->undo_no,
+							start->undo_no) >= 0) {
+					/* Our recursion starting point
+					transaction is 'smaller', let us
+					choose 'start' as the victim and roll
+					back it */
+
+					return(LOCK_VICTIM_IS_START);
+				}		
+
+				lock_deadlock_found = TRUE;
+
+				ut_a(err_buf <= lock_latest_err_buf + 4000);
+
+				/* Let us choose the transaction of wait_lock
+				as a victim to try to avoid deadlocking our
+				recursion starting point transaction */
+				
+				err_buf += sprintf(err_buf,
+				"*** WE ROLL BACK TRANSACTION (1)\n");
+				
+				wait_lock->trx->error_state = DB_DEADLOCK;
+				
+				lock_cancel_waiting_and_release(wait_lock);
+
+				/* Since trx and wait_lock are no longer
+				in the waits-for graph, we can return FALSE;
+				note that our selective algorithm can choose
+				several transactions as victims, but still
+				we may end up rolling back also the recursion
+				starting point transaction! */
+
+				return(LOCK_VICTIM_IS_OTHER);
 			}
 	
 			if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
@ -3058,10 +3193,11 @@ lock_deadlock_recursive(
 				incompatible mode, and is itself waiting for
 				a lock */

-				if (lock_deadlock_recursive(start, lock_trx,
-						lock_trx->wait_lock, cost)) {
+				ret = lock_deadlock_recursive(start, lock_trx,
+						lock_trx->wait_lock, cost);
+				if (ret != 0) {

-					return(TRUE);
+					return(ret);
 				}
 			}
 		}
@ -3153,12 +3289,16 @@ lock_table_remove_low(
 /*************************************************************************
 Enqueues a waiting request for a table lock which cannot be granted
 immediately. Checks for deadlocks. */
-
+static
 ulint
 lock_table_enqueue_waiting(
 /*=======================*/
 				/* out: DB_LOCK_WAIT, DB_DEADLOCK, or
-				DB_QUE_THR_SUSPENDED */
+				DB_QUE_THR_SUSPENDED, or DB_SUCCESS;
+				DB_SUCCESS means that there was a deadlock,
+				but another transaction was chosen as a
+				victim, and we got the lock immediately:
+				no need to wait then */
 	ulint		mode,	/* in: lock mode this transaction is
 				requesting */
 	dict_table_t*	table,	/* in: table */
@ -3205,6 +3345,13 @@ table->name);
 		return(DB_DEADLOCK);
 	}

+	if (trx->wait_lock == NULL) {
+		/* Deadlock resolution chose another transaction as a victim,
+		and we accidentally got our lock granted! */
+	
+		return(DB_SUCCESS);
+	}
+	
 	trx->que_state = TRX_QUE_LOCK_WAIT;
 	trx->wait_started = time(NULL);

@ -3292,7 +3439,7 @@ lock_table(
 	if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) {
 	
 		/* Another trx has a request on the table in an incompatible
-		mode: this trx must wait */
+		mode: this trx may have to wait */

 		err = lock_table_enqueue_waiting(mode, table, thr);
 			
@ -3659,7 +3806,11 @@ lock_rec_print(
 	}

 	if (lock_rec_get_gap(lock)) {
-		buf += sprintf(buf, " gap type lock");
+		buf += sprintf(buf, " locks gap before rec");
+	}
+
+	if (lock_rec_get_rec_not_gap(lock)) {
+		buf += sprintf(buf, " locks rec but not gap");
 	}

 	if (lock_rec_get_insert_intention(lock)) {
@ -4080,7 +4231,8 @@ lock_rec_queue_validate(
 		if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0,
 				LOCK_WAIT, rec, impl_trx)) {

-			ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx));
+			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+								impl_trx));
 		}
 	}

@ -4095,7 +4247,8 @@ lock_rec_queue_validate(
 		if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0,
 				LOCK_WAIT, rec, impl_trx)) {

-			ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx));
+			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+								impl_trx));
 		}
 	}

@ -4359,8 +4512,8 @@ lock_rec_insert_check_and_lock(

 	*inherit = TRUE;

-	/* If another transaction has an explicit lock request, gap or not,
-	waiting or granted, on the successor, the insert has to wait.
+	/* If another transaction has an explicit lock request which locks
+	the gap, waiting or granted, on the successor, the insert has to wait.

 	An exception is the case where the lock by the another transaction
 	is a gap type lock which it placed to wait for its turn to insert. We
@ -4369,8 +4522,10 @@ lock_rec_insert_check_and_lock(
 	had to wait for their insert. Both had waiting gap type lock requests
 	on the successor, which produced an unnecessary deadlock. */

-	if (lock_rec_other_has_conflicting(LOCK_X, LOCK_GAP,
-				LOCK_INSERT_INTENTION, next_rec, trx)) {
+	if (lock_rec_other_has_conflicting(LOCK_X | LOCK_GAP
+				| LOCK_INSERT_INTENTION, next_rec, trx)) {
+
+		/* Note that we may get DB_SUCCESS also here! */
 		err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP
 						| LOCK_INSERT_INTENTION,
 						next_rec, index, thr);
@ -4418,9 +4573,11 @@ lock_rec_convert_impl_to_expl(
 		/* If the transaction has no explicit x-lock set on the
 		record, set one for it */

-		if (!lock_rec_has_expl(LOCK_X, rec, impl_trx)) {
+		if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+								impl_trx)) {

-			lock_rec_add_to_queue(LOCK_REC | LOCK_X, rec, index,
+			lock_rec_add_to_queue(LOCK_REC | LOCK_X
+					      | LOCK_REC_NOT_GAP, rec, index,
 								impl_trx);
 		}
 	}
@ -4466,7 +4623,7 @@ lock_clust_rec_modify_check_and_lock(

 	lock_rec_convert_impl_to_expl(rec, index);

-	err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr);
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr);

 	lock_mutex_exit_kernel();

@ -4511,7 +4668,7 @@ lock_sec_rec_modify_check_and_lock(

 	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));

-	err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr);
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr);

 	lock_mutex_exit_kernel();
 	
@ -4545,6 +4702,8 @@ lock_sec_rec_read_check_and_lock(
 	ulint		mode,	/* in: mode of the lock which the read cursor
 				should set on records: LOCK_S or LOCK_X; the
 				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
 	que_thr_t*	thr)	/* in: query thread */
 {
 	ulint	err;
@ -4576,7 +4735,7 @@ lock_sec_rec_read_check_and_lock(
 		lock_rec_convert_impl_to_expl(rec, index);
 	}

-	err = lock_rec_lock(FALSE, mode, rec, index, thr);
+	err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr);

 	lock_mutex_exit_kernel();

@ -4607,13 +4766,16 @@ lock_clust_rec_read_check_and_lock(
 	ulint		mode,	/* in: mode of the lock which the read cursor
 				should set on records: LOCK_S or LOCK_X; the
 				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
 	que_thr_t*	thr)	/* in: query thread */
 {
 	ulint	err;

 	ut_ad(index->type & DICT_CLUSTERED);
 	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
-	
+	ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+					|| gap_mode == LOCK_REC_NOT_GAP);
 	if (flags & BTR_NO_LOCKING_FLAG) {

 		return(DB_SUCCESS);
@ -4631,7 +4793,7 @@ lock_clust_rec_read_check_and_lock(
 		lock_rec_convert_impl_to_expl(rec, index);
 	}

-	err = lock_rec_lock(FALSE, mode, rec, index, thr);
+	err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr);

 	lock_mutex_exit_kernel();

--- a/innobase/mem/mem0dbg.c
+++ b/innobase/mem/mem0dbg.c
@ -350,6 +350,16 @@ mem_hash_remove(
 							node->nth_heap);
 	   printf("in %s line %lu and tried to free in %s line %lu.\n",
 	  			node->file_name, node->line, file_name, line);
+
+	   printf(
+	   "Hex dump of 400 bytes around memory heap first block start:\n");
+
+	   ut_print_buf((byte*)(node->heap) - 200, 400);
+
+	   printf("\nDump of the mem heap:\n");
+
+	   mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, &size,
+								NULL, NULL);
 	   ut_error;
 	}

--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@ -148,7 +148,7 @@ Gets the operating system version. Currently works only on Windows. */
 ulint
 os_get_os_version(void)
 /*===================*/
-                  /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */
+                  /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
 {
 #ifdef __WIN__
  	OSVERSIONINFO     os_info;
@ -162,7 +162,11 @@ os_get_os_version(void)
  	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
    		return(OS_WIN95);
  	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+		if (os_info.dwMajorVersion <= 4) {
    			return(OS_WINNT);
+    		} else {
+			return(OS_WIN2000);
+    		}
  	} else {
    		ut_error;
    		return(0);
@ -268,9 +272,7 @@ os_file_get_last_error(void)
 }

 /********************************************************************
-Does error handling when a file operation fails. If we have run out
-of disk space, then the user can clean the disk. If we do not find
-a specified file, then the user can copy it to disk. */
+Does error handling when a file operation fails. */
 static
 ibool
 os_file_handle_error(
@ -503,7 +505,11 @@ try_again:
 		        value 2 denotes that we do not flush the log at every
 		        commit, but only once per second */
 		} else {
-		        attributes = attributes | FILE_FLAG_NO_BUFFERING;
+			if (srv_win_file_flush_method ==
+					SRV_WIN_IO_UNBUFFERED) {
+		        	attributes = attributes
+						| FILE_FLAG_NO_BUFFERING;
+			}
 		}
 #endif
 	} else if (purpose == OS_FILE_NORMAL) {
@ -514,7 +520,11 @@ try_again:
 		        value 2 denotes that we do not flush the log at every
 		        commit, but only once per second */
 		} else {
-		        attributes = attributes | FILE_FLAG_NO_BUFFERING;
+			if (srv_win_file_flush_method ==
+					SRV_WIN_IO_UNBUFFERED) {
+		        	attributes = attributes
+						| FILE_FLAG_NO_BUFFERING;
+			}
 		}
 #endif
 	} else {
@ -1752,6 +1762,7 @@ os_aio(
 	os_aio_array_t*	array;
 	os_aio_slot_t*	slot;
 #ifdef WIN_ASYNC_IO
+	ibool		retval;
 	BOOL		ret		= TRUE;
 	DWORD		len		= n;
 	void*		dummy_mess1;
@ -1824,6 +1835,8 @@ try_again:
 		if (os_aio_use_native_aio) {
 #ifdef WIN_ASYNC_IO
 			os_n_file_reads++;
+			os_bytes_read_since_printout += len;
+			
 			ret = ReadFile(file, buf, (DWORD)n, &len,
 							&(slot->control));
 #elif defined(POSIX_ASYNC_IO)
@ -1870,10 +1883,12 @@ try_again:
 	    		    where we also use async i/o: in Windows we must
 	    		    use the same wait mechanism as for async i/o */
 	    		
-	    		    return(os_aio_windows_handle(ULINT_UNDEFINED,
+	    		    retval = os_aio_windows_handle(ULINT_UNDEFINED,
 					slot->pos,
 		    			&dummy_mess1, &dummy_mess2,
-					&dummy_type));
+					&dummy_type);
+
+			    return(retval);
 	    		}

 			return(TRUE);
@ -1897,8 +1912,6 @@ try_again:
 		goto try_again;
 	}	

-	ut_error;
-	
 	return(FALSE);
 }

@ -1958,14 +1971,14 @@ os_aio_windows_handle(
 	n = array->n_slots / array->n_segments;

 	if (array == os_aio_sync_array) {
-		srv_io_thread_op_info[orig_seg] = "wait windows aio for 1 page";
+		srv_io_thread_op_info[orig_seg] = "wait Windows aio for 1 page";

 		ut_ad(pos < array->n_slots); 
 		os_event_wait(array->events[pos]);
 		i = pos;
 	} else {
 		srv_io_thread_op_info[orig_seg] =
-						"wait windows aio for n pages";
+						"wait Windows aio";
 		i = os_event_wait_multiple(n, (array->events) + segment * n);
 	}

@ -1991,9 +2004,7 @@ os_aio_windows_handle(
 		         ut_a(TRUE == os_file_flush(slot->file));
 		}
 	} else {
-		os_file_get_last_error();
-
-		ut_error;
+		os_file_handle_error(slot->file, slot->name);
 		
 		ret_val = FALSE;
 	}		  
--- a/innobase/os/os0proc.c
+++ b/innobase/os/os0proc.c
@ -18,6 +18,23 @@ Created 9/30/1995 Heikki Tuuri

 #include "ut0mem.h"

+/********************************************************************
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'. */
+
+ulint
+os_proc_get_number(void)
+/*====================*/
+{
+#ifdef __WIN__
+	return((ulint)GetCurrentProcessId());
+#else
+	return((ulint)getpid());
+#endif
+}
+
 /********************************************************************
 Allocates non-cacheable memory. */

--- a/innobase/page/page0cur.c
+++ b/innobase/page/page0cur.c
@ -169,7 +169,7 @@ page_cur_search_with_match(
 	ut_ad(dtuple_check_typed(tuple));
 	ut_ad((mode == PAGE_CUR_L) || (mode == PAGE_CUR_LE)
 	      || (mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)
-	      || (mode == PAGE_CUR_DBG));
+	      || (mode == PAGE_CUR_LE_OR_EXTENDS) || (mode == PAGE_CUR_DBG));
 	      
 #ifdef PAGE_CUR_ADAPT
 	if ((page_header_get_field(page, PAGE_LEVEL) == 0)
@ -232,9 +232,26 @@ page_cur_search_with_match(
 			low_matched_bytes = cur_matched_bytes;

 		} else if (cmp == -1) {
+
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && dfield_get_len(dtuple_get_nth_field(tuple,
+			    				cur_matched_fields))
+			    	== cur_matched_bytes
+			    && rec_get_nth_field_len(mid_rec,
+							cur_matched_fields)
+				!= UNIV_SQL_NULL) {
+
+				/* This means current dfield is not SQL
+			    	NULL, and the current rec field extends it */
+
+				low = mid;
+				low_matched_fields = cur_matched_fields;
+				low_matched_bytes = cur_matched_bytes;
+			} else {
 				up = mid;
 				up_matched_fields = cur_matched_fields;
 				up_matched_bytes = cur_matched_bytes;
+			}

 		} else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) {
 			low = mid;
@ -252,8 +269,8 @@ page_cur_search_with_match(
 	slot = page_dir_get_nth_slot(page, up);
 	up_rec = page_dir_slot_get_rec(slot);

-	/* Perform linear search until the upper and lower records
-	come to distance 1 of each other. */
+	/* Perform linear search until the upper and lower records come to
+	distance 1 of each other. */

   	while (page_rec_get_next(low_rec) != up_rec) {

@ -272,10 +289,25 @@ page_cur_search_with_match(
 			low_matched_bytes = cur_matched_bytes;

 		} else if (cmp == -1) {
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && dfield_get_len(dtuple_get_nth_field(tuple,
+			    				cur_matched_fields))
+			    	== cur_matched_bytes
+			    && rec_get_nth_field_len(mid_rec,
+							cur_matched_fields)
+				!= UNIV_SQL_NULL) {
+
+				/* This means current dfield is not SQL
+			    	NULL, and the current rec field extends it */
+
+				low = mid;
+				low_matched_fields = cur_matched_fields;
+				low_matched_bytes = cur_matched_bytes;
+			} else {
 				up_rec = mid_rec;
 				up_matched_fields = cur_matched_fields;
 				up_matched_bytes = cur_matched_bytes;
-
+			}
 		} else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) {
 			low_rec = mid_rec;
 			low_matched_fields = cur_matched_fields;
--- a/innobase/page/page0page.c
+++ b/innobase/page/page0page.c
@ -1312,6 +1312,194 @@ page_rec_validate(
 	return(TRUE);
 }
 	
+/*******************************************************************
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage. */
+
+ibool
+page_simple_validate(
+/*=================*/
+			/* out: TRUE if ok */
+	page_t*	page)	/* in: index page */
+{
+	page_cur_t 	cur;
+	page_dir_slot_t* slot;
+	ulint		slot_no;
+	ulint		n_slots;
+	rec_t*		rec;
+	byte*		rec_heap_top;
+	ulint		count;
+	ulint		own_count;
+	ibool		ret	= FALSE;
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (n_slots > UNIV_PAGE_SIZE / 4) {
+		fprintf(stderr,
+	"Nonsensical number %lu of page dir slots\n", n_slots);
+
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+	
+	if (rec_heap_top > page_dir_get_nth_slot(page, n_slots - 1)) {
+
+		fprintf(stderr,
+       	"Record heap and dir overlap on a page, heap top %lu, dir %lu\n",
+       		(ulint)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page),
+       		(ulint)(page_dir_get_nth_slot(page, n_slots - 1) - page));
+
+       		goto func_exit;
+       	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	page_cur_set_before_first(page, &cur);
+
+	for (;;) {
+		rec = (&cur)->rec;
+		
+		if (rec > rec_heap_top) {
+			fprintf(stderr,
+			"Record %lu is above rec heap top %lu\n",
+			(ulint)(rec - page), (ulint)(rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		if (rec_get_n_owned(rec) != 0) {
+			/* This is a record pointed to by a dir slot */
+			if (rec_get_n_owned(rec) != own_count) {
+
+				fprintf(stderr,
+				"Wrong owned count %lu, %lu, rec %lu\n",
+				rec_get_n_owned(rec), own_count,
+				(ulint)(rec - page));
+
+				goto func_exit;
+			}
+
+			if (page_dir_slot_get_rec(slot) != rec) {
+				fprintf(stderr,
+				"Dir slot does not point to right rec %lu\n",
+					(ulint)(rec - page));
+
+				goto func_exit;
+			}
+						
+			own_count = 0;
+
+			if (!page_cur_is_after_last(&cur)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_cur_is_after_last(&cur)) {
+
+			break;
+		}
+
+		if (rec_get_next_offs(rec) < FIL_PAGE_DATA
+				|| rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+			  "Next record offset nonsensical %lu for rec %lu\n",
+			  rec_get_next_offs(rec),
+			  (ulint)(rec - page));
+
+			goto func_exit;
+		}
+
+		count++;		
+
+		if (count > UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+			"Page record list appears to be circular %lu\n",
+								count);
+			goto func_exit;
+		}
+		
+		page_cur_move_to_next(&cur);
+		own_count++;
+	}
+	
+	if (rec_get_n_owned(rec) == 0) {
+		fprintf(stderr, "n owned is zero in a supremum rec\n");
+
+		goto func_exit;
+	}
+		
+	if (slot_no != n_slots - 1) {
+		fprintf(stderr, "n slots wrong %lu, %lu\n",
+			slot_no, n_slots - 1);
+		goto func_exit;
+	}		
+
+	if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) {
+		fprintf(stderr, "n recs wrong %lu %lu\n",
+		page_header_get_field(page, PAGE_N_RECS) + 2,  count + 1);
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (rec < page + FIL_PAGE_DATA
+				|| rec >= page + UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+		"Free list record has a nonsensical offset %lu\n",
+			(ulint)(rec - page));
+
+			goto func_exit;
+		}
+
+		if (rec > rec_heap_top) {
+			fprintf(stderr,
+			"Free list record %lu is above rec heap top %lu\n",
+			(ulint)(rec - page), (ulint)(rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		count++;
+		
+		if (count > UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+			"Page free list appears to be circular %lu\n",
+								count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next(rec);
+	}
+	
+	if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) {
+
+		fprintf(stderr, "N heap is wrong %lu, %lu\n",
+		page_header_get_field(page, PAGE_N_HEAP), count + 1);
+
+		goto func_exit;
+	}
+
+	ret = TRUE;	
+
+func_exit:
+	return(ret);			  
+}
+
 /*******************************************************************
 This function checks the consistency of an index page. */

@ -1339,6 +1527,14 @@ page_validate(
 	ulint		i;
 	char           	err_buf[1000];
 	
+	if (!page_simple_validate(page)) {
+		buf_page_print(page);
+
+		fprintf(stderr, "Apparent corruption in a page in index %s\n",
+								index->name);
+		return(FALSE);
+	}
+
 	heap = mem_heap_create(UNIV_PAGE_SIZE);
 	
 	/* The following buffer is used to check that the
--- a/innobase/pars/lexyy.c
+++ b/innobase/pars/lexyy.c
@ -4,8 +4,6 @@
 * $Header: /home/daffy/u0/vern/flex/RCS/flex.skl,v 2.91 96/09/10 16:58:48 vern Exp $
 */

-#include "univ.i"
-
 #define FLEX_SCANNER
 #define YY_FLEX_MAJOR_VERSION 2
 #define YY_FLEX_MINOR_VERSION 5
@ -609,18 +607,13 @@ How to make the InnoDB parser and lexer C files:

 6. Remove the #include of unistd.h from about line 2500 of lexyy.c

-7. Move #include <math.h> in pars0grm.c after #include "univ.i" to remove
-   a large file compilation error on AIX.
-
-8. Move #include "univ.i" in lexyy.c to the file start to remove a large
-   file compilation error on AIX.
-
 These instructions seem to work at least with bison-1.28 and flex-2.5.4 on
 Linux.
 *******************************************************/
 #line 36 "pars0lex.l"
 #define YYSTYPE que_node_t*

+#include "univ.i"
 #include "pars0pars.h"
 #include "pars0grm.h"
 #include "pars0sym.h"
--- a/innobase/pars/pars0grm.c
+++ b/innobase/pars/pars0grm.c
@ -102,8 +102,6 @@ que_node_t */
 #include "que0que.h"
 #include "row0sel.h"

-#include <math.h>
-
 #define YYSTYPE que_node_t*

 /* #define __STDC__ */
--- a/innobase/read/read0read.c
+++ b/innobase/read/read0read.c
@ -200,6 +200,28 @@ read_view_close(
 	UT_LIST_REMOVE(view_list, trx_sys->view_list, view);
 } 

+/*************************************************************************
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*	trx)	/* in: trx which has a read view */
+{
+	ut_a(trx->read_view);
+
+	mutex_enter(&kernel_mutex);
+
+	read_view_close(trx->read_view);
+
+	mem_heap_empty(trx->read_view_heap);
+
+	trx->read_view = NULL;
+
+	mutex_exit(&kernel_mutex);
+}
+	
 /*************************************************************************
 Prints a read view to stderr. */

--- a/innobase/row/row0ins.c
+++ b/innobase/row/row0ins.c
@ -321,59 +321,6 @@ row_ins_clust_index_entry_by_modify(
 	return(err);
 }

-/*******************************************************************
-Checks if a unique key violation to rec would occur at the index entry
-insert. */
-static
-ibool
-row_ins_dupl_error_with_rec(
-/*========================*/
-				/* out: TRUE if error */
-	rec_t*		rec,	/* in: user record; NOTE that we assume
-				that the caller already has a record lock on
-				the record! */
-	dtuple_t*	entry,	/* in: entry to insert */
-	dict_index_t*	index)	/* in: index */
-{
-	ulint	matched_fields;
-	ulint	matched_bytes;
-	ulint	n_unique;
-	ulint   i;
-	
-	n_unique = dict_index_get_n_unique(index);
-
-	matched_fields = 0;
-	matched_bytes = 0;
-
-	cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
-
-	if (matched_fields < n_unique) {
-
-	        return(FALSE);
-	}
-
-	/* In a unique secondary index we allow equal key values if they
-	contain SQL NULLs */
-
-	if (!(index->type & DICT_CLUSTERED)) {
-
-	        for (i = 0; i < n_unique; i++) {
-	                if (UNIV_SQL_NULL == dfield_get_len(
-                                         dtuple_get_nth_field(entry, i))) {
-
-	                        return(FALSE);
-	                }
-	        }
-	}
-
-	if (!rec_get_deleted_flag(rec)) {
-
-	        return(TRUE);
-	}
-
-	return(FALSE);
-}	
-
 /*************************************************************************
 Either deletes or sets the referencing columns SQL NULL in a child row.
 Used in ON DELETE ... clause for foreign keys when a parent row is
@ -533,8 +480,12 @@ row_ins_foreign_delete_or_set_null(
 	err = lock_table(0, table, LOCK_IX, thr);

 	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+		
 		err = lock_clust_rec_read_check_and_lock(0, clust_rec,
-						clust_index, LOCK_X, thr);
+				clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr);
 	}
 	
 	if (err != DB_SUCCESS) {
@ -630,12 +581,14 @@ nonstandard_exit_func:

 /*************************************************************************
 Sets a shared lock on a record. Used in locking possible duplicate key
-records. */
+records and also in checking foreign key constraints. */
 static
 ulint
 row_ins_set_shared_rec_lock(
 /*========================*/
 				/* out: DB_SUCCESS or error code */
+	ulint		type, 	/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP type lock */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index */
 	que_thr_t*	thr)	/* in: query thread */	
@ -644,10 +597,10 @@ row_ins_set_shared_rec_lock(

 	if (index->type & DICT_CLUSTERED) {
 		err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_S,
-									thr);
+								type, thr);
 	} else {
 		err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_S,
-									thr);
+								type, thr);
 	}

 	return(err);
@ -656,7 +609,7 @@ row_ins_set_shared_rec_lock(
 /*******************************************************************
 Checks if foreign key constraint fails for an index entry. Sets shared locks
 which lock either the success or the failure of the constraint. NOTE that
-the caller must have a shared latch on dict_foreign_key_check_lock. */
+the caller must have a shared latch on dict_operation_lock. */

 ulint
 row_ins_check_foreign_constraint(
@ -679,7 +632,7 @@ row_ins_check_foreign_constraint(
 	dict_table_t*	check_table;
 	dict_index_t*	check_index;
 	ulint		n_fields_cmp;
-	ibool           timeout_expired;
+	ibool		unique_search;
 	rec_t*		rec;
 	btr_pcur_t	pcur;
 	ibool		moved;
@ -689,7 +642,9 @@ row_ins_check_foreign_constraint(
 	mtr_t		mtr;

 run_again:
-	ut_ad(rw_lock_own(&dict_foreign_key_check_lock, RW_LOCK_SHARED));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+
+	err = DB_SUCCESS;

 	if (thr_get_trx(thr)->check_foreigns == FALSE) {
 		/* The user has suppressed foreign key checks currently for
@ -748,6 +703,14 @@ run_again:

 	dtuple_set_n_fields_cmp(entry, foreign->n_fields);

+	if (dict_index_get_n_unique(check_index) <= foreign->n_fields) {
+		/* We can just set a LOCK_REC_NOT_GAP type lock */
+	
+		unique_search = TRUE;
+	} else {
+		unique_search = FALSE;
+	}
+
 	btr_pcur_open(check_index, entry, PAGE_CUR_GE,
 					BTR_SEARCH_LEAF, &pcur, &mtr);

@ -761,26 +724,46 @@ run_again:
 			goto next_rec;
 		}
 		
-		/* Try to place a lock on the index record */
-
-		err = row_ins_set_shared_rec_lock(rec, check_index, thr);
+		if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
 		
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec,
+							check_index, thr);
 			if (err != DB_SUCCESS) {

 				break;
 			}

-		if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
-		
 			goto next_rec;
 		}

 		cmp = cmp_dtuple_rec(entry, rec);

 		if (cmp == 0) {
-			if (!rec_get_deleted_flag(rec)) {
+			if (rec_get_deleted_flag(rec)) {
+				err = row_ins_set_shared_rec_lock(LOCK_ORDINARY,
+							rec, check_index, thr);
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
+			} else {
 				/* Found a matching record */
 				
+				if (unique_search) {
+					err = row_ins_set_shared_rec_lock(
+							LOCK_REC_NOT_GAP,
+							rec, check_index, thr);
+				} else {
+					err = row_ins_set_shared_rec_lock(
+							LOCK_ORDINARY,
+							rec, check_index, thr);
+				}
+				
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
+
 /*				printf(
 "FOREIGN: Found matching record from %s %s\n",
 		check_index->table_name, check_index->name);
@ -807,6 +790,13 @@ run_again:
 		}

 		if (cmp < 0) {
+			err = row_ins_set_shared_rec_lock(LOCK_GAP,
+						rec, check_index, thr);
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+
 			if (check_ref) {			
 				err = DB_NO_REFERENCED_ROW;
 			} else {
@ -844,14 +834,14 @@ do_possible_lock_wait:

 		que_thr_stop_for_mysql(thr);

-		timeout_expired = srv_suspend_mysql_thread(thr);
+		srv_suspend_mysql_thread(thr);
 	
-		if (!timeout_expired) {
+		if (thr_get_trx(thr)->error_state == DB_SUCCESS) {

 		        goto run_again;
 		}

-		err = DB_LOCK_WAIT_TIMEOUT;
+		err = thr_get_trx(thr)->error_state;
 	}

 	return(err);
@ -890,21 +880,21 @@ row_ins_check_foreign_constraints(
 									trx);
 			}

-			if (!trx->has_dict_foreign_key_check_lock) {
+			if (!trx->has_dict_operation_lock) {
 				got_s_lock = TRUE;

-				rw_lock_s_lock(&dict_foreign_key_check_lock);
+				rw_lock_s_lock(&dict_operation_lock);

-				trx->has_dict_foreign_key_check_lock = TRUE;
+				trx->has_dict_operation_lock = TRUE;
 			}

 			err = row_ins_check_foreign_constraint(TRUE, foreign,
 						table, index, entry, thr);
 			if (got_s_lock) {

-				rw_lock_s_unlock(&dict_foreign_key_check_lock);	
+				rw_lock_s_unlock(&dict_operation_lock);	

-				trx->has_dict_foreign_key_check_lock = FALSE;
+				trx->has_dict_operation_lock = FALSE;
 			}
 				
 			if (err != DB_SUCCESS) {
@ -918,6 +908,59 @@ row_ins_check_foreign_constraints(
 	return(DB_SUCCESS);
 }

+/*******************************************************************
+Checks if a unique key violation to rec would occur at the index entry
+insert. */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+				/* out: TRUE if error */
+	rec_t*		rec,	/* in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	dtuple_t*	entry,	/* in: entry to insert */
+	dict_index_t*	index)	/* in: index */
+{
+	ulint	matched_fields;
+	ulint	matched_bytes;
+	ulint	n_unique;
+	ulint   i;
+	
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+	matched_bytes = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
+
+	if (matched_fields < n_unique) {
+
+	        return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!(index->type & DICT_CLUSTERED)) {
+
+	        for (i = 0; i < n_unique; i++) {
+	                if (UNIV_SQL_NULL == dfield_get_len(
+                                         dtuple_get_nth_field(entry, i))) {
+
+	                        return(FALSE);
+	                }
+	        }
+	}
+
+	if (!rec_get_deleted_flag(rec)) {
+
+	        return(TRUE);
+	}
+
+	return(FALSE);
+}	
+
 /*******************************************************************
 Scans a unique non-clustered index at a given index entry to determine
 whether a uniqueness violation has occurred for the key value of the entry.
@ -978,7 +1021,8 @@ row_ins_scan_sec_index_for_duplicate(
 				
 		/* Try to place a lock on the index record */

-		err = row_ins_set_shared_rec_lock(rec, index, thr);
+		err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, index,
+									thr);

 		if (err != DB_SUCCESS) {

@ -1082,8 +1126,8 @@ row_ins_duplicate_error_in_clust(
 			sure that in roll-forward we get the same duplicate
 			errors as in original execution */
 		
-			err = row_ins_set_shared_rec_lock(rec, cursor->index,
-									thr);
+			err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP,
+						rec, cursor->index, thr);
 			if (err != DB_SUCCESS) {
 					
 				return(err);
@ -1105,8 +1149,8 @@ row_ins_duplicate_error_in_clust(

 		if (rec != page_get_supremum_rec(page)) {

-			err = row_ins_set_shared_rec_lock(rec, cursor->index,
-									thr);
+			err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP,
+						rec, cursor->index, thr);
 			if (err != DB_SUCCESS) {
 					
 				return(err);
--- a/innobase/row/row0mysql.c
+++ b/innobase/row/row0mysql.c
@ -27,6 +27,7 @@ Created 9/17/2000 Heikki Tuuri
 #include "lock0lock.h"
 #include "rem0cmp.h"
 #include "log0log.h"
+#include "btr0sea.h"

 /* A dummy variable used to fool the compiler */
 ibool	row_mysql_identically_false	= FALSE;
@ -203,7 +204,6 @@ row_mysql_handle_errors(
 	que_thr_t*	thr,	/* in: query thread */
 	trx_savept_t*	savept)	/* in: savepoint or NULL */
 {
-	ibool	timeout_expired;
 	ulint	err;

 handle_new_error:
@ -240,11 +240,9 @@ handle_new_error:
 		/* MySQL will roll back the latest SQL statement */
 	} else if (err == DB_LOCK_WAIT) {

-		timeout_expired = srv_suspend_mysql_thread(thr);
-
-		if (timeout_expired) {
-			trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+		srv_suspend_mysql_thread(thr);

+		if (trx->error_state != DB_SUCCESS) {
 			que_thr_stop_for_mysql(thr);

 			goto handle_new_error;
@ -1146,7 +1144,7 @@ row_mysql_lock_data_dictionary(void)
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks or lock waits can occur then in these operations */

-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
+	rw_lock_x_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));
 }

@ -1161,7 +1159,7 @@ row_mysql_unlock_data_dictionary(void)
 	no deadlocks can occur then in these operations */

 	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+	rw_lock_x_unlock(&dict_operation_lock);
 }

 /*************************************************************************
@ -1184,6 +1182,7 @@ row_create_table_for_mysql(
 	ulint		err;

 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	
 	if (srv_created_new_raw) {
@ -1384,6 +1383,7 @@ row_create_index_for_mysql(
 	ulint		keywordlen;
 	ulint		err;

+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
 	
@ -1464,6 +1464,7 @@ row_table_add_foreign_constraints(
 	ulint	err;

 	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 	ut_a(sql_string);
 	
 	trx->op_info = (char *) "adding foreign keys";
@ -1846,12 +1847,16 @@ row_drop_table_for_mysql(
 	no deadlocks can occur then in these operations */

 	if (!has_dict_mutex) {
-		/* Prevent foreign key checks while we are dropping the table */
-		rw_lock_x_lock(&(dict_foreign_key_check_lock));
+		/* Prevent foreign key checks etc. while we are dropping the
+		table */
+		rw_lock_x_lock(&dict_operation_lock);

 		mutex_enter(&(dict_sys->mutex));
 	}

+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+	
 	graph = pars_sql(buf);

 	ut_a(graph);
@ -1861,9 +1866,6 @@ row_drop_table_for_mysql(

 	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;

-	/* Prevent purge from running while we are dropping the table */
-	rw_lock_s_lock(&(purge_sys->purge_is_running));
-
 	table = dict_table_get_low(name);

 	if (!table) {
@ -1945,11 +1947,10 @@ row_drop_table_for_mysql(
 		}
 	}
 funct_exit:
-	rw_lock_s_unlock(&(purge_sys->purge_is_running));

 	if (!has_dict_mutex) {
 		mutex_exit(&(dict_sys->mutex));
-		rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+		rw_lock_x_unlock(&dict_operation_lock);
 	}

 	que_graph_free(graph);
@ -1985,7 +1986,7 @@ row_drop_database_for_mysql(
 	
 	trx_start_if_not_started(trx);
 loop:
-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
+	rw_lock_x_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));

 	while ((table_name = dict_get_first_table_name_in_db(name))) {
@ -2000,7 +2001,7 @@ loop:

 		if (table->n_mysql_handles_opened > 0) {
 		        mutex_exit(&(dict_sys->mutex));
-			rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+			rw_lock_x_unlock(&dict_operation_lock);

 			ut_print_timestamp(stderr);
 			fprintf(stderr,
@ -2028,7 +2029,7 @@ loop:
 	}

 	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+	rw_lock_x_unlock(&dict_operation_lock);
 	
 	trx_commit_for_mysql(trx);

@ -2165,7 +2166,7 @@ row_rename_table_for_mysql(
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */

-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
+	rw_lock_x_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));

 	table = dict_table_get_low(old_name);
@ -2249,7 +2250,7 @@ row_rename_table_for_mysql(
 	}
 funct_exit:	
 	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+	rw_lock_x_unlock(&dict_operation_lock);

 	que_graph_free(graph);
 	
@ -2399,9 +2400,19 @@ row_check_table_for_mysql(
 	ulint		n_rows;
 	ulint		n_rows_in_table	= ULINT_UNDEFINED;
 	ulint		ret 		= DB_SUCCESS;
+	ulint		old_isolation_level;
 	
 	prebuilt->trx->op_info = (char *) "checking table";

+	old_isolation_level = prebuilt->trx->isolation_level;
+
+	/* We must run the index record counts at an isolation level
+	>= READ COMMITTED, because a dirty read can see a wrong number
+	of records in some index; to play safe, we use always
+	REPEATABLE READ here */
+
+	prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+	
 	index = dict_table_get_first_index(table);

 	while (index != NULL) {
@ -2433,6 +2444,9 @@ row_check_table_for_mysql(
 		index = dict_table_get_next_index(index);
 	}

+	/* Restore the original isolation level */
+	prebuilt->trx->isolation_level = old_isolation_level;
+	
 	/* We validate also the whole adaptive hash index for all tables
 	at every CHECK TABLE */

--- a/innobase/row/row0purge.c
+++ b/innobase/row/row0purge.c
@ -453,7 +453,9 @@ static
 ibool
 row_purge_parse_undo_rec(
 /*=====================*/
-				/* out: TRUE if purge operation required */
+				/* out: TRUE if purge operation required:
+				NOTE that then the CALLER must s-unlock
+				dict_operation_lock! */
 	purge_node_t*	node,	/* in: row undo node */
 	ibool*		updated_extern,
 				/* out: TRUE if an externally stored field
@ -493,18 +495,20 @@ row_purge_parse_undo_rec(
 	    	return(FALSE);
 	}
 	
+	/* Prevent DROP TABLE etc. from running when we are doing the purge
+	for this row */
+
+	rw_lock_s_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));

 	node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr));

-	rw_lock_x_lock(&(purge_sys->purge_is_running));
-
 	mutex_exit(&(dict_sys->mutex));
 	
 	if (node->table == NULL) {
 		/* The table has been dropped: no need to do purge */

-		rw_lock_x_unlock(&(purge_sys->purge_is_running));
+		rw_lock_s_unlock(&dict_operation_lock);

 		return(FALSE);
 	}
@ -514,7 +518,7 @@ row_purge_parse_undo_rec(
 	if (clust_index == NULL) {
 		/* The table was corrupt in the data dictionary */

-		rw_lock_x_unlock(&(purge_sys->purge_is_running));
+		rw_lock_s_unlock(&dict_operation_lock);

 		return(FALSE);
 	}
@ -573,6 +577,8 @@ row_purge(
 	} else {
 		purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
 									thr);
+		/* If purge_needed == TRUE, we must also remember to unlock
+		dict_operation_lock! */
 	}

 	if (purge_needed) {
@ -594,7 +600,7 @@ row_purge(
 			btr_pcur_close(&(node->pcur));
 		}

-		rw_lock_x_unlock(&(purge_sys->purge_is_running));		
+		rw_lock_s_unlock(&dict_operation_lock);		
 	}

 	/* Do some cleanup */
--- a/innobase/row/row0sel.c
+++ b/innobase/row/row0sel.c
@ -606,7 +606,7 @@ row_sel_get_clust_rec(
 		/* Try to place a lock on the index record */
 		
 		err = lock_clust_rec_read_check_and_lock(0, clust_rec, index,
-						node->row_lock_mode, thr);
+					node->row_lock_mode, LOCK_ORDINARY, thr);
 		if (err != DB_SUCCESS) {

 			return(err);
@ -678,16 +678,17 @@ sel_set_rec_lock(
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index */
 	ulint		mode,	/* in: lock mode */
+	ulint		type, 	/* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */
 	que_thr_t*	thr)	/* in: query thread */	
 {
 	ulint	err;

 	if (index->type & DICT_CLUSTERED) {
 		err = lock_clust_rec_read_check_and_lock(0, rec, index, mode,
-									thr);
+							type, thr);
 	} else {
 		err = lock_sec_rec_read_check_and_lock(0, rec, index, mode,
-									thr);
+							type, thr);
 	}

 	return(err);
@ -1154,7 +1155,7 @@ rec_loop:
 		
 		if (!consistent_read) {
 			err = sel_set_rec_lock(page_rec_get_next(rec), index,
-						node->row_lock_mode, thr);
+				node->row_lock_mode, LOCK_ORDINARY, thr);
 			if (err != DB_SUCCESS) {
 				/* Note that in this case we will store in pcur
 				the PREDECESSOR of the record we are waiting
@ -1180,8 +1181,8 @@ rec_loop:
 	if (!consistent_read) {
 		/* Try to place a lock on the index record */	

-		err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr);
-
+		err = sel_set_rec_lock(rec, index, node->row_lock_mode,
+						LOCK_ORDINARY, thr);
 		if (err != DB_SUCCESS) {

 			goto lock_wait_or_error;
@ -2200,6 +2201,7 @@ row_sel_get_clust_rec_for_mysql(
 	rec_t*		old_vers;
 	ulint		err;
 	trx_t*		trx;
+	char		err_buf[1000];

 	*out_rec = NULL;
 	
@ -2213,14 +2215,40 @@ row_sel_get_clust_rec_for_mysql(

 	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);

-	ut_ad(page_rec_is_user_rec(clust_rec));
+	if (!page_rec_is_user_rec(clust_rec)) {
+		ut_print_timestamp(stderr);
+	  	fprintf(stderr,
+		"  InnoDB: error clustered record for sec rec not found\n"
+		"InnoDB: index %s table %s\n", sec_index->name,
+		  	sec_index->table->name);
+
+	  	rec_sprintf(err_buf, 900, rec);
+	  	fprintf(stderr, "InnoDB: sec index record %s\n", err_buf);
+
+	  	rec_sprintf(err_buf, 900, clust_rec);
+	  	fprintf(stderr, "InnoDB: clust index record %s\n", err_buf);
+
+		trx_print(err_buf, trx);
+
+	  	fprintf(stderr,
+		"%s\nInnoDB: Make a detailed bug report and send it\n",
+							err_buf);
+	  	fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n");
+
+		clust_rec = NULL;
+
+		goto func_exit;
+	}

 	if (prebuilt->select_lock_type != LOCK_NONE) {
-		/* Try to place a lock on the index record */
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
 		
 		err = lock_clust_rec_read_check_and_lock(0, clust_rec,
 					clust_index,
-					prebuilt->select_lock_type, thr);
+					prebuilt->select_lock_type,
+					LOCK_REC_NOT_GAP, thr);
 		if (err != DB_SUCCESS) {

 			return(err);
@ -2233,7 +2261,11 @@ row_sel_get_clust_rec_for_mysql(

 		old_vers = NULL;

-		if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index,
+		/* If the isolation level allows reading of uncommitted data,
+		then we never look for an earlier version */
+
+		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && !lock_clust_rec_cons_read_sees(clust_rec, clust_index,
 							trx->read_view)) {

 			err = row_sel_build_prev_vers_for_mysql(
@ -2275,6 +2307,7 @@ row_sel_get_clust_rec_for_mysql(
 		}
 	}

+func_exit:
 	*out_rec = clust_rec;

 	if (prebuilt->select_lock_type == LOCK_X) {
@ -2407,7 +2440,7 @@ row_sel_push_cache_row_for_mysql(
 /*************************************************************************
 Tries to do a shortcut to fetch a clustered index record with a unique key,
 using the hash index if possible (not always). We assume that the search
-mode is PAGE_CUR_GE, it is a consistent read, trx has already a read view,
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
 btr search latch has been locked in S-mode. */
 static
 ulint
@ -2516,17 +2549,22 @@ row_search_for_mysql(
 	ibool		was_lock_wait;
 	ulint		ret;
 	ulint		shortcut;
+	ibool		unique_search			= FALSE;
 	ibool		unique_search_from_clust_index	= FALSE;
 	ibool		mtr_has_extra_clust_latch 	= FALSE;
 	ibool		moves_up 			= FALSE;
+	ibool		set_also_gap_locks		= TRUE;
+					/* if the query is a plain
+					locking SELECT, and the isolation
+					level is <= TRX_ISO_READ_COMMITTED,
+					then this is set to FALSE */
+	ibool		success;
 	ulint		cnt				= 0;
 	mtr_t		mtr;
 	
 	ut_ad(index && pcur && search_tuple);
 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
 		
-	ut_ad(sync_thread_levels_empty_gen(FALSE));
-	
 	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
 		fprintf(stderr,
 		"InnoDB: Error: trying to free a corrupt\n"
@ -2543,6 +2581,9 @@ row_search_for_mysql(
 	
 	printf("N tables locked %lu\n", trx->mysql_n_tables_locked);
 */
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
 	if (direction == 0) {
 		trx->op_info = (char *) "starting index read";
 	
@ -2608,18 +2649,35 @@ row_search_for_mysql(

 	mtr_start(&mtr);

-	/* Since we must release the search system latch when we retrieve an
-	externally stored field, we cannot use the adaptive hash index in a
-	search in the case the row may be long and there may be externally
-	stored fields */
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete
+	marked matching record.
+
+	Note that in a unique secondary index there may be different delete
+	marked versions of a record where only the primary key values differ:
+	thus in a secondary index we must use next-key locks when locking
+	delete marked records. */
 	
 	if (match_mode == ROW_SEL_EXACT
 	    && index->type & DICT_UNIQUE
-		&& index->type & DICT_CLUSTERED
-		&& !prebuilt->templ_contains_blob
-		&& (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
 	    && dtuple_get_n_fields(search_tuple)
 				== dict_index_get_n_unique(index)) {
+		unique_search = TRUE;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (unique_search	
+	    && index->type & DICT_CLUSTERED
+	    && !prebuilt->templ_contains_blob
+	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {

 		if (direction == ROW_SEL_NEXT) {
 			/* MySQL sometimes seems to do fetch next even
@ -2642,8 +2700,9 @@ row_search_for_mysql(

 		unique_search_from_clust_index = TRUE;

-		if (trx->mysql_n_tables_locked == 0
-					&& !prebuilt->sql_stat_start) {
+		if (prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view) {

 			/* This is a SELECT query done as a consistent read,
 			and the read view has already been allocated:
@ -2722,7 +2781,11 @@ row_search_for_mysql(
 			mtr_start(&mtr);
 		}
 	}
+
 no_shortcut:
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
 	if (trx->has_search_latch) {
 		rw_lock_s_unlock(&btr_search_latch);
 		trx->has_search_latch = FALSE;
@ -2730,6 +2793,23 @@ no_shortcut:

 	trx_start_if_not_started(trx);

+	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    && prebuilt->select_lock_type != LOCK_NONE
+	    && trx->mysql_query_str) {
+
+		/* Scan the MySQL query string; check if SELECT is the first
+	        word there */
+
+		dict_accept(*trx->mysql_query_str, "SELECT", &success);
+
+		if (success) {
+			/* It is a plain locking SELECT and the isolation
+			level is low: do not lock gaps */
+
+			set_also_gap_locks = FALSE;
+		}
+	}
+	
 	/* Note that if the search mode was GE or G, then the cursor
 	naturally moves upward (in fetch next) in alphabetical order,
 	otherwise downward */
@ -2793,8 +2873,10 @@ no_shortcut:
 		prebuilt->sql_stat_start = FALSE;
 	}

-	/*-------------------------------------------------------------*/
 rec_loop:
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+	
 	cons_read_requires_clust_rec = FALSE;

 	rec = btr_pcur_get_rec(pcur);
@ -2813,21 +2895,23 @@ rec_loop:
 		goto next_rec;
 	}

-	if (prebuilt->select_lock_type != LOCK_NONE) {
+	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+
+		if (prebuilt->select_lock_type != LOCK_NONE
+		    && set_also_gap_locks) {
+
 			/* Try to place a lock on the index record */	

-		err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type,
-									thr);
+			err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_ORDINARY, thr);
 			if (err != DB_SUCCESS) {

 				goto lock_wait_or_error;
 			}
 		}
-
-	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
-
 		/* A page supremum record cannot be in the result set: skip
-		it now when we have placed a possible lock on it */		
+		it now that we have placed a possible lock on it */
 		
 		goto next_rec;
 	}
@ -2850,6 +2934,19 @@ rec_loop:
 		
 		if (0 != cmp_dtuple_rec(search_tuple, rec)) {

+			if (prebuilt->select_lock_type != LOCK_NONE
+		    	    && set_also_gap_locks) {
+				/* Try to place a lock on the index record */	
+
+				err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_GAP, thr);
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+			}
+
 			btr_pcur_store_position(pcur, &mtr);

 			ret = DB_RECORD_NOT_FOUND;
@ -2862,6 +2959,19 @@ rec_loop:

 		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) {
 			
+			if (prebuilt->select_lock_type != LOCK_NONE
+			    && set_also_gap_locks) {
+				/* Try to place a lock on the index record */	
+
+				err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_GAP, thr);
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+			}
+
 			btr_pcur_store_position(pcur, &mtr);

 			ret = DB_RECORD_NOT_FOUND;
@ -2874,16 +2984,39 @@ rec_loop:
 	/* We are ready to look at a possible new index entry in the result
 	set: the cursor is now placed on a user record */

-	/* Get the right version of the row in a consistent read */
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */

-	if (prebuilt->select_lock_type == LOCK_NONE) {
+		if (!set_also_gap_locks
+		    || (unique_search && !rec_get_deleted_flag(rec))) {
+			err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_REC_NOT_GAP, thr);
+		} else {
+			err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_ORDINARY, thr);
+		}
 		
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+	} else {
 		/* This is a non-locking consistent read: if necessary, fetch
 		a previous version of the record */

 		cons_read_requires_clust_rec = FALSE;

-		if (index == clust_index) {
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+		
+		} else if (index == clust_index) {
 			
 			if (!lock_clust_rec_cons_read_sees(rec, index,
 							trx->read_view)) {
@ -3020,8 +3153,11 @@ got_row:
 	ret = DB_SUCCESS;

 	goto normal_return;
-	/*-------------------------------------------------------------*/	
+
 next_rec:
+	/*-------------------------------------------------------------*/	
+	/* PHASE 5: Move the cursor to the next index record */
+	
 	if (mtr_has_extra_clust_latch) {
 		/* We must commit mtr if we are moving to the next
 		non-clustered index record, because we could break the
@ -3064,8 +3200,10 @@ next_rec:
 	cnt++;

 	goto rec_loop;
-	/*-------------------------------------------------------------*/
+
 lock_wait_or_error:
+	/*-------------------------------------------------------------*/
+
 	btr_pcur_store_position(pcur, &mtr);

 	mtr_commit(&mtr);
@ -3096,6 +3234,7 @@ lock_wait_or_error:
 	return(err);

 normal_return:
+	/*-------------------------------------------------------------*/
 	que_thr_stop_for_mysql_no_error(thr, trx);

 	mtr_commit(&mtr);
@ -3156,10 +3295,12 @@ row_search_check_if_query_cache_permitted(

 		ret = TRUE;
 		
-		/* Assign a read view for the transaction if it does not yet
-		have one */
+		/* If the isolation level is high, assign a read view for the
+		transaction if it does not yet have one */
+
+		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+		    && !trx->read_view) {

-		if (!trx->read_view) {
 			trx->read_view = read_view_open_now(trx,
 						trx->read_view_heap);
 		}
--- a/innobase/row/row0uins.c
+++ b/innobase/row/row0uins.c
@ -254,6 +254,7 @@ row_undo_ins_parse_undo_rec(
 	node->table = dict_table_get_on_id(table_id, node->trx);

 	if (node->table == NULL) {
+
 		return;
 	}

@ -292,6 +293,7 @@ row_undo_ins(

 	if (!found) {
 	        trx_undo_rec_release(node->trx, node->undo_no);
+
 		return(DB_SUCCESS);
 	}

--- a/innobase/row/row0undo.c
+++ b/innobase/row/row0undo.c
@ -211,7 +211,6 @@ row_undo(

 	if (node->state == UNDO_NODE_FETCH_NEXT) {

-		/* The call below also starts &mtr */
 		node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
 							trx->roll_limit,
 							&roll_ptr,
@ -254,6 +253,10 @@ row_undo(
 		}
 	}

+	/* Prevent DROP TABLE etc. while we are rolling back this row */
+	
+	rw_lock_s_lock(&dict_operation_lock);		
+
 	if (node->state == UNDO_NODE_INSERT) {

 		err = row_undo_ins(node, thr);
@ -264,6 +267,8 @@ row_undo(
 		err = row_undo_mod(node, thr);
 	}

+	rw_lock_s_unlock(&dict_operation_lock);		
+
 	/* Do some cleanup */
 	btr_pcur_close(&(node->pcur));

--- a/innobase/row/row0upd.c
+++ b/innobase/row/row0upd.c
@ -79,7 +79,7 @@ ibool
 row_upd_index_is_referenced(
 /*========================*/
 				/* out: TRUE if referenced; NOTE that since
-				we do not hold dict_foreign_key_check_lock
+				we do not hold dict_operation_lock
 				when leaving the function, it may be that
 				the referencing table has been dropped when
 				we leave this function: this function is only
@ -95,8 +95,8 @@ row_upd_index_is_referenced(
 		return(FALSE);
 	}

-	if (!trx->has_dict_foreign_key_check_lock) {
-		rw_lock_s_lock(&dict_foreign_key_check_lock);
+	if (!trx->has_dict_operation_lock) {
+		rw_lock_s_lock(&dict_operation_lock);
 	}

 	foreign = UT_LIST_GET_FIRST(table->referenced_list);
@ -104,8 +104,8 @@ row_upd_index_is_referenced(
 	while (foreign) {
 		if (foreign->referenced_index == index) {

-			if (!trx->has_dict_foreign_key_check_lock) {
-				rw_lock_s_unlock(&dict_foreign_key_check_lock);
+			if (!trx->has_dict_operation_lock) {
+				rw_lock_s_unlock(&dict_operation_lock);
 			}

 			return(TRUE);
@ -114,8 +114,8 @@ row_upd_index_is_referenced(
 		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
 	}
 	
-	if (!trx->has_dict_foreign_key_check_lock) {
-		rw_lock_s_unlock(&dict_foreign_key_check_lock);
+	if (!trx->has_dict_operation_lock) {
+		rw_lock_s_unlock(&dict_operation_lock);
 	}

 	return(FALSE);
@ -162,12 +162,12 @@ row_upd_check_references_constraints(

 	mtr_start(mtr);	
 	
-	if (!trx->has_dict_foreign_key_check_lock) {
+	if (!trx->has_dict_operation_lock) {
 		got_s_lock = TRUE;

-		rw_lock_s_lock(&dict_foreign_key_check_lock);
+		rw_lock_s_lock(&dict_operation_lock);

-		trx->has_dict_foreign_key_check_lock = TRUE;
+		trx->has_dict_operation_lock = TRUE;
 	}
 		
 	foreign = UT_LIST_GET_FIRST(table->referenced_list);
@ -189,7 +189,7 @@ row_upd_check_references_constraints(
 			}

 			/* NOTE that if the thread ends up waiting for a lock
-			we will release dict_foreign_key_check_lock
+			we will release dict_operation_lock
 			temporarily! But the counter on the table
 			protects 'foreign' from being dropped while the check
 			is running. */
@ -212,8 +212,8 @@ row_upd_check_references_constraints(
 			if (err != DB_SUCCESS) {
 				if (got_s_lock) {
 					rw_lock_s_unlock(
-						&dict_foreign_key_check_lock);	
-					trx->has_dict_foreign_key_check_lock
+						&dict_operation_lock);	
+					trx->has_dict_operation_lock
 								= FALSE;
 				}

@ -227,8 +227,8 @@ row_upd_check_references_constraints(
 	}

 	if (got_s_lock) {
-		rw_lock_s_unlock(&dict_foreign_key_check_lock);	
-		trx->has_dict_foreign_key_check_lock = FALSE;
+		rw_lock_s_unlock(&dict_operation_lock);
+		trx->has_dict_operation_lock = FALSE;
 	}

 	mem_heap_free(heap);
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@ -136,8 +136,6 @@ byte	srv_latin1_ordering[256]	/* The sort order table of the latin1
 , 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF
 };
 		
-ibool	srv_use_native_aio	= FALSE;
-		
 ulint	srv_pool_size		= ULINT_MAX;	/* size in database pages;
 						MySQL originally sets this
 						value in megabytes */ 
@ -151,8 +149,9 @@ dulint	srv_archive_recovery_limit_lsn;

 ulint	srv_lock_wait_timeout	= 1024 * 1024 * 1024;

-char*   srv_unix_file_flush_method_str = NULL;
-ulint   srv_unix_file_flush_method = 0;
+char*   srv_file_flush_method_str = NULL;
+ulint   srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
+ulint   srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;

 /* If the following is != 0 we do not allow inserts etc. This protects
 the user from forgetting the innodb_force_recovery keyword to my.cnf */
@ -281,6 +280,9 @@ time_t	srv_last_monitor_time;

 mutex_t srv_innodb_monitor_mutex;

+ulint	srv_main_thread_process_no	= 0;
+ulint	srv_main_thread_id		= 0;
+
 /*
 	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
 	=========================================
@ -2046,13 +2048,15 @@ srv_table_reserve_slot_for_mysql(void)
 }

 /*******************************************************************
-Puts a MySQL OS thread to wait for a lock to be released. */
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */

-ibool
+void
 srv_suspend_mysql_thread(
 /*=====================*/
-				/* out: TRUE if the lock wait timeout was
-				exceeded */
 	que_thr_t*	thr)	/* in: query thread associated with the MySQL
 				OS thread */
 {
@ -2069,13 +2073,15 @@ srv_suspend_mysql_thread(

 	mutex_enter(&kernel_mutex);

+	trx->error_state = DB_SUCCESS;
+
 	if (thr->state == QUE_THR_RUNNING) {

 		/* The lock has already been released: no need to suspend */

 		mutex_exit(&kernel_mutex);

-		return(FALSE);
+		return;
 	}
 	
 	slot = srv_table_reserve_slot_for_mysql();
@ -2101,18 +2107,18 @@ srv_suspend_mysql_thread(
 	srv_conc_force_exit_innodb(thr_get_trx(thr));

 	/* Release possible foreign key check latch */
-	if (trx->has_dict_foreign_key_check_lock) {
+	if (trx->has_dict_operation_lock) {

-		rw_lock_s_unlock(&dict_foreign_key_check_lock);
+		rw_lock_s_unlock(&dict_operation_lock);
 	}

 	/* Wait for the release */
 	
 	os_event_wait(event);

-	if (trx->has_dict_foreign_key_check_lock) {
+	if (trx->has_dict_operation_lock) {

-		rw_lock_s_lock(&dict_foreign_key_check_lock);
+		rw_lock_s_lock(&dict_operation_lock);
 	}

 	/* Return back inside InnoDB */
@ -2131,10 +2137,9 @@ srv_suspend_mysql_thread(

 	if (srv_lock_wait_timeout < 100000000 && 
 	    			wait_time > (double)srv_lock_wait_timeout) {
-	   	return(TRUE);
-	}

-	return(FALSE);
+	    	trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+	}
 }

 /************************************************************************
@ -2300,9 +2305,19 @@ srv_sprintf_innodb_monitor(
 		       "ROW OPERATIONS\n"
 		       "--------------\n");
 	buf += sprintf(buf,
-	"%ld queries inside InnoDB, %ld queries in queue; main thread: %s\n",
-			srv_conc_n_threads, srv_conc_n_waiting_threads,
+	"%ld queries inside InnoDB, %ld queries in queue\n",
+			srv_conc_n_threads, srv_conc_n_waiting_threads);
+#ifdef UNIV_LINUX
+	buf += sprintf(buf,
+	"Main thread process no %lu, state: %s\n",
+			srv_main_thread_process_no,
 			srv_main_thread_op_info);
+#else
+	buf += sprintf(buf,
+	"Main thread id %lu, state: %s\n",
+			srv_main_thread_id,
+			srv_main_thread_op_info);
+#endif
 	buf += sprintf(buf,
 	"Number of rows inserted %lu, updated %lu, deleted %lu, read %lu\n",
 			srv_n_rows_inserted, 
@ -2636,6 +2651,9 @@ srv_master_thread(
 	
 	UT_NOT_USED(arg);

+	srv_main_thread_process_no = os_proc_get_number();
+	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+	
 	srv_table_reserve_slot(SRV_MASTER);	

 	mutex_enter(&kernel_mutex);
--- a/innobase/srv/srv0start.c
+++ b/innobase/srv/srv0start.c
@ -515,7 +515,7 @@ srv_calc_high32(
 }

 /*************************************************************************
-Creates or opens the log files. */
+Creates or opens the log files and closes them. */
 static
 ulint
 open_or_create_log_file(
@ -640,7 +640,7 @@ open_or_create_log_file(
 }

 /*************************************************************************
-Creates or opens database data files. */
+Creates or opens database data files and closes them. */
 static
 ulint
 open_or_create_data_files(
@ -965,31 +965,63 @@ innobase_start_or_create_for_mysql(void)

 	srv_is_being_started = TRUE;
        srv_startup_is_before_trx_rollback_phase = TRUE;
+	os_aio_use_native_aio = FALSE;
+
+#ifdef __WIN__
+	if (os_get_os_version() == OS_WIN95
+	    || os_get_os_version() == OS_WIN31
+	    || os_get_os_version() == OS_WINNT) {
+
+	  	/* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
+		and NT use simulated aio. In NT Windows provides async i/o,
+		but when run in conjunction with InnoDB Hot Backup, it seemed
+		to corrupt the data files. */
+
+	  	os_aio_use_native_aio = FALSE;
+	} else {
+	  	/* On Win 2000 and XP use async i/o */
+	  	os_aio_use_native_aio = TRUE;
+	}
+#endif	
+        if (srv_file_flush_method_str == NULL) {
+        	/* These are the default options */

-	if (0 == ut_strcmp(srv_unix_file_flush_method_str, "fdatasync")) {
 		srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;

-	} else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "O_DSYNC")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#ifndef __WIN__        
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "fdatasync")) {
+	  	srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
 	  	srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;

-	} else if (0 == ut_strcmp(srv_unix_file_flush_method_str,
+	} else if (0 == ut_strcmp(srv_file_flush_method_str,
 				  "littlesync")) {
 	  	srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;

-	} else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "nosync")) {
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
 	  	srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+#else
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
+	  	srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
+	  	os_aio_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
+	  	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+	  	os_aio_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str,
+							"async_unbuffered")) {
+	  	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;	
+#endif
 	} else {
 	  	fprintf(stderr, 
          	"InnoDB: Unrecognized value %s for innodb_flush_method\n",
-          				srv_unix_file_flush_method_str);
+          				srv_file_flush_method_str);
 	  	return(DB_ERROR);
 	}

-	/*
-	printf("srv_unix set to %lu\n", srv_unix_file_flush_method);
-	*/
-	os_aio_use_native_aio = srv_use_native_aio;
-
 	err = srv_boot();

 	if (err != DB_SUCCESS) {
@ -999,34 +1031,15 @@ innobase_start_or_create_for_mysql(void)

 	/* Restrict the maximum number of file i/o threads */
 	if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+
 		srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
 	}

-#if !(defined(WIN_ASYNC_IO) || defined(POSIX_ASYNC_IO))
+	if (!os_aio_use_native_aio) {
 		/* In simulated aio we currently have use only for 4 threads */

-	os_aio_use_native_aio = FALSE;
-
 		srv_n_file_io_threads = 4;
-#endif

-#ifdef __WIN__
-	if (os_get_os_version() == OS_WIN95
-	    || os_get_os_version() == OS_WIN31) {
-
-	  	/* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use
-	     	simulated aio */
-
-	  	os_aio_use_native_aio = FALSE;
-	  	srv_n_file_io_threads = 4;
-	} else {
-	  	/* On NT and Win 2000 always use aio */
-	  	os_aio_use_native_aio = TRUE;
-	}
-#endif
-	os_aio_use_native_aio = FALSE;
-	
-	if (!os_aio_use_native_aio) {
 		os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
 						* srv_n_file_io_threads,
 					srv_n_file_io_threads,
@ -1047,15 +1060,6 @@ innobase_start_or_create_for_mysql(void)
 	
 	lock_sys_create(srv_lock_table_size);

-#ifdef POSIX_ASYNC_IO
-	if (os_aio_use_native_aio) {
-		/* There is only one thread per async io array:
-		one for ibuf i/o, one for log i/o, one for ordinary reads,
-		one for ordinary writes; we need only 4 i/o threads */
-
-		srv_n_file_io_threads = 4;
-	}
-#endif
 	/* Create i/o-handler threads: */

 	for (i = 0; i < srv_n_file_io_threads; i++) {
--- a/innobase/sync/sync0rw.c
+++ b/innobase/sync/sync0rw.c
@ -663,7 +663,8 @@ rw_lock_own(
 /*========*/
 					/* out: TRUE if locked */
 	rw_lock_t*	lock,		/* in: rw-lock */
-	ulint		lock_type)	/* in: lock type */
+	ulint		lock_type)	/* in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
 {
 	rw_lock_debug_t*	info;

--- a/innobase/sync/sync0sync.c
+++ b/innobase/sync/sync0sync.c
@ -901,8 +901,7 @@ sync_thread_levels_empty_gen(

 		if (slot->latch != NULL && (!dict_mutex_allowed ||
 				(slot->level != SYNC_DICT
-				&& slot->level != SYNC_FOREIGN_KEY_CHECK
-				&& slot->level != SYNC_PURGE_IS_RUNNING))) {
+				&& slot->level != SYNC_DICT_OPERATION))) {

 			lock = slot->latch;
 			mutex = slot->latch;
@ -1087,12 +1086,10 @@ sync_thread_add_level(
 						SYNC_IBUF_PESS_INSERT_MUTEX));
 	} else if (level == SYNC_DICT_AUTOINC_MUTEX) {
 		ut_a(sync_thread_levels_g(array, SYNC_DICT_AUTOINC_MUTEX));
-	} else if (level == SYNC_FOREIGN_KEY_CHECK) {
-		ut_a(sync_thread_levels_g(array, SYNC_FOREIGN_KEY_CHECK));
+	} else if (level == SYNC_DICT_OPERATION) {
+		ut_a(sync_thread_levels_g(array, SYNC_DICT_OPERATION));
 	} else if (level == SYNC_DICT_HEADER) {
 		ut_a(sync_thread_levels_g(array, SYNC_DICT_HEADER));
-	} else if (level == SYNC_PURGE_IS_RUNNING) {
-		ut_a(sync_thread_levels_g(array, SYNC_PURGE_IS_RUNNING));
 	} else if (level == SYNC_DICT) {
 		ut_a(buf_debug_prints
 		     || sync_thread_levels_g(array, SYNC_DICT));
--- a/innobase/trx/trx0purge.c
+++ b/innobase/trx/trx0purge.c
@ -209,9 +209,6 @@ trx_purge_sys_create(void)
 	purge_sys->purge_undo_no = ut_dulint_zero;
 	purge_sys->next_stored = FALSE;
 	
-	rw_lock_create(&(purge_sys->purge_is_running));
-	rw_lock_set_level(&(purge_sys->purge_is_running),
-						SYNC_PURGE_IS_RUNNING);
 	rw_lock_create(&(purge_sys->latch));
 	rw_lock_set_level(&(purge_sys->latch), SYNC_PURGE_LATCH);

--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@ -23,7 +23,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "srv0srv.h"
 #include "thr0loc.h"
 #include "btr0sea.h"
-
+#include "os0proc.h"

 /* Copy of the prototype for innobase_mysql_print_thd: this
 copy MUST be equal to the one in mysql/sql/ha_innobase.cc ! */
@ -85,12 +85,14 @@ trx_create(
 	trx->conc_state = TRX_NOT_STARTED;
 	trx->start_time = time(NULL);

+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
 	trx->check_foreigns = TRUE;
 	trx->check_unique_secondary = TRUE;

 	trx->dict_operation = FALSE;

 	trx->mysql_thd = NULL;
+	trx->mysql_query_str = NULL;

 	trx->n_mysql_tables_in_use = 0;
 	trx->mysql_n_tables_locked = 0;
@ -132,7 +134,7 @@ trx_create(
 	trx->lock_heap = mem_heap_create_in_buffer(256);
 	UT_LIST_INIT(trx->trx_locks);

-	trx->has_dict_foreign_key_check_lock = FALSE;
+	trx->has_dict_operation_lock = FALSE;
 	trx->has_search_latch = FALSE;
 	trx->search_latch_timeout = BTR_SEA_TIMEOUT;

@ -176,6 +178,8 @@ trx_allocate_for_mysql(void)

 	trx->mysql_thread_id = os_thread_get_curr_id();

+	trx->mysql_process_no = os_proc_get_number();
+	
 	return(trx);
 }

@ -1497,9 +1501,12 @@ trx_print(
  		default: buf += sprintf(buf, " state %lu", trx->conc_state);
  	}

+#ifdef UNIV_LINUX
+        buf += sprintf(buf, ", process no %lu", trx->mysql_process_no);
+#else
        buf += sprintf(buf, ", OS thread id %lu",
 		       os_thread_pf(trx->mysql_thread_id));
-
+#endif
 	if (ut_strlen(trx->op_info) > 0) {
 		buf += sprintf(buf, " %s", trx->op_info);
 	}
--- a/sql/ha_innodb.cc
+++ b/sql/ha_innodb.cc
@ -97,6 +97,8 @@ are determined in innobase_init below: */
 char*	innobase_data_home_dir			= NULL;
 char*	innobase_log_group_home_dir		= NULL;
 char*	innobase_log_arch_dir			= NULL;
+/* The following has a midleading name: starting from 4.0.5 this also
+affects Windows */
 char*	innobase_unix_file_flush_method		= NULL;

 /* Below we have boolean-valued start-up parameters, and their default
@ -346,6 +348,7 @@ check_trx_exists(
 		trx = trx_allocate_for_mysql();

 		trx->mysql_thd = thd;
+		trx->mysql_query_str = &((*thd).query);
 		
 		thd->transaction.all.innobase_tid = trx;

@ -713,9 +716,10 @@ innobase_init(void)

 		DBUG_RETURN(TRUE);
 	}
-	srv_unix_file_flush_method_str = (innobase_unix_file_flush_method ?
+
+	srv_file_flush_method_str = (innobase_unix_file_flush_method ?
 				      innobase_unix_file_flush_method :
-				      (char*)"fdatasync");
+				      NULL);

 	srv_n_log_groups = (ulint) innobase_mirrored_log_groups;
 	srv_n_log_files = (ulint) innobase_log_files_in_group;
@ -725,8 +729,6 @@ innobase_init(void)
 	srv_log_buffer_size = (ulint) innobase_log_buffer_size;
 	srv_flush_log_at_trx_commit = (ulint) innobase_flush_log_at_trx_commit;

-	srv_use_native_aio = 0;
-
 	srv_pool_size = (ulint) innobase_buffer_pool_size;

 	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
@ -2179,8 +2181,16 @@ convert_search_mode_to_innobase(
 		case HA_READ_AFTER_KEY:		return(PAGE_CUR_G);
 		case HA_READ_BEFORE_KEY:	return(PAGE_CUR_L);
 		case HA_READ_PREFIX:		return(PAGE_CUR_GE);
-		case HA_READ_PREFIX_LAST:       return(PAGE_CUR_LE);
-		        /* HA_READ_PREFIX_LAST does not yet work in InnoDB! */
+		case HA_READ_PREFIX_LAST:
+		  /*		        ut_print_timestamp(stderr);
+                        fprintf(stderr,
+			" InnoDB: Warning: Using HA_READ_PREFIX_LAST\n"); */
+		        return(PAGE_CUR_LE);
+
+		        /* InnoDB does not yet support ..PREFIX_LAST!
+		        We have to add a new search flag
+		        PAGE_CUR_LE_OR_PREFIX to InnoDB. */
+
 			/* the above PREFIX flags mean that the last
 			field in the key value may just be a prefix
 			of the complete fixed length field */
@ -3639,7 +3649,6 @@ ha_innobase::reset(void)
  	return(0);
 }

-
 /**********************************************************************
 When we create a temporary table inside MySQL LOCK TABLES, MySQL will
 not call external_lock for the temporary table when it uses it. Instead,
@ -3661,6 +3670,14 @@ ha_innobase::start_stmt(
 	innobase_release_stat_resources(trx);
 	trx_mark_sql_stat_end(trx);

+	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    						&& trx->read_view) {
+	    	/* At low transaction isolation levels we let
+		each consistent read set its own snapshot */
+
+	    	read_view_close_for_mysql(trx);
+	}
+
 	auto_inc_counter_for_this_stat = 0;
 	prebuilt->sql_stat_start = TRUE;
 	prebuilt->hint_no_need_to_fetch_extra_cols = TRUE;
@ -3680,6 +3697,24 @@ ha_innobase::start_stmt(
 	return(0);
 }

+/**********************************************************************
+Maps a MySQL trx isolation level code to the InnoDB isolation level code */
+inline
+ulint
+innobase_map_isolation_level(
+/*=========================*/
+					/* out: InnoDB isolation level */
+	enum_tx_isolation	iso)	/* in: MySQL isolation level code */
+{
+	switch(iso) {
+		case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED);
+		case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ);
+		case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE);
+		case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED);
+		default: ut_a(0); return(0);
+	}	
+}
+	
 /**********************************************************************
 As MySQL will execute an external lock for every new table it uses when it
 starts to process an SQL statement (an exception is when MySQL calls
@ -3726,7 +3761,13 @@ ha_innobase::external_lock(
 		thd->transaction.all.innodb_active_trans = 1;
 		trx->n_mysql_tables_in_use++;

-		if (thd->variables.tx_isolation == ISO_SERIALIZABLE
+		if (thd->variables.tx_isolation != ISO_REPEATABLE_READ) {
+			trx->isolation_level = innobase_map_isolation_level(
+						(enum_tx_isolation)
+						thd->variables.tx_isolation);
+		}
+
+		if (trx->isolation_level == TRX_ISO_SERIALIZABLE
 		    && prebuilt->select_lock_type == LOCK_NONE) {

 		    	/* To get serializable execution we let InnoDB
@ -3753,6 +3794,15 @@ ha_innobase::external_lock(

 			innobase_release_stat_resources(trx);

+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    						&& trx->read_view) {
+
+	    			/* At low transaction isolation levels we let
+				each consistent read set its own snapshot */
+
+	    			read_view_close_for_mysql(trx);
+			}
+			
 		  	if (!(thd->options
 				 & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {

@ -3779,7 +3829,6 @@ innodb_show_status(
  	DBUG_ENTER("innodb_show_status");
 	
 	if (innodb_skip) {
-
                fprintf(stderr,
 	 "Cannot call SHOW INNODB STATUS because skip-innodb is defined\n");

--- a/sql/ha_innodb.h
+++ b/sql/ha_innodb.h
@ -96,7 +96,7 @@ class ha_innobase: public handler
 	ulong index_flags(uint idx) const
 	{
 	  return (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
-		  HA_KEY_READ_ONLY | HA_NOT_READ_PREFIX_LAST);
+		  HA_KEY_READ_ONLY);
 	}
  	uint max_record_length() const { return HA_MAX_REC_LENGTH; }
  	uint max_keys()          const { return MAX_KEY; }
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@ -3879,7 +3879,7 @@ static void set_options(void)

  /* Set default values for some variables */
  global_system_variables.table_type=DB_TYPE_MYISAM;
-  global_system_variables.tx_isolation=ISO_READ_COMMITTED;
+  global_system_variables.tx_isolation=ISO_REPEATABLE_READ;
  global_system_variables.select_limit= (ulong) HA_POS_ERROR;
  max_system_variables.select_limit= (ulong) HA_POS_ERROR;
  global_system_variables.max_join_size= (ulong) HA_POS_ERROR;
@ -4351,7 +4351,7 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
    }
    global_system_variables.tx_isolation= ((opt_sql_mode & MODE_SERIALIZABLE) ?
 					   ISO_SERIALIZABLE :
-					   ISO_READ_COMMITTED);
+					   ISO_REPEATABLE_READ);
    break;
  }
  case OPT_MASTER_PASSWORD: