diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c index a96830c24ff..b9253562fe6 100644 --- a/innobase/btr/btr0btr.c +++ b/innobase/btr/btr0btr.c @@ -570,6 +570,19 @@ btr_page_get_father_for_rec( node_ptr = btr_cur_get_rec(&cursor); + if (btr_node_ptr_get_child_page_no(node_ptr) != + buf_frame_get_page_no(page)) { + fprintf(stderr, +"InnoDB: Corruption of an index tree: table %s, index %s,\n" +"InnoDB: father ptr page no %lu, child page no %lu\n", + (UT_LIST_GET_FIRST(tree->tree_indexes))->table_name, + (UT_LIST_GET_FIRST(tree->tree_indexes))->name, + btr_node_ptr_get_child_page_no(node_ptr), + buf_frame_get_page_no(page)); + page_rec_print(page_rec_get_next(page_get_infimum_rec(page))); + page_rec_print(node_ptr); + } + ut_a(btr_node_ptr_get_child_page_no(node_ptr) == buf_frame_get_page_no(page)); mem_heap_free(heap); diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index a078c843159..ddcf5e8faf2 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -204,7 +204,7 @@ btr_cur_search_to_nth_level( the caller uses his search latch to protect the record! */ btr_cur_t* cursor, /* in/out: tree cursor; the cursor page is - s- or x-latched, but see also above! */ + s- or x-latched, but see also above! */ ulint has_search_latch,/* in: info on the latch mode the caller currently has on btr_search_latch: RW_S_LATCH, or 0 */ diff --git a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c index ca8589813ca..aac86f45ec9 100644 --- a/innobase/btr/btr0sea.c +++ b/innobase/btr/btr0sea.c @@ -743,7 +743,7 @@ btr_search_guess_on_hash( #ifdef notdefined /* These lines of code can be used in a debug version to check - correctness of the searched cursor position: */ + the correctness of the searched cursor position: */ info->last_hash_succ = FALSE; diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index 055d76e6d81..f1a2d915d46 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -220,6 +220,10 @@ buf_calc_page_checksum( { ulint checksum; + /* Since the fields FIL_PAGE_FILE_FLUSH_LSN and ..._ARCH_LOG_NO + are written outside the buffer pool to the first pages of data + files, we have to skip them in page checksum calculation */ + checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA @@ -279,8 +283,9 @@ buf_page_print( ut_sprintf_buf(buf, read_buf, UNIV_PAGE_SIZE); + ut_print_timestamp(stderr); fprintf(stderr, - "InnoDB: Page dump in ascii and hex (%lu bytes):\n%s", + " InnoDB: Page dump in ascii and hex (%lu bytes):\n%s", UNIV_PAGE_SIZE, buf); fprintf(stderr, "InnoDB: End of page dump\n"); @@ -288,7 +293,8 @@ buf_page_print( checksum = buf_calc_page_checksum(read_buf); - fprintf(stderr, "InnoDB: Page checksum %lu stored checksum %lu\n", + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Page checksum %lu stored checksum %lu\n", checksum, mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN)); @@ -1358,47 +1364,87 @@ buf_page_io_complete( /*=================*/ buf_block_t* block) /* in: pointer to the block in question */ { - dulint id; dict_index_t* index; + dulint id; ulint io_type; - + ulint read_page_no; + ut_ad(block); io_type = block->io_fix; if (io_type == BUF_IO_READ) { + /* If this page is not uninitialized and not in the + doublewrite buffer, then the page number should be the + same as in block */ + + read_page_no = mach_read_from_4((block->frame) + + FIL_PAGE_OFFSET); + if (read_page_no != 0 + && !trx_doublewrite_page_inside(read_page_no) + && read_page_no != block->offset) { + + fprintf(stderr, +"InnoDB: Error: page n:o stored in the page read in is %lu, should be %lu!\n", + read_page_no, block->offset); + } +#ifdef notdefined + if (block->offset != 0 && read_page_no == 0) { + /* Check that the page is really uninited */ + + for (i = 0; i < UNIV_PAGE_SIZE; i++) { + + if (*((block->frame) + i) != '\0') { + fprintf(stderr, +"InnoDB: Error: page n:o in the page read in is 0, but page %lu is inited!\n", + block->offset); + break; + } + } + } +#endif /* From version 3.23.38 up we store the page checksum - to the 4 upper bytes of the page end lsn field */ + to the 4 first bytes of the page end lsn field */ if (buf_page_is_corrupted(block->frame)) { fprintf(stderr, - "InnoDB: Database page corruption or a failed\n" - "InnoDB: file read of page %lu.\n", block->offset); + "InnoDB: Database page corruption on disk or a failed\n" + "InnoDB: file read of page %lu.\n", block->offset); fprintf(stderr, - "InnoDB: You may have to recover from a backup.\n"); + "InnoDB: You may have to recover from a backup.\n"); buf_page_print(block->frame); fprintf(stderr, - "InnoDB: Database page corruption or a failed\n" - "InnoDB: file read of page %lu.\n", block->offset); + "InnoDB: Database page corruption on disk or a failed\n" + "InnoDB: file read of page %lu.\n", block->offset); fprintf(stderr, - "InnoDB: You may have to recover from a backup.\n"); + "InnoDB: You may have to recover from a backup.\n"); fprintf(stderr, - "InnoDB: It is also possible that your operating\n" - "InnoDB: system has corrupted its own file cache\n" - "InnoDB: and rebooting your computer removes the\n" - "InnoDB: error.\n"); + "InnoDB: It is also possible that your operating\n" + "InnoDB: system has corrupted its own file cache\n" + "InnoDB: and rebooting your computer removes the\n" + "InnoDB: error.\n" + "InnoDB: If the corrupt page is an index page\n" + "InnoDB: you can also try to fix the corruption\n" + "InnoDB: by dumping, dropping, and reimporting\n" + "InnoDB: the corrupt table. You can use CHECK\n" + "InnoDB: TABLE to scan your table for corruption.\n" + "InnoDB: Look also at section 6.1 of\n" + "InnoDB: http://www.innodb.com/ibman.html about\n" + "InnoDB: forcing recovery.\n"); - if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) { + if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) { + fprintf(stderr, + "InnoDB: Ending processing because of a corrupt database page.\n"); exit(1); } } if (recv_recovery_is_on()) { - recv_recover_page(TRUE, block->frame, block->space, - block->offset); + recv_recover_page(FALSE, TRUE, block->frame, + block->space, block->offset); } if (!recv_no_ibuf_operations) { diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 8184f10d6e9..4c6850af078 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -327,6 +327,34 @@ try_again: mutex_exit(&(trx_doublewrite->mutex)); } +/************************************************************************ +Initializes a page for writing to the tablespace. */ + +void +buf_flush_init_for_writing( +/*=======================*/ + byte* page, /* in: page */ + dulint newest_lsn, /* in: newest modification lsn to the page */ + ulint space, /* in: space id */ + ulint page_no) /* in: page number */ +{ + /* Write the newest modification lsn to the page */ + mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); + + mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, newest_lsn); + + /* Write to the page the space id and page number */ + + mach_write_to_4(page + FIL_PAGE_SPACE, space); + mach_write_to_4(page + FIL_PAGE_OFFSET, page_no); + + /* We overwrite the first 4 bytes of the end lsn field to store + a page checksum */ + + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, + buf_calc_page_checksum(page)); +} + /************************************************************************ Does an asynchronous write of a buffer page. NOTE: in simulated aio and also when the doublewrite buffer is used, we must call @@ -349,23 +377,8 @@ buf_flush_write_block_low( /* Force the log to the disk before writing the modified block */ log_flush_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS); #endif - /* Write the newest modification lsn to the page */ - mach_write_to_8(block->frame + FIL_PAGE_LSN, - block->newest_modification); - mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, - block->newest_modification); - - /* Write to the page the space id and page number */ - - mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space); - mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset); - - /* We overwrite the first 4 bytes of the end lsn field to store - a page checksum */ - - mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN, - buf_calc_page_checksum(block->frame)); - + buf_flush_init_for_writing(block->frame, block->newest_modification, + block->space, block->offset); if (!trx_doublewrite) { fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index 657ce2d6809..d52483074cd 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -281,7 +281,8 @@ dict_table_autoinc_initialize( } /************************************************************************ -Gets the next autoinc value, 0 if not yet initialized. */ +Gets the next autoinc value, 0 if not yet initialized. If initialized, +increments the counter by 1. */ ib_longlong dict_table_autoinc_get( @@ -306,6 +307,32 @@ dict_table_autoinc_get( return(value); } +/************************************************************************ +Reads the autoinc counter value, 0 if not yet initialized. Does not +increment the counter. */ + +ib_longlong +dict_table_autoinc_read( +/*====================*/ + /* out: value of the counter */ + dict_table_t* table) /* in: table */ +{ + ib_longlong value; + + mutex_enter(&(table->autoinc_mutex)); + + if (!table->autoinc_inited) { + + value = 0; + } else { + value = table->autoinc; + } + + mutex_exit(&(table->autoinc_mutex)); + + return(value); +} + /************************************************************************ Updates the autoinc counter if the value supplied is bigger than the current value. If not inited, does nothing. */ @@ -644,7 +671,10 @@ dict_table_rename_in_cache( /*=======================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table */ - char* new_name) /* in: new name */ + char* new_name, /* in: new name */ + ibool rename_also_foreigns)/* in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ { dict_foreign_t* foreign; dict_index_t* index; @@ -702,6 +732,41 @@ dict_table_rename_in_cache( index = dict_table_get_next_index(index); } + if (!rename_also_foreigns) { + /* In ALTER TABLE we think of the rename table operation + in the direction table -> temporary table (#sql...) + as dropping the table with the old name and creating + a new with the new name. Thus we kind of drop the + constraints from the dictionary cache here. The foreign key + constraints will be inherited to the new table from the + system tables through a call of dict_load_foreigns. */ + + /* Remove the foreign constraints from the cache */ + foreign = UT_LIST_GET_LAST(table->foreign_list); + + while (foreign != NULL) { + dict_foreign_remove_from_cache(foreign); + foreign = UT_LIST_GET_LAST(table->foreign_list); + } + + /* Reset table field in referencing constraints */ + + foreign = UT_LIST_GET_FIRST(table->referenced_list); + + while (foreign != NULL) { + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + + foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + } + + /* Make the list of referencing constraints empty */ + + UT_LIST_INIT(table->referenced_list); + + return(TRUE); + } + /* Update the table name fields in foreign constraints */ foreign = UT_LIST_GET_FIRST(table->foreign_list); @@ -768,8 +833,6 @@ dict_table_remove_from_cache( foreign = UT_LIST_GET_LAST(table->foreign_list); while (foreign != NULL) { - ut_a(0 == ut_strcmp(foreign->foreign_table_name, table->name)); - dict_foreign_remove_from_cache(foreign); foreign = UT_LIST_GET_LAST(table->foreign_list); } @@ -779,8 +842,6 @@ dict_table_remove_from_cache( foreign = UT_LIST_GET_FIRST(table->referenced_list); while (foreign != NULL) { - ut_a(0 == ut_strcmp(foreign->referenced_table_name, - table->name)); foreign->referenced_table = NULL; foreign->referenced_index = NULL; @@ -1628,8 +1689,9 @@ dict_foreign_add_to_cache( { dict_table_t* for_table; dict_table_t* ref_table; - dict_foreign_t* for_in_cache = NULL; + dict_foreign_t* for_in_cache = NULL; dict_index_t* index; + ibool added_to_referenced_list = FALSE; ut_ad(mutex_own(&(dict_sys->mutex))); @@ -1673,6 +1735,7 @@ dict_foreign_add_to_cache( UT_LIST_ADD_LAST(referenced_list, ref_table->referenced_list, for_in_cache); + added_to_referenced_list = TRUE; } if (for_in_cache->foreign_table == NULL && for_table) { @@ -1683,6 +1746,12 @@ dict_foreign_add_to_cache( if (index == NULL) { if (for_in_cache == foreign) { + if (added_to_referenced_list) { + UT_LIST_REMOVE(referenced_list, + ref_table->referenced_list, + for_in_cache); + } + mem_heap_free(foreign->heap); } @@ -1802,9 +1871,14 @@ dict_scan_col( return(ptr); } + if (*ptr == '`') { + ptr++; + } + old_ptr = ptr; - while (!isspace(*ptr) && *ptr != ',' && *ptr != ')') { + while (!isspace(*ptr) && *ptr != ',' && *ptr != ')' && *ptr != '`') { + ptr++; } @@ -1825,6 +1899,10 @@ dict_scan_col( } } + if (*ptr == '`') { + ptr++; + } + return(ptr); } @@ -1855,9 +1933,13 @@ dict_scan_table_name( return(ptr); } + if (*ptr == '`') { + ptr++; + } + old_ptr = ptr; - while (!isspace(*ptr) && *ptr != '(') { + while (!isspace(*ptr) && *ptr != '(' && *ptr != '`') { if (*ptr == '.') { dot_ptr = ptr; } @@ -1898,6 +1980,10 @@ dict_scan_table_name( *table = dict_table_get_low(second_table_name); + if (*ptr == '`') { + ptr++; + } + return(ptr); } @@ -1940,8 +2026,8 @@ dict_create_foreign_constraints( /*============================*/ /* out: error code or DB_SUCCESS */ trx_t* trx, /* in: transaction */ - char* sql_string, /* in: table create statement where - foreign keys are declared like: + char* sql_string, /* in: table create or ALTER TABLE + statement where foreign keys are declared like: FOREIGN KEY (a, b) REFERENCES table2(c, d), table2 can be written also with the database name before it: test.table2; the default @@ -1967,10 +2053,11 @@ dict_create_foreign_constraints( if (table == NULL) { return(DB_ERROR); } + loop: ptr = dict_scan_to(ptr, "FOREIGN"); - if (*ptr == '\0' || dict_bracket_count(sql_string, ptr) != 1) { + if (*ptr == '\0') { /* The following call adds the foreign key constraints to the data dictionary system tables on disk */ @@ -2884,19 +2971,21 @@ dict_field_print_low( } /************************************************************************** -Sprintfs to a string info on foreign keys of a table. */ - +Sprintfs to a string info on foreign keys of a table in a format suitable +for CREATE TABLE. */ +static void -dict_print_info_on_foreign_keys( -/*============================*/ +dict_print_info_on_foreign_keys_in_create_format( +/*=============================================*/ + char* buf, /* in: auxiliary buffer of 10000 chars */ char* str, /* in/out: pointer to a string */ ulint len, /* in: space in str available for info */ dict_table_t* table) /* in: table */ { + dict_foreign_t* foreign; ulint i; char* buf2; - char buf[10000]; buf2 = buf; @@ -2911,11 +3000,93 @@ dict_print_info_on_foreign_keys( } while (foreign != NULL) { - buf2 += sprintf(buf2, "; ("); - + buf2 += sprintf(buf2, ",\n FOREIGN KEY ("); + + for (i = 0; i < foreign->n_fields; i++) { + buf2 += sprintf(buf2, "`%s`", + foreign->foreign_col_names[i]); + + if (i + 1 < foreign->n_fields) { + buf2 += sprintf(buf2, ", "); + } + } + + buf2 += sprintf(buf2, ") REFERENCES `%s` (", + foreign->referenced_table_name); + /* Change the '/' in the table name to '.' */ + + for (i = ut_strlen(buf); i > 0; i--) { + if (buf[i] == '/') { + + buf[i] = '.'; + + break; + } + } + + for (i = 0; i < foreign->n_fields; i++) { + buf2 += sprintf(buf2, "`%s`", + foreign->referenced_col_names[i]); + if (i + 1 < foreign->n_fields) { + buf2 += sprintf(buf2, ", "); + } + } + + buf2 += sprintf(buf2, ")"); + + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + mutex_exit(&(dict_sys->mutex)); + + buf[len - 1] = '\0'; + ut_memcpy(str, buf, len); +} + +/************************************************************************** +Sprintfs to a string info on foreign keys of a table. */ + +void +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /* in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + char* str, /* in/out: pointer to a string */ + ulint len, /* in: space in str available for info */ + dict_table_t* table) /* in: table */ +{ + dict_foreign_t* foreign; + ulint i; + char* buf2; + char buf[10000]; + + if (create_table_format) { + dict_print_info_on_foreign_keys_in_create_format( + buf, str, len, table); + return; + } + + buf2 = buf; + + mutex_enter(&(dict_sys->mutex)); + + foreign = UT_LIST_GET_FIRST(table->foreign_list); + + if (foreign == NULL) { + mutex_exit(&(dict_sys->mutex)); + + return; + } + + while (foreign != NULL) { + buf2 += sprintf(buf2, "; ("); + for (i = 0; i < foreign->n_fields; i++) { buf2 += sprintf(buf2, "%s", foreign->foreign_col_names[i]); + if (i + 1 < foreign->n_fields) { buf2 += sprintf(buf2, " "); } diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c index dcdc9ee01cd..29c98db4a53 100644 --- a/innobase/dict/dict0load.c +++ b/innobase/dict/dict0load.c @@ -688,7 +688,16 @@ dict_load_indexes( dict_load_fields(table, index, heap); - dict_index_add_to_cache(table, index); + if (index->type & DICT_CLUSTERED == 0 + && NULL == dict_table_get_first_index(table)) { + + fprintf(stderr, + "InnoDB: Error: trying to load index %s for table %s\n" + "InnoDB: but the first index was not clustered\n", + index->name, table->name); + } else { + dict_index_add_to_cache(table, index); + } } btr_pcur_move_to_next_user_rec(&pcur, &mtr); diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index 138f1a78985..ca508785497 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -89,8 +89,8 @@ struct fil_node_struct { char* name; /* the file name or path */ ibool open; /* TRUE if file open */ os_file_t handle; /* OS handle to the file, if file open */ - ulint size; /* size of the file in database blocks - (where the possible last incomplete block + ulint size; /* size of the file in database pages + (where the possible last incomplete megabyte is ignored) */ ulint n_pending; /* count of pending i/o-ops on this file */ @@ -945,6 +945,76 @@ fil_node_complete_io( } } +/************************************************************************** +Tries to extend a data file by the number of pages given. Any fractions of a +megabyte are ignored. */ + +ibool +fil_extend_last_data_file( +/*======================*/ + /* out: TRUE if success, also if we run + out of disk space we may return TRUE */ + ulint* actual_increase,/* out: number of pages we were able to + extend, here the orginal size of the file and + the resulting size of the file are rounded + downwards to a full megabyte, and the + difference expressed in pages is returned */ + ulint size_increase) /* in: try to extend this many pages */ +{ + fil_node_t* node; + fil_space_t* space; + fil_system_t* system = fil_system; + byte* buf; + ibool success; + ulint i; + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, 0, space, space->id == 0); + + ut_a(space); + + node = UT_LIST_GET_LAST(space->chain); + + fil_node_prepare_for_io(node, system, space); + + buf = mem_alloc(1024 * 1024); + + memset(buf, '\0', 1024 * 1024); + + for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) { + + success = os_file_write(node->name, node->handle, buf, + (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF, + node->size >> (32 - UNIV_PAGE_SIZE_SHIFT), + 1024 * 1024); + + if (!success) { + + break; + } + + node->size += ((1024 * 1024) / UNIV_PAGE_SIZE); + space->size += ((1024 * 1024) / UNIV_PAGE_SIZE); + + os_has_said_disk_full = FALSE; + } + + mem_free(buf); + + fil_node_complete_io(node, system, OS_FILE_WRITE); + + mutex_exit(&(system->mutex)); + + *actual_increase = i * ((1024 * 1024) / UNIV_PAGE_SIZE); + + fil_flush(0); + + srv_data_file_sizes[srv_n_data_files - 1] += *actual_increase; + + return(TRUE); +} + /************************************************************************ Reads or writes data. This operation is asynchronous (aio). */ @@ -966,9 +1036,9 @@ fil_io( ulint byte_offset, /* in: remainder of offset in bytes; in aio this must be divisible by the OS block size */ - ulint len, /* in: how many bytes to read; this must - not cross a file boundary; in aio this must - be a block size multiple */ + ulint len, /* in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ void* buf, /* in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index ccc13f15fde..e823fe62259 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -50,7 +50,7 @@ descriptor page, but used only in the first. */ #define FSP_FREE_LIMIT 12 /* Minimum page number for which the free list has not been initialized: the pages >= this limit are, by - definition free */ + definition, free */ #define FSP_LOWEST_NO_WRITE 16 /* The lowest page offset for which the page has not been written to disk (if it has been written, we know that @@ -898,6 +898,106 @@ fsp_header_inc_size( mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, mtr); } +/************************************************************************** +Gets the current free limit of a tablespace. The free limit means the +place of the first page which has never been put to the the free list +for allocation. The space above that address is initialized to zero. +Sets also the global variable log_fsp_current_free_limit. */ + +ulint +fsp_header_get_free_limit( +/*======================*/ + /* out: free limit in megabytes */ + ulint space) /* in: space id */ +{ + fsp_header_t* header; + ulint limit; + mtr_t mtr; + + ut_a(space == 0); /* We have only one log_fsp_current_... variable */ + + mtr_start(&mtr); + + mtr_x_lock(fil_space_get_latch(space), &mtr); + + header = fsp_get_space_header(space, &mtr); + + limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, &mtr); + + limit = limit / ((1024 * 1024) / UNIV_PAGE_SIZE); + + log_fsp_current_free_limit_set_and_checkpoint(limit); + + mtr_commit(&mtr); + + return(limit); +} + +/*************************************************************************** +Tries to extend the last data file file if it is defined as auto-extending. */ +static +ibool +fsp_try_extend_last_file( +/*=====================*/ + /* out: FALSE if not auto-extending */ + ulint* actual_increase,/* out: actual increase in pages */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint size; + ulint size_increase; + ibool success; + + ut_a(space == 0); + + *actual_increase = 0; + + if (!srv_auto_extend_last_data_file) { + + return(FALSE); + } + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + if (srv_last_file_size_max != 0) { + if (srv_last_file_size_max + < srv_data_file_sizes[srv_n_data_files - 1]) { + + fprintf(stderr, +"InnoDB: Error: Last data file size is %lu, max size allowed %lu\n", + srv_data_file_sizes[srv_n_data_files - 1], + srv_last_file_size_max); + } + + size_increase = srv_last_file_size_max + - srv_data_file_sizes[srv_n_data_files - 1]; + if (size_increase > SRV_AUTO_EXTEND_INCREMENT) { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } + } else { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } + + if (size_increase == 0) { + return(TRUE); + } + + /* Extend the data file. If we are not able to extend + the full requested length, the function tells us + the number of full megabytes (but the unit is pages!) + we were able to extend. */ + + success = fil_extend_last_data_file(actual_increase, size_increase); + + if (success) { + mlog_write_ulint(header + FSP_SIZE, size + *actual_increase, + MLOG_4BYTES, mtr); + } + + return(TRUE); +} + /************************************************************************** Puts new extents to the free list if there are free extents above the free limit. If an extent happens to contain an extent descriptor page, the extent @@ -917,8 +1017,9 @@ fsp_fill_free_list( ulint frag_n_used; page_t* descr_page; page_t* ibuf_page; - mtr_t ibuf_mtr; + ulint actual_increase; ulint i; + mtr_t ibuf_mtr; ut_ad(header && mtr); @@ -926,12 +1027,28 @@ fsp_fill_free_list( size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr); + if (srv_auto_extend_last_data_file + && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { + + /* Try to increase the last data file size */ + fsp_try_extend_last_file(&actual_increase, space, header, + mtr); + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + } + i = limit; while ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD)) { mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE, MLOG_4BYTES, mtr); + + /* Update the free limit info in the log system and make + a checkpoint */ + log_fsp_current_free_limit_set_and_checkpoint( + (i + FSP_EXTENT_SIZE) + / ((1024 * 1024) / UNIV_PAGE_SIZE)); + if (0 == i % XDES_DESCRIBED_PER_PAGE) { /* We are going to initialize a new descriptor page @@ -1172,6 +1289,7 @@ fsp_free_page( xdes_t* descr; ulint state; ulint frag_n_used; + char buf[1000]; ut_ad(mtr); @@ -1183,10 +1301,38 @@ fsp_free_page( state = xdes_get_state(descr, mtr); - ut_a((state == XDES_FREE_FRAG) || (state == XDES_FULL_FRAG)); + if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) { + fprintf(stderr, +"InnoDB: Error: File space extent descriptor of page %lu has state %lu\n", + page, state); + ut_sprintf_buf(buf, ((byte*)descr) - 50, 200); - ut_a(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr) - == FALSE); + fprintf(stderr, "InnoDB: Dump of descriptor: %s\n", buf); + + if (state == XDES_FREE) { + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } + + ut_a(0); + } + + if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr) + == TRUE) { + fprintf(stderr, +"InnoDB: Error: File space extent descriptor of page %lu says it is free\n", + page); + ut_sprintf_buf(buf, ((byte*)descr) - 50, 200); + + fprintf(stderr, "InnoDB: Dump of descriptor: %s\n", buf); + + /* We put here some fault tolerance: if the page + is already free, return without doing anything! */ + + return; + } xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr); @@ -2243,13 +2389,15 @@ fsp_reserve_free_extents( mtr_t* mtr) /* in: mtr */ { fsp_header_t* space_header; + rw_lock_t* latch; ulint n_free_list_ext; ulint free_limit; ulint size; ulint n_free; ulint n_free_up; ulint reserve; - rw_lock_t* latch; + ibool success; + ulint n_pages_added; ut_ad(mtr); ut_ad(!mutex_own(&kernel_mutex) @@ -2260,7 +2408,7 @@ fsp_reserve_free_extents( mtr_x_lock(latch, mtr); space_header = fsp_get_space_header(space, mtr); - +try_again: size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr); n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr); @@ -2291,7 +2439,7 @@ fsp_reserve_free_extents( if (n_free <= reserve + n_ext) { - return(FALSE); + goto try_to_extend; } } else if (alloc_type == FSP_UNDO) { /* We reserve 1 % of the space size to cleaning operations */ @@ -2300,13 +2448,26 @@ fsp_reserve_free_extents( if (n_free <= reserve + n_ext) { - return(FALSE); + goto try_to_extend; } } else { ut_a(alloc_type == FSP_CLEANING); } - return(fil_space_reserve_free_extents(space, n_free, n_ext)); + success = fil_space_reserve_free_extents(space, n_free, n_ext); + + if (success) { + return(TRUE); + } +try_to_extend: + success = fsp_try_extend_last_file(&n_pages_added, space, + space_header, mtr); + if (success && n_pages_added > 0) { + + goto try_again; + } + + return(FALSE); } /************************************************************************** diff --git a/innobase/include/buf0flu.h b/innobase/include/buf0flu.h index cb1c0965a65..1b40acaa269 100644 --- a/innobase/include/buf0flu.h +++ b/innobase/include/buf0flu.h @@ -28,6 +28,16 @@ a margin of replaceable pages there. */ void buf_flush_free_margin(void); /*=======================*/ +/************************************************************************ +Initializes a page for writing to the tablespace. */ + +void +buf_flush_init_for_writing( +/*=======================*/ + byte* page, /* in: page */ + dulint newest_lsn, /* in: newest modification lsn to the page */ + ulint space, /* in: space id */ + ulint page_no); /* in: page number */ /*********************************************************************** This utility flushes dirty blocks from the end of the LRU list or flush_list. NOTE 1: in the case of an LRU flush the calling thread may own latches to diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index 0f6f516c2cb..fd79e17090a 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -105,7 +105,8 @@ dict_table_autoinc_initialize( dict_table_t* table, /* in: table */ ib_longlong value); /* in: value which was assigned to a row */ /************************************************************************ -Gets the next autoinc value, 0 if not yet initialized. */ +Gets the next autoinc value, 0 if not yet initialized. If initialized, +increments the counter by 1. */ ib_longlong dict_table_autoinc_get( @@ -113,6 +114,15 @@ dict_table_autoinc_get( /* out: value for a new row, or 0 */ dict_table_t* table); /* in: table */ /************************************************************************ +Reads the autoinc counter value, 0 if not yet initialized. Does not +increment the counter. */ + +ib_longlong +dict_table_autoinc_read( +/*====================*/ + /* out: value of the counter */ + dict_table_t* table); /* in: table */ +/************************************************************************ Updates the autoinc counter if the value supplied is bigger than the current value. If not inited, does nothing. */ @@ -143,7 +153,10 @@ dict_table_rename_in_cache( /*=======================*/ /* out: TRUE if success */ dict_table_t* table, /* in: table */ - char* new_name); /* in: new name */ + char* new_name, /* in: new name */ + ibool rename_also_foreigns);/* in: in ALTER TABLE we want + to preserve the original table name + in constraints which reference it */ /************************************************************************** Adds a foreign key constraint object to the dictionary cache. May free the object if there already is an object with the same identifier in. @@ -284,6 +297,10 @@ Sprintfs to a string info on foreign keys of a table. */ void dict_print_info_on_foreign_keys( /*============================*/ + ibool create_table_format, /* in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ char* str, /* in/out: pointer to a string */ ulint len, /* in: space in str available for info */ dict_table_t* table); /* in: table */ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index ca74ea4cb2c..63e20221c16 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -64,8 +64,10 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_DATA 38 /* start of the data on the page */ /* File page trailer */ -#define FIL_PAGE_END_LSN 8 /* this should be same as - FIL_PAGE_LSN */ +#define FIL_PAGE_END_LSN 8 /* the low 4 bytes of this are used + to store the page checksum, the + last 4 bytes should be identical + to the last 4 bytes of FIL_PAGE_LSN */ #define FIL_PAGE_DATA_END 8 /* File page types */ @@ -134,6 +136,21 @@ fil_space_truncate_start( ulint trunc_len); /* in: truncate by this much; it is an error if this does not equal to the combined size of some initial files in the space */ +/************************************************************************** +Tries to extend a data file by the number of pages given. Any fractions of a +megabyte are ignored. */ + +ibool +fil_extend_last_data_file( +/*======================*/ + /* out: TRUE if success, also if we run + out of disk space we may return TRUE */ + ulint* actual_increase,/* out: number of pages we were able to + extend, here the orginal size of the file and + the resulting size of the file are rounded + downwards to a full megabyte, and the + difference expressed in pages is returned */ + ulint size_increase); /* in: try to extend this many pages */ /*********************************************************************** Frees a space object from a file system. Closes the files in the chain but does not delete them. */ diff --git a/innobase/include/fsp0fsp.h b/innobase/include/fsp0fsp.h index e7f9eab330b..a0197ec2d97 100644 --- a/innobase/include/fsp0fsp.h +++ b/innobase/include/fsp0fsp.h @@ -46,6 +46,17 @@ void fsp_init(void); /*==========*/ /************************************************************************** +Gets the current free limit of a tablespace. The free limit means the +place of the first page which has never been put to the the free list +for allocation. The space above that address is initialized to zero. +Sets also the global variable log_fsp_current_free_limit. */ + +ulint +fsp_header_get_free_limit( +/*======================*/ + /* out: free limit in megabytes */ + ulint space); /* in: space id */ +/************************************************************************** Initializes the space header of a new created space. */ void diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h index adff9fae544..eeb4f2e45f1 100644 --- a/innobase/include/log0log.h +++ b/innobase/include/log0log.h @@ -26,6 +26,32 @@ extern ibool log_debug_writes; #define LOG_WAIT_ALL_GROUPS 93 #define LOG_MAX_N_GROUPS 32 +/******************************************************************** +Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint, +so that we know that the limit has been written to a log checkpoint field +on disk. */ + +void +log_fsp_current_free_limit_set_and_checkpoint( +/*==========================================*/ + ulint limit); /* in: limit to set */ +/*********************************************************************** +Calculates where in log files we find a specified lsn. */ + +ulint +log_calc_where_lsn_is( +/*==================*/ + /* out: log file number */ + ib_longlong* log_file_offset, /* out: offset in that file + (including the header) */ + dulint first_header_lsn, /* in: first log file start + lsn */ + dulint lsn, /* in: lsn whose position to + determine */ + ulint n_log_files, /* in: total number of log + files */ + ib_longlong log_file_size); /* in: log file size + (including the header) */ /**************************************************************** Writes to the log the string given. The log must be released with log_release. */ @@ -225,6 +251,16 @@ Writes checkpoint info to groups. */ void log_groups_write_checkpoint_info(void); /*==================================*/ +/********************************************************** +Writes info to a buffer of a log group when log files are created in +backup restoration. */ + +void +log_reset_first_header_and_checkpoint( +/*==================================*/ + byte* hdr_buf,/* in: buffer which will be written to the start + of the first log file */ + dulint lsn); /* in: lsn of the start of the first log file */ /************************************************************************ Starts an archiving operation. */ @@ -507,7 +543,16 @@ extern log_t* log_sys; + LOG_MAX_N_GROUPS * 8) #define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END #define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END) -#define LOG_CHECKPOINT_SIZE (8 + LOG_CHECKPOINT_ARRAY_END) +#define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END) + /* current fsp free limit in the + tablespace, in units of one megabyte */ +#define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END) + /* this magic number tells if the + checkpoint contains the above field: + the field was added to InnoDB-3.23.50 */ +#define LOG_CHECKPOINT_SIZE (16 + LOG_CHECKPOINT_ARRAY_END) + +#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243 /* Offsets of a log file header */ #define LOG_GROUP_ID 0 /* log group number */ diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h index 8f896756db9..0825325965d 100644 --- a/innobase/include/log0recv.h +++ b/innobase/include/log0recv.h @@ -15,6 +15,39 @@ Created 9/20/1997 Heikki Tuuri #include "hash0hash.h" #include "log0log.h" +/*********************************************************************** +Reads the checkpoint info needed in hot backup. */ + +ibool +recv_read_cp_info_for_backup( +/*=========================*/ + /* out: TRUE if success */ + byte* hdr, /* in: buffer containing the log group header */ + dulint* lsn, /* out: checkpoint lsn */ + ulint* offset, /* out: checkpoint offset in the log group */ + ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database + is running with < version 3.23.50 of InnoDB */ + dulint* cp_no, /* out: checkpoint number */ + dulint* first_header_lsn); + /* out: lsn of of the start of the first log file */ +/*********************************************************************** +Scans the log segment and n_bytes_scanned is set to the length of valid +log scanned. */ + +void +recv_scan_log_seg_for_backup( +/*=========================*/ + byte* buf, /* in: buffer containing log data */ + ulint buf_len, /* in: data length in that buffer */ + dulint* scanned_lsn, /* in/out: lsn of buffer start, + we return scanned lsn */ + ulint* scanned_checkpoint_no, + /* in/out: 4 lowest bytes of the + highest scanned checkpoint number so + far */ + ulint* n_bytes_scanned);/* out: how much we were able to + scan, smaller than buf_len if log + data ended here */ /*********************************************************************** Returns TRUE if recovery is currently running. */ UNIV_INLINE @@ -35,6 +68,10 @@ read in, or also for a page already in the buffer pool. */ void recv_recover_page( /*==============*/ + ibool recover_backup, /* in: TRUE if we are recovering a backup + page: then we do not acquire any latches + since the page was read in outside the + buffer pool */ ibool just_read_in, /* in: TRUE if the i/o-handler calls this for a freshly read page */ page_t* page, /* in: buffer page */ @@ -69,8 +106,15 @@ recv_scan_log_recs( /*===============*/ /* out: TRUE if limit_lsn has been reached, or not able to scan any more in this log group */ + ibool apply_automatically,/* in: TRUE if we want this function to + apply log records automatically when the + hash table becomes full; in the hot backup tool + the tool does the applying, not this + function */ + ulint available_memory,/* in: we let the hash table of recs to grow + to this size, at the maximum */ ibool store_to_hash, /* in: TRUE if the records should be stored - to the hash table; this is set FALSE if just + to the hash table; this is set to FALSE if just debug checking is needed */ byte* buf, /* in: buffer containing a log segment or garbage */ @@ -92,6 +136,16 @@ recv_reset_logs( ibool new_logs_created);/* in: TRUE if resetting logs is done at the log creation; FALSE if it is done after archive recovery */ +/********************************************************** +Creates new log files after a backup has been restored. */ + +void +recv_reset_log_files_for_backup( +/*============================*/ + char* log_dir, /* in: log file directory path */ + ulint n_log_files, /* in: number of log files */ + ulint log_file_size, /* in: log file size */ + dulint lsn); /* in: new start lsn */ /************************************************************ Creates the recovery system. */ @@ -102,8 +156,11 @@ recv_sys_create(void); Inits the recovery system for a recovery operation. */ void -recv_sys_init(void); -/*===============*/ +recv_sys_init( +/*==========*/ + ibool recover_from_backup, /* in: TRUE if this is called + to recover from a hot backup */ + ulint available_memory); /* in: available memory in bytes */ /*********************************************************************** Empties the hash table of stored log records, applying them to appropriate pages. */ @@ -118,6 +175,17 @@ recv_apply_hashed_log_recs( disk and invalidated in buffer pool: this alternative means that no new log records can be generated during the application */ +/*********************************************************************** +Applies log records in the hash table to a backup. */ + +void +recv_apply_log_recs_for_backup( +/*===========================*/ + ulint n_data_files, /* in: number of data files */ + char** data_files, /* in: array containing the paths to the + data files */ + ulint* file_sizes); /* in: sizes of the data files in database + pages */ /************************************************************ Recovers from archived log files, and also from log files, if they exist. */ @@ -260,6 +328,14 @@ extern ibool recv_recovery_on; extern ibool recv_no_ibuf_operations; extern ibool recv_needed_recovery; +/* Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many +times! */ +#define RECV_PARSING_BUF_SIZE (2 * 1024 * 1024) + +/* Size of block reads when the log groups are scanned forward to do a +roll-forward */ +#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE) + /* States of recv_addr_struct */ #define RECV_NOT_PROCESSED 71 #define RECV_BEING_READ 72 diff --git a/innobase/include/mem0mem.h b/innobase/include/mem0mem.h index 89c5428f054..bfd25f5bdbe 100644 --- a/innobase/include/mem0mem.h +++ b/innobase/include/mem0mem.h @@ -41,11 +41,11 @@ page buffer pool; the latter method is used for very big heaps */ /* The following start size is used for the first block in the memory heap if the size is not specified, i.e., 0 is given as the parameter in the call of -create. The standard size is the maximum size of the blocks used for +create. The standard size is the maximum (payload) size of the blocks used for allocations of small buffers. */ #define MEM_BLOCK_START_SIZE 64 -#define MEM_BLOCK_STANDARD_SIZE 8192 +#define MEM_BLOCK_STANDARD_SIZE 8000 /* If a memory heap is allowed to grow into the buffer pool, the following is the maximum size for a single allocated buffer: */ diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index 411a9fb2c21..67d76814936 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -11,6 +11,12 @@ Created 10/21/1995 Heikki Tuuri #include "univ.i" + +/* If the following is set to TRUE, we do not call os_file_flush in every +os_file_write */ +extern ibool os_do_not_call_flush_at_each_write; +extern ibool os_has_said_disk_full; + #ifdef __WIN__ /* We define always WIN_ASYNC_IO, and check at run-time whether @@ -55,6 +61,9 @@ log. */ #define OS_FILE_CREATE 52 #define OS_FILE_OVERWRITE 53 +#define OS_FILE_READ_ONLY 333 +#define OS_FILE_READ_WRITE 444 + /* Options for file_create */ #define OS_FILE_AIO 61 #define OS_FILE_NORMAL 62 @@ -118,6 +127,27 @@ os_get_os_version(void); /*===================*/ /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */ /******************************************************************** +Creates the seek mutexes used in positioned reads and writes. */ + +void +os_io_init_simple(void); +/*===================*/ +/******************************************************************** +A simple function to open or create a file. */ + +os_file_t +os_file_create_simple( +/*==================*/ + /* out, own: handle to the file, not defined if error, + error number can be retrieved with os_get_last_error */ + char* name, /* in: name of the file or path as a null-terminated + string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened + (if does not exist, error), or OS_FILE_CREATE if a new + file is created (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ + ibool* success);/* out: TRUE if succeed, FALSE if error */ +/******************************************************************** Opens an existing file or creates a new. */ os_file_t diff --git a/innobase/include/row0mysql.h b/innobase/include/row0mysql.h index 48b6ba8a715..92693319175 100644 --- a/innobase/include/row0mysql.h +++ b/innobase/include/row0mysql.h @@ -402,13 +402,13 @@ struct row_prebuilt_struct { byte* ins_upd_rec_buff;/* buffer for storing data converted to the Innobase format from the MySQL format */ - ibool in_update_remember_pos; - /* if an update is processed, then if - this flag is set to TRUE, it means - that the stored cursor position in - SELECT is the right position also - for the update: we can just restore - the cursor and save CPU time */ + ibool hint_no_need_to_fetch_extra_cols; + /* normally this is TRUE, but + MySQL will set this to FALSE + if we might be required to fetch also + other columns than mentioned in the + query: the clustered index column(s), + or an auto-increment column*/ upd_node_t* upd_node; /* Innobase SQL update node used to perform updates and deletes */ que_fork_t* ins_graph; /* Innobase SQL query graph used diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 01986c759d7..903dd9afc90 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -24,10 +24,13 @@ extern char srv_fatal_errbuf[]; thread starts running */ extern os_event_t srv_lock_timeout_thread_event; +/* If the last data file is auto-extended, we add this many pages to it +at a time */ +#define SRV_AUTO_EXTEND_INCREMENT (8 * ((1024 * 1024) / UNIV_PAGE_SIZE)) + /* Server parameters which are read from the initfile */ extern char* srv_data_home; -extern char* srv_logs_home; extern char* srv_arch_dir; extern ulint srv_n_data_files; @@ -35,6 +38,9 @@ extern char** srv_data_file_names; extern ulint* srv_data_file_sizes; extern ulint* srv_data_file_is_raw_partition; +extern ibool srv_auto_extend_last_data_file; +extern ulint srv_last_file_size_max; + extern ibool srv_created_new_raw; #define SRV_NEW_RAW 1 @@ -185,6 +191,19 @@ srv_boot(void); /*==========*/ /* out: DB_SUCCESS or error code */ /************************************************************************* +Initializes the server. */ + +void +srv_init(void); +/*==========*/ +/************************************************************************* +Initializes the synchronization primitives, memory system, and the thread +local storage. */ + +void +srv_general_init(void); +/*==================*/ +/************************************************************************* Gets the number of threads in the system. */ ulint diff --git a/innobase/include/srv0start.h b/innobase/include/srv0start.h index 6dbdcd27250..01ac063e1c9 100644 --- a/innobase/include/srv0start.h +++ b/innobase/include/srv0start.h @@ -12,6 +12,56 @@ Created 10/10/1995 Heikki Tuuri #include "univ.i" +/************************************************************************* +Normalizes a directory path for Windows: converts slashes to backslashes. */ + +void +srv_normalize_path_for_win( +/*=======================*/ + char* str); /* in/out: null-terminated character string */ +/************************************************************************* +Adds a slash or a backslash to the end of a string if it is missing +and the string is not empty. */ + +char* +srv_add_path_separator_if_needed( +/*=============================*/ + /* out, own: string which has the separator if the + string is not empty */ + char* str); /* in: null-terminated character string */ +/************************************************************************* +Reads the data files and their sizes from a character string given in +the .cnf file. */ + +ibool +srv_parse_data_file_paths_and_sizes( +/*================================*/ + /* out: TRUE if ok, FALSE if parsing + error */ + char* str, /* in: the data file path string */ + char*** data_file_names, /* out, own: array of data file + names */ + ulint** data_file_sizes, /* out, own: array of data file sizes + in megabytes */ + ulint** data_file_is_raw_partition,/* out, own: array of flags + showing which data files are raw + partitions */ + ulint* n_data_files, /* out: number of data files */ + ibool* is_auto_extending, /* out: TRUE if the last data file is + auto-extending */ + ulint* max_auto_extend_size); /* out: max auto extend size for the + last file if specified, 0 if not */ +/************************************************************************* +Reads log group home directories from a character string given in +the .cnf file. */ + +ibool +srv_parse_log_group_home_dirs( +/*==========================*/ + /* out: TRUE if ok, FALSE if parsing + error */ + char* str, /* in: character string */ + char*** log_group_home_dirs); /* out, own: log group home dirs */ /******************************************************************** Starts Innobase and creates a new database if database files are not found and the user wants. Server parameters are diff --git a/innobase/include/trx0sys.h b/innobase/include/trx0sys.h index f2eded697ec..98c726fb118 100644 --- a/innobase/include/trx0sys.h +++ b/innobase/include/trx0sys.h @@ -44,6 +44,15 @@ half-written pages in the data files. */ void trx_sys_doublewrite_restore_corrupt_pages(void); /*===========================================*/ +/******************************************************************** +Determines if a page number is located inside the doublewrite buffer. */ + +ibool +trx_doublewrite_page_inside( +/*========================*/ + /* out: TRUE if the location is inside + the two blocks of the doublewrite buffer */ + ulint page_no); /* in: page number */ /******************************************************************* Checks if a page address is the trx sys header page. */ UNIV_INLINE diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h index b45f2160392..4fb45221899 100644 --- a/innobase/include/ut0byte.h +++ b/innobase/include/ut0byte.h @@ -55,6 +55,15 @@ ut_dulint_get_low( /* out: 32 bits in ulint */ dulint d); /* in: dulint */ /*********************************************************** +Converts a dulint (a struct of 2 ulints) to ib_longlong, which is a 64-bit +integer type. */ +UNIV_INLINE +ib_longlong +ut_conv_dulint_to_longlong( +/*=======================*/ + /* out: value in ib_longlong type */ + dulint d); /* in: dulint */ +/*********************************************************** Tests if a dulint is zero. */ UNIV_INLINE ibool diff --git a/innobase/include/ut0rnd.h b/innobase/include/ut0rnd.h index a30251e6da0..c8ef0dd4001 100644 --- a/innobase/include/ut0rnd.h +++ b/innobase/include/ut0rnd.h @@ -35,7 +35,7 @@ ut_rnd_gen_next_ulint( /************************************************************* The following function generates 'random' ulint integers which enumerate the value space (let there be N of them) of ulint integers -in a pseudo random fashion. Note that the same integer is repeated +in a pseudo-random fashion. Note that the same integer is repeated always after N calls to the generator. */ UNIV_INLINE ulint diff --git a/innobase/include/ut0ut.h b/innobase/include/ut0ut.h index 4366b832ff6..338460d7de9 100644 --- a/innobase/include/ut0ut.h +++ b/innobase/include/ut0ut.h @@ -17,6 +17,16 @@ Created 1/20/1994 Heikki Tuuri typedef time_t ib_time_t; +/************************************************************ +Gets the high 32 bits in a ulint. That is makes a shift >> 32, +but since there seem to be compiler bugs in both gcc and Visual C++, +we do this by a special conversion. */ + +ulint +ut_get_high32( +/*==========*/ + /* out: a >> 32 */ + ulint a); /* in: ulint */ /********************************************************** Calculates the minimum of two ulints. */ UNIV_INLINE @@ -144,6 +154,15 @@ void ut_print_timestamp( /*===============*/ FILE* file); /* in: file where to print */ +/************************************************************** +Returns current year, month, day. */ + +void +ut_get_year_month_day( +/*==================*/ + ulint* year, /* out: current year */ + ulint* month, /* out: month */ + ulint* day); /* out: day */ /***************************************************************** Runs an idle loop on CPU. The argument gives the desired delay in microseconds on 100 MHz Pentium + Visual C++. */ diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index 4fcf625242b..2ba035d1eb2 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -1,7 +1,7 @@ /****************************************************** Database log -(c) 1995-1997 InnoDB Oy +(c) 1995-1997 Innobase Oy Created 12/9/1995 Heikki Tuuri *******************************************************/ @@ -24,6 +24,9 @@ Created 12/9/1995 Heikki Tuuri #include "trx0sys.h" #include "trx0trx.h" +/* Current free limit; protected by the log sys mutex; 0 means uninitialized */ +ulint log_fsp_current_free_limit = 0; + /* Global log system variable */ log_t* log_sys = NULL; @@ -95,6 +98,32 @@ void log_archive_margin(void); /*====================*/ +/******************************************************************** +Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint, +so that we know that the limit has been written to a log checkpoint field +on disk. */ + +void +log_fsp_current_free_limit_set_and_checkpoint( +/*==========================================*/ + ulint limit) /* in: limit to set */ +{ + ibool success; + + mutex_enter(&(log_sys->mutex)); + + log_fsp_current_free_limit = limit; + + mutex_exit(&(log_sys->mutex)); + + /* Try to make a synchronous checkpoint */ + + success = FALSE; + + while (!success) { + success = log_checkpoint(TRUE, TRUE); + } +} /******************************************************************** Returns the oldest modified block lsn in the pool, or log_sys->lsn if none @@ -436,6 +465,51 @@ log_group_calc_lsn_offset( return(log_group_calc_real_offset(offset, group)); } +/*********************************************************************** +Calculates where in log files we find a specified lsn. */ + +ulint +log_calc_where_lsn_is( +/*==================*/ + /* out: log file number */ + ib_longlong* log_file_offset, /* out: offset in that file + (including the header) */ + dulint first_header_lsn, /* in: first log file start + lsn */ + dulint lsn, /* in: lsn whose position to + determine */ + ulint n_log_files, /* in: total number of log + files */ + ib_longlong log_file_size) /* in: log file size + (including the header) */ +{ + ib_longlong ib_lsn; + ib_longlong ib_first_header_lsn; + ib_longlong capacity = log_file_size - LOG_FILE_HDR_SIZE; + ulint file_no; + ib_longlong add_this_many; + + ib_lsn = ut_conv_dulint_to_longlong(lsn); + ib_first_header_lsn = ut_conv_dulint_to_longlong(first_header_lsn); + + if (ib_lsn < ib_first_header_lsn) { + add_this_many = 1 + (ib_first_header_lsn - ib_lsn) + / (capacity * (ib_longlong)n_log_files); + ib_lsn += add_this_many + * capacity * (ib_longlong)n_log_files; + } + + ut_a(ib_lsn >= ib_first_header_lsn); + + file_no = ((ulint)((ib_lsn - ib_first_header_lsn) / capacity)) + % n_log_files; + *log_file_offset = (ib_lsn - ib_first_header_lsn) % capacity; + + *log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE; + + return(file_no); +} + /************************************************************ Sets the field values in group to correspond to a given lsn. For this function to work, the values must already be correctly initialized to correspond to @@ -653,7 +727,7 @@ log_init(void) #ifdef UNIV_LOG_DEBUG recv_sys_create(); - recv_sys_init(); + recv_sys_init(FALSE, buf_pool_get_curr_size()); recv_sys->parse_start_lsn = log_sys->lsn; recv_sys->scanned_lsn = log_sys->lsn; @@ -961,7 +1035,7 @@ log_group_write_buf( ibool sync; ibool write_header; ulint next_offset; - + ut_ad(mutex_own(&(log_sys->mutex))); ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0); ut_ad(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); @@ -1002,9 +1076,28 @@ loop: } if (log_debug_writes) { + ulint i; + printf( - "Writing log file segment to group %lu offset %lu len %lu\n", - group->id, next_offset, write_len); + "Writing log file segment to group %lu offset %lu len %lu\n" + "start lsn %lu %lu\n", + group->id, next_offset, write_len, + ut_dulint_get_high(start_lsn), + ut_dulint_get_low(start_lsn)); + printf( + "First block n:o %lu last block n:o %lu\n", + log_block_get_hdr_no(buf), + log_block_get_hdr_no( + buf + write_len - OS_FILE_LOG_BLOCK_SIZE)); + ut_a(log_block_get_hdr_no(buf) + == log_block_convert_lsn_to_no(start_lsn)); + + for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) { + + ut_a(log_block_get_hdr_no(buf) + i + == log_block_get_hdr_no(buf + + i * OS_FILE_LOG_BLOCK_SIZE)); + } } if (log_do_write) { @@ -1346,7 +1439,7 @@ log_group_checkpoint( ulint i; ut_ad(mutex_own(&(log_sys->mutex))); - ut_ad(LOG_CHECKPOINT_SIZE <= OS_FILE_LOG_BLOCK_SIZE); + ut_a(LOG_CHECKPOINT_SIZE <= OS_FILE_LOG_BLOCK_SIZE); buf = group->checkpoint_buf; @@ -1394,6 +1487,15 @@ log_group_checkpoint( LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold); + /* Starting from InnoDB-3.23.50, we also write info on allocated + size in the tablespace */ + + mach_write_to_4(buf + LOG_CHECKPOINT_FSP_FREE_LIMIT, + log_fsp_current_free_limit); + + mach_write_to_4(buf + LOG_CHECKPOINT_FSP_MAGIC_N, + LOG_CHECKPOINT_FSP_MAGIC_N_VAL); + /* We alternate the physical place of the checkpoint info in the first log file */ @@ -1428,6 +1530,48 @@ log_group_checkpoint( } } +/********************************************************** +Writes info to a buffer of a log group when log files are created in +backup restoration. */ + +void +log_reset_first_header_and_checkpoint( +/*==================================*/ + byte* hdr_buf,/* in: buffer which will be written to the start + of the first log file */ + dulint lsn) /* in: lsn of the start of the first log file + + LOG_BLOCK_HDR_SIZE */ +{ + ulint fold; + byte* buf; + + mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0); + mach_write_to_8(hdr_buf + LOG_FILE_START_LSN, lsn); + + buf = hdr_buf + LOG_CHECKPOINT_1; + + mach_write_to_8(buf + LOG_CHECKPOINT_NO, ut_dulint_zero); + mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn); + + mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET, + LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE); + + mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024); + + mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, ut_dulint_max); + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold); + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold); + + /* Starting from InnoDB-3.23.50, we should also write info on + allocated size in the tablespace, but unfortunately we do not + know it here */ +} + /********************************************************** Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */ @@ -2795,7 +2939,10 @@ log_check_log_recs( ut_memcpy(scan_buf, start, end - start); - recv_scan_log_recs(FALSE, scan_buf, end - start, + recv_scan_log_recs(TRUE, + buf_pool_get_curr_size() - + RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + FALSE, scan_buf, end - start, ut_dulint_align_down(buf_start_lsn, OS_FILE_LOG_BLOCK_SIZE), &contiguous_lsn, &scanned_lsn); diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index 29e87c7572b..c31719f7bb0 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -1,7 +1,7 @@ /****************************************************** Recovery -(c) 1997 InnoDB Oy +(c) 1997 Innobase Oy Created 9/20/1997 Heikki Tuuri *******************************************************/ @@ -33,13 +33,6 @@ Created 9/20/1997 Heikki Tuuri #include "dict0boot.h" #include "fil0fil.h" -/* Size of block reads when the log groups are scanned forward to do a -roll-forward */ -#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE) - -/* Size of the parsing buffer */ -#define RECV_PARSING_BUF_SIZE LOG_BUFFER_SIZE - /* Log records are stored in the hash table in chunks at most of this size; this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */ #define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t)) @@ -69,6 +62,9 @@ ibool recv_no_ibuf_operations = FALSE; log scan */ ulint recv_scan_print_counter = 0; +ibool recv_is_from_backup = FALSE; + + /************************************************************ Creates the recovery system. */ @@ -94,8 +90,11 @@ recv_sys_create(void) Inits the recovery system for a recovery operation. */ void -recv_sys_init(void) -/*===============*/ +recv_sys_init( +/*==========*/ + ibool recover_from_backup, /* in: TRUE if this is called + to recover from a hot backup */ + ulint available_memory) /* in: available memory in bytes */ { if (recv_sys->heap != NULL) { @@ -104,13 +103,18 @@ recv_sys_init(void) mutex_enter(&(recv_sys->mutex)); - recv_sys->heap = mem_heap_create_in_buffer(256); + if (!recover_from_backup) { + recv_sys->heap = mem_heap_create_in_buffer(256); + } else { + recv_sys->heap = mem_heap_create(256); + recv_is_from_backup = TRUE; + } recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE); recv_sys->len = 0; recv_sys->recovered_offset = 0; - recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 64); + recv_sys->addr_hash = hash_create(available_memory / 64); recv_sys->n_addrs = 0; recv_sys->apply_log_recs = FALSE; @@ -337,7 +341,7 @@ recv_synchronize_groups( start_lsn = ut_dulint_align_down(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); end_lsn = ut_dulint_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); - ut_ad(ut_dulint_cmp(start_lsn, end_lsn) != 0); + ut_a(ut_dulint_cmp(start_lsn, end_lsn) != 0); log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block, up_to_date_group, start_lsn, end_lsn); @@ -377,6 +381,35 @@ recv_synchronize_groups( mutex_enter(&(log_sys->mutex)); } +/*************************************************************************** +Checks the consistency of the checkpoint info */ +static +ibool +recv_check_cp_is_consistent( +/*========================*/ + /* out: TRUE if ok */ + byte* buf) /* in: buffer containing checkpoint info */ +{ + ulint fold; + + fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); + + if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf + + LOG_CHECKPOINT_CHECKSUM_1)) { + return(FALSE); + } + + fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, + LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); + + if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf + + LOG_CHECKPOINT_CHECKSUM_2)) { + return(FALSE); + } + + return(TRUE); +} + /************************************************************ Looks for the maximum consistent checkpoint from the log groups. */ static @@ -392,7 +425,6 @@ recv_find_max_checkpoint( dulint max_no; dulint checkpoint_no; ulint field; - ulint fold; byte* buf; group = UT_LIST_GET_FIRST(log_sys->log_groups); @@ -410,17 +442,11 @@ recv_find_max_checkpoint( log_group_read_checkpoint_info(group, field); - /* Check the consistency of the checkpoint info */ - fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); - - if ((fold & 0xFFFFFFFF) - != mach_read_from_4(buf - + LOG_CHECKPOINT_CHECKSUM_1)) { + if (!recv_check_cp_is_consistent(buf)) { if (log_debug_writes) { fprintf(stderr, - "InnoDB: Checkpoint in group %lu at %lu invalid, %lu, %lu\n", + "InnoDB: Checkpoint in group %lu at %lu invalid, %lu\n", group->id, field, - fold & 0xFFFFFFFF, mach_read_from_4(buf + LOG_CHECKPOINT_CHECKSUM_1)); @@ -429,23 +455,6 @@ recv_find_max_checkpoint( goto not_consistent; } - fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, - LOG_CHECKPOINT_CHECKSUM_2 - - LOG_CHECKPOINT_LSN); - if ((fold & 0xFFFFFFFF) - != mach_read_from_4(buf - + LOG_CHECKPOINT_CHECKSUM_2)) { - if (log_debug_writes) { - fprintf(stderr, - "InnoDB: Checkpoint in group %lu at %lu invalid, %lu, %lu\n", - group->id, field, - fold & 0xFFFFFFFF, - mach_read_from_4(buf - + LOG_CHECKPOINT_CHECKSUM_2)); - } - goto not_consistent; - } - group->state = LOG_GROUP_OK; group->lsn = mach_read_from_8(buf @@ -476,7 +485,13 @@ recv_find_max_checkpoint( if (*max_group == NULL) { - fprintf(stderr, "InnoDB: No valid checkpoint found\n"); + fprintf(stderr, +"InnoDB: No valid checkpoint found.\n" +"InnoDB: If this error appears when you are creating an InnoDB database,\n" +"InnoDB: the problem may be that during an earlier attempt you managed\n" +"InnoDB: to create the InnoDB data files, but log file creation failed.\n" +"InnoDB: If that is the case, please refer to section 3.1 of\n" +"InnoDB: http://www.innodb.com/ibman.html\n"); return(DB_ERROR); } @@ -484,6 +499,162 @@ recv_find_max_checkpoint( return(DB_SUCCESS); } +/*********************************************************************** +Reads the checkpoint info needed in hot backup. */ + +ibool +recv_read_cp_info_for_backup( +/*=========================*/ + /* out: TRUE if success */ + byte* hdr, /* in: buffer containing the log group header */ + dulint* lsn, /* out: checkpoint lsn */ + ulint* offset, /* out: checkpoint offset in the log group */ + ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database + is running with < version 3.23.50 of InnoDB */ + dulint* cp_no, /* out: checkpoint number */ + dulint* first_header_lsn) + /* out: lsn of of the start of the first log file */ +{ + ulint max_cp = 0; + dulint max_cp_no = ut_dulint_zero; + byte* cp_buf; + + cp_buf = hdr + LOG_CHECKPOINT_1; + + if (recv_check_cp_is_consistent(cp_buf)) { + max_cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO); + max_cp = LOG_CHECKPOINT_1; + } + + cp_buf = hdr + LOG_CHECKPOINT_2; + + if (recv_check_cp_is_consistent(cp_buf)) { + if (ut_dulint_cmp(mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO), + max_cp_no) > 0) { + max_cp = LOG_CHECKPOINT_2; + } + } + + if (max_cp == 0) { + return(FALSE); + } + + cp_buf = hdr + max_cp; + + *lsn = mach_read_from_8(cp_buf + LOG_CHECKPOINT_LSN); + *offset = mach_read_from_4(cp_buf + LOG_CHECKPOINT_OFFSET); + + /* If the user is running a pre-3.23.50 version of InnoDB, its + checkpoint data does not contain the fsp limit info */ + if (mach_read_from_4(cp_buf + LOG_CHECKPOINT_FSP_MAGIC_N) + == LOG_CHECKPOINT_FSP_MAGIC_N_VAL) { + + *fsp_limit = mach_read_from_4( + cp_buf + LOG_CHECKPOINT_FSP_FREE_LIMIT); + + if (*fsp_limit == 0) { + *fsp_limit = 1000000000; + } + } else { + *fsp_limit = 1000000000; + } + +/* printf("fsp limit %lu MB\n", *fsp_limit); */ + + *cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO); + + *first_header_lsn = mach_read_from_8(hdr + LOG_FILE_START_LSN); + + return(TRUE); +} + +/*********************************************************************** +Scans the log segment and n_bytes_scanned is set to the length of valid +log scanned. */ + +void +recv_scan_log_seg_for_backup( +/*=========================*/ + byte* buf, /* in: buffer containing log data */ + ulint buf_len, /* in: data length in that buffer */ + dulint* scanned_lsn, /* in/out: lsn of buffer start, + we return scanned lsn */ + ulint* scanned_checkpoint_no, + /* in/out: 4 lowest bytes of the + highest scanned checkpoint number so + far */ + ulint* n_bytes_scanned)/* out: how much we were able to + scan, smaller than buf_len if log + data ended here */ +{ + ulint data_len; + byte* log_block; + ulint no; + + *n_bytes_scanned = 0; + + for (log_block = buf; log_block < buf + buf_len; + log_block += OS_FILE_LOG_BLOCK_SIZE) { + + no = log_block_get_hdr_no(log_block); + + /* fprintf(stderr, "Log block header no %lu\n", no); */ + + if (no != log_block_get_trl_no(log_block) + || no != log_block_convert_lsn_to_no(*scanned_lsn)) { + +/* printf( +"Log block n:o %lu, trailer n:o %lu, scanned lsn n:o %lu\n", + no, log_block_get_trl_no(log_block), + log_block_convert_lsn_to_no(*scanned_lsn)); +*/ + /* Garbage or an incompletely written log block */ + + log_block += OS_FILE_LOG_BLOCK_SIZE; + +/* printf( +"Next log block n:o %lu, trailer n:o %lu\n", + log_block_get_hdr_no(log_block), + log_block_get_trl_no(log_block)); +*/ + break; + } + + if (*scanned_checkpoint_no > 0 + && log_block_get_checkpoint_no(log_block) + < *scanned_checkpoint_no + && *scanned_checkpoint_no + - log_block_get_checkpoint_no(log_block) + > 0x80000000) { + + /* Garbage from a log buffer flush which was made + before the most recent database recovery */ + + printf("Scanned cp n:o %lu, block cp n:o %lu\n", + *scanned_checkpoint_no, + log_block_get_checkpoint_no(log_block)); + + break; + } + + data_len = log_block_get_data_len(log_block); + + *scanned_checkpoint_no + = log_block_get_checkpoint_no(log_block); + *scanned_lsn = ut_dulint_add(*scanned_lsn, data_len); + + *n_bytes_scanned += data_len; + + if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + /* Log data ends here */ + + /* printf("Log block data len %lu\n", data_len); */ + + break; + } + } +} + /*********************************************************************** Tries to parse a single log record body and also applies it to a page if specified. */ @@ -625,7 +796,6 @@ recv_get_fil_addr_struct( recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, recv_hash(space, page_no)); - while (recv_addr) { if ((recv_addr->space == space) && (recv_addr->page_no == page_no)) { @@ -755,6 +925,10 @@ read in, or also for a page already in the buffer pool. */ void recv_recover_page( /*==============*/ + ibool recover_backup, /* in: TRUE if we are recovering a backup + page: then we do not acquire any latches + since the page was read in outside the + buffer pool */ ibool just_read_in, /* in: TRUE if the i/o-handler calls this for a freshly read page */ page_t* page, /* in: buffer page */ @@ -799,39 +973,48 @@ recv_recover_page( mutex_exit(&(recv_sys->mutex)); - block = buf_block_align(page); - - if (just_read_in) { - /* Move the ownership of the x-latch on the page to this OS - thread, so that we can acquire a second x-latch on it. This - is needed for the operations to the page to pass the debug - checks. */ - - rw_lock_x_lock_move_ownership(&(block->lock)); - } - mtr_start(&mtr); - mtr_set_log_mode(&mtr, MTR_LOG_NONE); - success = buf_page_get_known_nowait(RW_X_LATCH, page, BUF_KEEP_OLD, + if (!recover_backup) { + block = buf_block_align(page); + + if (just_read_in) { + /* Move the ownership of the x-latch on the page to this OS + thread, so that we can acquire a second x-latch on it. This + is needed for the operations to the page to pass the debug + checks. */ + + rw_lock_x_lock_move_ownership(&(block->lock)); + } + + success = buf_page_get_known_nowait(RW_X_LATCH, page, + BUF_KEEP_OLD, IB__FILE__, __LINE__, &mtr); - ut_a(success); + ut_a(success); - buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); + buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); + } /* Read the newest modification lsn from the page */ page_lsn = mach_read_from_8(page + FIL_PAGE_LSN); - /* It may be that the page has been modified in the buffer pool: read - the newest modification lsn there */ + if (!recover_backup) { + /* It may be that the page has been modified in the buffer + pool: read the newest modification lsn there */ - page_newest_lsn = buf_frame_get_newest_modification(page); + page_newest_lsn = buf_frame_get_newest_modification(page); - if (!ut_dulint_is_zero(page_newest_lsn)) { + if (!ut_dulint_is_zero(page_newest_lsn)) { - page_lsn = page_newest_lsn; + page_lsn = page_newest_lsn; + } + } else { + /* In recovery from a backup we do not use the buffer + pool */ + + page_newest_lsn = ut_dulint_zero; } modification_to_page = FALSE; @@ -852,13 +1035,13 @@ recv_recover_page( buf = ((byte*)(recv->data)) + sizeof(recv_data_t); } - if ((recv->type == MLOG_INIT_FILE_PAGE) - || (recv->type == MLOG_FULL_PAGE)) { - /* A new file page may has been taken into use, + if (recv->type == MLOG_INIT_FILE_PAGE + || recv->type == MLOG_FULL_PAGE) { + /* A new file page may have been taken into use, or we have stored the full contents of the page: in this case it may be that the original log record type was MLOG_INIT_FILE_PAGE, and we replaced it - with MLOG_FULL_PAGE, thus to we have to apply + with MLOG_FULL_PAGE, thus we have to apply any record of type MLOG_FULL_PAGE */ page_lsn = page_newest_lsn; @@ -885,6 +1068,13 @@ recv_recover_page( recv_parse_or_apply_log_rec_body(recv->type, buf, buf + recv->len, page, &mtr); + mach_write_to_8(page + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN, + ut_dulint_add(recv->start_lsn, + recv->len)); + mach_write_to_8(page + FIL_PAGE_LSN, + ut_dulint_add(recv->start_lsn, + recv->len)); } if (recv->len > RECV_DATA_BLOCK_SIZE) { @@ -903,7 +1093,7 @@ recv_recover_page( mutex_exit(&(recv_sys->mutex)); - if (modification_to_page) { + if (!recover_backup && modification_to_page) { buf_flush_recv_note_modification(block, start_lsn, end_lsn); } @@ -1038,8 +1228,8 @@ loop: buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); - recv_recover_page(FALSE, page, space, - page_no); + recv_recover_page(FALSE, FALSE, page, + space, page_no); mtr_commit(&mtr); } else { recv_read_in_area(space, page_no); @@ -1111,6 +1301,95 @@ loop: mutex_exit(&(recv_sys->mutex)); } +/*********************************************************************** +Applies log records in the hash table to a backup. */ + +void +recv_apply_log_recs_for_backup( +/*===========================*/ + ulint n_data_files, /* in: number of data files */ + char** data_files, /* in: array containing the paths to the + data files */ + ulint* file_sizes) /* in: sizes of the data files in database + pages */ +{ + recv_addr_t* recv_addr; + os_file_t data_file; + ulint n_pages_total = 0; + ulint nth_file = 0; + ulint nth_page_in_file= 0; + byte* page; + ibool success; + ulint i; + + recv_sys->apply_log_recs = TRUE; + recv_sys->apply_batch_on = TRUE; + + page = buf_pool->frame_zero; + + for (i = 0; i < n_data_files; i++) { + n_pages_total += file_sizes[i]; + } + + printf( +"InnoDB: Starting an apply batch of log records to the database...\n" +"InnoDB: Progress in percents: "); + + for (i = 0; i < n_pages_total; i++) { + + if (i == 0 || nth_page_in_file == file_sizes[nth_file]) { + if (i != 0) { + nth_file++; + nth_page_in_file = 0; + os_file_flush(data_file); + os_file_close(data_file); + } + + data_file = os_file_create_simple(data_files[nth_file], + OS_FILE_OPEN, + OS_FILE_READ_WRITE, + &success); + ut_a(success); + } + + recv_addr = recv_get_fil_addr_struct(0, i); + + if (recv_addr != NULL) { + os_file_read(data_file, page, + (nth_page_in_file << UNIV_PAGE_SIZE_SHIFT) + & 0xFFFFFFFF, + nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT), + UNIV_PAGE_SIZE); + + recv_recover_page(TRUE, FALSE, page, 0, i); + + buf_flush_init_for_writing(page, + mach_read_from_8(page + FIL_PAGE_LSN), + 0, i); + + os_file_write(data_files[nth_file], + data_file, page, + (nth_page_in_file << UNIV_PAGE_SIZE_SHIFT) + & 0xFFFFFFFF, + nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT), + UNIV_PAGE_SIZE); + } + + if ((100 * i) / n_pages_total + != (100 * (i + 1)) / n_pages_total) { + printf("%lu ", (100 * i) / n_pages_total); + fflush(stdout); + } + + nth_page_in_file++; + } + + os_file_flush(data_file); + os_file_close(data_file); + + recv_sys_empty_hash(); +} + /*********************************************************************** In the debug version, updates the replica of a file page, based on a log record. */ @@ -1430,12 +1709,13 @@ recv_check_incomplete_log_recs( /*********************************************************** Parses log records from a buffer and stores them to a hash table to wait -merging to file pages. If the hash table becomes too full, applies it -automatically to file pages. */ - -void +merging to file pages. */ +static +ibool recv_parse_log_recs( /*================*/ + /* out: TRUE if the hash table of parsed log + records became full */ ibool store_to_hash) /* in: TRUE if the records should be stored to the hash table; this is set to FALSE if just debug checking is needed */ @@ -1462,7 +1742,7 @@ loop: if (ptr == end_ptr) { - return; + return(FALSE); } single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG; @@ -1476,7 +1756,7 @@ loop: &page_no, &body); if (len == 0) { - return; + return(FALSE); } new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len); @@ -1487,7 +1767,7 @@ loop: that also the next log block should have been scanned in */ - return; + return(FALSE); } recv_sys->recovered_offset += len; @@ -1529,7 +1809,7 @@ loop: &page_no, &body); if (len == 0) { - return; + return(FALSE); } if ((!store_to_hash) && (type != MLOG_MULTI_REC_END)) { @@ -1570,27 +1850,9 @@ loop: that also the next log block should have been scanned in */ - return; + return(FALSE); } - if (2 * n_recs * (sizeof(recv_t) + sizeof(recv_addr_t)) - + total_len - + mem_heap_get_size(recv_sys->heap) - + RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE - > buf_pool_get_curr_size()) { - - /* Hash table of log records will grow too big: - empty it */ - - recv_apply_hashed_log_recs(FALSE); - } - - ut_ad(2 * n_recs * (sizeof(recv_t) + sizeof(recv_addr_t)) - + total_len - + mem_heap_get_size(recv_sys->heap) - + RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE - < buf_pool_get_curr_size()); - /* Add all the records to the hash table */ ptr = recv_sys->buf + recv_sys->recovered_offset; @@ -1627,18 +1889,7 @@ loop: ptr += len; } } - - if (store_to_hash && buf_get_free_list_len() - < RECV_POOL_N_FREE_BLOCKS) { - - /* Hash table of log records has grown too big: empty it; - FALSE means no ibuf operations allowed, as we cannot add - new records to the log yet: they would be produced by ibuf - operations */ - - recv_apply_hashed_log_recs(FALSE); - } - + goto loop; } @@ -1713,7 +1964,7 @@ recv_sys_add_to_parsing_buf( recv_sys->len += end_offset - start_offset; - ut_ad(recv_sys->len <= RECV_PARSING_BUF_SIZE); + ut_a(recv_sys->len <= RECV_PARSING_BUF_SIZE); } return(TRUE); @@ -1743,6 +1994,13 @@ recv_scan_log_recs( /*===============*/ /* out: TRUE if limit_lsn has been reached, or not able to scan any more in this log group */ + ibool apply_automatically,/* in: TRUE if we want this function to + apply log records automatically when the + hash table becomes full; in the hot backup tool + the tool does the applying, not this + function */ + ulint available_memory,/* in: we let the hash table of recs to grow + to this size, at the maximum */ ibool store_to_hash, /* in: TRUE if the records should be stored to the hash table; this is set to FALSE if just debug checking is needed */ @@ -1764,7 +2022,9 @@ recv_scan_log_recs( ut_ad(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0); ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0); ut_ad(len > 0); - + ut_a(apply_automatically <= TRUE); + ut_a(store_to_hash <= TRUE); + finished = FALSE; log_block = buf; @@ -1845,6 +2105,13 @@ recv_scan_log_recs( /* We were able to find more log data: add it to the parsing buffer if parse_start_lsn is already non-zero */ + if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE + >= RECV_PARSING_BUF_SIZE) { + fprintf(stderr, +"InnoDB: Error: log parsing buffer overflow. Recovery may have failed!\n"); + finished = TRUE; + } + more_data = recv_sys_add_to_parsing_buf(log_block, scanned_lsn); recv_sys->scanned_lsn = scanned_lsn; @@ -1863,25 +2130,36 @@ recv_scan_log_recs( *group_scanned_lsn = scanned_lsn; - if (more_data) { + if (recv_needed_recovery || recv_is_from_backup) { recv_scan_print_counter++; - if (recv_scan_print_counter < 10 - || (recv_scan_print_counter % 10 == 0)) { + if (finished || (recv_scan_print_counter % 80 == 0)) { + fprintf(stderr, "InnoDB: Doing recovery: scanned up to log sequence number %lu %lu\n", ut_dulint_get_high(*group_scanned_lsn), ut_dulint_get_low(*group_scanned_lsn)); - if (recv_scan_print_counter == 10) { - fprintf(stderr, -"InnoDB: After this prints a line for every 10th scan sweep:\n"); - } } + } + if (more_data) { /* Try to parse more log records */ recv_parse_log_recs(store_to_hash); + if (store_to_hash && mem_heap_get_size(recv_sys->heap) + > available_memory + && apply_automatically) { + + /* Hash table of log records has grown too big: + empty it; FALSE means no ibuf operations + allowed, as we cannot add new records to the + log yet: they would be produced by ibuf + operations */ + + recv_apply_hashed_log_recs(FALSE); + } + if (recv_sys->recovered_offset > RECV_PARSING_BUF_SIZE / 4) { /* Move parsing buffer data to the buffer start */ @@ -1918,10 +2196,12 @@ recv_group_scan_log_recs( log_group_read_log_seg(LOG_RECOVER, log_sys->buf, group, start_lsn, end_lsn); - finished = recv_scan_log_recs(TRUE, log_sys->buf, - RECV_SCAN_SIZE, start_lsn, - contiguous_lsn, - group_scanned_lsn); + finished = recv_scan_log_recs(TRUE, + buf_pool_get_curr_size() + - RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + TRUE, log_sys->buf, + RECV_SCAN_SIZE, start_lsn, + contiguous_lsn, group_scanned_lsn); start_lsn = end_lsn; } @@ -1969,7 +2249,7 @@ recv_recovery_from_checkpoint_start( if (type == LOG_CHECKPOINT) { recv_sys_create(); - recv_sys_init(); + recv_sys_init(FALSE, buf_pool_get_curr_size()); } if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { @@ -2280,6 +2560,84 @@ recv_reset_logs( mutex_enter(&(log_sys->mutex)); } +/********************************************************** +Creates new log files after a backup has been restored. */ + +void +recv_reset_log_files_for_backup( +/*============================*/ + char* log_dir, /* in: log file directory path */ + ulint n_log_files, /* in: number of log files */ + ulint log_file_size, /* in: log file size */ + dulint lsn) /* in: new start lsn, must be divisible by + OS_FILE_LOG_BLOCK_SIZE */ +{ + os_file_t log_file; + ibool success; + byte* buf; + ulint i; + char name[5000]; + + buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + + for (i = 0; i < n_log_files; i++) { + + sprintf(name, "%sib_logfile%lu", log_dir, i); + + log_file = os_file_create_simple(name, OS_FILE_CREATE, + OS_FILE_READ_WRITE, &success); + if (!success) { + printf( +"InnoDB: Cannot create %s. Check that the file does not exist yet.\n", name); + + exit(1); + } + + printf( +"Setting log file size to %lu %lu\n", ut_get_high32(log_file_size), + log_file_size & 0xFFFFFFFF); + + success = os_file_set_size(name, log_file, + log_file_size & 0xFFFFFFFF, + ut_get_high32(log_file_size)); + + if (!success) { + printf( +"InnoDB: Cannot set %s size to %lu %lu\n", name, ut_get_high32(log_file_size), + log_file_size & 0xFFFFFFFF); + exit(1); + } + + os_file_flush(log_file); + os_file_close(log_file); + } + + /* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */ + + log_reset_first_header_and_checkpoint(buf, + ut_dulint_add(lsn, LOG_BLOCK_HDR_SIZE)); + + log_block_init(buf + LOG_FILE_HDR_SIZE, lsn); + log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE, + LOG_BLOCK_HDR_SIZE); + sprintf(name, "%sib_logfile%lu", log_dir, 0); + + log_file = os_file_create_simple(name, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &success); + if (!success) { + printf("InnoDB: Cannot open %s.\n", name); + + exit(1); + } + + os_file_write(name, log_file, buf, 0, 0, + LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE); + os_file_flush(log_file); + os_file_close(log_file); + + ut_free(buf); +} + /********************************************************** Reads from the archive of a log group and performs recovery. */ static @@ -2296,13 +2654,13 @@ log_group_recover_from_archive_file( dulint dummy_lsn; dulint scanned_lsn; ulint len; - char name[10000]; ibool ret; byte* buf; ulint read_offset; ulint file_size; ulint file_size_high; int input_char; + char name[10000]; try_open_again: buf = log_sys->buf; @@ -2438,9 +2796,11 @@ ask_again: group->archive_space_id, read_offset / UNIV_PAGE_SIZE, read_offset % UNIV_PAGE_SIZE, len, buf, NULL); - - ret = recv_scan_log_recs(TRUE, buf, len, start_lsn, - &dummy_lsn, &scanned_lsn); + ret = recv_scan_log_recs(TRUE, + buf_pool_get_curr_size() - + RECV_POOL_N_FREE_BLOCKS * UNIV_PAGE_SIZE, + TRUE, buf, len, start_lsn, + &dummy_lsn, &scanned_lsn); if (ut_dulint_cmp(scanned_lsn, file_end_lsn) == 0) { @@ -2485,7 +2845,7 @@ recv_recovery_from_archive_start( ulint err; recv_sys_create(); - recv_sys_init(); + recv_sys_init(FALSE, buf_pool_get_curr_size()); sync_order_checks_on = TRUE; diff --git a/innobase/mem/mem0mem.c b/innobase/mem/mem0mem.c index 0680968a7eb..94cf85dfd63 100644 --- a/innobase/mem/mem0mem.c +++ b/innobase/mem/mem0mem.c @@ -234,7 +234,8 @@ mem_heap_add_block( new_size = 2 * mem_block_get_len(block); if (heap->type != MEM_HEAP_DYNAMIC) { - ut_ad(n <= MEM_MAX_ALLOC_IN_BUF); + /* From the buffer pool we allocate buffer frames */ + ut_a(n <= MEM_MAX_ALLOC_IN_BUF); if (new_size > MEM_MAX_ALLOC_IN_BUF) { new_size = MEM_MAX_ALLOC_IN_BUF; @@ -249,7 +250,7 @@ mem_heap_add_block( } new_block = mem_heap_create_block(heap, new_size, NULL, heap->type, - heap->file_name, heap->line); + heap->file_name, heap->line); if (new_block == NULL) { return(NULL); diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index 2ae59e26f35..dd51227bbf6 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -10,17 +10,22 @@ Created 10/21/1995 Heikki Tuuri #include "os0sync.h" #include "ut0mem.h" #include "srv0srv.h" -#include "trx0sys.h" #include "fil0fil.h" #undef HAVE_FDATASYNC +#undef UNIV_NON_BUFFERED_IO + #ifdef POSIX_ASYNC_IO /* We assume in this case that the OS has standard Posix aio (at least SunOS 2.6, HP-UX 11i and AIX 4.3 have) */ #endif +/* If the following is set to TRUE, we do not call os_file_flush in every +os_file_write. We can set this TRUE if the doublewrite buffer is used. */ +ibool os_do_not_call_flush_at_each_write = FALSE; + /* We use these mutexes to protect lseek + file i/o operation, if the OS does not provide an atomic pread or pwrite, or similar */ #define OS_FILE_N_SEEK_MUTEXES 16 @@ -118,6 +123,9 @@ ulint os_n_file_writes_old = 0; ulint os_n_fsyncs_old = 0; time_t os_last_printout; +ibool os_has_said_disk_full = FALSE; + + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -167,27 +175,28 @@ os_file_get_last_error(void) err = (ulint) GetLastError(); - if (err != ERROR_FILE_EXISTS) { - fprintf(stderr, - "InnoDB: Operating system error number %li in a file operation.\n" + if (err != ERROR_FILE_EXISTS && err != ERROR_DISK_FULL) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %li in a file operation.\n" "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n", (long) err); - if (err == ERROR_PATH_NOT_FOUND) { + if (err == ERROR_PATH_NOT_FOUND) { fprintf(stderr, "InnoDB: The error means the system cannot find the path specified.\n" "InnoDB: In installation you must create directories yourself, InnoDB\n" "InnoDB: does not create them.\n"); - } else if (err == ERROR_ACCESS_DENIED) { + } else if (err == ERROR_ACCESS_DENIED) { fprintf(stderr, "InnoDB: The error means mysqld does not have the access rights to\n" "InnoDB: the directory. It may also be you have created a subdirectory\n" "InnoDB: of the same name as a data file.\n"); - } else { + } else { fprintf(stderr, "InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.html\n" "InnoDB: what the error number means.\n"); - } + } } if (err == ERROR_FILE_NOT_FOUND) { @@ -202,26 +211,28 @@ os_file_get_last_error(void) #else err = (ulint) errno; - if (err != EEXIST) { - fprintf(stderr, - "InnoDB: Operating system error number %li in a file operation.\n" + if (err != EEXIST && err != ENOSPC ) { + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Operating system error number %li in a file operation.\n" "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n", (long) err); - if (err == ENOENT) { + if (err == ENOENT) { fprintf(stderr, "InnoDB: The error means the system cannot find the path specified.\n" "InnoDB: In installation you must create directories yourself, InnoDB\n" "InnoDB: does not create them.\n"); - } else if (err == EACCES) { + } else if (err == EACCES) { fprintf(stderr, "InnoDB: The error means mysqld does not have the access rights to\n" "InnoDB: the directory.\n"); - } else { + } else { fprintf(stderr, "InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.html\n" "InnoDB: what the error number means or use the perror program of MySQL.\n"); - } + } } if (err == ENOSPC ) { @@ -259,18 +270,26 @@ os_file_handle_error( err = os_file_get_last_error(); if (err == OS_FILE_DISK_FULL) { - fprintf(stderr, "\n"); - if (name) { - fprintf(stderr, - "InnoDB: Encountered a problem with file %s.\n", - name); - } - fprintf(stderr, - "InnoDB: Cannot continue operation.\n" - "InnoDB: Disk is full. Try to clean the disk to free space.\n" - "InnoDB: Delete a possible created file and restart.\n"); + /* We only print a warning about disk full once */ - exit(1); + if (os_has_said_disk_full) { + + return(FALSE); + } + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk to free space.\n"); + + os_has_said_disk_full = TRUE; + + return(FALSE); } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) { return(TRUE); @@ -290,6 +309,130 @@ os_file_handle_error( return(FALSE); } +/******************************************************************** +Creates the seek mutexes used in positioned reads and writes. */ + +void +os_io_init_simple(void) +/*===================*/ +{ + ulint i; + + for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_file_seek_mutexes[i] = os_mutex_create(NULL); + } +} + +/******************************************************************** +A simple function to open or create a file. */ + +os_file_t +os_file_create_simple( +/*==================*/ + /* out, own: handle to the file, not defined if error, + error number can be retrieved with os_get_last_error */ + char* name, /* in: name of the file or path as a null-terminated + string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened + (if does not exist, error), or OS_FILE_CREATE if a new + file is created (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD create_flag; + DWORD access; + DWORD attributes = 0; + ibool retry; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + create_flag = 0; + ut_error; + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + access = 0; + ut_error; + } + + file = CreateFile(name, + access, + FILE_SHARE_READ | FILE_SHARE_WRITE, + /* file can be read and written + also by other processes */ + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /* no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + + retry = os_file_handle_error(file, name); + + if (retry) { + goto try_again; + } + } else { + *success = TRUE; + } + + return(file); +#else + os_file_t file; + int create_flag; + ibool retry; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else { + create_flag = 0; + ut_error; + } + + if (create_mode == OS_FILE_CREATE) { + file = open(name, create_flag, S_IRUSR | S_IWUSR | S_IRGRP + | S_IWGRP | S_IROTH | S_IWOTH); + } else { + file = open(name, create_flag); + } + + if (file == -1) { + *success = FALSE; + + retry = os_file_handle_error(file, name); + + if (retry) { + goto try_again; + } + } else { + *success = TRUE; + } + + return(file); +#endif +} /******************************************************************** Opens an existing file or creates a new. */ @@ -355,8 +498,9 @@ try_again: file = CreateFile(name, GENERIC_READ | GENERIC_WRITE, /* read and write access */ - FILE_SHARE_READ,/* file can be read by other - processes */ + FILE_SHARE_READ | FILE_SHARE_WRITE, + /* file can be read and written + also by other processes */ NULL, /* default security attributes */ create_flag, attributes, @@ -494,6 +638,11 @@ os_file_get_size( offs = lseek(file, 0, SEEK_END); + if (offs == ((off_t)-1)) { + + return(FALSE); + } + if (sizeof(off_t) > 4) { *size = (ulint)(offs & 0xFFFFFFFF); *size_high = (ulint)(offs >> 32); @@ -524,13 +673,11 @@ os_file_set_size( ib_longlong low; ulint n_bytes; ibool ret; - ibool retry; byte* buf; ulint i; ut_a(size == (size & 0xFFFFFFFF)); -try_again: /* We use a very big 8 MB buffer in writing because Linux may be extremely slow in fsync on 1 MB writes */ @@ -571,14 +718,6 @@ try_again: } error_handling: - retry = os_file_handle_error(file, name); - - if (retry) { - goto try_again; - } - - ut_error; - return(FALSE); } @@ -725,8 +864,7 @@ os_file_pwrite( 64-bit address */ if (sizeof(off_t) > 4) { - offs = (off_t)offset + (((off_t)offset_high) << 32); - + offs = (off_t)offset + (((off_t)offset_high) << 32); } else { offs = (off_t)offset; @@ -743,8 +881,8 @@ os_file_pwrite( if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC && srv_unix_file_flush_method != SRV_UNIX_NOSYNC - && !trx_doublewrite) { - + && !os_do_not_call_flush_at_each_write) { + /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially physically written to disk. */ @@ -774,7 +912,7 @@ os_file_pwrite( if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC && srv_unix_file_flush_method != SRV_UNIX_NOSYNC - && !trx_doublewrite) { + && !os_do_not_call_flush_at_each_write) { /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially @@ -899,13 +1037,12 @@ os_file_write( DWORD ret2; DWORD low; DWORD high; - ibool retry; ulint i; ut_a((offset & 0xFFFFFFFF) == offset); os_n_file_writes++; -try_again: + ut_ad(file); ut_ad(buf); ut_ad(n > 0); @@ -924,7 +1061,15 @@ try_again: os_mutex_exit(os_file_seek_mutexes[i]); - goto error_handling; + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: File pointer positioning to file %s failed at\n" +"InnoDB: offset %lu %lu. Operating system error number %lu.\n", + name, offset_high, offset, + (ulint)GetLastError()); + + return(FALSE); } ret = WriteFile(file, buf, n, &len, NULL); @@ -932,38 +1077,61 @@ try_again: /* Always do fsync to reduce the probability that when the OS crashes, a database page is only partially physically written to disk. */ - if (!trx_doublewrite) { + if (!os_do_not_call_flush_at_each_write) { ut_a(TRUE == os_file_flush(file)); } os_mutex_exit(os_file_seek_mutexes[i]); if (ret && len == n) { + return(TRUE); } + + if (!os_has_said_disk_full) { + + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: Write to file %s failed at offset %lu %lu.\n" +"InnoDB: %lu bytes should have been written, only %lu were written.\n" +"InnoDB: Operating system error number %lu.\n" +"InnoDB: Check that your OS and file system support files of this size.\n" +"InnoDB: Check also the disk is not full or a disk quota exceeded.\n", + name, offset_high, offset, n, len, + (ulint)GetLastError()); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); #else - ibool retry; ssize_t ret; -try_again: ret = os_file_pwrite(file, buf, n, offset, offset_high); if ((ulint)ret == n) { + return(TRUE); } -#endif -#ifdef __WIN__ -error_handling: -#endif - retry = os_file_handle_error(file, name); - if (retry) { - goto try_again; - } + if (!os_has_said_disk_full) { - ut_error; + ut_print_timestamp(stderr); - return(FALSE); + fprintf(stderr, +" InnoDB: Error: Write to file %s failed at offset %lu %lu.\n" +"InnoDB: %lu bytes should have been written, only %lu were written.\n" +"InnoDB: Operating system error number %lu.\n" +"InnoDB: Check that your OS and file system support files of this size.\n" +"InnoDB: Check also the disk is not full or a disk quota exceeded.\n", + name, offset_high, offset, n, ret, (ulint)errno); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#endif } /******************************************************************** @@ -1034,7 +1202,8 @@ os_aio_array_create( } /**************************************************************************** -Initializes the asynchronous io system. Creates separate aio array for +Initializes the asynchronous io system. Calls also os_io_init_simple. +Creates a separate aio array for non-ibuf read and write, a third aio array for the ibuf i/o, with just one segment, two aio arrays for log reads and writes with one segment, and a synchronous aio array of the specified size. The combined number of segments @@ -1061,6 +1230,8 @@ os_aio_init( ut_ad(n % n_segments == 0); ut_ad(n_segments >= 4); + os_io_init_simple(); + n_per_seg = n / n_segments; n_write_segs = (n_segments - 2) / 2; n_read_segs = n_segments - 2 - n_write_segs; @@ -1081,10 +1252,6 @@ os_aio_init( os_aio_validate(); - for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { - os_file_seek_mutexes[i] = os_mutex_create(NULL); - } - os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*)); for (i = 0; i < n_segments; i++) { @@ -1742,7 +1909,8 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - if (slot->type == OS_FILE_WRITE && !trx_doublewrite) { + if (slot->type == OS_FILE_WRITE + && !os_do_not_call_flush_at_each_write) { ut_a(TRUE == os_file_flush(slot->file)); } } else { @@ -1827,7 +1995,8 @@ os_aio_posix_handle( *message1 = slot->message1; *message2 = slot->message2; - if (slot->type == OS_FILE_WRITE && !trx_doublewrite) { + if (slot->type == OS_FILE_WRITE + && !os_do_not_call_flush_at_each_write) { ut_a(TRUE == os_file_flush(slot->file)); } diff --git a/innobase/rem/rem0cmp.c b/innobase/rem/rem0cmp.c index 60828b122ba..e4779b5f26b 100644 --- a/innobase/rem/rem0cmp.c +++ b/innobase/rem/rem0cmp.c @@ -55,7 +55,8 @@ cmp_debug_dtuple_rec_with_match( contains the value for current comparison */ /***************************************************************** This function is used to compare two data fields for which the data type -is such that we must use MySQL code to compare them. */ +is such that we must use MySQL code to compare them. The prototype here +must be a copy of the the one in ha_innobase.cc! */ int innobase_mysql_cmp( diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index 3321c288e39..04df66c3bbd 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -391,7 +391,7 @@ row_ins_check_foreign_constraint( /* out: DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ - ibool check_ref,/* in: TRUE If we want to check that + ibool check_ref,/* in: TRUE if we want to check that the referenced table is ok, FALSE if we want to to check the foreign key table */ dict_foreign_t* foreign,/* in: foreign constraint; NOTE that the @@ -411,10 +411,23 @@ row_ins_check_foreign_constraint( ibool moved; int cmp; ulint err; + ulint i; mtr_t mtr; ut_ad(rw_lock_own(&dict_foreign_key_check_lock, RW_LOCK_SHARED)); + /* If any of the foreign key fields in entry is SQL NULL, we + suppress the foreign key check: this is compatible with Oracle, + for example */ + + for (i = 0; i < foreign->n_fields; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(DB_SUCCESS); + } + } + if (check_ref) { check_table = foreign->referenced_table; check_index = foreign->referenced_index; @@ -591,6 +604,8 @@ row_ins_scan_sec_index_for_duplicate( dtuple_t* entry, /* in: index entry */ que_thr_t* thr) /* in: query thread */ { + ulint n_unique; + ulint i; int cmp; ulint n_fields_cmp; rec_t* rec; @@ -599,6 +614,20 @@ row_ins_scan_sec_index_for_duplicate( ibool moved; mtr_t mtr; + n_unique = dict_index_get_n_unique(index); + + /* If the secondary index is unique, but one of the fields in the + n_unique first fields is NULL, a unique key violation cannot occur, + since we define NULL != NULL in this case */ + + for (i = 0; i < n_unique; i++) { + if (UNIV_SQL_NULL == dfield_get_len( + dtuple_get_nth_field(entry, i))) { + + return(DB_SUCCESS); + } + } + mtr_start(&mtr); /* Store old value on n_fields_cmp */ diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index c4a844b3fa9..2aca698eebd 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -1881,6 +1881,28 @@ loop: return(err); } +/************************************************************************* +Checks if a table name contains the string "/#sql" which denotes temporary +tables in MySQL. */ +static +ibool +row_is_mysql_tmp_table_name( +/*========================*/ + /* out: TRUE if temporary table */ + char* name) /* in: table name in the form 'database/tablename' */ +{ + ulint i; + + for (i = 0; i <= ut_strlen(name) - 5; i++) { + if (ut_memcmp(name + i, "/#sql", 5) == 0) { + + return(TRUE); + } + } + + return(FALSE); +} + /************************************************************************* Renames a table for MySQL. */ @@ -1944,16 +1966,27 @@ row_rename_table_for_mysql( str2 = "';\nold_table_name := '"; - str3 = - "';\n" - "UPDATE SYS_TABLES SET NAME = new_table_name\n" - "WHERE NAME = old_table_name;\n" - "UPDATE SYS_FOREIGN SET FOR_NAME = new_table_name\n" - "WHERE FOR_NAME = old_table_name;\n" - "UPDATE SYS_FOREIGN SET REF_NAME = new_table_name\n" - "WHERE REF_NAME = old_table_name;\n" - "COMMIT WORK;\n" - "END;\n"; + if (row_is_mysql_tmp_table_name(new_name)) { + + /* We want to preserve the original foreign key + constraint definitions despite the name change */ + + str3 = + "';\n" + "UPDATE SYS_TABLES SET NAME = new_table_name\n" + "WHERE NAME = old_table_name;\n" + "END;\n"; + } else { + str3 = + "';\n" + "UPDATE SYS_TABLES SET NAME = new_table_name\n" + "WHERE NAME = old_table_name;\n" + "UPDATE SYS_FOREIGN SET FOR_NAME = new_table_name\n" + "WHERE FOR_NAME = old_table_name;\n" + "UPDATE SYS_FOREIGN SET REF_NAME = new_table_name\n" + "WHERE REF_NAME = old_table_name;\n" + "END;\n"; + } len = ut_strlen(str1); @@ -2028,7 +2061,32 @@ row_rename_table_for_mysql( trx_general_rollback_for_mysql(trx, FALSE, NULL); trx->error_state = DB_SUCCESS; } else { - ut_a(dict_table_rename_in_cache(table, new_name)); + ut_a(dict_table_rename_in_cache(table, new_name, + !row_is_mysql_tmp_table_name(new_name))); + + if (row_is_mysql_tmp_table_name(old_name)) { + + err = dict_load_foreigns(new_name); + + if (err != DB_SUCCESS) { + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: in ALTER TABLE table %s\n" + "InnoDB: has or is referenced in foreign key constraints\n" + "InnoDB: which are not compatible with the new table definition.\n", + new_name); + + ut_a(dict_table_rename_in_cache(table, + old_name, FALSE)); + + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, + NULL); + trx->error_state = DB_SUCCESS; + } + } } funct_exit: mutex_exit(&(dict_sys->mutex)); diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index 663a544faac..9ebd47c25bf 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -2233,7 +2233,7 @@ row_sel_get_clust_rec_for_mysql( (or old_vers) is not rec; in that case we must ignore such row because in our snapshot rec would not have existed. Remember that from rec we cannot see directly which transaction - id corrsponds to it: we have to go to the clustered index + id corresponds to it: we have to go to the clustered index record. A query where we want to fetch all rows where the secondary index value is in some interval would return a wrong result if we would not drop rows which we come to @@ -2244,6 +2244,12 @@ row_sel_get_clust_rec_for_mysql( && !row_sel_sec_rec_is_for_clust_rec(rec, sec_index, clust_rec, clust_index)) { clust_rec = NULL; + } else { +#ifdef UNIV_SEARCH_DEBUG + ut_a(clust_rec == NULL || + row_sel_sec_rec_is_for_clust_rec(rec, sec_index, + clust_rec, clust_index)); +#endif } } @@ -2399,7 +2405,12 @@ row_sel_try_search_shortcut_for_mysql( btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, pcur, - RW_S_LATCH, mtr); +#ifndef UNIV_SEARCH_DEBUG + RW_S_LATCH, +#else + 0, +#endif + mtr); rec = btr_pcur_get_rec(pcur); if (!page_rec_is_user_rec(rec)) { @@ -2623,15 +2634,18 @@ row_search_for_mysql( goto no_shortcut; } - +#ifndef UNIV_SEARCH_DEBUG if (!trx->has_search_latch) { rw_lock_s_lock(&btr_search_latch); trx->has_search_latch = TRUE; } - +#endif shortcut = row_sel_try_search_shortcut_for_mysql(&rec, prebuilt, &mtr); if (shortcut == SEL_FOUND) { +#ifdef UNIV_SEARCH_DEBUG + ut_a(0 == cmp_dtuple_rec(search_tuple, rec)); +#endif row_sel_store_mysql_rec(buf, prebuilt, rec); mtr_commit(&mtr); @@ -2793,7 +2807,9 @@ rec_loop: /* The record matches enough */ ut_ad(mode == PAGE_CUR_GE); - +#ifdef UNIV_SEARCH_DEBUG + ut_a(0 == cmp_dtuple_rec(search_tuple, rec)); +#endif } else if (match_mode == ROW_SEL_EXACT) { /* Test if the index record matches completely to search_tuple in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index 435cfa3485e..a566e29f2c3 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -142,7 +142,7 @@ try_again: /************************************************************************* Checks if possible foreign key constraints hold after a delete of the record -under pcur. NOTE that this function will temporarily commit mtr and lose +under pcur. NOTE that this function will temporarily commit mtr and lose the pcur position! */ static ulint diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index 553c012bf85..8afe1396f1b 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -69,13 +69,19 @@ char* srv_main_thread_op_info = ""; names, where the file name itself may also contain a path */ char* srv_data_home = NULL; -char* srv_logs_home = NULL; char* srv_arch_dir = NULL; ulint srv_n_data_files = 0; char** srv_data_file_names = NULL; ulint* srv_data_file_sizes = NULL; /* size in database pages */ +ibool srv_auto_extend_last_data_file = FALSE; /* if TRUE, then we + auto-extend the last data + file */ +ulint srv_last_file_size_max = 0; /* if != 0, this tells + the max size auto-extending + may increase the last data + file size */ ulint* srv_data_file_is_raw_partition = NULL; /* If the following is TRUE we do not allow inserts etc. This protects @@ -1596,7 +1602,7 @@ srv_read_initfile( /************************************************************************* Initializes the server. */ -static + void srv_init(void) /*==========*/ @@ -1664,7 +1670,7 @@ srv_init(void) /************************************************************************* Initializes the synchronization primitives, memory system, and the thread local storage. */ -static + void srv_general_init(void) /*==================*/ @@ -1686,6 +1692,7 @@ srv_conc_enter_innodb( trx_t* trx) /* in: transaction object associated with the thread */ { + ibool has_slept = FALSE; srv_conc_slot_t* slot; ulint i; @@ -1703,7 +1710,7 @@ srv_conc_enter_innodb( return; } - +retry: os_fast_mutex_lock(&srv_conc_mutex); if (srv_conc_n_threads < (lint)srv_thread_concurrency) { @@ -1716,7 +1723,23 @@ srv_conc_enter_innodb( return; } + + /* If the transaction is not holding resources, let it sleep + for 100 milliseconds, and try again then */ + if (!has_slept && !trx->has_search_latch + && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) { + + has_slept = TRUE; /* We let is sleep only once to avoid + starvation */ + + os_fast_mutex_unlock(&srv_conc_mutex); + + os_thread_sleep(100000); + + goto retry; + } + /* Too many threads inside: put the current thread to a queue */ for (i = 0; i < OS_THREAD_MAX_N; i++) { @@ -1908,6 +1931,9 @@ srv_normalize_init_values(void) * ((1024 * 1024) / UNIV_PAGE_SIZE); } + srv_last_file_size_max = srv_last_file_size_max + * ((1024 * 1024) / UNIV_PAGE_SIZE); + srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE; srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE; diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index f9a13944bb5..1fcf8c76a5f 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -84,6 +84,308 @@ we may get an assertion failure in os0file.c */ #define SRV_LOG_SPACE_FIRST_ID 1000000000 +/************************************************************************* +Reads the data files and their sizes from a character string given in +the .cnf file. */ + +ibool +srv_parse_data_file_paths_and_sizes( +/*================================*/ + /* out: TRUE if ok, FALSE if parsing + error */ + char* str, /* in: the data file path string */ + char*** data_file_names, /* out, own: array of data file + names */ + ulint** data_file_sizes, /* out, own: array of data file sizes + in megabytes */ + ulint** data_file_is_raw_partition,/* out, own: array of flags + showing which data files are raw + partitions */ + ulint* n_data_files, /* out: number of data files */ + ibool* is_auto_extending, /* out: TRUE if the last data file is + auto-extending */ + ulint* max_auto_extend_size) /* out: max auto extend size for the + last file if specified, 0 if not */ +{ + char* input_str; + char* endp; + char* path; + ulint size; + ulint i = 0; + + *is_auto_extending = FALSE; + *max_auto_extend_size = 0; + + input_str = str; + + /* First calculate the number of data files and check syntax: + path:size[M | G];path:size[M | G]... . Note that a Windows path may + contain a drive name and a ':'. */ + + while (*str != '\0') { + path = str; + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/'))) { + str++; + } + + if (*str == '\0') { + return(FALSE); + } + + str++; + + size = strtoul(str, &endp, 10); + + str = endp; + + if (*str != 'M' && *str != 'G') { + size = size / (1024 * 1024); + } else if (*str == 'G') { + size = size * 1024; + str++; + } else { + str++; + } + + if (strlen(str) >= ut_strlen(":autoextend") + && 0 == ut_memcmp(str, ":autoextend", + ut_strlen(":autoextend"))) { + + str += ut_strlen(":autoextend"); + + if (strlen(str) >= ut_strlen(":max:") + && 0 == ut_memcmp(str, ":max:", + ut_strlen(":max:"))) { + + str += ut_strlen(":max:"); + + size = strtoul(str, &endp, 10); + + str = endp; + + if (*str != 'M' && *str != 'G') { + size = size / (1024 * 1024); + } else if (*str == 'G') { + size = size * 1024; + str++; + } else { + str++; + } + } + + if (*str != '\0') { + + return(FALSE); + } + } + + if (strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + str += 3; + } + + if (strlen(str) >= 3 + && *str == 'r' + && *(str + 1) == 'a' + && *(str + 2) == 'w') { + str += 3; + } + + if (size == 0) { + return(FALSE); + } + + i++; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + + return(FALSE); + } + } + + *data_file_names = (char**)ut_malloc(i * sizeof(void*)); + *data_file_sizes = (ulint*)ut_malloc(i * sizeof(ulint)); + *data_file_is_raw_partition = (ulint*)ut_malloc(i * sizeof(ulint)); + + *n_data_files = i; + + /* Then store the actual values to our arrays */ + + str = input_str; + i = 0; + + while (*str != '\0') { + path = str; + + /* Note that we must ignore the ':' in a Windows path */ + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/'))) { + str++; + } + + if (*str == ':') { + /* Make path a null-terminated string */ + *str = '\0'; + str++; + } + + size = strtoul(str, &endp, 10); + + str = endp; + + if ((*str != 'M') && (*str != 'G')) { + size = size / (1024 * 1024); + } else if (*str == 'G') { + size = size * 1024; + str++; + } else { + str++; + } + + (*data_file_names)[i] = path; + (*data_file_sizes)[i] = size; + + if (strlen(str) >= ut_strlen(":autoextend") + && 0 == ut_memcmp(str, ":autoextend", + ut_strlen(":autoextend"))) { + + *is_auto_extending = TRUE; + + str += ut_strlen(":autoextend"); + + if (strlen(str) >= ut_strlen(":max:") + && 0 == ut_memcmp(str, ":max:", + ut_strlen(":max:"))) { + + str += ut_strlen(":max:"); + + size = strtoul(str, &endp, 10); + + str = endp; + + if (*str != 'M' && *str != 'G') { + size = size / (1024 * 1024); + } else if (*str == 'G') { + size = size * 1024; + str++; + } else { + str++; + } + + *max_auto_extend_size = size; + } + + if (*str != '\0') { + + return(FALSE); + } + } + + (*data_file_is_raw_partition)[i] = 0; + + if (strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + str += 3; + (*data_file_is_raw_partition)[i] = SRV_NEW_RAW; + } + + if (strlen(str) >= 3 + && *str == 'r' + && *(str + 1) == 'a' + && *(str + 2) == 'w') { + str += 3; + + if ((*data_file_is_raw_partition)[i] == 0) { + (*data_file_is_raw_partition)[i] = SRV_OLD_RAW; + } + } + + i++; + + if (*str == ';') { + str++; + } + } + + return(TRUE); +} + +/************************************************************************* +Reads log group home directories from a character string given in +the .cnf file. */ + +ibool +srv_parse_log_group_home_dirs( +/*==========================*/ + /* out: TRUE if ok, FALSE if parsing + error */ + char* str, /* in: character string */ + char*** log_group_home_dirs) /* out, own: log group home dirs */ +{ + char* input_str; + char* path; + ulint i = 0; + + input_str = str; + + /* First calculate the number of directories and check syntax: + path;path;... */ + + while (*str != '\0') { + path = str; + + while (*str != ';' && *str != '\0') { + str++; + } + + i++; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + + return(FALSE); + } + } + + *log_group_home_dirs = (char**) ut_malloc(i * sizeof(void*)); + + /* Then store the actual values to our array */ + + str = input_str; + i = 0; + + while (*str != '\0') { + path = str; + + while (*str != ';' && *str != '\0') { + str++; + } + + if (*str == ';') { + *str = '\0'; + str++; + } + + (*log_group_home_dirs)[i] = path; + + i++; + } + + return(TRUE); +} + /************************************************************************ I/o-handler thread function. */ static @@ -127,7 +429,7 @@ io_handler_thread( /************************************************************************* Normalizes a directory path for Windows: converts slashes to backslashes. */ -static + void srv_normalize_path_for_win( /*=======================*/ @@ -148,7 +450,7 @@ srv_normalize_path_for_win( /************************************************************************* Adds a slash or a backslash to the end of a string if it is missing and the string is not empty. */ -static + char* srv_add_path_separator_if_needed( /*=============================*/ @@ -354,6 +656,7 @@ open_or_create_data_files( ibool one_created = FALSE; ulint size; ulint size_high; + ulint rounded_size_pages; char name[10000]; if (srv_n_data_files >= 1000) { @@ -433,17 +736,35 @@ open_or_create_data_files( ret = os_file_get_size(files[i], &size, &size_high); ut_a(ret); + /* Round size downward to megabytes */ - /* File sizes in srv_... are given in - database pages */ + rounded_size_pages = (size / (1024 * 1024) + + 4096 * size_high) + << (20 - UNIV_PAGE_SIZE_SHIFT); - if (size != srv_calc_low32( - srv_data_file_sizes[i]) - || size_high != srv_calc_high32( - srv_data_file_sizes[i])) { + if (i == srv_n_data_files - 1 + && srv_auto_extend_last_data_file) { + + if (srv_data_file_sizes[i] > + rounded_size_pages + || (srv_last_file_size_max > 0 + && srv_last_file_size_max < + rounded_size_pages)) { + + fprintf(stderr, + "InnoDB: Error: data file %s is of a different size\n" + "InnoDB: than specified in the .cnf file!\n", name); + } + + srv_data_file_sizes[i] = + rounded_size_pages; + } + + if (rounded_size_pages + != srv_data_file_sizes[i]) { fprintf(stderr, - "InnoDB: Error: data file %s is of different size\n" + "InnoDB: Error: data file %s is of a different size\n" "InnoDB: than specified in the .cnf file!\n", name); return(DB_ERROR); @@ -477,7 +798,7 @@ open_or_create_data_files( >> (20 - UNIV_PAGE_SIZE_SHIFT))); fprintf(stderr, - "InnoDB: Database physically writes the file full: wait...\n"); + "InnoDB: Database physically writes the file full: wait...\n"); ret = os_file_set_size(name, files[i], srv_calc_low32(srv_data_file_sizes[i]), @@ -675,6 +996,8 @@ innobase_start_or_create_for_mysql(void) os_aio_use_native_aio = TRUE; } #endif + os_aio_use_native_aio = FALSE; + if (!os_aio_use_native_aio) { os_aio_init(4 * SRV_N_PENDING_IOS_PER_THREAD * srv_n_file_io_threads, @@ -721,12 +1044,10 @@ innobase_start_or_create_for_mysql(void) return(DB_ERROR); } - if (sizeof(ulint) == 4 - && srv_n_log_files * srv_log_file_size >= 262144) { + if (srv_n_log_files * srv_log_file_size >= 262144) { fprintf(stderr, - "InnoDB: Error: combined size of log files must be < 4 GB\n" - "InnoDB: on 32-bit computers\n"); + "InnoDB: Error: combined size of log files must be < 4 GB\n"); return(DB_ERROR); } @@ -758,7 +1079,6 @@ innobase_start_or_create_for_mysql(void) &max_flushed_lsn, &max_arch_log_no, &sum_of_new_sizes); if (err != DB_SUCCESS) { - fprintf(stderr, "InnoDB: Could not open data files\n"); return((int) err); @@ -797,9 +1117,9 @@ innobase_start_or_create_for_mysql(void) || (log_opened && log_created)) { fprintf(stderr, "InnoDB: Error: all log files must be created at the same time.\n" - "InnoDB: If you want bigger or smaller log files,\n" - "InnoDB: shut down the database and make sure there\n" - "InnoDB: were no errors in shutdown.\n" + "InnoDB: All log files must be created also in database creation.\n" + "InnoDB: If you want bigger or smaller log files, shut down the\n" + "InnoDB: database and make sure there were no errors in shutdown.\n" "InnoDB: Then delete the existing log files. Edit the .cnf file\n" "InnoDB: and start the database again.\n"); @@ -835,9 +1155,7 @@ innobase_start_or_create_for_mysql(void) mutex_enter(&(log_sys->mutex)); - recv_reset_logs(ut_dulint_align_down(max_flushed_lsn, - OS_FILE_LOG_BLOCK_SIZE), - max_arch_log_no + 1, TRUE); + recv_reset_logs(max_flushed_lsn, max_arch_log_no + 1, TRUE); mutex_exit(&(log_sys->mutex)); } @@ -877,6 +1195,10 @@ innobase_start_or_create_for_mysql(void) srv_startup_is_before_trx_rollback_phase = FALSE; + /* Initialize the fsp free limit global variable in the log + system */ + fsp_header_get_free_limit(0); + recv_recovery_from_archive_finish(); } else { /* We always try to do a recovery, even if the database had @@ -893,6 +1215,7 @@ innobase_start_or_create_for_mysql(void) /* Since ibuf init is in dict_boot, and ibuf is needed in any disk i/o, first call dict_boot */ + dict_boot(); trx_sys_init_at_db_start(); @@ -900,6 +1223,11 @@ innobase_start_or_create_for_mysql(void) trx_sys_init_at_db_start */ srv_startup_is_before_trx_rollback_phase = FALSE; + + /* Initialize the fsp free limit global variable in the log + system */ + fsp_header_get_free_limit(0); + recv_recovery_from_checkpoint_finish(); } @@ -969,7 +1297,7 @@ innobase_start_or_create_for_mysql(void) if (err != DB_SUCCESS) { return((int)DB_ERROR); } - + /* Create the master thread which monitors the database server, and does purge and other utility operations */ diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index b29ffb4b3bf..6c9776560bd 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -20,11 +20,42 @@ Created 3/26/1996 Heikki Tuuri #include "srv0srv.h" #include "trx0purge.h" #include "log0log.h" +#include "os0file.h" /* The transaction system */ trx_sys_t* trx_sys = NULL; trx_doublewrite_t* trx_doublewrite = NULL; +/******************************************************************** +Determines if a page number is located inside the doublewrite buffer. */ + +ibool +trx_doublewrite_page_inside( +/*========================*/ + /* out: TRUE if the location is inside + the two blocks of the doublewrite buffer */ + ulint page_no) /* in: page number */ +{ + if (trx_doublewrite == NULL) { + + return(FALSE); + } + + if (page_no >= trx_doublewrite->block1 + && page_no < trx_doublewrite->block1 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + if (page_no >= trx_doublewrite->block2 + && page_no < trx_doublewrite->block2 + + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + return(TRUE); + } + + return(FALSE); +} + /******************************************************************** Creates or initialializes the doublewrite buffer at a database start. */ static @@ -36,6 +67,11 @@ trx_doublewrite_init( { trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); + /* When we have the doublewrite buffer in use, we do not need to + call os_file_flush (Unix fsync) after every write. */ + + os_do_not_call_flush_at_each_write = TRUE; + mutex_create(&(trx_doublewrite->mutex)); mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE); diff --git a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c index 79351ff120f..a1320e8b5bc 100644 --- a/innobase/ut/ut0mem.c +++ b/innobase/ut/ut0mem.c @@ -121,6 +121,7 @@ ut_malloc( { return(ut_malloc_low(n, TRUE)); } + /************************************************************************** Frees a memory block allocated with ut_malloc. */ diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index 964d5bca567..7ee32b9a8e2 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -16,6 +16,24 @@ Created 5/11/1994 Heikki Tuuri ibool ut_always_false = FALSE; +/************************************************************ +Gets the high 32 bits in a ulint. That is makes a shift >> 32, +but since there seem to be compiler bugs in both gcc and Visual C++, +we do this by a special conversion. */ + +ulint +ut_get_high32( +/*==========*/ + /* out: a >> 32 */ + ulint a) /* in: ulint */ +{ + if (sizeof(ulint) == 4) { + return(0); + } + + return(a >> 32); +} + /************************************************************ The following function returns a clock time in milliseconds. */ @@ -58,11 +76,11 @@ ut_print_timestamp( FILE* file) /* in: file where to print */ { #ifdef __WIN__ - SYSTEMTIME cal_tm; + SYSTEMTIME cal_tm; - GetLocalTime(&cal_tm); + GetLocalTime(&cal_tm); - fprintf(file,"%02d%02d%02d %2d:%02d:%02d", + fprintf(file,"%02d%02d%02d %2d:%02d:%02d", (int)cal_tm.wYear % 100, (int)cal_tm.wMonth, (int)cal_tm.wDay, @@ -70,23 +88,21 @@ ut_print_timestamp( (int)cal_tm.wMinute, (int)cal_tm.wSecond); #else + struct tm cal_tm; + struct tm* cal_tm_ptr; + time_t tm; - struct tm cal_tm; - struct tm* cal_tm_ptr; - time_t tm; - - time(&tm); + time(&tm); #ifdef HAVE_LOCALTIME_R - localtime_r(&tm, &cal_tm); - cal_tm_ptr = &cal_tm; + localtime_r(&tm, &cal_tm); + cal_tm_ptr = &cal_tm; #else - cal_tm_ptr = localtime(&tm); + cal_tm_ptr = localtime(&tm); #endif - - fprintf(file,"%02d%02d%02d %2d:%02d:%02d", + fprintf(file,"%02d%02d%02d %2d:%02d:%02d", cal_tm_ptr->tm_year % 100, - cal_tm_ptr->tm_mon+1, + cal_tm_ptr->tm_mon + 1, cal_tm_ptr->tm_mday, cal_tm_ptr->tm_hour, cal_tm_ptr->tm_min, @@ -94,6 +110,39 @@ ut_print_timestamp( #endif } +/************************************************************** +Returns current year, month, day. */ + +void +ut_get_year_month_day( +/*==================*/ + ulint* year, /* out: current year */ + ulint* month, /* out: month */ + ulint* day) /* out: day */ +{ +#ifdef __WIN__ + SYSTEMTIME cal_tm; + + GetLocalTime(&cal_tm); + + *year = (ulint)cal_tm.wYear; + *month = (ulint)cal_tm.wMonth; + *day = (ulint)cal_tm.wDay; +#else + struct tm cal_tm; + struct tm* cal_tm_ptr; + time_t tm; + + time(&tm); + + cal_tm_ptr = localtime(&tm); + + *year = (ulint)cal_tm_ptr->tm_year; + *month = (ulint)cal_tm_ptr->tm_mon + 1; + *day = (ulint)cal_tm_ptr->tm_mday; +#endif +} + /***************************************************************** Runs an idle loop on CPU. The argument gives the desired delay in microseconds on 100 MHz Pentium + Visual C++. */ diff --git a/sql/ha_innobase.cc b/sql/ha_innobase.cc index 2cdf15ce974..9159ef3f1c1 100644 --- a/sql/ha_innobase.cc +++ b/sql/ha_innobase.cc @@ -76,20 +76,34 @@ bool innodb_skip = 0; uint innobase_init_flags = 0; ulong innobase_cache_size = 0; +/* The default values for the following, type long, start-up parameters +are declared in mysqld.cc: */ + long innobase_mirrored_log_groups, innobase_log_files_in_group, innobase_log_file_size, innobase_log_buffer_size, innobase_buffer_pool_size, innobase_additional_mem_pool_size, innobase_file_io_threads, innobase_lock_wait_timeout, - innobase_thread_concurrency, innobase_force_recovery; + innobase_thread_concurrency, innobase_force_recovery; -char *innobase_data_home_dir, *innobase_data_file_path; -char *innobase_log_group_home_dir, *innobase_log_arch_dir; -char *innobase_unix_file_flush_method; -my_bool innobase_flush_log_at_trx_commit, innobase_log_archive, - innobase_use_native_aio, innobase_fast_shutdown; +/* The default values for the following char* start-up parameters +are determined in innobase_init below: */ /* innobase_data_file_path=ibdata:15,idata2:1,... */ +char* innobase_data_home_dir = NULL; +char* innobase_data_file_path = NULL; +char* innobase_log_group_home_dir = NULL; +char* innobase_log_arch_dir = NULL; +char* innobase_unix_file_flush_method = NULL; + +/* Below we have boolean-valued start-up parameters, and their default +values */ + +my_bool innobase_flush_log_at_trx_commit = FALSE; +my_bool innobase_log_archive = FALSE; +my_bool innobase_use_native_aio = FALSE; +my_bool innobase_fast_shutdown = TRUE; + /* The following counter is used to convey information to InnoDB about server activity: in selects it is not sensible to call srv_active_wake_master_thread after each fetch or search, we only do @@ -331,227 +345,6 @@ ha_innobase::update_thd( return(0); } -/************************************************************************* -Reads the data files and their sizes from a character string given in -the .cnf file. */ -static -bool -innobase_parse_data_file_paths_and_sizes(void) -/*==========================================*/ - /* out: TRUE if ok, FALSE if parsing - error */ -{ - char* str; - char* endp; - char* path; - ulint size; - ulint i = 0; - - str = innobase_data_file_path; - - /* First calculate the number of data files and check syntax: - path:size[M];path:size[M]... . Note that a Windows path may - contain a drive name and a ':'. */ - - while (*str != '\0') { - path = str; - - while ((*str != ':' && *str != '\0') - || (*str == ':' - && (*(str + 1) == '\\' || *(str + 1) == '/'))) { - str++; - } - - if (*str == '\0') { - return(FALSE); - } - - str++; - - size = strtoul(str, &endp, 10); - - str = endp; - - if ((*str != 'M') && (*str != 'G')) { - size = size / (1024 * 1024); - } else if (*str == 'G') { - size = size * 1024; - str++; - } else { - str++; - } - - if (strlen(str) >= 6 - && *str == 'n' - && *(str + 1) == 'e' - && *(str + 2) == 'w') { - str += 3; - } - - if (strlen(str) >= 3 - && *str == 'r' - && *(str + 1) == 'a' - && *(str + 2) == 'w') { - str += 3; - } - - if (size == 0) { - return(FALSE); - } - - i++; - - if (*str == ';') { - str++; - } else if (*str != '\0') { - - return(FALSE); - } - } - - srv_data_file_names = (char**)ut_malloc(i * sizeof(void*)); - srv_data_file_sizes = (ulint*)ut_malloc(i * sizeof(ulint)); - srv_data_file_is_raw_partition = (ulint*)ut_malloc(i * sizeof(ulint)); - - srv_n_data_files = i; - - /* Then store the actual values to our arrays */ - - str = innobase_data_file_path; - i = 0; - - while (*str != '\0') { - path = str; - - /* Note that we must ignore the ':' in a Windows path */ - - while ((*str != ':' && *str != '\0') - || (*str == ':' - && (*(str + 1) == '\\' || *(str + 1) == '/'))) { - str++; - } - - if (*str == ':') { - /* Make path a null-terminated string */ - *str = '\0'; - str++; - } - - size = strtoul(str, &endp, 10); - - str = endp; - - if ((*str != 'M') && (*str != 'G')) { - size = size / (1024 * 1024); - } else if (*str == 'G') { - size = size * 1024; - str++; - } else { - str++; - } - - srv_data_file_is_raw_partition[i] = 0; - - if (strlen(str) >= 6 - && *str == 'n' - && *(str + 1) == 'e' - && *(str + 2) == 'w') { - str += 3; - srv_data_file_is_raw_partition[i] = SRV_NEW_RAW; - } - - if (strlen(str) >= 3 - && *str == 'r' - && *(str + 1) == 'a' - && *(str + 2) == 'w') { - str += 3; - - if (srv_data_file_is_raw_partition[i] == 0) { - srv_data_file_is_raw_partition[i] = SRV_OLD_RAW; - } - } - - srv_data_file_names[i] = path; - srv_data_file_sizes[i] = size; - - i++; - - if (*str == ';') { - str++; - } - } - - return(TRUE); -} - -/************************************************************************* -Reads log group home directories from a character string given in -the .cnf file. */ -static -bool -innobase_parse_log_group_home_dirs(void) -/*====================================*/ - /* out: TRUE if ok, FALSE if parsing - error */ -{ - char* str; - char* path; - ulint i = 0; - - str = innobase_log_group_home_dir; - - /* First calculate the number of directories and check syntax: - path;path;... */ - - while (*str != '\0') { - path = str; - - while (*str != ';' && *str != '\0') { - str++; - } - - i++; - - if (*str == ';') { - str++; - } else if (*str != '\0') { - - return(FALSE); - } - } - - if (i != (ulint) innobase_mirrored_log_groups) { - - return(FALSE); - } - - srv_log_group_home_dirs = (char**) ut_malloc(i * sizeof(void*)); - - /* Then store the actual values to our array */ - - str = innobase_log_group_home_dir; - i = 0; - - while (*str != '\0') { - path = str; - - while (*str != ';' && *str != '\0') { - str++; - } - - if (*str == ';') { - *str = '\0'; - str++; - } - - srv_log_group_home_dirs[i] = path; - - i++; - } - - return(TRUE); -} - /************************************************************************* Opens an InnoDB database. */ @@ -565,7 +358,7 @@ innobase_init(void) bool ret; DBUG_ENTER("innobase_init"); - + if (specialflag & SPECIAL_NO_PRIOR) { srv_set_thread_priorities = FALSE; } else { @@ -574,49 +367,62 @@ innobase_init(void) } /* Use current_dir if no paths are set */ - current_dir[0]=FN_CURLIB; - current_dir[1]=FN_LIBCHAR; - current_dir[2]=0; + current_dir[0] = FN_CURLIB; + current_dir[1] = FN_LIBCHAR; + current_dir[2] = 0; /* Set InnoDB initialization parameters according to the values read from MySQL .cnf file */ - if (!innobase_data_file_path) - { - fprintf(stderr, + if (!innobase_data_file_path) { + fprintf(stderr, "Cannot initialize InnoDB as 'innodb_data_file_path' is not set.\n" "If you do not want to use transactional InnoDB tables, add a line\n" "skip-innodb\n" "to the [mysqld] section of init parameters in your my.cnf\n" "or my.ini. If you want to use InnoDB tables, add for example,\n" - "innodb_data_file_path = ibdata1:30M\n" + "innodb_data_file_path = ibdata1:30M:autoextend\n" "But to get good performance you should adjust for your hardware\n" "the InnoDB startup options listed in section 2 at\n" "http://www.innodb.com/ibman.html\n"); - innodb_skip=1; - DBUG_RETURN(FALSE); // Continue without innobase + innodb_skip=1; + DBUG_RETURN(FALSE); /* Continue without InnoDB */ } srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : current_dir); - srv_logs_home = (char*) ""; srv_arch_dir = (innobase_log_arch_dir ? innobase_log_arch_dir : current_dir); - ret = innobase_parse_data_file_paths_and_sizes(); - + ret = (bool) + srv_parse_data_file_paths_and_sizes(innobase_data_file_path, + &srv_data_file_names, + &srv_data_file_sizes, + &srv_data_file_is_raw_partition, + &srv_n_data_files, + &srv_auto_extend_last_data_file, + &srv_last_file_size_max); if (ret == FALSE) { - fprintf(stderr, "InnoDB: syntax error in innodb_data_file_path\n"); - DBUG_RETURN(TRUE); + fprintf(stderr, + "InnoDB: syntax error in innodb_data_file_path\n"); + DBUG_RETURN(TRUE); } - if (!innobase_log_group_home_dir) - innobase_log_group_home_dir= current_dir; - ret = innobase_parse_log_group_home_dirs(); + if (!innobase_log_group_home_dir) { + innobase_log_group_home_dir = current_dir; + } - if (ret == FALSE) { - DBUG_RETURN(TRUE); + ret = (bool) + srv_parse_log_group_home_dirs(innobase_log_group_home_dir, + &srv_log_group_home_dirs); + + if (ret == FALSE || innobase_mirrored_log_groups != 1) { + fprintf(stderr, + "InnoDB: syntax error in innodb_log_group_home_dir\n" + "InnoDB: or a wrong number of mirrored log groups\n"); + + DBUG_RETURN(TRUE); } srv_unix_file_flush_method_str = (innobase_unix_file_flush_method ? @@ -658,10 +464,11 @@ innobase_init(void) if (err != DB_SUCCESS) { - DBUG_RETURN(1); + DBUG_RETURN(1); } + (void) hash_init(&innobase_open_tables,32,0,0, - (hash_get_key) innobase_get_key,0,0); + (hash_get_key) innobase_get_key,0,0); pthread_mutex_init(&innobase_mutex,MY_MUTEX_INIT_FAST); DBUG_RETURN(0); } @@ -1340,33 +1147,43 @@ build_template( clust_index = dict_table_get_first_index_noninline(prebuilt->table); - if (!prebuilt->in_update_remember_pos) { + if (!prebuilt->hint_no_need_to_fetch_extra_cols) { + /* We have a hint that we should at least fetch all + columns in the key, or all columns in the table */ + if (prebuilt->read_just_key) { + /* MySQL has instructed us that it is enough to + fetch the columns in the key */ + fetch_all_in_key = TRUE; } else { /* We are building a temporary table: fetch all - columns */ + columns; the reason is that MySQL may use the + clustered index key to store rows, but the mechanism + we use below to detect required columns does not + reveal that. Actually, it might be enough to + fetch only all in the key also in this case! */ templ_type = ROW_MYSQL_WHOLE_ROW; } } if (prebuilt->select_lock_type == LOCK_X) { - /* TODO: should fix the code in sql_update so that we could do - with fetching only the needed columns */ + /* We always retrieve the whole clustered index record if we + use exclusive row level locks, for example, if the read is + done in an UPDATE statement. */ templ_type = ROW_MYSQL_WHOLE_ROW; } if (templ_type == ROW_MYSQL_REC_FIELDS) { + /* In versions < 3.23.50 we always retrieved the clustered + index record if prebuilt->select_lock_type == LOCK_S, + but there is really not need for that, and in some cases + performance could be seriously degraded because the MySQL + optimizer did not know about our convention! */ - if (prebuilt->select_lock_type != LOCK_NONE) { - /* Let index be the clustered index */ - - index = clust_index; - } else { - index = prebuilt->index; - } + index = prebuilt->index; } else { index = clust_index; } @@ -1462,12 +1279,6 @@ skip_field: (index->table->cols + templ->col_no)->clust_pos; } } - - if (templ_type == ROW_MYSQL_REC_FIELDS - && prebuilt->select_lock_type != LOCK_NONE) { - - prebuilt->need_to_access_clustered = TRUE; - } } /************************************************************************ @@ -1500,7 +1311,9 @@ ha_innobase::write_row( } if (table->next_number_field && record == table->record[0]) { - + /* This is the case where the table has an + auto-increment column */ + /* Fetch the value the user possibly has set in the autoincrement field */ @@ -1584,12 +1397,6 @@ ha_innobase::write_row( } } - /* Set the 'in_update_remember_pos' flag to FALSE to - make sure all columns are fetched in the select done by - update_auto_increment */ - - prebuilt->in_update_remember_pos = FALSE; - update_auto_increment(); if (auto_inc == 0) { @@ -1613,7 +1420,7 @@ ha_innobase::write_row( } /* We have to set sql_stat_start to TRUE because - update_auto_increment has called a select, and + update_auto_increment may have called a select, and has reset that flag; row_insert_for_mysql has to know to set the IX intention lock on the table, something it only does at the start of each statement */ @@ -1853,9 +1660,7 @@ ha_innobase::update_row( /* This is not a delete */ prebuilt->upd_node->is_delete = FALSE; - if (!prebuilt->in_update_remember_pos) { - assert(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); - } + assert(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); srv_conc_enter_innodb(prebuilt->trx); @@ -1901,7 +1706,6 @@ ha_innobase::delete_row( /* This is a delete */ prebuilt->upd_node->is_delete = TRUE; - prebuilt->in_update_remember_pos = TRUE; srv_conc_enter_innodb(prebuilt->trx); @@ -2616,7 +2420,9 @@ ha_innobase::create( /* Create the table definition in InnoDB */ - if (error = create_table_def(trx, form, norm_name)) { + error = create_table_def(trx, form, norm_name); + + if (error) { trx_commit_for_mysql(trx); @@ -3203,13 +3009,59 @@ ha_innobase::update_table_comment( pos += sprintf(pos, "InnoDB free: %lu kB", (ulong) innobase_get_free_space()); - /* We assume 150 bytes of space to print info */ + /* We assume 450 - length bytes of space to print info */ - dict_print_info_on_foreign_keys(pos, 500, prebuilt->table); + if (length < 450) { + dict_print_info_on_foreign_keys(FALSE, pos, 450 - length, + prebuilt->table); + } return(str); } +/*********************************************************************** +Gets the foreign key create info for a table stored in InnoDB. */ + +char* +ha_innobase::get_foreign_key_create_info(void) +/*==========================================*/ + /* out, own: character string in the form which + can be inserted to the CREATE TABLE statement, + MUST be freed with ::free_foreign_key_create_info */ +{ + row_prebuilt_t* prebuilt = (row_prebuilt_t*)innobase_prebuilt; + char* str; + + if (prebuilt == NULL) { + fprintf(stderr, +"InnoDB: Error: cannot get create info for foreign keys\n"); + + return(NULL); + } + + str = (char*)ut_malloc(10000); + + str[0] = '\0'; + + dict_print_info_on_foreign_keys(TRUE, str, 9000, prebuilt->table); + + return(str); +} + +/*********************************************************************** +Frees the foreign key create info for a table stored in InnoDB, if it is +non-NULL. */ + +void +ha_innobase::free_foreign_key_create_info( +/*======================================*/ + char* str) /* in, own: create info string to free */ +{ + if (str) { + ut_free(str); + } +} + /*********************************************************************** Tells something additional to the handler about how to do things. */ @@ -3235,7 +3087,7 @@ ha_innobase::extra( prebuilt->read_just_key = 0; break; case HA_EXTRA_DONT_USE_CURSOR_TO_UPDATE: - prebuilt->in_update_remember_pos = FALSE; + prebuilt->hint_no_need_to_fetch_extra_cols = FALSE; break; case HA_EXTRA_KEYREAD: prebuilt->read_just_key = 1; @@ -3282,7 +3134,7 @@ ha_innobase::external_lock( trx = prebuilt->trx; prebuilt->sql_stat_start = TRUE; - prebuilt->in_update_remember_pos = TRUE; + prebuilt->hint_no_need_to_fetch_extra_cols = TRUE; prebuilt->read_just_key = 0; @@ -3301,6 +3153,16 @@ ha_innobase::external_lock( thd->transaction.all.innodb_active_trans = 1; trx->n_mysql_tables_in_use++; + if (thd->tx_isolation == ISO_SERIALIZABLE + && prebuilt->select_lock_type == LOCK_NONE) { + + /* To get serializable execution we let InnoDB + conceptually add 'LOCK IN SHARE MODE' to all SELECTs + which otherwise would have been consistent reads */ + + prebuilt->select_lock_type = LOCK_S; + } + if (prebuilt->select_lock_type != LOCK_NONE) { trx->mysql_n_tables_locked++; @@ -3407,8 +3269,8 @@ ha_innobase::store_lock( lock_type == TL_READ_NO_INSERT) { /* This is a SELECT ... IN SHARE MODE, or we are doing a complex SQL statement like - INSERT INTO ... SELECT ... and the logical logging - requires the use of a locking read */ + INSERT INTO ... SELECT ... and the logical logging (MySQL + binlog) requires the use of a locking read */ prebuilt->select_lock_type = LOCK_S; } else { @@ -3448,37 +3310,59 @@ ha_innobase::get_auto_increment() /*=============================*/ /* out: the next auto-increment column value */ { - row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt; - longlong nr; - int error; + row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt; + longlong nr; + int error; - (void) extra(HA_EXTRA_KEYREAD); - index_init(table->next_number_index); + /* Also SHOW TABLE STATUS calls this function. Previously, when we did + always read the max autoinc key value, setting x-locks, users were + surprised that SHOW TABLE STATUS could end up in a deadlock with + ordinary SQL queries. We avoid these deadlocks if the auto-inc + counter for the table has been initialized by fetching the value + from the table struct in dictionary cache. */ - /* We use an exclusive lock when we read the max key value from the - auto-increment column index. This is because then build_template will - advise InnoDB to fetch all columns. In SHOW TABLE STATUS the query - id of the auto-increment column is not changed, and previously InnoDB - did not fetch it, causing SHOW TABLE STATUS to show wrong values - for the autoinc column. */ + assert(prebuilt->table); + + nr = dict_table_autoinc_read(prebuilt->table); - prebuilt->select_lock_type = LOCK_X; - prebuilt->trx->mysql_n_tables_locked += 1; + if (nr != 0) { + + return(nr + 1); + } + + (void) extra(HA_EXTRA_KEYREAD); + index_init(table->next_number_index); + + /* We use an exclusive lock when we read the max key value from the + auto-increment column index. This is because then build_template will + advise InnoDB to fetch all columns. In SHOW TABLE STATUS the query + id of the auto-increment column is not changed, and previously InnoDB + did not fetch it, causing SHOW TABLE STATUS to show wrong values + for the autoinc column. */ + + prebuilt->select_lock_type = LOCK_X; + + /* Play safe and also give in another way the hint to fetch + all columns in the key: */ + + prebuilt->hint_no_need_to_fetch_extra_cols = FALSE; + + prebuilt->trx->mysql_n_tables_locked += 1; - error=index_last(table->record[1]); + error = index_last(table->record[1]); - if (error) { - nr = 1; - } else { - nr = (longlong) table->next_number_field-> - val_int_offset(table->rec_buff_length) + 1; - } + if (error) { + nr = 1; + } else { + nr = (longlong) table->next_number_field-> + val_int_offset(table->rec_buff_length) + 1; + } - (void) extra(HA_EXTRA_NO_KEYREAD); + (void) extra(HA_EXTRA_NO_KEYREAD); - index_end(); + index_end(); - return(nr); + return(nr); } #endif /* HAVE_INNOBASE_DB */ diff --git a/sql/ha_innobase.h b/sql/ha_innobase.h index ec77cd1a70f..9f752dd2eda 100644 --- a/sql/ha_innobase.h +++ b/sql/ha_innobase.h @@ -154,7 +154,8 @@ class ha_innobase: public handler int rename_table(const char* from, const char* to); int check(THD* thd, HA_CHECK_OPT* check_opt); char* update_table_comment(const char* comment); - + char* get_foreign_key_create_info(); + void free_foreign_key_create_info(char* str); THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type); longlong get_auto_increment();