1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-29 05:21:33 +03:00

Bug#38804: Query deadlock causes all tables to be inaccessible.

Problem was a mutex added in bug n 27405 for solving a problem
with auto_increment in partitioned innodb tables.
(in ha_partition::write_row over partitions file->ha_write_row)

Solution is to use the patch for bug#33479, which refines the
usage of mutexes for auto_increment.

Backport of bug-33479 from 6.0:

Bug-33479: auto_increment failures in partitioning

Several problems with auto_increment in partitioning
(with MyISAM, InnoDB. Locking issues, not handling
multi-row INSERTs properly etc.)

Changed the auto_increment handling for partitioning:
Added a ha_data variable in table_share for storage engine specific data
such as auto_increment value handling in partitioning, also see WL 4305
and using the ha_data->mutex to lock around read + update.

The idea is this:
Store the table's reserved auto_increment value in
the TABLE_SHARE and use a mutex to, lock it for reading and updating it
and unlocking it, in one block. Only accessing all partitions
when it is not initialized.
Also allow reservations of ranges, and if no one has done a reservation
afterwards, lower the reservation to what was actually used after
the statement is done (via release_auto_increment from WL 3146).
The lock is kept from the first reservation if it is statement based
replication and a multi-row INSERT statement where the number of
candidate rows to insert is not known in advance (like INSERT SELECT,
LOAD DATA, unlike INSERT VALUES (row1), (row2),,(rowN)).

This should also lead to better concurrancy (no need to have a mutex
protection around write_row in all cases)
and work with any local storage engine.
This commit is contained in:
Mattias Jonsson
2008-09-08 15:30:01 +02:00
parent 59edfa9261
commit 5b164964e2
17 changed files with 5535 additions and 124 deletions

View File

@ -160,7 +160,8 @@ const uint ha_partition::NO_CURRENT_PART_ID= 0xFFFFFFFF;
ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share)
:handler(hton, share), m_part_info(NULL), m_create_handler(FALSE),
m_is_sub_partitioned(0), is_clone(FALSE)
m_is_sub_partitioned(0), is_clone(FALSE), auto_increment_lock(FALSE),
auto_increment_safe_stmt_log_lock(FALSE)
{
DBUG_ENTER("ha_partition::ha_partition(table)");
init_handler_variables();
@ -182,7 +183,8 @@ ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share)
ha_partition::ha_partition(handlerton *hton, partition_info *part_info)
:handler(hton, NULL), m_part_info(part_info),
m_create_handler(TRUE),
m_is_sub_partitioned(m_part_info->is_sub_partitioned()), is_clone(FALSE)
m_is_sub_partitioned(m_part_info->is_sub_partitioned()), is_clone(FALSE),
auto_increment_lock(FALSE), auto_increment_safe_stmt_log_lock(FALSE)
{
DBUG_ENTER("ha_partition::ha_partition(part_info)");
init_handler_variables();
@ -1248,7 +1250,7 @@ int ha_partition::prepare_new_partition(TABLE *tbl,
assumes that external_lock() is last call that may fail here.
Otherwise see description for cleanup_new_partition().
*/
if ((error= file->ha_external_lock(current_thd, m_lock_type)))
if ((error= file->ha_external_lock(ha_thd(), m_lock_type)))
goto error;
DBUG_RETURN(0);
@ -1336,8 +1338,8 @@ void ha_partition::cleanup_new_partition(uint part_count)
int ha_partition::change_partitions(HA_CREATE_INFO *create_info,
const char *path,
ulonglong *copied,
ulonglong *deleted,
ulonglong * const copied,
ulonglong * const deleted,
const uchar *pack_frm_data
__attribute__((unused)),
size_t pack_frm_len
@ -1354,7 +1356,7 @@ int ha_partition::change_partitions(HA_CREATE_INFO *create_info,
int error= 1;
bool first;
uint temp_partitions= m_part_info->temp_partitions.elements;
THD *thd= current_thd;
THD *thd= ha_thd();
DBUG_ENTER("ha_partition::change_partitions");
/*
@ -1628,7 +1630,8 @@ int ha_partition::change_partitions(HA_CREATE_INFO *create_info,
partitions.
*/
int ha_partition::copy_partitions(ulonglong *copied, ulonglong *deleted)
int ha_partition::copy_partitions(ulonglong * const copied,
ulonglong * const deleted)
{
uint reorg_part= 0;
int result= 0;
@ -1674,13 +1677,13 @@ int ha_partition::copy_partitions(ulonglong *copied, ulonglong *deleted)
table since it doesn't fit into any partition any longer due to
changed partitioning ranges or list values.
*/
deleted++;
(*deleted)++;
}
else
{
THD *thd= ha_thd();
/* Copy record to new handler */
copied++;
(*copied)++;
tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
result= m_new_file[new_part]->ha_write_row(m_rec0);
reenable_binlog(thd);
@ -1804,7 +1807,7 @@ uint ha_partition::del_ren_cre_table(const char *from,
handler **file, **abort_file;
DBUG_ENTER("del_ren_cre_table()");
if (get_from_handler_file(from, current_thd->mem_root))
if (get_from_handler_file(from, ha_thd()->mem_root))
DBUG_RETURN(TRUE);
DBUG_ASSERT(m_file_buffer);
DBUG_PRINT("enter", ("from: (%s) to: (%s)", from, to));
@ -1931,7 +1934,7 @@ int ha_partition::set_up_table_before_create(TABLE *tbl,
{
int error= 0;
const char *partition_name;
THD *thd= current_thd;
THD *thd= ha_thd();
DBUG_ENTER("set_up_table_before_create");
if (!part_elem)
@ -2327,7 +2330,7 @@ bool ha_partition::get_from_handler_file(const char *name, MEM_ROOT *mem_root)
tot_partition_words= (m_tot_parts + 3) / 4;
engine_array= (handlerton **) my_alloca(m_tot_parts * sizeof(handlerton*));
for (i= 0; i < m_tot_parts; i++)
engine_array[i]= ha_resolve_by_legacy_type(current_thd,
engine_array[i]= ha_resolve_by_legacy_type(ha_thd(),
(enum legacy_db_type)
*(uchar *) ((file_buffer) + 12 + i));
address_tot_name_len= file_buffer + 12 + 4 * tot_partition_words;
@ -2398,8 +2401,10 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
uint alloc_len;
handler **file;
char name_buff[FN_REFLEN];
bool is_not_tmp_table= (table_share->tmp_table == NO_TMP_TABLE);
DBUG_ENTER("ha_partition::open");
DBUG_ASSERT(table->s == table_share);
ref_length= 0;
m_mode= mode;
m_open_test_lock= test_if_locked;
@ -2408,9 +2413,9 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
DBUG_RETURN(1);
m_start_key.length= 0;
m_rec0= table->record[0];
m_rec_length= table->s->reclength;
m_rec_length= table_share->reclength;
alloc_len= m_tot_parts * (m_rec_length + PARTITION_BYTES_IN_POS);
alloc_len+= table->s->max_key_length;
alloc_len+= table_share->max_key_length;
if (!m_ordered_rec_buffer)
{
if (!(m_ordered_rec_buffer= (uchar*)my_malloc(alloc_len, MYF(MY_WME))))
@ -2482,6 +2487,29 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
0, key_rec_cmp, (void*)this)))
goto err_handler;
/*
Use table_share->ha_data to share auto_increment_value among all handlers
for the same table.
*/
if (is_not_tmp_table)
pthread_mutex_lock(&table_share->mutex);
if (!table_share->ha_data)
{
HA_DATA_PARTITION *ha_data;
/* currently only needed for auto_increment */
table_share->ha_data= ha_data= (HA_DATA_PARTITION*)
alloc_root(&table_share->mem_root,
sizeof(HA_DATA_PARTITION));
if (!ha_data)
{
pthread_mutex_unlock(&table_share->mutex);
goto err_handler;
}
DBUG_PRINT("info", ("table_share->ha_data 0x%p", ha_data));
bzero(ha_data, sizeof(HA_DATA_PARTITION));
}
if (is_not_tmp_table)
pthread_mutex_unlock(&table_share->mutex);
/*
Some handlers update statistics as part of the open call. This will in
some cases corrupt the statistics of the partition handler and thus
@ -2539,6 +2567,7 @@ int ha_partition::close(void)
handler **file;
DBUG_ENTER("ha_partition::close");
DBUG_ASSERT(table->s == table_share);
delete_queue(&m_queue);
if (!is_clone)
bitmap_free(&(m_part_info->used_partitions));
@ -2607,6 +2636,7 @@ int ha_partition::external_lock(THD *thd, int lock_type)
handler **file;
DBUG_ENTER("ha_partition::external_lock");
DBUG_ASSERT(!auto_increment_lock && !auto_increment_safe_stmt_log_lock);
file= m_file;
m_lock_type= lock_type;
@ -2825,8 +2855,9 @@ int ha_partition::write_row(uchar * buf)
uint32 part_id;
int error;
longlong func_value;
bool autoincrement_lock= FALSE;
bool have_auto_increment= table->next_number_field && buf == table->record[0];
my_bitmap_map *old_map;
HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
THD *thd= ha_thd();
timestamp_auto_set_type orig_timestamp_type= table->timestamp_field_type;
#ifdef NOT_NEEDED
@ -2844,28 +2875,16 @@ int ha_partition::write_row(uchar * buf)
If we have an auto_increment column and we are writing a changed row
or a new row, then update the auto_increment value in the record.
*/
if (table->next_number_field && buf == table->record[0])
if (have_auto_increment)
{
/*
Some engines (InnoDB for example) can change autoincrement
counter only after 'table->write_row' operation.
So if another thread gets inside the ha_partition::write_row
before it is complete, it gets same auto_increment value,
which means DUP_KEY error (bug #27405)
Here we separate the access using table_share->mutex, and
use autoincrement_lock variable to avoid unnecessary locks.
Probably not an ideal solution.
*/
if (table_share->tmp_table == NO_TMP_TABLE)
if (!ha_data->auto_inc_initialized &&
!table->s->next_number_keypart)
{
/*
Bug#30878 crash when alter table from non partitioned table
to partitioned.
Checking if tmp table then there is no need to lock,
and the table_share->mutex may not be initialised.
If auto_increment in table_share is not initialized, start by
initializing it.
*/
autoincrement_lock= TRUE;
pthread_mutex_lock(&table_share->mutex);
info(HA_STATUS_AUTO);
}
error= update_auto_increment();
@ -2903,11 +2922,11 @@ int ha_partition::write_row(uchar * buf)
DBUG_PRINT("info", ("Insert in partition %d", part_id));
tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
error= m_file[part_id]->ha_write_row(buf);
if (have_auto_increment && !table->s->next_number_keypart)
set_auto_increment_if_higher(table->next_number_field->val_int());
reenable_binlog(thd);
exit:
table->timestamp_field_type= orig_timestamp_type;
if (autoincrement_lock)
pthread_mutex_unlock(&table_share->mutex);
DBUG_RETURN(error);
}
@ -2931,13 +2950,6 @@ exit:
Keep in mind that the server can do updates based on ordering if an
ORDER BY clause was used. Consecutive ordering is not guarenteed.
Currently new_data will not have an updated auto_increament record, or
and updated timestamp field. You can do these for partition by doing these:
if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
table->timestamp_field->set_time();
if (table->next_number_field && record == table->record[0])
update_auto_increment();
Called from sql_select.cc, sql_acl.cc, sql_update.cc, and sql_insert.cc.
new_data is always record[0]
old_data is normally record[1] but may be anything
@ -2969,17 +2981,23 @@ int ha_partition::update_row(const uchar *old_data, uchar *new_data)
goto exit;
}
/*
TODO:
set_internal_auto_increment=
max(set_internal_auto_increment, new_data->auto_increment)
*/
m_last_part= new_part_id;
if (new_part_id == old_part_id)
{
DBUG_PRINT("info", ("Update in partition %d", new_part_id));
tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
error= m_file[new_part_id]->ha_update_row(old_data, new_data);
/*
if updating an auto_increment column, update
table_share->ha_data->next_auto_inc_val if needed.
(not to be used if auto_increment on secondary field in a multi-
column index)
mysql_update does not set table->next_number_field, so we use
table->found_next_number_field instead.
*/
if (table->found_next_number_field && new_data == table->record[0] &&
!table->s->next_number_keypart)
set_auto_increment_if_higher(table->found_next_number_field->val_int());
reenable_binlog(thd);
goto exit;
}
@ -2989,6 +3007,9 @@ int ha_partition::update_row(const uchar *old_data, uchar *new_data)
old_part_id, new_part_id));
tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
error= m_file[new_part_id]->ha_write_row(new_data);
if (table->found_next_number_field && new_data == table->record[0] &&
!table->s->next_number_keypart)
set_auto_increment_if_higher(table->found_next_number_field->val_int());
reenable_binlog(thd);
if (error)
goto exit;
@ -3084,8 +3105,17 @@ int ha_partition::delete_all_rows()
{
int error;
handler **file;
THD *thd= ha_thd();
DBUG_ENTER("ha_partition::delete_all_rows");
if (thd->lex->sql_command == SQLCOM_TRUNCATE)
{
HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
lock_auto_increment();
ha_data->next_auto_inc_val= 0;
ha_data->auto_inc_initialized= FALSE;
unlock_auto_increment();
}
file= m_file;
do
{
@ -4544,21 +4574,54 @@ int ha_partition::handle_ordered_prev(uchar *buf)
int ha_partition::info(uint flag)
{
handler *file, **file_array;
DBUG_ENTER("ha_partition:info");
DBUG_ENTER("ha_partition::info");
if (flag & HA_STATUS_AUTO)
{
ulonglong auto_increment_value= 0;
bool auto_inc_is_first_in_idx= (table_share->next_number_keypart == 0);
HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
DBUG_PRINT("info", ("HA_STATUS_AUTO"));
file_array= m_file;
do
if (!table->found_next_number_field)
stats.auto_increment_value= 0;
else if (ha_data->auto_inc_initialized)
{
file= *file_array;
file->info(HA_STATUS_AUTO);
set_if_bigger(auto_increment_value, file->stats.auto_increment_value);
} while (*(++file_array));
stats.auto_increment_value= auto_increment_value;
lock_auto_increment();
stats.auto_increment_value= ha_data->next_auto_inc_val;
unlock_auto_increment();
}
else
{
lock_auto_increment();
/* to avoid two concurrent initializations, check again when locked */
if (ha_data->auto_inc_initialized)
stats.auto_increment_value= ha_data->next_auto_inc_val;
else
{
handler *file, **file_array;
ulonglong auto_increment_value= 0;
file_array= m_file;
DBUG_PRINT("info",
("checking all partitions for auto_increment_value"));
do
{
file= *file_array;
file->info(HA_STATUS_AUTO);
set_if_bigger(auto_increment_value,
file->stats.auto_increment_value);
} while (*(++file_array));
DBUG_ASSERT(auto_increment_value);
stats.auto_increment_value= auto_increment_value;
if (auto_inc_is_first_in_idx)
{
set_if_bigger(ha_data->next_auto_inc_val, auto_increment_value);
ha_data->auto_inc_initialized= TRUE;
DBUG_PRINT("info", ("initializing next_auto_inc_val to %lu",
(ulong) ha_data->next_auto_inc_val));
}
}
unlock_auto_increment();
}
}
if (flag & HA_STATUS_VARIABLE)
{
@ -4582,6 +4645,7 @@ int ha_partition::info(uint flag)
check_time: Time of last check (only applicable to MyISAM)
We report last time of all underlying handlers
*/
handler *file, **file_array;
stats.records= 0;
stats.deleted= 0;
stats.data_file_length= 0;
@ -4663,6 +4727,7 @@ int ha_partition::info(uint flag)
So we calculate these constants by using the variables on the first
handler.
*/
handler *file;
file= m_file[0];
file->info(HA_STATUS_CONST);
@ -4684,6 +4749,7 @@ int ha_partition::info(uint flag)
}
if (flag & HA_STATUS_TIME)
{
handler *file, **file_array;
DBUG_PRINT("info", ("info: HA_STATUS_TIME"));
/*
This flag is used to set the latest update time of the table.
@ -5744,19 +5810,33 @@ int ha_partition::cmp_ref(const uchar *ref1, const uchar *ref2)
MODULE auto increment
****************************************************************************/
void ha_partition::restore_auto_increment(ulonglong)
{
DBUG_ENTER("ha_partition::restore_auto_increment");
DBUG_VOID_RETURN;
int ha_partition::reset_auto_increment(ulonglong value)
{
handler **file= m_file;
int res;
HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
DBUG_ENTER("ha_partition::reset_auto_increment");
lock_auto_increment();
ha_data->auto_inc_initialized= FALSE;
ha_data->next_auto_inc_val= 0;
do
{
if ((res= (*file)->ha_reset_auto_increment(value)) != 0)
break;
} while (*(++file));
unlock_auto_increment();
DBUG_RETURN(res);
}
/*
/**
This method is called by update_auto_increment which in turn is called
by the individual handlers as part of write_row. We will always let
the first handler keep track of the auto increment value for all
partitions.
by the individual handlers as part of write_row. We use the
table_share->ha_data->next_auto_inc_val, or search all
partitions for the highest auto_increment_value if not initialized or
if auto_increment field is a secondary part of a key, we must search
every partition when holding a mutex to be sure of correctness.
*/
void ha_partition::get_auto_increment(ulonglong offset, ulonglong increment,
@ -5764,59 +5844,88 @@ void ha_partition::get_auto_increment(ulonglong offset, ulonglong increment,
ulonglong *first_value,
ulonglong *nb_reserved_values)
{
ulonglong first_value_part, last_value_part, nb_reserved_values_part,
last_value= ~ (ulonglong) 0;
handler **pos, **end;
bool retry= TRUE;
DBUG_ENTER("ha_partition::get_auto_increment");
DBUG_PRINT("info", ("offset: %lu inc: %lu desired_values: %lu "
"first_value: %lu", (ulong) offset, (ulong) increment,
(ulong) nb_desired_values, (ulong) *first_value));
DBUG_ASSERT(increment && nb_desired_values);
*first_value= 0;
if (table->s->next_number_keypart)
{
/*
next_number_keypart is != 0 if the auto_increment column is a secondary
column in the index (it is allowed in MyISAM)
*/
DBUG_PRINT("info", ("next_number_keypart != 0"));
ulonglong nb_reserved_values_part;
ulonglong first_value_part, max_first_value;
handler **file= m_file;
first_value_part= max_first_value= *first_value;
/* Must lock and find highest value among all partitions. */
lock_auto_increment();
do
{
/* Only nb_desired_values = 1 makes sense */
(*file)->get_auto_increment(offset, increment, 1,
&first_value_part, &nb_reserved_values_part);
if (first_value_part == ~(ulonglong)(0)) // error in one partition
{
*first_value= first_value_part;
/* log that the error was between table/partition handler */
sql_print_error("Partition failed to reserve auto_increment value");
unlock_auto_increment();
DBUG_VOID_RETURN;
}
DBUG_PRINT("info", ("first_value_part: %lu", (ulong) first_value_part));
set_if_bigger(max_first_value, first_value_part);
} while (*(++file));
*first_value= max_first_value;
*nb_reserved_values= 1;
unlock_auto_increment();
}
else
{
THD *thd= ha_thd();
HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
/*
This is initialized in the beginning of the first write_row call.
*/
DBUG_ASSERT(ha_data->auto_inc_initialized);
/*
Get a lock for handling the auto_increment in table_share->ha_data
for avoiding two concurrent statements getting the same number.
*/
again:
for (pos=m_file, end= m_file+ m_tot_parts; pos != end ; pos++)
{
first_value_part= *first_value;
(*pos)->get_auto_increment(offset, increment, nb_desired_values,
&first_value_part, &nb_reserved_values_part);
if (first_value_part == ~(ulonglong)(0)) // error in one partition
lock_auto_increment();
/*
In a multi-row insert statement like INSERT SELECT and LOAD DATA
where the number of candidate rows to insert is not known in advance
we must hold a lock/mutex for the whole statement if we have statement
based replication. Because the statement-based binary log contains
only the first generated value used by the statement, and slaves assumes
all other generated values used by this statement were consecutive to
this first one, we must exclusively lock the generator until the statement
is done.
*/
if (!auto_increment_safe_stmt_log_lock &&
thd->lex->sql_command != SQLCOM_INSERT &&
mysql_bin_log.is_open() &&
!thd->current_stmt_binlog_row_based &&
(thd->options & OPTION_BIN_LOG))
{
*first_value= first_value_part;
sql_print_error("Partition failed to reserve auto_increment value");
DBUG_VOID_RETURN;
DBUG_PRINT("info", ("locking auto_increment_safe_stmt_log_lock"));
auto_increment_safe_stmt_log_lock= TRUE;
}
/*
Partition has reserved an interval. Intersect it with the intervals
already reserved for the previous partitions.
*/
last_value_part= (nb_reserved_values_part == ULONGLONG_MAX) ?
ULONGLONG_MAX : (first_value_part + nb_reserved_values_part * increment);
set_if_bigger(*first_value, first_value_part);
set_if_smaller(last_value, last_value_part);
/* this gets corrected (for offset/increment) in update_auto_increment */
*first_value= ha_data->next_auto_inc_val;
ha_data->next_auto_inc_val+= nb_desired_values * increment;
unlock_auto_increment();
DBUG_PRINT("info", ("*first_value: %lu", (ulong) *first_value));
*nb_reserved_values= nb_desired_values;
}
if (last_value < *first_value) /* empty intersection, error */
{
/*
When we have an empty intersection, it means that one or more
partitions may have a significantly different autoinc next value.
We should not fail here - it just means that we should try to
find a new reservation making use of the current *first_value
wbich should now be compatible with all partitions.
*/
if (retry)
{
retry= FALSE;
last_value= ~ (ulonglong) 0;
release_auto_increment();
goto again;
}
/*
We should not get here.
*/
sql_print_error("Failed to calculate auto_increment value for partition");
*first_value= ~(ulonglong)(0);
}
if (increment) // If not check for values
*nb_reserved_values= (last_value == ULONGLONG_MAX) ?
ULONGLONG_MAX : ((last_value - *first_value) / increment);
DBUG_VOID_RETURN;
}
@ -5824,9 +5933,31 @@ void ha_partition::release_auto_increment()
{
DBUG_ENTER("ha_partition::release_auto_increment");
for (uint i= 0; i < m_tot_parts; i++)
if (table->s->next_number_keypart)
{
m_file[i]->ha_release_auto_increment();
for (uint i= 0; i < m_tot_parts; i++)
m_file[i]->ha_release_auto_increment();
}
else if (next_insert_id)
{
HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data;
ulonglong next_auto_inc_val;
lock_auto_increment();
next_auto_inc_val= ha_data->next_auto_inc_val;
if (next_insert_id < next_auto_inc_val &&
auto_inc_interval_for_cur_row.maximum() >= next_auto_inc_val)
ha_data->next_auto_inc_val= next_insert_id;
DBUG_PRINT("info", ("ha_data->next_auto_inc_val: %lu",
(ulong) ha_data->next_auto_inc_val));
/* Unlock the multi row statement lock taken in get_auto_increment */
if (auto_increment_safe_stmt_log_lock)
{
auto_increment_safe_stmt_log_lock= FALSE;
DBUG_PRINT("info", ("unlocking auto_increment_safe_stmt_log_lock"));
}
unlock_auto_increment();
}
DBUG_VOID_RETURN;
}