From fe9450676fe684875fb7a900ab26c878c743c095 Mon Sep 17 00:00:00 2001 From: Sujatha Date: Wed, 12 May 2021 13:46:51 +0530 Subject: [PATCH] MDEV-25502: rpl.rpl_perfschema_applier_status_by_worker failed in bb with: Test assertion failed Problem: ======= Test fails with 3 different symptoms connection slave; Assertion text: 'Last_Seen_Transaction should show .' Assertion condition: '"0-1-1" = ""' Assertion condition, interpolated: '"0-1-1" = ""' Assertion result: '0' connection slave; Assertion text: 'Value returned by SSS and PS table for Last_Error_Number should be same.' Assertion condition: '"1146" = "0"' Assertion condition, interpolated: '"1146" = "0"' Assertion result: '0' connection slave; Assertion text: 'Value returned by PS table for worker_idle_time should be >= 1' Assertion condition: '"0" >= "1"' Assertion condition, interpolated: '"0" >= "1"' Assertion result: '0' Fix1: ==== Performance schema table's Last_Seen_Transaction is compared with 'SELECT gtid_slave_pos'. Since DDLs are not transactional changes to user table and gtid_slave_pos table are not guaranteed to be synchronous. To fix the issue Gtid_IO_Pos value from SHOW SLAVE STATUS command will be used to verify the correctness of Performance schema specific Last_Seen_Transaction. Fix2: ==== On error worker thread information is stored as part of backup pool. Access to this backup pool should be protected by 'LOCK_rpl_thread_pool' mutex so that simultaneous START SLAVE cannot destroy the backup pool, while it is being queried by performance schema. Fix3: ==== When a worker is waiting for events if performance schema table is queried, at present it just returns the difference between current_time and start_time. This is incorrect. It should be worker_idle_time + (current_time - start_time). For example a worker thread was idle for 10 seconds and then it got events to process. Upon completion it goes to idle state, now if the pfs table is queried it should return current_idle time + worker_idle_time. --- .../rpl_perfschema_applier_status_by_worker.result | 4 +++- .../t/rpl_perfschema_applier_status_by_worker.test | 14 +++++++------- sql/rpl_parallel.h | 2 +- .../table_replication_applier_status_by_worker.cc | 9 ++++++--- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/mysql-test/suite/rpl/r/rpl_perfschema_applier_status_by_worker.result b/mysql-test/suite/rpl/r/rpl_perfschema_applier_status_by_worker.result index a2d336ddad7..268ab0104a6 100644 --- a/mysql-test/suite/rpl/r/rpl_perfschema_applier_status_by_worker.result +++ b/mysql-test/suite/rpl/r/rpl_perfschema_applier_status_by_worker.result @@ -22,9 +22,11 @@ include/assert.inc [Service_State should be "ON" on a fresh slave server.] include/assert.inc [Last_Seen_Transaction should show "" if no transaction applierd] connection master; CREATE TABLE t1 (a INT) ENGINE=InnoDB; +include/save_master_gtid.inc connection slave; +include/sync_with_master_gtid.inc include/assert.inc [Channel_name must be slave1] -include/assert.inc [Last_Seen_Transaction should show 0-1-1.] +include/assert.inc [Last_Seen_Transaction should show 0-1-1] include/assert.inc [Value returned by SSS and PS table for Last_Error_Number should be same.] include/assert.inc [Value returned by SSS and PS table for Last_Error_Message should both be empty.] include/assert.inc [Value returned by PS table for Last_Error_Timestamp should be 0000-00-00 00:00:00.] diff --git a/mysql-test/suite/rpl/t/rpl_perfschema_applier_status_by_worker.test b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status_by_worker.test index 16e33708c55..857979d00c4 100644 --- a/mysql-test/suite/rpl/t/rpl_perfschema_applier_status_by_worker.test +++ b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status_by_worker.test @@ -29,7 +29,7 @@ # STOP SLAVE. # # ==== Related Bugs and Worklogs ==== -# +# # MDEV-20220: Merge 5.7 P_S replication table 'replication_applier_status_by_worker # --source include/have_perfschema.inc @@ -83,10 +83,10 @@ source include/assert.inc; --connection master CREATE TABLE t1 (a INT) ENGINE=InnoDB; ---save_master_pos +--source include/save_master_gtid.inc ---connection slave ---sync_with_master 0,'slave1' +connection slave; +--source include/sync_with_master_gtid.inc let $ps_value= query_get_value(select channel_name from performance_schema.replication_applier_status_by_worker, channel_name, 1); let $assert_text= Channel_name must be slave1; @@ -94,9 +94,9 @@ let $assert_cond= "$ps_value"= "slave1"; source include/assert.inc; let $ps_value= query_get_value(select Last_Seen_Transaction from performance_schema.replication_applier_status_by_worker, Last_Seen_Transaction, 1); -let $slave_gtid_pos= `SELECT @@gtid_slave_pos`; -let $assert_text= Last_Seen_Transaction should show $slave_gtid_pos.; -let $assert_cond= "$ps_value" = "$slave_gtid_pos"; +let $sss_value= query_get_value(SHOW SLAVE STATUS, Gtid_IO_Pos, 1); +let $assert_text= Last_Seen_Transaction should show $sss_value; +let $assert_cond= "$ps_value" = "$sss_value"; source include/assert.inc; let $sss_value= query_get_value(SHOW SLAVE STATUS, Last_SQL_Errno, 1); diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h index cc7795b4b0d..d3c46301ff8 100644 --- a/sql/rpl_parallel.h +++ b/sql/rpl_parallel.h @@ -187,7 +187,7 @@ struct rpl_parallel_thread { ulonglong get_worker_idle_time() { if (start_time) - return compute_time_lapsed(); + return (worker_idle_time + compute_time_lapsed()); else return worker_idle_time; } diff --git a/storage/perfschema/table_replication_applier_status_by_worker.cc b/storage/perfschema/table_replication_applier_status_by_worker.cc index 6d7866bc318..bb7ec50a533 100644 --- a/storage/perfschema/table_replication_applier_status_by_worker.cc +++ b/storage/perfschema/table_replication_applier_status_by_worker.cc @@ -92,9 +92,9 @@ ha_rows table_replication_applier_status_by_worker::get_row_count() int table_replication_applier_status_by_worker::rnd_next(void) { - if (global_rpl_thread_pool.inited && global_rpl_thread_pool.count) + rpl_parallel_thread_pool *pool= &global_rpl_thread_pool; + if (pool->inited && pool->count) { - rpl_parallel_thread_pool *pool= &global_rpl_thread_pool; mysql_mutex_lock(&pool->LOCK_rpl_thread_pool); uint worker_count= pool->count; for (m_pos.set_at(&m_next_pos); @@ -111,7 +111,8 @@ int table_replication_applier_status_by_worker::rnd_next(void) } else { - struct pool_bkp_for_pfs *bkp_pool= &global_rpl_thread_pool.pfs_bkp; + mysql_mutex_lock(&pool->LOCK_rpl_thread_pool); + struct pool_bkp_for_pfs *bkp_pool= &pool->pfs_bkp; if (bkp_pool->inited && bkp_pool->count) { for (m_pos.set_at(&m_next_pos); @@ -121,9 +122,11 @@ int table_replication_applier_status_by_worker::rnd_next(void) rpl_parallel_thread *rpt= bkp_pool->rpl_thread_arr[m_pos.m_index]; make_row(rpt); m_next_pos.set_after(&m_pos); + mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool); return 0; } } + mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool); } return HA_ERR_END_OF_FILE; }