mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
Improved handling of marking processes as dead
Run ndb_mgmd as deamon Make extra attempt to check if processes are still alive
This commit is contained in:
@@ -272,40 +272,17 @@ sub spawn_parent_impl {
|
|||||||
last;
|
last;
|
||||||
}
|
}
|
||||||
|
|
||||||
# If one of the mysqld processes died, we want to
|
# If one of the processes died, we want to
|
||||||
# mark this, and kill the mysqltest process.
|
# mark this, and kill the mysqltest process.
|
||||||
|
|
||||||
foreach my $idx (0..1)
|
mark_process_dead($ret_pid);
|
||||||
{
|
|
||||||
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
|
|
||||||
{
|
|
||||||
mtr_debug("child $ret_pid was master[$idx], " .
|
|
||||||
"exit during mysqltest run");
|
|
||||||
$::master->[$idx]->{'pid'}= 0;
|
|
||||||
last;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach my $idx (0..2)
|
|
||||||
{
|
|
||||||
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
|
|
||||||
{
|
|
||||||
mtr_debug("child $ret_pid was slave[$idx], " .
|
|
||||||
"exit during mysqltest run");
|
|
||||||
$::slave->[$idx]->{'pid'}= 0;
|
|
||||||
last;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mtr_debug("waitpid() caught exit of unknown child $ret_pid, " .
|
|
||||||
"exit during mysqltest run");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( $ret_pid != $pid )
|
if ( $ret_pid != $pid )
|
||||||
{
|
{
|
||||||
# We terminated the waiting because a "mysqld" process died.
|
# We terminated the waiting because a "mysqld" process died.
|
||||||
# Kill the mysqltest process.
|
# Kill the mysqltest process.
|
||||||
|
mtr_verbose("Kill mysqltest because another process died");
|
||||||
kill(9,$pid);
|
kill(9,$pid);
|
||||||
|
|
||||||
$ret_pid= waitpid($pid,0);
|
$ret_pid= waitpid($pid,0);
|
||||||
@@ -639,13 +616,19 @@ sub mtr_check_stop_servers ($) {
|
|||||||
mtr_warning("couldn't delete $file");
|
mtr_warning("couldn't delete $file");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
$srv->{'pid'}= 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ( $errors )
|
if ( $errors )
|
||||||
{
|
{
|
||||||
# We are in trouble, just die....
|
# There where errors killing processes
|
||||||
mtr_error("we could not kill or clean up all processes");
|
# do one last attempt to ping the servers
|
||||||
|
# and if they can't be pinged, assume they are dead
|
||||||
|
if ( ! mtr_ping_with_timeout( \@$spec ) )
|
||||||
|
{
|
||||||
|
mtr_error("we could not kill or clean up all processes");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -773,6 +756,49 @@ sub mtr_ping_with_timeout($) {
|
|||||||
return $res;
|
return $res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Loop through our list of processes and look for and entry
|
||||||
|
# with the provided pid
|
||||||
|
# Set the pid of that process to 0 if found
|
||||||
|
#
|
||||||
|
sub mark_process_dead($)
|
||||||
|
{
|
||||||
|
my $ret_pid= shift;
|
||||||
|
|
||||||
|
foreach my $mysqld (@{$::master}, @{$::slave})
|
||||||
|
{
|
||||||
|
if ( $mysqld->{'pid'} eq $ret_pid )
|
||||||
|
{
|
||||||
|
mtr_verbose("$mysqld->{'type'} $mysqld->{'idx'} exited, pid: $ret_pid");
|
||||||
|
$mysqld->{'pid'}= 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $cluster (@{$::clusters})
|
||||||
|
{
|
||||||
|
if ( $cluster->{'pid'} eq $ret_pid )
|
||||||
|
{
|
||||||
|
mtr_verbose("$cluster->{'name'} cluster ndb_mgmd exited, pid: $ret_pid");
|
||||||
|
$cluster->{'pid'}= 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $ndbd (@{$cluster->{'ndbds'}})
|
||||||
|
{
|
||||||
|
if ( $ndbd->{'pid'} eq $ret_pid )
|
||||||
|
{
|
||||||
|
mtr_verbose("$cluster->{'name'} cluster ndbd exited, pid: $ret_pid");
|
||||||
|
$ndbd->{'pid'}= 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mtr_warning("mark_process_dead couldn't find an entry for pid: $ret_pid");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
#
|
#
|
||||||
# The operating system will keep information about dead children,
|
# The operating system will keep information about dead children,
|
||||||
@@ -789,45 +815,8 @@ sub mtr_record_dead_children () {
|
|||||||
# -1 or 0 means there are no more procesess to wait for
|
# -1 or 0 means there are no more procesess to wait for
|
||||||
while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1)
|
while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1)
|
||||||
{
|
{
|
||||||
mtr_warning("waitpid() caught exit of child $ret_pid");
|
mtr_warning("mtr_record_dead_children: $ret_pid");
|
||||||
foreach my $idx (0..1)
|
mark_process_dead($ret_pid);
|
||||||
{
|
|
||||||
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
|
|
||||||
{
|
|
||||||
mtr_warning("child $ret_pid was master[$idx]");
|
|
||||||
$::master->[$idx]->{'pid'}= 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach my $idx (0..2)
|
|
||||||
{
|
|
||||||
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
|
|
||||||
{
|
|
||||||
mtr_warning("child $ret_pid was slave[$idx]");
|
|
||||||
$::slave->[$idx]->{'pid'}= 0;
|
|
||||||
last;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach my $cluster (@{$::clusters})
|
|
||||||
{
|
|
||||||
if ( $cluster->{'pid'} eq $ret_pid )
|
|
||||||
{
|
|
||||||
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndb_mgmd");
|
|
||||||
$cluster->{'pid'}= 0;
|
|
||||||
last;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach my $ndbd (@{$cluster->{'ndbds'}})
|
|
||||||
{
|
|
||||||
if ( $ndbd->{'pid'} eq $ret_pid )
|
|
||||||
{
|
|
||||||
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndbd");
|
|
||||||
$ndbd->{'pid'}= 0;
|
|
||||||
last;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -843,7 +832,8 @@ sub start_reap_all {
|
|||||||
my $pid;
|
my $pid;
|
||||||
while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1)
|
while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1)
|
||||||
{
|
{
|
||||||
print "start_reap_all: pid: $pid.\n";
|
mtr_warning("start_reap_all pid: $pid");
|
||||||
|
mark_process_dead($pid);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -903,6 +893,7 @@ sub sleep_until_file_created ($$$) {
|
|||||||
# Check if it died after the fork() was successful
|
# Check if it died after the fork() was successful
|
||||||
if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid )
|
if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid )
|
||||||
{
|
{
|
||||||
|
mtr_warning("Process $pid died");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1687,6 +1687,7 @@ sub ndbcluster_wait_started($){
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
sub mysqld_wait_started($){
|
sub mysqld_wait_started($){
|
||||||
my $mysqld= shift;
|
my $mysqld= shift;
|
||||||
|
|
||||||
@@ -1706,6 +1707,7 @@ sub ndb_mgmd_start ($) {
|
|||||||
mtr_init_args(\$args);
|
mtr_init_args(\$args);
|
||||||
mtr_add_arg($args, "--no-defaults");
|
mtr_add_arg($args, "--no-defaults");
|
||||||
mtr_add_arg($args, "--core");
|
mtr_add_arg($args, "--core");
|
||||||
|
mtr_add_arg($args, "--nodaemon");
|
||||||
mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini");
|
mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini");
|
||||||
|
|
||||||
|
|
||||||
@@ -1716,9 +1718,23 @@ sub ndb_mgmd_start ($) {
|
|||||||
"",
|
"",
|
||||||
{ append_log_file => 1 });
|
{ append_log_file => 1 });
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME Should not be needed
|
||||||
|
# Unfortunately the cluster nodes will fail to start
|
||||||
|
# if ndb_mgmd has not started properly
|
||||||
|
sleep(1);
|
||||||
|
|
||||||
|
# if (!sleep_until_file_created($cluster->{'path_pid'},
|
||||||
|
# 30, # Seconds
|
||||||
|
# $pid))
|
||||||
|
# {
|
||||||
|
# mtr_warning("Failed to start ndb_mgd for $cluster->{'name'} cluster");
|
||||||
|
# return 1;
|
||||||
|
# }
|
||||||
|
|
||||||
# Remember pid of ndb_mgmd
|
# Remember pid of ndb_mgmd
|
||||||
$cluster->{'pid'}= $pid;
|
$cluster->{'pid'}= $pid;
|
||||||
mtr_verbose("ndb_mgmd_start, pid: $pid");
|
|
||||||
return $pid;
|
return $pid;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1774,19 +1790,6 @@ sub ndbcluster_start ($$) {
|
|||||||
|
|
||||||
my $pid= ndb_mgmd_start($cluster);
|
my $pid= ndb_mgmd_start($cluster);
|
||||||
|
|
||||||
# FIXME Should not be needed
|
|
||||||
# Unfortunately cluster will fail
|
|
||||||
# if ndb_mgmd has not started properly
|
|
||||||
# Wait for the ndb_mgmd pid file to be created
|
|
||||||
if (!sleep_until_file_created($cluster->{'path_pid'},
|
|
||||||
60,
|
|
||||||
$pid))
|
|
||||||
{
|
|
||||||
mtr_warning("Failed to start ndb_mgmd for $cluster->{'name'} cluster");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ )
|
for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ )
|
||||||
{
|
{
|
||||||
ndbd_start($cluster, $idx, $extra_args);
|
ndbd_start($cluster, $idx, $extra_args);
|
||||||
|
Reference in New Issue
Block a user