mirror of
				https://github.com/MariaDB/server.git
				synced 2025-11-03 14:33:32 +03:00 
			
		
		
		
	The problem was a race condition on shutdown -- when IM got shutdown request while a guarded mysqld is starting. In this case the Guardian thread tried to stop the mysqld, but might fail if the mysqld hadn't created pid-file so far. When this happened, the mysqld-monitor thread didn't stop, so the assert in Thread_registry happened. The fix is to make several attempts to stop mysqld if it is active.
		
			
				
	
	
		
			497 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			497 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/* Copyright (C) 2004 MySQL AB
 | 
						|
 | 
						|
   This program is free software; you can redistribute it and/or modify
 | 
						|
   it under the terms of the GNU General Public License as published by
 | 
						|
   the Free Software Foundation; version 2 of the License.
 | 
						|
 | 
						|
   This program is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
   GNU General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU General Public License
 | 
						|
   along with this program; if not, write to the Free Software
 | 
						|
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 | 
						|
 | 
						|
 | 
						|
#if defined(__GNUC__) && defined(USE_PRAGMA_IMPLEMENTATION)
 | 
						|
#pragma implementation
 | 
						|
#endif
 | 
						|
 | 
						|
#include "guardian.h"
 | 
						|
#include <string.h>
 | 
						|
#include <sys/types.h>
 | 
						|
#include <signal.h>
 | 
						|
 | 
						|
#include "instance.h"
 | 
						|
#include "instance_map.h"
 | 
						|
#include "log.h"
 | 
						|
#include "mysql_manager_error.h"
 | 
						|
#include "options.h"
 | 
						|
 | 
						|
 | 
						|
/*************************************************************************
 | 
						|
 {{{ Constructor & destructor.
 | 
						|
*************************************************************************/
 | 
						|
 | 
						|
/**
 | 
						|
  Guardian constructor.
 | 
						|
 | 
						|
  SYNOPSIS
 | 
						|
    Guardian()
 | 
						|
    thread_registry_arg
 | 
						|
    instance_map_arg
 | 
						|
 | 
						|
  DESCRIPTION
 | 
						|
    Nominal contructor intended for assigning references and initialize
 | 
						|
    trivial objects. Real initialization is made by init() method.
 | 
						|
*/
 | 
						|
 | 
						|
Guardian::Guardian(Thread_registry *thread_registry_arg,
 | 
						|
                   Instance_map *instance_map_arg)
 | 
						|
  :shutdown_requested(FALSE),
 | 
						|
  stopped(FALSE),
 | 
						|
  thread_registry(thread_registry_arg),
 | 
						|
  instance_map(instance_map_arg)
 | 
						|
{
 | 
						|
  pthread_mutex_init(&LOCK_guardian, 0);
 | 
						|
  pthread_cond_init(&COND_guardian, 0);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
Guardian::~Guardian()
 | 
						|
{
 | 
						|
  /*
 | 
						|
    NOTE: it's necessary to synchronize here, because Guiardian thread can be
 | 
						|
    still alive an hold the mutex (because it is detached and we have no
 | 
						|
    control over it).
 | 
						|
  */
 | 
						|
 | 
						|
  lock();
 | 
						|
  unlock();
 | 
						|
 | 
						|
  pthread_mutex_destroy(&LOCK_guardian);
 | 
						|
  pthread_cond_destroy(&COND_guardian);
 | 
						|
}
 | 
						|
 | 
						|
/*************************************************************************
 | 
						|
  }}}
 | 
						|
*************************************************************************/
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Send request to stop Guardian.
 | 
						|
 | 
						|
  SYNOPSIS
 | 
						|
    request_shutdown()
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::request_shutdown()
 | 
						|
{
 | 
						|
  stop_instances();
 | 
						|
 | 
						|
  lock();
 | 
						|
  shutdown_requested= TRUE;
 | 
						|
  unlock();
 | 
						|
 | 
						|
  ping();
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Process an instance.
 | 
						|
 | 
						|
  SYNOPSIS
 | 
						|
    process_instance()
 | 
						|
    instance  a pointer to the instance for processing
 | 
						|
 | 
						|
  MT-NOTE:
 | 
						|
    - the given instance must be locked before calling this operation;
 | 
						|
    - Guardian must be locked before calling this operation.
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::process_instance(Instance *instance)
 | 
						|
{
 | 
						|
  int restart_retry= 100;
 | 
						|
  time_t current_time= time(NULL);
 | 
						|
 | 
						|
  if (instance->get_state() == Instance::STOPPING)
 | 
						|
  {
 | 
						|
    /* This brach is executed during shutdown. */
 | 
						|
 | 
						|
    /* This returns TRUE if and only if an instance was stopped for sure. */
 | 
						|
    if (instance->is_crashed())
 | 
						|
    {
 | 
						|
      log_info("Guardian: '%s' stopped.",
 | 
						|
               (const char *) instance->get_name()->str);
 | 
						|
 | 
						|
      instance->set_state(Instance::STOPPED);
 | 
						|
    }
 | 
						|
    else if ((uint) (current_time - instance->last_checked) >=
 | 
						|
             instance->options.get_shutdown_delay())
 | 
						|
    {
 | 
						|
      log_info("Guardian: '%s' hasn't stopped within %d secs.",
 | 
						|
               (const char *) instance->get_name()->str,
 | 
						|
               (int) instance->options.get_shutdown_delay());
 | 
						|
 | 
						|
      instance->kill_mysqld(SIGKILL);
 | 
						|
 | 
						|
      log_info("Guardian: pretend that '%s' is killed.",
 | 
						|
               (const char *) instance->get_name()->str);
 | 
						|
 | 
						|
      instance->set_state(Instance::STOPPED);
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
      log_info("Guardian: waiting for '%s' to stop (%d secs left).",
 | 
						|
               (const char *) instance->get_name()->str,
 | 
						|
               (int) (instance->options.get_shutdown_delay() -
 | 
						|
                      current_time + instance->last_checked));
 | 
						|
    }
 | 
						|
 | 
						|
    return;
 | 
						|
  }
 | 
						|
 | 
						|
  if (instance->is_mysqld_running())
 | 
						|
  {
 | 
						|
    /* The instance can be contacted  on it's port */
 | 
						|
 | 
						|
    /* If STARTING also check that pidfile has been created */
 | 
						|
    if (instance->get_state() == Instance::STARTING &&
 | 
						|
        instance->options.load_pid() == 0)
 | 
						|
    {
 | 
						|
      /* Pid file not created yet, don't go to STARTED state yet  */
 | 
						|
    }
 | 
						|
    else if (instance->get_state() != Instance::STARTED)
 | 
						|
    {
 | 
						|
      /* clear status fields */
 | 
						|
      log_info("Guardian: '%s' is running, set state to STARTED.",
 | 
						|
               (const char *) instance->options.instance_name.str);
 | 
						|
      instance->reset_stat();
 | 
						|
      instance->set_state(Instance::STARTED);
 | 
						|
    }
 | 
						|
  }
 | 
						|
  else
 | 
						|
  {
 | 
						|
    switch (instance->get_state()) {
 | 
						|
    case Instance::NOT_STARTED:
 | 
						|
      log_info("Guardian: starting '%s'...",
 | 
						|
               (const char *) instance->options.instance_name.str);
 | 
						|
 | 
						|
      /* NOTE: set state to STARTING _before_ start() is called. */
 | 
						|
      instance->set_state(Instance::STARTING);
 | 
						|
      instance->last_checked= current_time;
 | 
						|
 | 
						|
      instance->start_mysqld();
 | 
						|
 | 
						|
      return;
 | 
						|
 | 
						|
    case Instance::STARTED:     /* fallthrough */
 | 
						|
    case Instance::STARTING:    /* let the instance start or crash */
 | 
						|
      if (!instance->is_crashed())
 | 
						|
        return;
 | 
						|
 | 
						|
      instance->crash_moment= current_time;
 | 
						|
      instance->last_checked= current_time;
 | 
						|
      instance->set_state(Instance::JUST_CRASHED);
 | 
						|
      /* fallthrough -- restart an instance immediately */
 | 
						|
 | 
						|
    case Instance::JUST_CRASHED:
 | 
						|
      if (current_time - instance->crash_moment <= 2)
 | 
						|
      {
 | 
						|
        if (instance->is_crashed())
 | 
						|
        {
 | 
						|
          instance->start_mysqld();
 | 
						|
          log_info("Guardian: starting '%s'...",
 | 
						|
                   (const char *) instance->options.instance_name.str);
 | 
						|
        }
 | 
						|
      }
 | 
						|
      else
 | 
						|
        instance->set_state(Instance::CRASHED);
 | 
						|
 | 
						|
      return;
 | 
						|
 | 
						|
    case Instance::CRASHED:    /* just regular restarts */
 | 
						|
      if ((ulong) (current_time - instance->last_checked) <=
 | 
						|
          (ulong) Options::Main::monitoring_interval)
 | 
						|
        return;
 | 
						|
 | 
						|
      if (instance->restart_counter < restart_retry)
 | 
						|
      {
 | 
						|
        if (instance->is_crashed())
 | 
						|
        {
 | 
						|
          instance->start_mysqld();
 | 
						|
          instance->last_checked= current_time;
 | 
						|
 | 
						|
          log_info("Guardian: restarting '%s'...",
 | 
						|
                   (const char *) instance->options.instance_name.str);
 | 
						|
        }
 | 
						|
      }
 | 
						|
      else
 | 
						|
      {
 | 
						|
        log_info("Guardian: can not start '%s'. "
 | 
						|
                 "Abandoning attempts to (re)start it",
 | 
						|
                 (const char *) instance->options.instance_name.str);
 | 
						|
 | 
						|
        instance->set_state(Instance::CRASHED_AND_ABANDONED);
 | 
						|
      }
 | 
						|
 | 
						|
      return;
 | 
						|
 | 
						|
    case Instance::CRASHED_AND_ABANDONED:
 | 
						|
      return; /* do nothing */
 | 
						|
 | 
						|
    default:
 | 
						|
      DBUG_ASSERT(0);
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Main function of Guardian thread.
 | 
						|
 | 
						|
  SYNOPSIS
 | 
						|
    run()
 | 
						|
 | 
						|
  DESCRIPTION
 | 
						|
    Check for all guarded instances and restart them if needed.
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::run()
 | 
						|
{
 | 
						|
  struct timespec timeout;
 | 
						|
 | 
						|
  log_info("Guardian: started.");
 | 
						|
 | 
						|
  thread_registry->register_thread(&thread_info);
 | 
						|
 | 
						|
  /* Loop, until all instances were shut down at the end. */
 | 
						|
 | 
						|
  while (true)
 | 
						|
  {
 | 
						|
    Instance_map::Iterator instances_it(instance_map);
 | 
						|
    Instance *instance;
 | 
						|
    bool all_instances_stopped= TRUE;
 | 
						|
 | 
						|
    instance_map->lock();
 | 
						|
 | 
						|
    while ((instance= instances_it.next()))
 | 
						|
    {
 | 
						|
      instance->lock();
 | 
						|
 | 
						|
      if (!instance->is_guarded() ||
 | 
						|
          instance->get_state() == Instance::STOPPED)
 | 
						|
      {
 | 
						|
        instance->unlock();
 | 
						|
        continue;
 | 
						|
      }
 | 
						|
 | 
						|
      process_instance(instance);
 | 
						|
 | 
						|
      if (instance->get_state() != Instance::STOPPED)
 | 
						|
        all_instances_stopped= FALSE;
 | 
						|
 | 
						|
      instance->unlock();
 | 
						|
    }
 | 
						|
 | 
						|
    instance_map->unlock();
 | 
						|
 | 
						|
    lock();
 | 
						|
 | 
						|
    if (shutdown_requested && all_instances_stopped)
 | 
						|
    {
 | 
						|
      log_info("Guardian: all guarded mysqlds stopped.");
 | 
						|
 | 
						|
      stopped= TRUE;
 | 
						|
      unlock();
 | 
						|
      break;
 | 
						|
    }
 | 
						|
 | 
						|
    set_timespec(timeout, Options::Main::monitoring_interval);
 | 
						|
 | 
						|
    thread_registry->cond_timedwait(&thread_info, &COND_guardian,
 | 
						|
                                    &LOCK_guardian, &timeout);
 | 
						|
    unlock();
 | 
						|
  }
 | 
						|
 | 
						|
  log_info("Guardian: stopped.");
 | 
						|
 | 
						|
  /* Now, when the Guardian is stopped we can stop the IM. */
 | 
						|
 | 
						|
  thread_registry->unregister_thread(&thread_info);
 | 
						|
  thread_registry->request_shutdown();
 | 
						|
 | 
						|
  log_info("Guardian: finished.");
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Return the value of stopped flag.
 | 
						|
*/
 | 
						|
 | 
						|
bool Guardian::is_stopped()
 | 
						|
{
 | 
						|
  int var;
 | 
						|
 | 
						|
  lock();
 | 
						|
  var= stopped;
 | 
						|
  unlock();
 | 
						|
 | 
						|
  return var;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Wake up Guardian thread.
 | 
						|
 | 
						|
  MT-NOTE: though usually the mutex associated with condition variable should
 | 
						|
  be acquired before signalling the variable, here this is not needed.
 | 
						|
  Signalling under locked mutex is used to avoid lost signals. In the current
 | 
						|
  logic however locking mutex does not guarantee that the signal will not be
 | 
						|
  lost.
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::ping()
 | 
						|
{
 | 
						|
  pthread_cond_signal(&COND_guardian);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Prepare list of instances.
 | 
						|
 | 
						|
  SYNOPSIS
 | 
						|
    init()
 | 
						|
 | 
						|
  MT-NOTE: Instance Map must be locked before calling the operation.
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::init()
 | 
						|
{
 | 
						|
  Instance *instance;
 | 
						|
  Instance_map::Iterator iterator(instance_map);
 | 
						|
 | 
						|
  while ((instance= iterator.next()))
 | 
						|
  {
 | 
						|
    instance->lock();
 | 
						|
 | 
						|
    instance->reset_stat();
 | 
						|
    instance->set_state(Instance::NOT_STARTED);
 | 
						|
 | 
						|
    instance->unlock();
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  An internal method which is called at shutdown to unregister instances and
 | 
						|
  attempt to stop them if requested.
 | 
						|
 | 
						|
  SYNOPSIS
 | 
						|
    stop_instances()
 | 
						|
 | 
						|
  DESCRIPTION
 | 
						|
    Loops through the guarded_instances list and prepares them for shutdown.
 | 
						|
    For each instance we issue a stop command and change the state
 | 
						|
    accordingly.
 | 
						|
 | 
						|
  NOTE
 | 
						|
    Guardian object should be locked by the caller.
 | 
						|
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::stop_instances()
 | 
						|
{
 | 
						|
  static const int NUM_STOP_ATTEMPTS = 100;
 | 
						|
 | 
						|
  Instance_map::Iterator instances_it(instance_map);
 | 
						|
  Instance *instance;
 | 
						|
 | 
						|
  instance_map->lock();
 | 
						|
 | 
						|
  while ((instance= instances_it.next()))
 | 
						|
  {
 | 
						|
    instance->lock();
 | 
						|
 | 
						|
    if (!instance->is_guarded() ||
 | 
						|
        instance->get_state() == Instance::STOPPED)
 | 
						|
    {
 | 
						|
      instance->unlock();
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
 | 
						|
    /*
 | 
						|
      If instance is running or was running (and now probably hanging),
 | 
						|
      request stop.
 | 
						|
    */
 | 
						|
 | 
						|
    if (instance->is_mysqld_running() ||
 | 
						|
        instance->get_state() == Instance::STARTED)
 | 
						|
    {
 | 
						|
      instance->set_state(Instance::STOPPING);
 | 
						|
      instance->last_checked= time(NULL);
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
      /* Otherwise mark it as STOPPED. */
 | 
						|
      instance->set_state(Instance::STOPPED);
 | 
						|
    }
 | 
						|
 | 
						|
    /* Request mysqld to stop. */
 | 
						|
 | 
						|
    bool instance_stopped= FALSE;
 | 
						|
 | 
						|
    for (int cur_attempt= 0; cur_attempt < NUM_STOP_ATTEMPTS; ++cur_attempt)
 | 
						|
    {
 | 
						|
      if (!instance->kill_mysqld(SIGTERM))
 | 
						|
      {
 | 
						|
        instance_stopped= TRUE;
 | 
						|
        break;
 | 
						|
      }
 | 
						|
 | 
						|
      if (!instance->is_active())
 | 
						|
      {
 | 
						|
        instance_stopped= TRUE;
 | 
						|
        break;
 | 
						|
      }
 | 
						|
 | 
						|
      /* Sleep for 0.3 sec and check again. */
 | 
						|
 | 
						|
      my_sleep(300000);
 | 
						|
    }
 | 
						|
 | 
						|
    /*
 | 
						|
      Abort if we failed to stop mysqld instance. That should not happen,
 | 
						|
      but if it happened, we don't know what to do and prefer to have clear
 | 
						|
      failure with coredump.
 | 
						|
    */
 | 
						|
 | 
						|
    DBUG_ASSERT(instance_stopped);
 | 
						|
 | 
						|
    instance->unlock();
 | 
						|
  }
 | 
						|
 | 
						|
  instance_map->unlock();
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Lock Guardian.
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::lock()
 | 
						|
{
 | 
						|
  pthread_mutex_lock(&LOCK_guardian);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/**
 | 
						|
  Unlock Guardian.
 | 
						|
*/
 | 
						|
 | 
						|
void Guardian::unlock()
 | 
						|
{
 | 
						|
  pthread_mutex_unlock(&LOCK_guardian);
 | 
						|
}
 |