mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-04-18 21:44:02 +03:00
* MCOL-5594: Interactive "mcs cluster stop" command for CMAPI. [add] NodeProcessController class to handle Node operations [add] two endpoints: stop_dmlproc (PUT) and is_process_running (GET) [add] NodeProcessController.put_stop_dmlproc method to separately stop DMLProc on primary Node [add] NodeProcessController.get_process_running method to check if specified process running or not [add] build_url function to helpers.py. It needed to build urls with query_params [add] MCSProcessManager.gracefully_stop_dmlproc method [add] MCSProcessManager.is_service_running method as a top level wrapper to the same method in dispatcher [fix] MCSProcessManager.stop by using new gracefully_stop_dmlproc [add] interactive option and mode to mcs cluster stop command [fix] requirements.txt with typer version to 0.9.0 where supports various of features including "Annotated" [fix] requirements.txt click version (8.1.3 -> 8.1.7) and typing-extensions (4.3.0 -> 4.8.0). This is dependencies for typer package. [fix] multiple minor formatting, docstrings and comments * MCOL-5594: Add new CMAPI transaction manager. - [add] TransactionManager ContextDecorator to manage transactions in less code and in one place - [add] TransactionManager to cli cluster stop command and to API cluster shutdown command - [fix] id -> txn_id in ClusterHandler class - [fix] ClusterHandler.shutdown class to use inside existing transaction - [add] docstrings in multiple places * MCOL-5594: Review fixes.
464 lines
16 KiB
Python
464 lines
16 KiB
Python
from __future__ import annotations
|
|
import logging
|
|
import os.path
|
|
import socket
|
|
from time import sleep
|
|
|
|
import psutil
|
|
|
|
from cmapi_server.exceptions import CMAPIBasicError
|
|
from cmapi_server.constants import MCS_INSTALL_BIN, ALL_MCS_PROGS
|
|
from cmapi_server.process_dispatchers.systemd import SystemdDispatcher
|
|
from cmapi_server.process_dispatchers.container import (
|
|
ContainerDispatcher
|
|
)
|
|
from mcs_node_control.models.dbrm import DBRM
|
|
from mcs_node_control.models.dbrm_socket import SOCK_TIMEOUT
|
|
from mcs_node_control.models.misc import get_workernodes
|
|
from mcs_node_control.models.process import Process
|
|
|
|
|
|
PROCESS_DISPATCHERS = {
|
|
'systemd': SystemdDispatcher,
|
|
# could be used in docker containers and OSes w/o systemd
|
|
'container': ContainerDispatcher,
|
|
}
|
|
PRIMARY_PROGS = ('controllernode', 'DMLProc', 'DDLProc')
|
|
|
|
|
|
class MCSProcessManager:
|
|
"""Class to run process operations.
|
|
|
|
e.g. re/-start or stop systemd services, run executable.
|
|
"""
|
|
CONTROLLER_MAX_RETRY = 30
|
|
mcs_progs = {}
|
|
mcs_version_info = None
|
|
dispatcher_name = None
|
|
process_dispatcher = None
|
|
|
|
@classmethod
|
|
def _get_prog_name(cls, name: str) -> str:
|
|
"""Get proper service name for systemd or non-systemd installations.
|
|
|
|
:param name: service name
|
|
:type name: str
|
|
:return: correct service name
|
|
:rtype: str
|
|
"""
|
|
if cls.dispatcher_name == 'systemd':
|
|
return ALL_MCS_PROGS[name].service_name
|
|
return name
|
|
|
|
@classmethod
|
|
def _get_sorted_progs(
|
|
cls, is_primary: bool, reverse: bool = False
|
|
) -> dict:
|
|
"""Get sorted services dict.
|
|
|
|
:param is_primary: is primary node or not
|
|
:type is_primary: bool
|
|
:param reverse: reverse sort order, defaults to False
|
|
:type reverse: bool, optional
|
|
:return: dict with sorted services in correct start/stop order
|
|
:rtype: dict
|
|
"""
|
|
unsorted_progs: dict
|
|
if is_primary:
|
|
unsorted_progs = cls.mcs_progs
|
|
else:
|
|
unsorted_progs = {
|
|
prog_name: prog_info
|
|
for prog_name, prog_info in cls.mcs_progs.items()
|
|
if prog_name not in PRIMARY_PROGS
|
|
}
|
|
if reverse:
|
|
# stop sequence builds using stop_priority property
|
|
return dict(
|
|
sorted(
|
|
unsorted_progs.items(),
|
|
key=lambda item: item[1].stop_priority,
|
|
)
|
|
)
|
|
# start up sequence is a dict default sequence
|
|
return unsorted_progs
|
|
|
|
@classmethod
|
|
def _detect_processes(cls) -> None:
|
|
"""Detect existing mcs services. Depends on MCS version."""
|
|
if cls.mcs_progs:
|
|
logging.warning('Mcs ProcessHandler already detected processes.')
|
|
|
|
for prog_name, prog_info in ALL_MCS_PROGS.items():
|
|
if os.path.exists(os.path.join(MCS_INSTALL_BIN, prog_name)):
|
|
cls.mcs_progs[prog_name] = prog_info
|
|
|
|
@classmethod
|
|
def detect(cls, dispatcher_name: str, dispatcher_path: str = None) -> None:
|
|
"""Detect mcs version info and installed processes.
|
|
|
|
:param dispatcher_name: process dispatcher name
|
|
:type dispatcher_name: str
|
|
:param dispatcher_path: path to custom dispatcher,
|
|
for next releases, defaults to None
|
|
:type dispatcher_path: str, optional
|
|
:raises CMAPIBasicError: if custom dispatcher path doesn't exists
|
|
:raises CMAPIBasicError: Not implemented custom dispatcher error
|
|
"""
|
|
cls._detect_processes()
|
|
# detect mcs version info by processes
|
|
if len(cls.mcs_progs) == 8:
|
|
cls.mcs_version_info = '6.4.x and lower'
|
|
elif len(cls.mcs_progs) == 7 and 'ExeMgr' not in cls.mcs_progs:
|
|
cls.mcs_version_info = '22.08.x and higher'
|
|
else:
|
|
cls.mcs_version_info = 'Undefined'
|
|
logging.warning(
|
|
'MCS version haven\'t been detected properly.'
|
|
'Please try to update your CMAPI version or contact support.'
|
|
)
|
|
logging.info(
|
|
f'Detected {len(cls.mcs_progs)} MCS services.'
|
|
f'MCS version is {cls.mcs_version_info}'
|
|
)
|
|
# TODO: For next releases. Do we really need custom dispatchers?
|
|
if dispatcher_name not in PROCESS_DISPATCHERS:
|
|
logging.warning(
|
|
f'Custom process dispatcher with name "{dispatcher_name}" '
|
|
f'and path "{dispatcher_path}" used.'
|
|
)
|
|
if not dispatcher_path or not os.path.exists(dispatcher_path):
|
|
err_msg = 'Wrong dispatcher path in cmapi_config file.'
|
|
logging.error(err_msg)
|
|
raise CMAPIBasicError(err_msg)
|
|
cls.dispatcher_name = 'custom'
|
|
raise CMAPIBasicError('Custom dispatchers yet not implemented!')
|
|
|
|
cls.dispatcher_name = dispatcher_name
|
|
cls.process_dispatcher = PROCESS_DISPATCHERS[dispatcher_name]
|
|
cls.process_dispatcher.init()
|
|
|
|
@classmethod
|
|
def _wait_for_workernodes(cls) -> bool:
|
|
"""Wait for workernodes processes.
|
|
|
|
Waiting for all workernodes to come up before starting
|
|
controllernode on a primary.
|
|
|
|
:return: True on success
|
|
:rtype: bool
|
|
"""
|
|
logging.debug(
|
|
'Waiting for all workernodes to come up before starting '
|
|
'controllernode on a primary.'
|
|
)
|
|
workernodes = get_workernodes()
|
|
attempts = cls.CONTROLLER_MAX_RETRY
|
|
while attempts > 0 and len(workernodes) > 0:
|
|
logging.debug(f'Waiting for "{list(workernodes)}"....{attempts}')
|
|
# creating a separated list with workernode names
|
|
# for safe deleting items from source dict
|
|
for name in list(workernodes):
|
|
try:
|
|
sock = socket.socket(
|
|
socket.AF_INET, socket.SOCK_STREAM
|
|
)
|
|
sock.settimeout(SOCK_TIMEOUT)
|
|
sock.connect(
|
|
(
|
|
workernodes[name]['IPAddr'],
|
|
workernodes[name]['Port']
|
|
)
|
|
)
|
|
except socket.timeout:
|
|
logging.debug(
|
|
f'"{name}" {workernodes[name]["IPAddr"]}:'
|
|
f'{workernodes[name]["Port"]} not started yet.'
|
|
)
|
|
else:
|
|
# delete started workernode from workernodes dict
|
|
del workernodes[name]
|
|
finally:
|
|
sock.close()
|
|
attempts -= 1
|
|
|
|
if workernodes:
|
|
logging.error(
|
|
f'Some workernodes: "{workernodes}" are not reachable after '
|
|
f'{cls.CONTROLLER_MAX_RETRY} attempts to connect with '
|
|
f'{SOCK_TIMEOUT} seconds timeout.'
|
|
'Starting mcs-controllernode anyway.'
|
|
)
|
|
return False
|
|
return True
|
|
|
|
@classmethod
|
|
def _wait_for_controllernode(cls) -> bool:
|
|
"""Waiting for controllernode to come up on a primary.
|
|
|
|
:return: True on success
|
|
:rtype: bool
|
|
"""
|
|
logging.debug(
|
|
'Waiting for controllernode to come up before starting '
|
|
'ddlproc/dmlproc on non-primary nodes.'
|
|
)
|
|
attempts = cls.CONTROLLER_MAX_RETRY
|
|
success = False
|
|
while attempts > 0:
|
|
try:
|
|
with DBRM():
|
|
# check connection
|
|
success = True
|
|
except (ConnectionRefusedError, RuntimeError, socket.error):
|
|
logging.info(
|
|
'Cannot establish connection to controllernode.'
|
|
f'Controller node still not started. Waiting...{attempts}'
|
|
)
|
|
else:
|
|
break
|
|
attempts -= 1
|
|
|
|
if not success:
|
|
logging.error(
|
|
'Controllernode is not reachable after '
|
|
f'{cls.CONTROLLER_MAX_RETRY} attempts to connect with '
|
|
f'{SOCK_TIMEOUT} seconds timeout.'
|
|
'Starting mcs-dmlproc/mcs-ddlproc anyway.'
|
|
)
|
|
return False
|
|
return True
|
|
|
|
@classmethod
|
|
def _wait_for_DMLProc_stop(cls, timeout: int = 10) -> bool:
|
|
"""Waiting DMLProc process to stop.
|
|
|
|
:param timeout: timeout to wait, defaults to 10
|
|
:type timeout: int, optional
|
|
:return: True on success
|
|
:rtype: bool
|
|
"""
|
|
logging.info(f'Waiting for DMLProc to stop in {timeout} seconds')
|
|
dmlproc_stopped = False
|
|
while timeout > 0:
|
|
logging.info(
|
|
f'Waiting for DMLProc to stop. Seconds left {timeout}.'
|
|
)
|
|
if not Process.check_process_alive('DMLProc'):
|
|
logging.info('DMLProc gracefully stopped by DBRM command.')
|
|
dmlproc_stopped = True
|
|
break
|
|
sleep(1)
|
|
timeout -= 1
|
|
else:
|
|
logging.error(
|
|
f'DMLProc did not stopped gracefully by DBRM command within '
|
|
f'{timeout} seconds. Will be stopped directly.'
|
|
)
|
|
return dmlproc_stopped
|
|
|
|
@classmethod
|
|
def noop(cls, *args, **kwargs):
|
|
"""No operation. TODO: looks like useless."""
|
|
cls.process_dispatcher.noop()
|
|
|
|
@classmethod
|
|
def gracefully_stop_dmlproc(cls) -> None:
|
|
"""Gracefully stop DMLProc using DBRM commands."""
|
|
logging.info(
|
|
'Trying to gracefully stop DMLProc using DBRM commands.'
|
|
)
|
|
try:
|
|
with DBRM() as dbrm:
|
|
dbrm.set_system_state(
|
|
['SS_ROLLBACK', 'SS_SHUTDOWN_PENDING']
|
|
)
|
|
except (ConnectionRefusedError, RuntimeError):
|
|
logging.error(
|
|
'Cannot set SS_ROLLBACK and SS_SHUTDOWN_PENDING via DBRM, '
|
|
'graceful auto stop of DMLProc failed. '
|
|
'Try a regular stop method.'
|
|
)
|
|
raise
|
|
|
|
@classmethod
|
|
def is_service_running(cls, name: str, use_sudo: bool = True) -> bool:
|
|
"""Check if MCS process is running.
|
|
|
|
:param name: mcs process name
|
|
:type name: str
|
|
:param use_sudo: use sudo or not, defaults to True
|
|
:type use_sudo: bool, optional
|
|
:return: True if mcs process is running, otherwise False
|
|
:rtype: bool
|
|
"""
|
|
return cls.process_dispatcher.is_service_running(
|
|
cls._get_prog_name(name), use_sudo
|
|
)
|
|
|
|
|
|
@classmethod
|
|
def start(cls, name: str, is_primary: bool, use_sudo: bool) -> bool:
|
|
"""Start mcs process.
|
|
|
|
:param name: mcs process name
|
|
:type name: str
|
|
:param is_primary: is node primary or not
|
|
:type is_primary: bool
|
|
:param use_sudo: use sudo or not
|
|
:type use_sudo: bool
|
|
:return: True if process started successfully
|
|
:rtype: bool
|
|
"""
|
|
return cls.process_dispatcher.start(
|
|
cls._get_prog_name(name), is_primary, use_sudo
|
|
)
|
|
|
|
@classmethod
|
|
def stop(
|
|
cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = 10
|
|
) -> bool:
|
|
"""Stop mcs process.
|
|
|
|
:param name: mcs process name
|
|
:type name: str
|
|
:param is_primary: is node primary or not
|
|
:type is_primary: bool
|
|
:param use_sudo: use sudo or not
|
|
:type use_sudo: bool
|
|
:param timeout: timeout for DMLProc gracefully stop using DBRM, seconds
|
|
:type timeout: int
|
|
:return: True if process stopped successfully
|
|
:rtype: bool
|
|
"""
|
|
# TODO: do we need here force stop DMLProc as a method argument?
|
|
|
|
if is_primary and name == 'DMLProc':
|
|
try:
|
|
cls.gracefully_stop_dmlproc()
|
|
except (ConnectionRefusedError, RuntimeError):
|
|
# stop DMLProc using regular signals or systemd
|
|
return cls.process_dispatcher.stop(
|
|
cls._get_prog_name(name), is_primary, use_sudo
|
|
)
|
|
# DMLProc gracefully stopped using DBRM commands otherwise
|
|
# continue with a regular stop method
|
|
if cls._wait_for_DMLProc_stop(timeout):
|
|
return True
|
|
return cls.process_dispatcher.stop(
|
|
cls._get_prog_name(name), is_primary, use_sudo
|
|
)
|
|
|
|
@classmethod
|
|
def restart(cls, name: str, is_primary: bool, use_sudo: bool) -> bool:
|
|
"""Restart mcs process.
|
|
|
|
:param name: mcs process name
|
|
:type name: str
|
|
:param is_primary: is node primary or not
|
|
:type is_primary: bool
|
|
:param use_sudo: use sudo or not
|
|
:type use_sudo: bool
|
|
:return: True if process restarted successfully
|
|
:rtype: bool
|
|
"""
|
|
return cls.process_dispatcher.restart(
|
|
cls._get_prog_name(name), is_primary, use_sudo
|
|
)
|
|
|
|
@classmethod
|
|
def get_running_mcs_procs(cls) -> list[dict]:
|
|
"""Search for mcs processes.
|
|
|
|
The method returns PIDs of MCS services in both container or systemd
|
|
environments.
|
|
|
|
:return: list of dicts with name and pid of mcs process
|
|
:rtype: list[dict]
|
|
"""
|
|
return [
|
|
{'name': proc.name(), 'pid': proc.pid}
|
|
for proc in psutil.process_iter(['pid', 'name'])
|
|
if proc.name() in cls.mcs_progs
|
|
]
|
|
|
|
@classmethod
|
|
def is_node_processes_ok(
|
|
cls, is_primary: bool, node_stopped: bool
|
|
) -> bool:
|
|
"""Check if needed processes exists or not.
|
|
|
|
:param is_primary: is node primary or not
|
|
:type is_primary: bool
|
|
:param node_stopped: is node stopped or started
|
|
:type node_stopped: bool
|
|
:return: True if there are expected value of processes, else False
|
|
:rtype: bool
|
|
|
|
..NOTE: For next releases. Now only used in tests.
|
|
"""
|
|
running_procs = cls.get_running_mcs_procs()
|
|
if node_stopped:
|
|
return len(running_procs) == 0
|
|
node_progs = cls._get_sorted_progs(is_primary)
|
|
return set(node_progs) == set(p['name'] for p in running_procs)
|
|
|
|
@classmethod
|
|
def start_node(cls, is_primary: bool, use_sudo: bool = True):
|
|
"""Start mcs node processes.
|
|
|
|
:param is_primary: is node primary or not, defaults to True
|
|
:type is_primary: bool
|
|
:param use_sudo: use sudo or not, defaults to True
|
|
:type use_sudo: bool, optional
|
|
:raises CMAPIBasicError: immediately if one mcs process not started
|
|
"""
|
|
for prog_name in cls._get_sorted_progs(is_primary):
|
|
if (
|
|
cls.dispatcher_name == 'systemd'
|
|
and prog_name == 'StorageManager'
|
|
):
|
|
# TODO: MCOL-5458
|
|
logging.info(
|
|
f'Skip starting {prog_name} with systemd dispatcher.'
|
|
)
|
|
continue
|
|
# TODO: additional error handling
|
|
if prog_name == 'controllernode':
|
|
cls._wait_for_workernodes()
|
|
if prog_name in ('DMLProc', 'DDLProc'):
|
|
cls._wait_for_controllernode()
|
|
if not cls.start(prog_name, is_primary, use_sudo):
|
|
logging.error(f'Process "{prog_name}" not started properly.')
|
|
raise CMAPIBasicError(f'Error while starting "{prog_name}".')
|
|
|
|
@classmethod
|
|
def stop_node(
|
|
cls, is_primary: bool, use_sudo: bool = True, timeout: int = 10
|
|
):
|
|
"""Stop mcs node processes.
|
|
|
|
:param is_primary: is node primary or not, defaults to True
|
|
:type is_primary: bool
|
|
:param use_sudo: use sudo or not, defaults to True
|
|
:type use_sudo: bool, optional
|
|
:param timeout: timeout for DMLProc gracefully stop using DBRM, seconds
|
|
:type timeout: int
|
|
:raises CMAPIBasicError: immediately if one mcs process not stopped
|
|
"""
|
|
# Every time try to stop all processes no matter primary it or slave,
|
|
# so use full available list of processes. Otherwise, it could cause
|
|
# undefined behaviour when primary gone and then recovers (failover
|
|
# triggered 2 times).
|
|
for prog_name in cls._get_sorted_progs(True, reverse=True):
|
|
if not cls.stop(prog_name, is_primary, use_sudo):
|
|
logging.error(f'Process "{prog_name}" not stopped properly.')
|
|
raise CMAPIBasicError(f'Error while stopping "{prog_name}"')
|
|
|
|
@classmethod
|
|
def restart_node(cls, is_primary: bool, use_sudo: bool):
|
|
"""TODO: For next releases."""
|
|
if cls.get_running_mcs_procs():
|
|
cls.stop_node(is_primary, use_sudo)
|
|
cls.start_node(is_primary, use_sudo)
|