1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-11-19 22:02:09 +03:00

fix(cmapi): MCOL-6091 CMAPI gives DMLProc only 10 seconds for a greceful stop

Fix default timeout for stop node, stop dml proc, shutdown Controller, put_config handler etc.
All places that could cause reducing dmlproc graceful stop timeout are fixed:
- cluster and node shutdown
- stop_dmlproc
- start_transaction
- put_config
- toggle_cluster_state
This commit is contained in:
mariadb-AlanMologorsky
2025-10-17 15:53:39 +03:00
committed by Alan Mologorsky
parent 63415f28d0
commit c5e3b847ab
8 changed files with 98 additions and 49 deletions

View File

@@ -114,7 +114,10 @@ MCS_BACKUP_MANAGER_SH = os.path.join(MCS_INSTALL_BIN, 'mcs_backup_manager.sh')
CMAPI_PORT = 8640 #TODO: use it in all places CMAPI_PORT = 8640 #TODO: use it in all places
CURRENT_NODE_CMAPI_URL = f'https://localhost:{CMAPI_PORT}' CURRENT_NODE_CMAPI_URL = f'https://localhost:{CMAPI_PORT}'
REQUEST_TIMEOUT: float = 30.0 REQUEST_TIMEOUT: float = 30.0
TRANSACTION_TIMEOUT: float = 300.0 # 5 minutes
DMLPROC_SHUTDOWN_TIMEOUT: float = 300.0 # 5 minutes, should be less then LONG_REQUEST_TIMEOUT
LONG_REQUEST_TIMEOUT: float = 400.0 # should be less than TRANSACTION_TIMEOUT
TRANSACTION_TIMEOUT: float = 600.0 # 10 minutes
# API version # API version
_version = '0.4.0' _version = '0.4.0'

View File

@@ -16,10 +16,18 @@ from mcs_node_control.models.node_status import NodeStatus
from pydantic import ValidationError from pydantic import ValidationError
from cmapi_server.constants import ( from cmapi_server.constants import (
CMAPI_PACKAGE_NAME, CMAPI_PORT, DEFAULT_MCS_CONF_PATH, CMAPI_PACKAGE_NAME,
DEFAULT_SM_CONF_PATH, EM_PATH_SUFFIX, MCS_BRM_CURRENT_PATH, MCS_EM_PATH, CMAPI_PORT,
MDB_CS_PACKAGE_NAME, MDB_SERVER_PACKAGE_NAME, REQUEST_TIMEOUT, DEFAULT_MCS_CONF_PATH,
S3_BRM_CURRENT_PATH, SECRET_KEY, DMLPROC_SHUTDOWN_TIMEOUT,
EM_PATH_SUFFIX,
MCS_BRM_CURRENT_PATH,
MCS_EM_PATH,
MDB_CS_PACKAGE_NAME,
MDB_SERVER_PACKAGE_NAME,
REQUEST_TIMEOUT,
S3_BRM_CURRENT_PATH,
SECRET_KEY,
) )
from cmapi_server.controllers.api_clients import NodeControllerClient from cmapi_server.controllers.api_clients import NodeControllerClient
from cmapi_server.controllers.error import APIError from cmapi_server.controllers.error import APIError
@@ -725,7 +733,7 @@ class ShutdownController:
req = cherrypy.request req = cherrypy.request
use_sudo = get_use_sudo(req.app.config) use_sudo = get_use_sudo(req.app.config)
request_body = cherrypy.request.json request_body = cherrypy.request.json
timeout = request_body.get('timeout', 0) timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
node_config = NodeConfig() node_config = NodeConfig()
try: try:
MCSProcessManager.stop_node( MCSProcessManager.stop_node(
@@ -894,7 +902,7 @@ class ClusterController:
request = cherrypy.request request = cherrypy.request
request_body = request.json request_body = request.json
timeout = request_body.get('timeout', None) timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
force = request_body.get('force', False) force = request_body.get('force', False)
config = request_body.get('config', DEFAULT_MCS_CONF_PATH) config = request_body.get('config', DEFAULT_MCS_CONF_PATH)
in_transaction = request_body.get('in_transaction', False) in_transaction = request_body.get('in_transaction', False)
@@ -904,7 +912,7 @@ class ClusterController:
with TransactionManager(): with TransactionManager():
response = ClusterHandler.shutdown(config, timeout) response = ClusterHandler.shutdown(config, timeout)
else: else:
response = ClusterHandler.shutdown(config) response = ClusterHandler.shutdown(config, timeout)
except CMAPIBasicError as err: except CMAPIBasicError as err:
raise_422_error(module_logger, func_name, err.message) raise_422_error(module_logger, func_name, err.message)
@@ -1594,7 +1602,7 @@ class NodeProcessController():
request = cherrypy.request request = cherrypy.request
request_body = request.json request_body = request.json
timeout = request_body.get('timeout', 10) timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
force = request_body.get('force', False) force = request_body.get('force', False)
if force: if force:

View File

@@ -16,7 +16,11 @@ from mcs_node_control.models.node_config import NodeConfig
from tracing.traced_session import get_traced_session from tracing.traced_session import get_traced_session
from cmapi_server.constants import ( from cmapi_server.constants import (
CMAPI_CONF_PATH, CMAPI_PORT, DEFAULT_MCS_CONF_PATH, REQUEST_TIMEOUT, CMAPI_CONF_PATH,
CMAPI_PORT,
DEFAULT_MCS_CONF_PATH,
DMLPROC_SHUTDOWN_TIMEOUT,
REQUEST_TIMEOUT,
) )
from cmapi_server.exceptions import CMAPIBasicError, exc_to_cmapi_error from cmapi_server.exceptions import CMAPIBasicError, exc_to_cmapi_error
from cmapi_server.controllers.api_clients import NodeControllerClient from cmapi_server.controllers.api_clients import NodeControllerClient
@@ -44,7 +48,7 @@ class ClusterAction(Enum):
def toggle_cluster_state( def toggle_cluster_state(
action: ClusterAction, config: str) -> dict: action: ClusterAction, config: str, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> dict:
"""Toggle the state of the cluster (start or stop). """Toggle the state of the cluster (start or stop).
:param action: The cluster action to perform. :param action: The cluster action to perform.
@@ -64,7 +68,7 @@ def toggle_cluster_state(
switch_node_maintenance(maintainance_flag) switch_node_maintenance(maintainance_flag)
update_revision_and_manager() update_revision_and_manager()
broadcast_new_config(config, distribute_secrets=True) broadcast_new_config(config, distribute_secrets=True, timeout=timeout)
class ClusterHandler: class ClusterHandler:
@@ -161,7 +165,7 @@ class ClusterHandler:
@staticmethod @staticmethod
def shutdown( def shutdown(
config: str = DEFAULT_MCS_CONF_PATH, timeout: Optional[int] = None config: str = DEFAULT_MCS_CONF_PATH, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT,
) -> dict: ) -> dict:
"""Method to stop the MCS Cluster. """Method to stop the MCS Cluster.
@@ -169,7 +173,7 @@ class ClusterHandler:
defaults to DEFAULT_MCS_CONF_PATH defaults to DEFAULT_MCS_CONF_PATH
:type config: str, optional :type config: str, optional
:param timeout: timeout in seconds to gracefully stop DMLProc, :param timeout: timeout in seconds to gracefully stop DMLProc,
defaults to None defaults to DMLPROC_SHUTDOWN_TIMEOUT
:type timeout: Optional[int], optional :type timeout: Optional[int], optional
:raises CMAPIBasicError: if no nodes in the cluster :raises CMAPIBasicError: if no nodes in the cluster
:return: start timestamp :return: start timestamp
@@ -180,7 +184,7 @@ class ClusterHandler:
'Cluster shutdown command called. Shutting down the cluster.' 'Cluster shutdown command called. Shutting down the cluster.'
) )
operation_start_time = str(datetime.now()) operation_start_time = str(datetime.now())
toggle_cluster_state(ClusterAction.STOP, config) toggle_cluster_state(ClusterAction.STOP, config, timeout=timeout)
logger.debug('Successfully finished shutting down the cluster.') logger.debug('Successfully finished shutting down the cluster.')
return {'timestamp': operation_start_time} return {'timestamp': operation_start_time}

View File

@@ -27,8 +27,15 @@ from cmapi_server.exceptions import CMAPIBasicError
requests.packages.urllib3.disable_warnings() # pylint: disable=no-member requests.packages.urllib3.disable_warnings() # pylint: disable=no-member
from cmapi_server.constants import ( from cmapi_server.constants import (
CMAPI_CONF_PATH, CMAPI_DEFAULT_CONF_PATH, DEFAULT_MCS_CONF_PATH, CMAPI_CONF_PATH,
DEFAULT_SM_CONF_PATH, LOCALHOSTS, _version CMAPI_DEFAULT_CONF_PATH,
DEFAULT_MCS_CONF_PATH,
DEFAULT_SM_CONF_PATH,
DMLPROC_SHUTDOWN_TIMEOUT,
LOCALHOSTS,
LONG_REQUEST_TIMEOUT,
TRANSACTION_TIMEOUT,
_version
) )
from cmapi_server.handlers.cej import CEJPasswordHandler from cmapi_server.handlers.cej import CEJPasswordHandler
from cmapi_server.managers.process import MCSProcessManager from cmapi_server.managers.process import MCSProcessManager
@@ -54,7 +61,7 @@ def start_transaction(
remove_nodes: Optional[list] = None, remove_nodes: Optional[list] = None,
optional_nodes: Optional[list] = None, optional_nodes: Optional[list] = None,
txn_id: Optional[int] = None, txn_id: Optional[int] = None,
timeout: float = 300.0 timeout: float = TRANSACTION_TIMEOUT
): ):
"""Start internal CMAPI transaction. """Start internal CMAPI transaction.
@@ -78,7 +85,7 @@ def start_transaction(
:param txn_id: id for transaction to start, defaults to None :param txn_id: id for transaction to start, defaults to None
:type txn_id: Optional[int], optional :type txn_id: Optional[int], optional
:param timeout: time in seconds for cmapi transaction lock before it ends :param timeout: time in seconds for cmapi transaction lock before it ends
automatically, defaults to 300 automatically, defaults to TRANSACTION_TIMEOUT
:type timeout: float, optional :type timeout: float, optional
:return: (success, txn_id, nodes) :return: (success, txn_id, nodes)
:rtype: tuple[bool, int, list[str]] :rtype: tuple[bool, int, list[str]]
@@ -315,8 +322,7 @@ def broadcast_new_config(
defaults to DEFAULT_SM_CONF_PATH defaults to DEFAULT_SM_CONF_PATH
:param test_mode: for test purposes, defaults to False TODO: remove :param test_mode: for test purposes, defaults to False TODO: remove
:param nodes: nodes list for config put, defaults to None :param nodes: nodes list for config put, defaults to None
:param timeout: timeout passing to gracefully stop DMLProc TODO: for next :param timeout: timeout passing to gracefully stop DMLProc process,
releases. Could affect all logic of broadcacting new config
:param distribute_secrets: flag to distribute secrets to nodes :param distribute_secrets: flag to distribute secrets to nodes
:param stateful_config_dict: stateful config update dict to distribute to nodes :param stateful_config_dict: stateful config update dict to distribute to nodes
:raises CMAPIBasicError: If Broadcasting config to nodes failed with errors :raises CMAPIBasicError: If Broadcasting config to nodes failed with errors
@@ -332,7 +338,7 @@ def broadcast_new_config(
headers = {'x-api-key': key} headers = {'x-api-key': key}
if stateful_config_dict: if stateful_config_dict:
body = { body = {
'timeout': 300, 'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout,
'stateful_config_dict': stateful_config_dict, 'stateful_config_dict': stateful_config_dict,
'only_stateful_config': True, 'only_stateful_config': True,
} }
@@ -348,7 +354,7 @@ def broadcast_new_config(
body = { body = {
'manager': root.find('./ClusterManager').text, 'manager': root.find('./ClusterManager').text,
'revision': root.find('./ConfigRevision').text, 'revision': root.find('./ConfigRevision').text,
'timeout': 300, 'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout,
'config': config_text, 'config': config_text,
'mcs_config_filename': cs_config_filename, 'mcs_config_filename': cs_config_filename,
'sm_config_filename': sm_config_filename, 'sm_config_filename': sm_config_filename,
@@ -386,7 +392,7 @@ def broadcast_new_config(
async with create_traced_async_session() as session: async with create_traced_async_session() as session:
try: try:
async with session.put( async with session.put(
url, headers=headers, json=body, ssl=False, timeout=120 url, headers=headers, json=body, ssl=False, timeout=LONG_REQUEST_TIMEOUT
) as response: ) as response:
resp_json = await response.json(encoding='utf-8') resp_json = await response.json(encoding='utf-8')
response.raise_for_status() response.raise_for_status()

View File

@@ -3,11 +3,18 @@ from __future__ import annotations
import logging import logging
import os.path import os.path
import socket import socket
import time
from time import sleep from time import sleep
import psutil import psutil
from cmapi_server.constants import ALL_MCS_PROGS, MCS_INSTALL_BIN, MCSProgs, ProgInfo from cmapi_server.constants import (
ALL_MCS_PROGS,
DMLPROC_SHUTDOWN_TIMEOUT,
MCS_INSTALL_BIN,
MCSProgs,
ProgInfo,
)
from cmapi_server.exceptions import CMAPIBasicError from cmapi_server.exceptions import CMAPIBasicError
from cmapi_server.process_dispatchers.base import BaseDispatcher from cmapi_server.process_dispatchers.base import BaseDispatcher
from cmapi_server.process_dispatchers.container import ContainerDispatcher from cmapi_server.process_dispatchers.container import ContainerDispatcher
@@ -238,32 +245,49 @@ class MCSProcessManager:
return True return True
@classmethod @classmethod
def _wait_for_DMLProc_stop(cls, timeout: int = 10) -> bool: def _wait_for_DMLProc_stop(cls, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> bool:
"""Waiting DMLProc process to stop. """Waiting DMLProc process to stop.
:param timeout: timeout to wait, defaults to 10 :param timeout: timeout to wait in seconds, defaults to DMLPROC_SHUTDOWN_TIMEOUT
:type timeout: int, optional :type timeout: int, optional
:return: True on success :return: True on success
:rtype: bool :rtype: bool
""" """
logging.info(f'Waiting for DMLProc to stop in {timeout} seconds') logging.info(f'Waiting for DMLProc to stop in {timeout} seconds')
dmlproc_stopped = False # Use a deadline-based loop with throttled logging to reduce noise.
while timeout > 0: deadline = time.monotonic() + max(1, int(timeout))
logging.info( LOG_INTERVAL = 30 # seconds
f'Waiting for DMLProc to stop. Seconds left {timeout}.' next_log_in = 0 # log immediately on first iteration
)
while True:
remaining = int(deadline - time.monotonic())
if remaining <= 0:
break
if not Process.check_process_alive('DMLProc'): if not Process.check_process_alive('DMLProc'):
logging.info('DMLProc gracefully stopped by DBRM command.') logging.info('DMLProc gracefully stopped by DBRM command.')
dmlproc_stopped = True return True
break
sleep(1) # Throttle waiting logs to roughly once every LOG_INTERVAL seconds
timeout -= 1 if next_log_in <= 0:
else: sleep_for = min(10, remaining)
logging.error( logging.info(
f'DMLProc did not stopped gracefully by DBRM command within ' (
f'{timeout} seconds. Will be stopped directly.' f'Waiting for DMLProc to stop. Seconds left ~{remaining}. '
) f'Sleeping {sleep_for} seconds before next check.'
return dmlproc_stopped )
)
next_log_in = LOG_INTERVAL
sleep_for = min(10, remaining)
sleep(sleep_for)
next_log_in -= sleep_for
logging.error(
'DMLProc didn\'t stop gracefully by DBRM command within '
f'{timeout} seconds. Will be stopped directly.'
)
return False
@classmethod @classmethod
def noop(cls, *args, **kwargs): def noop(cls, *args, **kwargs):
@@ -324,7 +348,7 @@ class MCSProcessManager:
@classmethod @classmethod
def stop( def stop(
cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = 10 cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT
) -> bool: ) -> bool:
"""Stop mcs process. """Stop mcs process.
@@ -455,7 +479,7 @@ class MCSProcessManager:
cls, cls,
is_primary: bool, is_primary: bool,
use_sudo: bool = True, use_sudo: bool = True,
timeout: int = 10, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT,
): ):
"""Stop mcs node processes. """Stop mcs node processes.
@@ -472,7 +496,7 @@ class MCSProcessManager:
# undefined behaviour when primary gone and then recovers (failover # undefined behaviour when primary gone and then recovers (failover
# triggered 2 times). # triggered 2 times).
for prog_name in cls._get_sorted_progs(is_primary=True, reverse=True): for prog_name in cls._get_sorted_progs(is_primary=True, reverse=True):
if not cls.stop(prog_name, is_primary, use_sudo): if not cls.stop(prog_name, is_primary, use_sudo, timeout=timeout):
logging.error(f'Process "{prog_name}" not stopped properly.') logging.error(f'Process "{prog_name}" not stopped properly.')
raise CMAPIBasicError(f'Error while stopping "{prog_name}"') raise CMAPIBasicError(f'Error while stopping "{prog_name}"')

View File

@@ -154,7 +154,6 @@ class MariaDBESRepoManager:
:raises CMAPIBasicError: no latest version matched with latest tested :raises CMAPIBasicError: no latest version matched with latest tested
:raises CMAPIBasicError: if request error :raises CMAPIBasicError: if request error
:return: latest MDB version matched with latest tested major :return: latest MDB version matched with latest tested major
:rtype: str
""" """
try: try:
# Download the keyring file # Download the keyring file
@@ -174,7 +173,7 @@ class MariaDBESRepoManager:
) )
latest_version_num = sorted(latest_version_nums, reverse=True)[0] latest_version_num = sorted(latest_version_nums, reverse=True)[0]
logging.debug( logging.debug(
'Succesfully got latest MBD version number: ' 'Succesfully got latest MDB version number: '
f'{latest_version_num}' f'{latest_version_num}'
) )
except requests.RequestException as exc: except requests.RequestException as exc:

View File

@@ -159,6 +159,7 @@ def stop(
# could affect put_config (helpers.py broadcast_config) operation # could affect put_config (helpers.py broadcast_config) operation
timeout = 0 timeout = 0
#TODO: bypass timeout here
resp = client.shutdown_cluster({'in_transaction': True}) resp = client.shutdown_cluster({'in_transaction': True})
return {'timestamp': start_time} return {'timestamp': start_time}

View File

@@ -19,8 +19,12 @@ from rich.table import Table
from cmapi_server.constants import ( from cmapi_server.constants import (
MCS_DATA_PATH, MCS_SECRETS_FILENAME, REQUEST_TIMEOUT, TRANSACTION_TIMEOUT, CMAPI_CONF_PATH,
CMAPI_CONF_PATH, CMAPI_PORT, CMAPI_PORT,
MCS_DATA_PATH,
MCS_SECRETS_FILENAME,
REQUEST_TIMEOUT,
TRANSACTION_TIMEOUT,
) )
from cmapi_server.controllers.api_clients import ( from cmapi_server.controllers.api_clients import (
AppControllerClient, ClusterControllerClient, NodeControllerClient AppControllerClient, ClusterControllerClient, NodeControllerClient