You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-11-19 22:02:09 +03:00
fix(cmapi): MCOL-6091 CMAPI gives DMLProc only 10 seconds for a greceful stop
Fix default timeout for stop node, stop dml proc, shutdown Controller, put_config handler etc. All places that could cause reducing dmlproc graceful stop timeout are fixed: - cluster and node shutdown - stop_dmlproc - start_transaction - put_config - toggle_cluster_state
This commit is contained in:
committed by
Alan Mologorsky
parent
63415f28d0
commit
c5e3b847ab
@@ -114,7 +114,10 @@ MCS_BACKUP_MANAGER_SH = os.path.join(MCS_INSTALL_BIN, 'mcs_backup_manager.sh')
|
||||
CMAPI_PORT = 8640 #TODO: use it in all places
|
||||
CURRENT_NODE_CMAPI_URL = f'https://localhost:{CMAPI_PORT}'
|
||||
REQUEST_TIMEOUT: float = 30.0
|
||||
TRANSACTION_TIMEOUT: float = 300.0 # 5 minutes
|
||||
|
||||
DMLPROC_SHUTDOWN_TIMEOUT: float = 300.0 # 5 minutes, should be less then LONG_REQUEST_TIMEOUT
|
||||
LONG_REQUEST_TIMEOUT: float = 400.0 # should be less than TRANSACTION_TIMEOUT
|
||||
TRANSACTION_TIMEOUT: float = 600.0 # 10 minutes
|
||||
|
||||
# API version
|
||||
_version = '0.4.0'
|
||||
|
||||
@@ -16,10 +16,18 @@ from mcs_node_control.models.node_status import NodeStatus
|
||||
from pydantic import ValidationError
|
||||
|
||||
from cmapi_server.constants import (
|
||||
CMAPI_PACKAGE_NAME, CMAPI_PORT, DEFAULT_MCS_CONF_PATH,
|
||||
DEFAULT_SM_CONF_PATH, EM_PATH_SUFFIX, MCS_BRM_CURRENT_PATH, MCS_EM_PATH,
|
||||
MDB_CS_PACKAGE_NAME, MDB_SERVER_PACKAGE_NAME, REQUEST_TIMEOUT,
|
||||
S3_BRM_CURRENT_PATH, SECRET_KEY,
|
||||
CMAPI_PACKAGE_NAME,
|
||||
CMAPI_PORT,
|
||||
DEFAULT_MCS_CONF_PATH,
|
||||
DMLPROC_SHUTDOWN_TIMEOUT,
|
||||
EM_PATH_SUFFIX,
|
||||
MCS_BRM_CURRENT_PATH,
|
||||
MCS_EM_PATH,
|
||||
MDB_CS_PACKAGE_NAME,
|
||||
MDB_SERVER_PACKAGE_NAME,
|
||||
REQUEST_TIMEOUT,
|
||||
S3_BRM_CURRENT_PATH,
|
||||
SECRET_KEY,
|
||||
)
|
||||
from cmapi_server.controllers.api_clients import NodeControllerClient
|
||||
from cmapi_server.controllers.error import APIError
|
||||
@@ -725,7 +733,7 @@ class ShutdownController:
|
||||
req = cherrypy.request
|
||||
use_sudo = get_use_sudo(req.app.config)
|
||||
request_body = cherrypy.request.json
|
||||
timeout = request_body.get('timeout', 0)
|
||||
timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
|
||||
node_config = NodeConfig()
|
||||
try:
|
||||
MCSProcessManager.stop_node(
|
||||
@@ -894,7 +902,7 @@ class ClusterController:
|
||||
|
||||
request = cherrypy.request
|
||||
request_body = request.json
|
||||
timeout = request_body.get('timeout', None)
|
||||
timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
|
||||
force = request_body.get('force', False)
|
||||
config = request_body.get('config', DEFAULT_MCS_CONF_PATH)
|
||||
in_transaction = request_body.get('in_transaction', False)
|
||||
@@ -904,7 +912,7 @@ class ClusterController:
|
||||
with TransactionManager():
|
||||
response = ClusterHandler.shutdown(config, timeout)
|
||||
else:
|
||||
response = ClusterHandler.shutdown(config)
|
||||
response = ClusterHandler.shutdown(config, timeout)
|
||||
except CMAPIBasicError as err:
|
||||
raise_422_error(module_logger, func_name, err.message)
|
||||
|
||||
@@ -1594,7 +1602,7 @@ class NodeProcessController():
|
||||
|
||||
request = cherrypy.request
|
||||
request_body = request.json
|
||||
timeout = request_body.get('timeout', 10)
|
||||
timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT)
|
||||
force = request_body.get('force', False)
|
||||
|
||||
if force:
|
||||
|
||||
@@ -16,7 +16,11 @@ from mcs_node_control.models.node_config import NodeConfig
|
||||
from tracing.traced_session import get_traced_session
|
||||
|
||||
from cmapi_server.constants import (
|
||||
CMAPI_CONF_PATH, CMAPI_PORT, DEFAULT_MCS_CONF_PATH, REQUEST_TIMEOUT,
|
||||
CMAPI_CONF_PATH,
|
||||
CMAPI_PORT,
|
||||
DEFAULT_MCS_CONF_PATH,
|
||||
DMLPROC_SHUTDOWN_TIMEOUT,
|
||||
REQUEST_TIMEOUT,
|
||||
)
|
||||
from cmapi_server.exceptions import CMAPIBasicError, exc_to_cmapi_error
|
||||
from cmapi_server.controllers.api_clients import NodeControllerClient
|
||||
@@ -44,7 +48,7 @@ class ClusterAction(Enum):
|
||||
|
||||
|
||||
def toggle_cluster_state(
|
||||
action: ClusterAction, config: str) -> dict:
|
||||
action: ClusterAction, config: str, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> dict:
|
||||
"""Toggle the state of the cluster (start or stop).
|
||||
|
||||
:param action: The cluster action to perform.
|
||||
@@ -64,7 +68,7 @@ def toggle_cluster_state(
|
||||
|
||||
switch_node_maintenance(maintainance_flag)
|
||||
update_revision_and_manager()
|
||||
broadcast_new_config(config, distribute_secrets=True)
|
||||
broadcast_new_config(config, distribute_secrets=True, timeout=timeout)
|
||||
|
||||
|
||||
class ClusterHandler:
|
||||
@@ -161,7 +165,7 @@ class ClusterHandler:
|
||||
|
||||
@staticmethod
|
||||
def shutdown(
|
||||
config: str = DEFAULT_MCS_CONF_PATH, timeout: Optional[int] = None
|
||||
config: str = DEFAULT_MCS_CONF_PATH, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT,
|
||||
) -> dict:
|
||||
"""Method to stop the MCS Cluster.
|
||||
|
||||
@@ -169,7 +173,7 @@ class ClusterHandler:
|
||||
defaults to DEFAULT_MCS_CONF_PATH
|
||||
:type config: str, optional
|
||||
:param timeout: timeout in seconds to gracefully stop DMLProc,
|
||||
defaults to None
|
||||
defaults to DMLPROC_SHUTDOWN_TIMEOUT
|
||||
:type timeout: Optional[int], optional
|
||||
:raises CMAPIBasicError: if no nodes in the cluster
|
||||
:return: start timestamp
|
||||
@@ -180,7 +184,7 @@ class ClusterHandler:
|
||||
'Cluster shutdown command called. Shutting down the cluster.'
|
||||
)
|
||||
operation_start_time = str(datetime.now())
|
||||
toggle_cluster_state(ClusterAction.STOP, config)
|
||||
toggle_cluster_state(ClusterAction.STOP, config, timeout=timeout)
|
||||
logger.debug('Successfully finished shutting down the cluster.')
|
||||
return {'timestamp': operation_start_time}
|
||||
|
||||
|
||||
@@ -27,8 +27,15 @@ from cmapi_server.exceptions import CMAPIBasicError
|
||||
requests.packages.urllib3.disable_warnings() # pylint: disable=no-member
|
||||
|
||||
from cmapi_server.constants import (
|
||||
CMAPI_CONF_PATH, CMAPI_DEFAULT_CONF_PATH, DEFAULT_MCS_CONF_PATH,
|
||||
DEFAULT_SM_CONF_PATH, LOCALHOSTS, _version
|
||||
CMAPI_CONF_PATH,
|
||||
CMAPI_DEFAULT_CONF_PATH,
|
||||
DEFAULT_MCS_CONF_PATH,
|
||||
DEFAULT_SM_CONF_PATH,
|
||||
DMLPROC_SHUTDOWN_TIMEOUT,
|
||||
LOCALHOSTS,
|
||||
LONG_REQUEST_TIMEOUT,
|
||||
TRANSACTION_TIMEOUT,
|
||||
_version
|
||||
)
|
||||
from cmapi_server.handlers.cej import CEJPasswordHandler
|
||||
from cmapi_server.managers.process import MCSProcessManager
|
||||
@@ -54,7 +61,7 @@ def start_transaction(
|
||||
remove_nodes: Optional[list] = None,
|
||||
optional_nodes: Optional[list] = None,
|
||||
txn_id: Optional[int] = None,
|
||||
timeout: float = 300.0
|
||||
timeout: float = TRANSACTION_TIMEOUT
|
||||
):
|
||||
"""Start internal CMAPI transaction.
|
||||
|
||||
@@ -78,7 +85,7 @@ def start_transaction(
|
||||
:param txn_id: id for transaction to start, defaults to None
|
||||
:type txn_id: Optional[int], optional
|
||||
:param timeout: time in seconds for cmapi transaction lock before it ends
|
||||
automatically, defaults to 300
|
||||
automatically, defaults to TRANSACTION_TIMEOUT
|
||||
:type timeout: float, optional
|
||||
:return: (success, txn_id, nodes)
|
||||
:rtype: tuple[bool, int, list[str]]
|
||||
@@ -315,8 +322,7 @@ def broadcast_new_config(
|
||||
defaults to DEFAULT_SM_CONF_PATH
|
||||
:param test_mode: for test purposes, defaults to False TODO: remove
|
||||
:param nodes: nodes list for config put, defaults to None
|
||||
:param timeout: timeout passing to gracefully stop DMLProc TODO: for next
|
||||
releases. Could affect all logic of broadcacting new config
|
||||
:param timeout: timeout passing to gracefully stop DMLProc process,
|
||||
:param distribute_secrets: flag to distribute secrets to nodes
|
||||
:param stateful_config_dict: stateful config update dict to distribute to nodes
|
||||
:raises CMAPIBasicError: If Broadcasting config to nodes failed with errors
|
||||
@@ -332,7 +338,7 @@ def broadcast_new_config(
|
||||
headers = {'x-api-key': key}
|
||||
if stateful_config_dict:
|
||||
body = {
|
||||
'timeout': 300,
|
||||
'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout,
|
||||
'stateful_config_dict': stateful_config_dict,
|
||||
'only_stateful_config': True,
|
||||
}
|
||||
@@ -348,7 +354,7 @@ def broadcast_new_config(
|
||||
body = {
|
||||
'manager': root.find('./ClusterManager').text,
|
||||
'revision': root.find('./ConfigRevision').text,
|
||||
'timeout': 300,
|
||||
'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout,
|
||||
'config': config_text,
|
||||
'mcs_config_filename': cs_config_filename,
|
||||
'sm_config_filename': sm_config_filename,
|
||||
@@ -386,7 +392,7 @@ def broadcast_new_config(
|
||||
async with create_traced_async_session() as session:
|
||||
try:
|
||||
async with session.put(
|
||||
url, headers=headers, json=body, ssl=False, timeout=120
|
||||
url, headers=headers, json=body, ssl=False, timeout=LONG_REQUEST_TIMEOUT
|
||||
) as response:
|
||||
resp_json = await response.json(encoding='utf-8')
|
||||
response.raise_for_status()
|
||||
|
||||
@@ -3,11 +3,18 @@ from __future__ import annotations
|
||||
import logging
|
||||
import os.path
|
||||
import socket
|
||||
import time
|
||||
from time import sleep
|
||||
|
||||
import psutil
|
||||
|
||||
from cmapi_server.constants import ALL_MCS_PROGS, MCS_INSTALL_BIN, MCSProgs, ProgInfo
|
||||
from cmapi_server.constants import (
|
||||
ALL_MCS_PROGS,
|
||||
DMLPROC_SHUTDOWN_TIMEOUT,
|
||||
MCS_INSTALL_BIN,
|
||||
MCSProgs,
|
||||
ProgInfo,
|
||||
)
|
||||
from cmapi_server.exceptions import CMAPIBasicError
|
||||
from cmapi_server.process_dispatchers.base import BaseDispatcher
|
||||
from cmapi_server.process_dispatchers.container import ContainerDispatcher
|
||||
@@ -238,32 +245,49 @@ class MCSProcessManager:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def _wait_for_DMLProc_stop(cls, timeout: int = 10) -> bool:
|
||||
def _wait_for_DMLProc_stop(cls, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> bool:
|
||||
"""Waiting DMLProc process to stop.
|
||||
|
||||
:param timeout: timeout to wait, defaults to 10
|
||||
:param timeout: timeout to wait in seconds, defaults to DMLPROC_SHUTDOWN_TIMEOUT
|
||||
:type timeout: int, optional
|
||||
:return: True on success
|
||||
:rtype: bool
|
||||
"""
|
||||
logging.info(f'Waiting for DMLProc to stop in {timeout} seconds')
|
||||
dmlproc_stopped = False
|
||||
while timeout > 0:
|
||||
logging.info(
|
||||
f'Waiting for DMLProc to stop. Seconds left {timeout}.'
|
||||
)
|
||||
# Use a deadline-based loop with throttled logging to reduce noise.
|
||||
deadline = time.monotonic() + max(1, int(timeout))
|
||||
LOG_INTERVAL = 30 # seconds
|
||||
next_log_in = 0 # log immediately on first iteration
|
||||
|
||||
while True:
|
||||
remaining = int(deadline - time.monotonic())
|
||||
if remaining <= 0:
|
||||
break
|
||||
|
||||
if not Process.check_process_alive('DMLProc'):
|
||||
logging.info('DMLProc gracefully stopped by DBRM command.')
|
||||
dmlproc_stopped = True
|
||||
break
|
||||
sleep(1)
|
||||
timeout -= 1
|
||||
else:
|
||||
logging.error(
|
||||
f'DMLProc did not stopped gracefully by DBRM command within '
|
||||
f'{timeout} seconds. Will be stopped directly.'
|
||||
)
|
||||
return dmlproc_stopped
|
||||
return True
|
||||
|
||||
# Throttle waiting logs to roughly once every LOG_INTERVAL seconds
|
||||
if next_log_in <= 0:
|
||||
sleep_for = min(10, remaining)
|
||||
logging.info(
|
||||
(
|
||||
f'Waiting for DMLProc to stop. Seconds left ~{remaining}. '
|
||||
f'Sleeping {sleep_for} seconds before next check.'
|
||||
)
|
||||
)
|
||||
next_log_in = LOG_INTERVAL
|
||||
|
||||
sleep_for = min(10, remaining)
|
||||
sleep(sleep_for)
|
||||
next_log_in -= sleep_for
|
||||
|
||||
logging.error(
|
||||
'DMLProc didn\'t stop gracefully by DBRM command within '
|
||||
f'{timeout} seconds. Will be stopped directly.'
|
||||
)
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def noop(cls, *args, **kwargs):
|
||||
@@ -324,7 +348,7 @@ class MCSProcessManager:
|
||||
|
||||
@classmethod
|
||||
def stop(
|
||||
cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = 10
|
||||
cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT
|
||||
) -> bool:
|
||||
"""Stop mcs process.
|
||||
|
||||
@@ -455,7 +479,7 @@ class MCSProcessManager:
|
||||
cls,
|
||||
is_primary: bool,
|
||||
use_sudo: bool = True,
|
||||
timeout: int = 10,
|
||||
timeout: int = DMLPROC_SHUTDOWN_TIMEOUT,
|
||||
):
|
||||
"""Stop mcs node processes.
|
||||
|
||||
@@ -472,7 +496,7 @@ class MCSProcessManager:
|
||||
# undefined behaviour when primary gone and then recovers (failover
|
||||
# triggered 2 times).
|
||||
for prog_name in cls._get_sorted_progs(is_primary=True, reverse=True):
|
||||
if not cls.stop(prog_name, is_primary, use_sudo):
|
||||
if not cls.stop(prog_name, is_primary, use_sudo, timeout=timeout):
|
||||
logging.error(f'Process "{prog_name}" not stopped properly.')
|
||||
raise CMAPIBasicError(f'Error while stopping "{prog_name}"')
|
||||
|
||||
|
||||
@@ -154,7 +154,6 @@ class MariaDBESRepoManager:
|
||||
:raises CMAPIBasicError: no latest version matched with latest tested
|
||||
:raises CMAPIBasicError: if request error
|
||||
:return: latest MDB version matched with latest tested major
|
||||
:rtype: str
|
||||
"""
|
||||
try:
|
||||
# Download the keyring file
|
||||
@@ -174,7 +173,7 @@ class MariaDBESRepoManager:
|
||||
)
|
||||
latest_version_num = sorted(latest_version_nums, reverse=True)[0]
|
||||
logging.debug(
|
||||
'Succesfully got latest MBD version number: '
|
||||
'Succesfully got latest MDB version number: '
|
||||
f'{latest_version_num}'
|
||||
)
|
||||
except requests.RequestException as exc:
|
||||
|
||||
@@ -159,6 +159,7 @@ def stop(
|
||||
# could affect put_config (helpers.py broadcast_config) operation
|
||||
timeout = 0
|
||||
|
||||
#TODO: bypass timeout here
|
||||
resp = client.shutdown_cluster({'in_transaction': True})
|
||||
return {'timestamp': start_time}
|
||||
|
||||
|
||||
@@ -19,8 +19,12 @@ from rich.table import Table
|
||||
|
||||
|
||||
from cmapi_server.constants import (
|
||||
MCS_DATA_PATH, MCS_SECRETS_FILENAME, REQUEST_TIMEOUT, TRANSACTION_TIMEOUT,
|
||||
CMAPI_CONF_PATH, CMAPI_PORT,
|
||||
CMAPI_CONF_PATH,
|
||||
CMAPI_PORT,
|
||||
MCS_DATA_PATH,
|
||||
MCS_SECRETS_FILENAME,
|
||||
REQUEST_TIMEOUT,
|
||||
TRANSACTION_TIMEOUT,
|
||||
)
|
||||
from cmapi_server.controllers.api_clients import (
|
||||
AppControllerClient, ClusterControllerClient, NodeControllerClient
|
||||
|
||||
Reference in New Issue
Block a user