diff --git a/cmapi/cmapi_server/constants.py b/cmapi/cmapi_server/constants.py index 0e66e4eca..838fde7e7 100644 --- a/cmapi/cmapi_server/constants.py +++ b/cmapi/cmapi_server/constants.py @@ -114,7 +114,10 @@ MCS_BACKUP_MANAGER_SH = os.path.join(MCS_INSTALL_BIN, 'mcs_backup_manager.sh') CMAPI_PORT = 8640 #TODO: use it in all places CURRENT_NODE_CMAPI_URL = f'https://localhost:{CMAPI_PORT}' REQUEST_TIMEOUT: float = 30.0 -TRANSACTION_TIMEOUT: float = 300.0 # 5 minutes + +DMLPROC_SHUTDOWN_TIMEOUT: float = 300.0 # 5 minutes, should be less then LONG_REQUEST_TIMEOUT +LONG_REQUEST_TIMEOUT: float = 400.0 # should be less than TRANSACTION_TIMEOUT +TRANSACTION_TIMEOUT: float = 600.0 # 10 minutes # API version _version = '0.4.0' diff --git a/cmapi/cmapi_server/controllers/endpoints.py b/cmapi/cmapi_server/controllers/endpoints.py index 528fa1154..f6dba520d 100644 --- a/cmapi/cmapi_server/controllers/endpoints.py +++ b/cmapi/cmapi_server/controllers/endpoints.py @@ -16,10 +16,18 @@ from mcs_node_control.models.node_status import NodeStatus from pydantic import ValidationError from cmapi_server.constants import ( - CMAPI_PACKAGE_NAME, CMAPI_PORT, DEFAULT_MCS_CONF_PATH, - DEFAULT_SM_CONF_PATH, EM_PATH_SUFFIX, MCS_BRM_CURRENT_PATH, MCS_EM_PATH, - MDB_CS_PACKAGE_NAME, MDB_SERVER_PACKAGE_NAME, REQUEST_TIMEOUT, - S3_BRM_CURRENT_PATH, SECRET_KEY, + CMAPI_PACKAGE_NAME, + CMAPI_PORT, + DEFAULT_MCS_CONF_PATH, + DMLPROC_SHUTDOWN_TIMEOUT, + EM_PATH_SUFFIX, + MCS_BRM_CURRENT_PATH, + MCS_EM_PATH, + MDB_CS_PACKAGE_NAME, + MDB_SERVER_PACKAGE_NAME, + REQUEST_TIMEOUT, + S3_BRM_CURRENT_PATH, + SECRET_KEY, ) from cmapi_server.controllers.api_clients import NodeControllerClient from cmapi_server.controllers.error import APIError @@ -725,7 +733,7 @@ class ShutdownController: req = cherrypy.request use_sudo = get_use_sudo(req.app.config) request_body = cherrypy.request.json - timeout = request_body.get('timeout', 0) + timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT) node_config = NodeConfig() try: MCSProcessManager.stop_node( @@ -894,7 +902,7 @@ class ClusterController: request = cherrypy.request request_body = request.json - timeout = request_body.get('timeout', None) + timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT) force = request_body.get('force', False) config = request_body.get('config', DEFAULT_MCS_CONF_PATH) in_transaction = request_body.get('in_transaction', False) @@ -904,7 +912,7 @@ class ClusterController: with TransactionManager(): response = ClusterHandler.shutdown(config, timeout) else: - response = ClusterHandler.shutdown(config) + response = ClusterHandler.shutdown(config, timeout) except CMAPIBasicError as err: raise_422_error(module_logger, func_name, err.message) @@ -1594,7 +1602,7 @@ class NodeProcessController(): request = cherrypy.request request_body = request.json - timeout = request_body.get('timeout', 10) + timeout = request_body.get('timeout', DMLPROC_SHUTDOWN_TIMEOUT) force = request_body.get('force', False) if force: diff --git a/cmapi/cmapi_server/handlers/cluster.py b/cmapi/cmapi_server/handlers/cluster.py index d93fee043..d9dcf49b2 100644 --- a/cmapi/cmapi_server/handlers/cluster.py +++ b/cmapi/cmapi_server/handlers/cluster.py @@ -16,7 +16,11 @@ from mcs_node_control.models.node_config import NodeConfig from tracing.traced_session import get_traced_session from cmapi_server.constants import ( - CMAPI_CONF_PATH, CMAPI_PORT, DEFAULT_MCS_CONF_PATH, REQUEST_TIMEOUT, + CMAPI_CONF_PATH, + CMAPI_PORT, + DEFAULT_MCS_CONF_PATH, + DMLPROC_SHUTDOWN_TIMEOUT, + REQUEST_TIMEOUT, ) from cmapi_server.exceptions import CMAPIBasicError, exc_to_cmapi_error from cmapi_server.controllers.api_clients import NodeControllerClient @@ -44,7 +48,7 @@ class ClusterAction(Enum): def toggle_cluster_state( - action: ClusterAction, config: str) -> dict: + action: ClusterAction, config: str, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> dict: """Toggle the state of the cluster (start or stop). :param action: The cluster action to perform. @@ -64,7 +68,7 @@ def toggle_cluster_state( switch_node_maintenance(maintainance_flag) update_revision_and_manager() - broadcast_new_config(config, distribute_secrets=True) + broadcast_new_config(config, distribute_secrets=True, timeout=timeout) class ClusterHandler: @@ -161,7 +165,7 @@ class ClusterHandler: @staticmethod def shutdown( - config: str = DEFAULT_MCS_CONF_PATH, timeout: Optional[int] = None + config: str = DEFAULT_MCS_CONF_PATH, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT, ) -> dict: """Method to stop the MCS Cluster. @@ -169,7 +173,7 @@ class ClusterHandler: defaults to DEFAULT_MCS_CONF_PATH :type config: str, optional :param timeout: timeout in seconds to gracefully stop DMLProc, - defaults to None + defaults to DMLPROC_SHUTDOWN_TIMEOUT :type timeout: Optional[int], optional :raises CMAPIBasicError: if no nodes in the cluster :return: start timestamp @@ -180,7 +184,7 @@ class ClusterHandler: 'Cluster shutdown command called. Shutting down the cluster.' ) operation_start_time = str(datetime.now()) - toggle_cluster_state(ClusterAction.STOP, config) + toggle_cluster_state(ClusterAction.STOP, config, timeout=timeout) logger.debug('Successfully finished shutting down the cluster.') return {'timestamp': operation_start_time} diff --git a/cmapi/cmapi_server/helpers.py b/cmapi/cmapi_server/helpers.py index c1adc6143..8b10b457c 100644 --- a/cmapi/cmapi_server/helpers.py +++ b/cmapi/cmapi_server/helpers.py @@ -27,8 +27,15 @@ from cmapi_server.exceptions import CMAPIBasicError requests.packages.urllib3.disable_warnings() # pylint: disable=no-member from cmapi_server.constants import ( - CMAPI_CONF_PATH, CMAPI_DEFAULT_CONF_PATH, DEFAULT_MCS_CONF_PATH, - DEFAULT_SM_CONF_PATH, LOCALHOSTS, _version + CMAPI_CONF_PATH, + CMAPI_DEFAULT_CONF_PATH, + DEFAULT_MCS_CONF_PATH, + DEFAULT_SM_CONF_PATH, + DMLPROC_SHUTDOWN_TIMEOUT, + LOCALHOSTS, + LONG_REQUEST_TIMEOUT, + TRANSACTION_TIMEOUT, + _version ) from cmapi_server.handlers.cej import CEJPasswordHandler from cmapi_server.managers.process import MCSProcessManager @@ -54,7 +61,7 @@ def start_transaction( remove_nodes: Optional[list] = None, optional_nodes: Optional[list] = None, txn_id: Optional[int] = None, - timeout: float = 300.0 + timeout: float = TRANSACTION_TIMEOUT ): """Start internal CMAPI transaction. @@ -78,7 +85,7 @@ def start_transaction( :param txn_id: id for transaction to start, defaults to None :type txn_id: Optional[int], optional :param timeout: time in seconds for cmapi transaction lock before it ends - automatically, defaults to 300 + automatically, defaults to TRANSACTION_TIMEOUT :type timeout: float, optional :return: (success, txn_id, nodes) :rtype: tuple[bool, int, list[str]] @@ -315,8 +322,7 @@ def broadcast_new_config( defaults to DEFAULT_SM_CONF_PATH :param test_mode: for test purposes, defaults to False TODO: remove :param nodes: nodes list for config put, defaults to None - :param timeout: timeout passing to gracefully stop DMLProc TODO: for next - releases. Could affect all logic of broadcacting new config + :param timeout: timeout passing to gracefully stop DMLProc process, :param distribute_secrets: flag to distribute secrets to nodes :param stateful_config_dict: stateful config update dict to distribute to nodes :raises CMAPIBasicError: If Broadcasting config to nodes failed with errors @@ -332,7 +338,7 @@ def broadcast_new_config( headers = {'x-api-key': key} if stateful_config_dict: body = { - 'timeout': 300, + 'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout, 'stateful_config_dict': stateful_config_dict, 'only_stateful_config': True, } @@ -348,7 +354,7 @@ def broadcast_new_config( body = { 'manager': root.find('./ClusterManager').text, 'revision': root.find('./ConfigRevision').text, - 'timeout': 300, + 'timeout': DMLPROC_SHUTDOWN_TIMEOUT if timeout is None else timeout, 'config': config_text, 'mcs_config_filename': cs_config_filename, 'sm_config_filename': sm_config_filename, @@ -386,7 +392,7 @@ def broadcast_new_config( async with create_traced_async_session() as session: try: async with session.put( - url, headers=headers, json=body, ssl=False, timeout=120 + url, headers=headers, json=body, ssl=False, timeout=LONG_REQUEST_TIMEOUT ) as response: resp_json = await response.json(encoding='utf-8') response.raise_for_status() diff --git a/cmapi/cmapi_server/managers/process.py b/cmapi/cmapi_server/managers/process.py index 8e9f7ddd4..1aa18a15f 100644 --- a/cmapi/cmapi_server/managers/process.py +++ b/cmapi/cmapi_server/managers/process.py @@ -3,11 +3,18 @@ from __future__ import annotations import logging import os.path import socket +import time from time import sleep import psutil -from cmapi_server.constants import ALL_MCS_PROGS, MCS_INSTALL_BIN, MCSProgs, ProgInfo +from cmapi_server.constants import ( + ALL_MCS_PROGS, + DMLPROC_SHUTDOWN_TIMEOUT, + MCS_INSTALL_BIN, + MCSProgs, + ProgInfo, +) from cmapi_server.exceptions import CMAPIBasicError from cmapi_server.process_dispatchers.base import BaseDispatcher from cmapi_server.process_dispatchers.container import ContainerDispatcher @@ -238,32 +245,49 @@ class MCSProcessManager: return True @classmethod - def _wait_for_DMLProc_stop(cls, timeout: int = 10) -> bool: + def _wait_for_DMLProc_stop(cls, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT) -> bool: """Waiting DMLProc process to stop. - :param timeout: timeout to wait, defaults to 10 + :param timeout: timeout to wait in seconds, defaults to DMLPROC_SHUTDOWN_TIMEOUT :type timeout: int, optional :return: True on success :rtype: bool """ logging.info(f'Waiting for DMLProc to stop in {timeout} seconds') - dmlproc_stopped = False - while timeout > 0: - logging.info( - f'Waiting for DMLProc to stop. Seconds left {timeout}.' - ) + # Use a deadline-based loop with throttled logging to reduce noise. + deadline = time.monotonic() + max(1, int(timeout)) + LOG_INTERVAL = 30 # seconds + next_log_in = 0 # log immediately on first iteration + + while True: + remaining = int(deadline - time.monotonic()) + if remaining <= 0: + break + if not Process.check_process_alive('DMLProc'): logging.info('DMLProc gracefully stopped by DBRM command.') - dmlproc_stopped = True - break - sleep(1) - timeout -= 1 - else: - logging.error( - f'DMLProc did not stopped gracefully by DBRM command within ' - f'{timeout} seconds. Will be stopped directly.' - ) - return dmlproc_stopped + return True + + # Throttle waiting logs to roughly once every LOG_INTERVAL seconds + if next_log_in <= 0: + sleep_for = min(10, remaining) + logging.info( + ( + f'Waiting for DMLProc to stop. Seconds left ~{remaining}. ' + f'Sleeping {sleep_for} seconds before next check.' + ) + ) + next_log_in = LOG_INTERVAL + + sleep_for = min(10, remaining) + sleep(sleep_for) + next_log_in -= sleep_for + + logging.error( + 'DMLProc didn\'t stop gracefully by DBRM command within ' + f'{timeout} seconds. Will be stopped directly.' + ) + return False @classmethod def noop(cls, *args, **kwargs): @@ -324,7 +348,7 @@ class MCSProcessManager: @classmethod def stop( - cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = 10 + cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = DMLPROC_SHUTDOWN_TIMEOUT ) -> bool: """Stop mcs process. @@ -455,7 +479,7 @@ class MCSProcessManager: cls, is_primary: bool, use_sudo: bool = True, - timeout: int = 10, + timeout: int = DMLPROC_SHUTDOWN_TIMEOUT, ): """Stop mcs node processes. @@ -472,7 +496,7 @@ class MCSProcessManager: # undefined behaviour when primary gone and then recovers (failover # triggered 2 times). for prog_name in cls._get_sorted_progs(is_primary=True, reverse=True): - if not cls.stop(prog_name, is_primary, use_sudo): + if not cls.stop(prog_name, is_primary, use_sudo, timeout=timeout): logging.error(f'Process "{prog_name}" not stopped properly.') raise CMAPIBasicError(f'Error while stopping "{prog_name}"') diff --git a/cmapi/cmapi_server/managers/upgrade/repo.py b/cmapi/cmapi_server/managers/upgrade/repo.py index ce5c227d4..956edfaeb 100644 --- a/cmapi/cmapi_server/managers/upgrade/repo.py +++ b/cmapi/cmapi_server/managers/upgrade/repo.py @@ -154,7 +154,6 @@ class MariaDBESRepoManager: :raises CMAPIBasicError: no latest version matched with latest tested :raises CMAPIBasicError: if request error :return: latest MDB version matched with latest tested major - :rtype: str """ try: # Download the keyring file @@ -174,7 +173,7 @@ class MariaDBESRepoManager: ) latest_version_num = sorted(latest_version_nums, reverse=True)[0] logging.debug( - 'Succesfully got latest MBD version number: ' + 'Succesfully got latest MDB version number: ' f'{latest_version_num}' ) except requests.RequestException as exc: diff --git a/cmapi/mcs_cluster_tool/cluster_app.py b/cmapi/mcs_cluster_tool/cluster_app.py index 776555ca9..d7dfe7165 100644 --- a/cmapi/mcs_cluster_tool/cluster_app.py +++ b/cmapi/mcs_cluster_tool/cluster_app.py @@ -159,6 +159,7 @@ def stop( # could affect put_config (helpers.py broadcast_config) operation timeout = 0 + #TODO: bypass timeout here resp = client.shutdown_cluster({'in_transaction': True}) return {'timestamp': start_time} diff --git a/cmapi/mcs_cluster_tool/tools_commands.py b/cmapi/mcs_cluster_tool/tools_commands.py index 8a9d63209..7a64098b3 100644 --- a/cmapi/mcs_cluster_tool/tools_commands.py +++ b/cmapi/mcs_cluster_tool/tools_commands.py @@ -19,8 +19,12 @@ from rich.table import Table from cmapi_server.constants import ( - MCS_DATA_PATH, MCS_SECRETS_FILENAME, REQUEST_TIMEOUT, TRANSACTION_TIMEOUT, - CMAPI_CONF_PATH, CMAPI_PORT, + CMAPI_CONF_PATH, + CMAPI_PORT, + MCS_DATA_PATH, + MCS_SECRETS_FILENAME, + REQUEST_TIMEOUT, + TRANSACTION_TIMEOUT, ) from cmapi_server.controllers.api_clients import ( AppControllerClient, ClusterControllerClient, NodeControllerClient