1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-11-02 06:13:16 +03:00
Files
mariadb-columnstore-engine/cmapi/failover/config.py
mariadb-AlanMologorsky c86586c228 feat(cmapi,failover): MCOL-6006 Disable failover when shared storage not detected
- Add SharedStorageMonitor thread to periodically verify shared storage:
  * Writes a temp file to the shared location and validates MD5 from all nodes.
  * Skips nodes with unstable recent heartbeats; retries once; defers decision if any node is unreachable.
  * Updates a cluster-wide stateful flag (shared_storage_on) only on conclusive checks.
- New CMAPI endpoints:
  * PUT /cmapi/{ver}/cluster/check-shared-storage — orchestrates cross-node checks.
  * GET /cmapi/{ver}/node/check-shared-file — validates a given file’s MD5 on a node.
  * PUT /cmapi/{ver}/node/stateful-config — fast path to distribute stateful config updates.
- Introduce in-memory stateful config (AppStatefulConfig) with versioned flags (term/seq) and shared_storage_on flag:
  * Broadcast via helpers.broadcast_stateful_config and enhanced broadcast_new_config.
  * Config PUT is now validated with Pydantic models; supports stateful-only updates and set_mode requests.
- Failover behavior:
  * NodeMonitor keeps failover inactive when shared_storage_on is false or cluster size < 3.
  * Rebalancing DBRoots becomes a no-op when shared storage is OFF (safety guard).
- mcl status improvements: per-node 'state' (online/offline), better timeouts and error reporting.
- Routing/wiring: add dispatcher routes for new endpoints; add ClusterModeEnum.
- Tests: cover shared-storage monitor (unreachable nodes, HB-based skipping), node manipulation with shared storage ON/OFF, and server/config flows.
- Dependencies: add pydantic; minor cleanups and logging.
2025-10-01 21:10:34 +04:00

161 lines
4.8 KiB
Python

import configparser
import logging
import threading
from os.path import getmtime
import lxml
from cmapi_server.constants import DEFAULT_MCS_CONF_PATH, DEFAULT_SM_CONF_PATH
from mcs_node_control.models.node_config import NodeConfig
class Config:
config_file = ''
# params read from the config file
_desired_nodes = []
_active_nodes = []
_inactive_nodes = []
_primary_node = ''
_my_name = None # derived from config file
config_lock = threading.Lock()
last_mtime = 0
die = False
logger = None
def __init__(self, config_file=DEFAULT_MCS_CONF_PATH):
self.config_file = config_file
self.logger = logging.getLogger()
def getDesiredNodes(self):
self.config_lock.acquire()
self.check_reload()
ret = self._desired_nodes
self.config_lock.release()
return ret
def getActiveNodes(self):
self.config_lock.acquire()
self.check_reload()
ret = self._active_nodes
self.config_lock.release()
return ret
def getInactiveNodes(self):
self.config_lock.acquire()
self.check_reload()
ret = self._inactive_nodes
self.config_lock.release()
return ret
def getAllNodes(self):
"""Returns a 3-element tuple describing the status of all nodes.
index 0 = all nodes in the cluster
index 1 = all active nodes
index 2 = all inactive nodes
"""
self.config_lock.acquire()
self.check_reload()
ret = (self._desired_nodes, self._active_nodes, self._inactive_nodes)
self.config_lock.release()
return ret
def getPrimaryNode(self):
self.config_lock.acquire()
self.check_reload()
ret = self._primary_node
self.config_lock.release()
return ret
def check_reload(self):
"""Check config reload.
Returns True if reload happened, False otherwise.
"""
if self.last_mtime != getmtime(self.config_file):
self.load_config()
return True
return False
def who_am_I(self):
self.config_lock.acquire()
self.check_reload()
ret = self._my_name
self.config_lock.release()
return ret
def load_config(self):
try:
node_config = NodeConfig()
root = node_config.get_current_config_root(self.config_file)
last_mtime = getmtime(self.config_file)
except Exception:
self.logger.warning(
f'Failed to parse config file {self.config_file}.',
exc_info=True
)
return False
node_tmp = root.findall('./DesiredNodes/Node')
if len(node_tmp) == 0:
self.logger.warning(
f'The config file {self.config_file} is missing entries '
'in the DesiredNodes section'
)
return False
desired_nodes = [node.text for node in node_tmp]
active_nodes = [
node.text for node in root.findall('./ActiveNodes/Node')
]
inactive_nodes = [
node.text for node in root.findall('./InactiveNodes/Node')
]
node_tmp = root.find('./PrimaryNode')
if node_tmp is None or len(node_tmp.text) == 0:
self.logger.warning(
f'The config file {self.config_file} is missing a valid '
'PrimaryNode entry'
)
return False
primary_node = node_tmp.text
# find my name in this cluster
names = set(node_config.get_network_addresses_and_names())
all_nodes = set(desired_nodes)
intersection = all_nodes & names
if len(intersection) > 1:
my_name = intersection.pop()
self.logger.warning(
'This node has multiple names in the list of desired nodes, '
'was it added more than once? Some things may not work in '
f'this configuration. Using {my_name} as the name for this '
'node.'
)
elif len(intersection) == 0:
self.logger.warning(
'This node has no entry in the list of desired nodes.'
)
my_name = None
elif len(intersection) == 1:
my_name = intersection.pop()
# handles the initial 0-node special case
if my_name == '127.0.0.1':
my_name = None
self.logger.info(f'Loaded the config file, my name is {my_name}')
desired_nodes.sort()
active_nodes.sort()
inactive_nodes.sort()
self._desired_nodes = desired_nodes
self._active_nodes = active_nodes
self._inactive_nodes = inactive_nodes
self._primary_node = primary_node
self.last_mtime = last_mtime
self._my_name = my_name
return True