1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2026-01-06 08:21:10 +03:00

MCOL-6094: reset shmem locks before stopping workernode

This commit is contained in:
Alexander Presnyakov
2025-08-22 12:52:51 +00:00
committed by Leonid Fedorov
parent a7496ac9d0
commit 48e14ed5e5
4 changed files with 72 additions and 1 deletions

View File

@@ -11,7 +11,7 @@ from time import sleep
import psutil
from cmapi_server.constants import (
IFLAG, LIBJEMALLOC_DEFAULT_PATH, MCS_INSTALL_BIN, ALL_MCS_PROGS
IFLAG, LIBJEMALLOC_DEFAULT_PATH, MCS_INSTALL_BIN, ALL_MCS_PROGS,
)
from cmapi_server.exceptions import CMAPIBasicError
from cmapi_server.process_dispatchers.base import BaseDispatcher
@@ -219,6 +219,20 @@ class ContainerDispatcher(BaseDispatcher):
service_proc = cls._get_proc_object(service)
if service == 'workernode':
# Run pre-stop lock reset before saving BRM
# These stale locks can occur if the controllernode couldn't stop correctly
# and they cause mcs-savebrm.py to hang
logger.debug('Pre-stop: inspecting and resetting shmem locks.')
prestop_path = os.path.join(MCS_INSTALL_BIN, 'mcs-prestop-workernode.sh')
prestop_logpath = cls._create_mcs_process_logfile(
'mcs-prestop-workernode.log'
)
with open(prestop_logpath, 'a', encoding='utf-8') as prestop_logfh:
_success, _ = cls.exec_command(
prestop_path, stdout=prestop_logfh
)
# start mcs-savebrm.py before stoping workernode
logger.debug('Waiting to save BRM.')
savebrm_path = os.path.join(MCS_INSTALL_BIN, 'mcs-savebrm.py')
@@ -289,6 +303,7 @@ class ContainerDispatcher(BaseDispatcher):
...TODO: for next releases. Additional error handling.
"""
stop_success = True
if cls.is_service_running(service):
# TODO: retry?
stop_success = cls.stop(service, is_primary, use_sudo)

View File

@@ -163,6 +163,7 @@ set(SHMEM_FILE_GLOB "MCS-shm-")
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcssavebrm.py" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-prestop-workernode.sh.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-prestop-workernode.sh" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog.in" "${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog" @ONLY)
columnstore_install_program(columnstore-post-install ${ENGINE_BINDIR})
@@ -174,6 +175,7 @@ columnstore_install_program(columnstoreSyslogSetup.sh ${ENGINE_BINDIR})
columnstore_install_program(mcs-stop-controllernode.sh ${ENGINE_BINDIR})
columnstore_install_program(mcs-loadbrm.py ${ENGINE_BINDIR})
columnstore_install_program(mcs-savebrm.py ${ENGINE_BINDIR})
columnstore_install_program(mcs-prestop-workernode.sh ${ENGINE_BINDIR})
columnstore_install_program(mariadb-columnstore-start.sh ${ENGINE_BINDIR})
columnstore_install_program(mariadb-columnstore-stop.sh ${ENGINE_BINDIR})
columnstore_install_program(loop_process_starter.sh ${ENGINE_BINDIR})

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env bash
# Pre-stop helper for mcs-workernode: inspect and reset BRM shmem locks
# to avoid mcs-savebrm.py hanging on locks.
set -euo pipefail
LOG_DIR="/var/log/mariadb/columnstore"
LOG_FILE="${LOG_DIR}/prestop-workernode.log"
BIN_DIR='@ENGINE_BINDIR@'
SHMEM_LOCKS_BIN="${BIN_DIR}/mcs-shmem-locks"
RESET_LOCKS_BIN="${BIN_DIR}/reset_locks"
log(){
# $1 - level, $2... - message
local level="$1"; shift
printf "%s [%s] %b\n" "$(date -Is)" "${level}" "$*" >> "${LOG_FILE}"
}
mkdir -p "${LOG_DIR}" 2>/dev/null || true
log INFO "Pre-stop: checking BRM shmem locks before stopping workernode."
# This must not happen, but check that the binaries exist just in case
if [[ ! -x "${SHMEM_LOCKS_BIN}" ]]; then
log ERROR "${SHMEM_LOCKS_BIN} not found; aborting."
exit 1
fi
if [[ ! -x "${RESET_LOCKS_BIN}" ]]; then
log ERROR "${RESET_LOCKS_BIN} not found; aborting."
exit 1
fi
# Capture current lock state
OUT="$(${SHMEM_LOCKS_BIN} --lock-id 0 2>&1)" || true
log INFO "Current lock state:\n${OUT}"
# Determine if any readers/writers are active
ACTIVE_TOTAL=$(echo "${OUT}" | awk -F'=' '/^[[:space:]]*readers =|^[[:space:]]*writers =/ {gsub(/ /, ""); print $2}' | awk '{s+=$1} END {print s+0}')
if [[ "${ACTIVE_TOTAL}" -gt 0 ]]; then
log WARN "Detected active shmem locks (sum readers+writers=${ACTIVE_TOTAL}). Attempting reset."
"${RESET_LOCKS_BIN}" -s >/dev/null 2>&1 || log ERROR "reset_locks failed to run."
sleep 1
OUT2="$(${SHMEM_LOCKS_BIN} --lock-id 0 2>&1)" || true
log INFO "Post-reset lock state:\n${OUT2}"
else
log INFO "No active shmem locks detected."
fi
log INFO "Pre-stop lock inspection/reset finished."
exit 0

View File

@@ -13,6 +13,7 @@ LimitCORE=@CORE_DUMPS@
Environment="@WORKERNODE_ALLOC_CONFIG@"
ExecStart=@ENGINE_BINDIR@/workernode DBRM_Worker%i
ExecStopPre=@ENGINE_BINDIR@/mcs-prestop-workernode.sh
ExecStopPost=@ENGINE_BINDIR@/mcs-savebrm.py
ExecStopPost=/usr/bin/env bash -c "clearShm > /dev/null 2>&1"