You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2026-01-06 08:21:10 +03:00
MCOL-6094: reset shmem locks before stopping workernode
This commit is contained in:
committed by
Leonid Fedorov
parent
a7496ac9d0
commit
48e14ed5e5
@@ -11,7 +11,7 @@ from time import sleep
|
||||
import psutil
|
||||
|
||||
from cmapi_server.constants import (
|
||||
IFLAG, LIBJEMALLOC_DEFAULT_PATH, MCS_INSTALL_BIN, ALL_MCS_PROGS
|
||||
IFLAG, LIBJEMALLOC_DEFAULT_PATH, MCS_INSTALL_BIN, ALL_MCS_PROGS,
|
||||
)
|
||||
from cmapi_server.exceptions import CMAPIBasicError
|
||||
from cmapi_server.process_dispatchers.base import BaseDispatcher
|
||||
@@ -219,6 +219,20 @@ class ContainerDispatcher(BaseDispatcher):
|
||||
service_proc = cls._get_proc_object(service)
|
||||
|
||||
if service == 'workernode':
|
||||
# Run pre-stop lock reset before saving BRM
|
||||
# These stale locks can occur if the controllernode couldn't stop correctly
|
||||
# and they cause mcs-savebrm.py to hang
|
||||
|
||||
logger.debug('Pre-stop: inspecting and resetting shmem locks.')
|
||||
prestop_path = os.path.join(MCS_INSTALL_BIN, 'mcs-prestop-workernode.sh')
|
||||
prestop_logpath = cls._create_mcs_process_logfile(
|
||||
'mcs-prestop-workernode.log'
|
||||
)
|
||||
with open(prestop_logpath, 'a', encoding='utf-8') as prestop_logfh:
|
||||
_success, _ = cls.exec_command(
|
||||
prestop_path, stdout=prestop_logfh
|
||||
)
|
||||
|
||||
# start mcs-savebrm.py before stoping workernode
|
||||
logger.debug('Waiting to save BRM.')
|
||||
savebrm_path = os.path.join(MCS_INSTALL_BIN, 'mcs-savebrm.py')
|
||||
@@ -289,6 +303,7 @@ class ContainerDispatcher(BaseDispatcher):
|
||||
|
||||
...TODO: for next releases. Additional error handling.
|
||||
"""
|
||||
stop_success = True
|
||||
if cls.is_service_running(service):
|
||||
# TODO: retry?
|
||||
stop_success = cls.stop(service, is_primary, use_sudo)
|
||||
|
||||
@@ -163,6 +163,7 @@ set(SHMEM_FILE_GLOB "MCS-shm-")
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py" @ONLY)
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py" @ONLY)
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcssavebrm.py" @ONLY)
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-prestop-workernode.sh.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-prestop-workernode.sh" @ONLY)
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog.in" "${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog" @ONLY)
|
||||
|
||||
columnstore_install_program(columnstore-post-install ${ENGINE_BINDIR})
|
||||
@@ -174,6 +175,7 @@ columnstore_install_program(columnstoreSyslogSetup.sh ${ENGINE_BINDIR})
|
||||
columnstore_install_program(mcs-stop-controllernode.sh ${ENGINE_BINDIR})
|
||||
columnstore_install_program(mcs-loadbrm.py ${ENGINE_BINDIR})
|
||||
columnstore_install_program(mcs-savebrm.py ${ENGINE_BINDIR})
|
||||
columnstore_install_program(mcs-prestop-workernode.sh ${ENGINE_BINDIR})
|
||||
columnstore_install_program(mariadb-columnstore-start.sh ${ENGINE_BINDIR})
|
||||
columnstore_install_program(mariadb-columnstore-stop.sh ${ENGINE_BINDIR})
|
||||
columnstore_install_program(loop_process_starter.sh ${ENGINE_BINDIR})
|
||||
|
||||
53
oam/install_scripts/mcs-prestop-workernode.sh.in
Normal file
53
oam/install_scripts/mcs-prestop-workernode.sh.in
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
# Pre-stop helper for mcs-workernode: inspect and reset BRM shmem locks
|
||||
# to avoid mcs-savebrm.py hanging on locks.
|
||||
set -euo pipefail
|
||||
|
||||
LOG_DIR="/var/log/mariadb/columnstore"
|
||||
LOG_FILE="${LOG_DIR}/prestop-workernode.log"
|
||||
BIN_DIR='@ENGINE_BINDIR@'
|
||||
|
||||
SHMEM_LOCKS_BIN="${BIN_DIR}/mcs-shmem-locks"
|
||||
RESET_LOCKS_BIN="${BIN_DIR}/reset_locks"
|
||||
|
||||
log(){
|
||||
# $1 - level, $2... - message
|
||||
local level="$1"; shift
|
||||
printf "%s [%s] %b\n" "$(date -Is)" "${level}" "$*" >> "${LOG_FILE}"
|
||||
}
|
||||
|
||||
mkdir -p "${LOG_DIR}" 2>/dev/null || true
|
||||
log INFO "Pre-stop: checking BRM shmem locks before stopping workernode."
|
||||
|
||||
# This must not happen, but check that the binaries exist just in case
|
||||
if [[ ! -x "${SHMEM_LOCKS_BIN}" ]]; then
|
||||
log ERROR "${SHMEM_LOCKS_BIN} not found; aborting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -x "${RESET_LOCKS_BIN}" ]]; then
|
||||
log ERROR "${RESET_LOCKS_BIN} not found; aborting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Capture current lock state
|
||||
OUT="$(${SHMEM_LOCKS_BIN} --lock-id 0 2>&1)" || true
|
||||
log INFO "Current lock state:\n${OUT}"
|
||||
|
||||
# Determine if any readers/writers are active
|
||||
ACTIVE_TOTAL=$(echo "${OUT}" | awk -F'=' '/^[[:space:]]*readers =|^[[:space:]]*writers =/ {gsub(/ /, ""); print $2}' | awk '{s+=$1} END {print s+0}')
|
||||
if [[ "${ACTIVE_TOTAL}" -gt 0 ]]; then
|
||||
log WARN "Detected active shmem locks (sum readers+writers=${ACTIVE_TOTAL}). Attempting reset."
|
||||
"${RESET_LOCKS_BIN}" -s >/dev/null 2>&1 || log ERROR "reset_locks failed to run."
|
||||
sleep 1
|
||||
OUT2="$(${SHMEM_LOCKS_BIN} --lock-id 0 2>&1)" || true
|
||||
log INFO "Post-reset lock state:\n${OUT2}"
|
||||
else
|
||||
log INFO "No active shmem locks detected."
|
||||
fi
|
||||
|
||||
log INFO "Pre-stop lock inspection/reset finished."
|
||||
|
||||
exit 0
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ LimitCORE=@CORE_DUMPS@
|
||||
|
||||
Environment="@WORKERNODE_ALLOC_CONFIG@"
|
||||
ExecStart=@ENGINE_BINDIR@/workernode DBRM_Worker%i
|
||||
ExecStopPre=@ENGINE_BINDIR@/mcs-prestop-workernode.sh
|
||||
ExecStopPost=@ENGINE_BINDIR@/mcs-savebrm.py
|
||||
ExecStopPost=/usr/bin/env bash -c "clearShm > /dev/null 2>&1"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user