You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-08-01 06:46:55 +03:00
feat(save_brm)!: MCOL-5709: protect from S3/NFS IO errors (#3206)
* feat(save_brm)!: protect from S3/NFS IO errors * feat(save_brm)!: future refactoring * cleanup * feat(save_brm)!: forgotten template * feat(save-brm,ci)!: python3 package for rocky8 --------- Co-authored-by: Roman Nozdrin <roman.nozdrin@mariadb.com>
This commit is contained in:
@ -2,6 +2,7 @@
|
||||
import configparser
|
||||
import fcntl
|
||||
import json
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
@ -9,6 +10,7 @@ import ssl
|
||||
import struct
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError, URLError
|
||||
@ -21,7 +23,11 @@ MCS_CONFIG_PATH = os.path.join(MCS_ETC_PATH, 'Columnstore.xml')
|
||||
SM_CONFIG_PATH = os.path.join(MCS_ETC_PATH, 'storagemanager.cnf')
|
||||
MCS_BIN_DIR = '@ENGINE_BINDIR@'
|
||||
SAVEBRM = os.path.join(MCS_BIN_DIR, 'save_brm')
|
||||
EM_FILE_SUFFIX = '_em'
|
||||
EM_FILE_SIZE_THRESHOLD = 1000
|
||||
HALF_A_MINUTE = 30
|
||||
NUMBER_OF_FILES_TO_KEEP = 40
|
||||
DEFAULT_EM_LOCAL_PATH_PREFIX = ''
|
||||
LOCALHOST = '127.0.0.1'
|
||||
# according to https://www.ibm.com/docs/en/storage-sentinel/1.1.2?topic=installation-map-your-local-host-loopback-address
|
||||
LOCALHOSTS = (
|
||||
@ -33,6 +39,8 @@ LOCALHOSTS = (
|
||||
)
|
||||
API_VERSION = '0.4.0'
|
||||
API_PORT = '8640'
|
||||
BRM_BACKUP_PATH = '/tmp/columnstore_tmp_files/rdwrscratch/'
|
||||
BRM_BACKUP_PATH_PART = '{}_BRM_saves'
|
||||
|
||||
|
||||
def get_api_key():
|
||||
@ -161,16 +169,13 @@ def is_node_primary(conf_root):
|
||||
success = True
|
||||
except HTTPError as exc:
|
||||
logging.warning(
|
||||
'Something goes wrong while requesting primary status ',
|
||||
'through api.',
|
||||
'Got response code "{}" with reason "{}".'.format(
|
||||
'Something goes wrong while requesting primary status through api. Got response code "{}" with reason "{}".'.format(
|
||||
exc.code, exc.reason
|
||||
)
|
||||
)
|
||||
except URLError:
|
||||
logging.warning(
|
||||
'CMAPI became unavailable while trying',
|
||||
'to request primary status.'
|
||||
'CMAPI became unavailable while trying to request primary status.'
|
||||
)
|
||||
except Exception:
|
||||
logging.error(
|
||||
@ -192,42 +197,191 @@ def is_node_primary(conf_root):
|
||||
return is_primary_fallback(conf_root.find('./DBRM_Controller/IPAddr').text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
master_addr = ''
|
||||
pm_count = 0
|
||||
logging.basicConfig(
|
||||
format='%(levelname)s: %(message)s', level=logging.DEBUG
|
||||
)
|
||||
logging.debug('Loading Columnstore.xml.')
|
||||
def get_file_size(file_path):
|
||||
""" Returns the size of the file in bytes. """
|
||||
try:
|
||||
cs_config = ET.parse(MCS_CONFIG_PATH)
|
||||
config_root = cs_config.getroot()
|
||||
master_addr = config_root.find('./DBRM_Controller/IPAddr').text
|
||||
pm_count = int(
|
||||
config_root.find('./SystemModuleConfig/ModuleCount3').text
|
||||
size = os.path.getsize(file_path)
|
||||
return size
|
||||
except OSError as e:
|
||||
logging.error('OSError in get_file_size(): {}.'.format(e))
|
||||
return None
|
||||
|
||||
|
||||
def em_is_empty(file_path_prefix):
|
||||
"""Returns True if EM file size is less than EM_FILE_SIZE_THRESHOLD
|
||||
or its argument is None.
|
||||
|
||||
:rtype: Bool
|
||||
"""
|
||||
# Add error message if EM is empty
|
||||
is_none = file_path_prefix is None
|
||||
filesize = get_file_size(file_path_prefix + EM_FILE_SUFFIX)
|
||||
is_em_empty = is_none or filesize < EM_FILE_SIZE_THRESHOLD
|
||||
if is_em_empty:
|
||||
logging.error('EM file is none or its size {} is less than {} bytes.'.format(filesize, EM_FILE_SIZE_THRESHOLD))
|
||||
return is_em_empty
|
||||
|
||||
|
||||
def clean_up_backup_brm_files(save_brm_dir_path):
|
||||
""" Removes all but 5 last usable sets of BRM files in the specified directory.
|
||||
Usable in the context means having non-empty EM.
|
||||
"""
|
||||
filenames = os.listdir(save_brm_dir_path)
|
||||
filenames.sort(reverse=True)
|
||||
files_to_remove = filenames[NUMBER_OF_FILES_TO_KEEP:]
|
||||
for filename in files_to_remove:
|
||||
file_path = os.path.join(save_brm_dir_path, filename)
|
||||
logging.debug('Clean up {}.'.format(file_path))
|
||||
try:
|
||||
os.remove(file_path)
|
||||
except OSError as e:
|
||||
logging.error('OSError exception happens removing {}: {}.'.format(file_path, e))
|
||||
|
||||
|
||||
def remove_files_by_prefix_if_exist(file_path_prefix):
|
||||
""" Removes files with the given prefix if they exist. """
|
||||
if file_path_prefix is None:
|
||||
logging.error(
|
||||
'file_path_prefix is None. Cannot remove files.',
|
||||
exc_info=True
|
||||
)
|
||||
logging.debug('Succesfully loaded Columnstore.xml.')
|
||||
except (FileNotFoundError, AttributeError, ValueError) as e:
|
||||
# is it correct case?
|
||||
return
|
||||
try:
|
||||
|
||||
files_paths = glob.glob(file_path_prefix + '*')
|
||||
for file_path in files_paths:
|
||||
os.remove(file_path)
|
||||
except OSError as e:
|
||||
logging.error(
|
||||
'Error removing file: {} - {}'.format(file_path, e.strerror),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
|
||||
def get_config_root_from_file(file_path):
|
||||
"""Returns XML root element from file.
|
||||
|
||||
:param file_path: xml config path
|
||||
:return: XML root element or None
|
||||
:rtype: Element or None
|
||||
"""
|
||||
try:
|
||||
cs_config = ET.parse(file_path)
|
||||
return cs_config.getroot()
|
||||
except (FileNotFoundError, AttributeError, ValueError):
|
||||
logging.error(
|
||||
'Exception while loading Columnstore.xml. Continue anyway.',
|
||||
exc_info=True
|
||||
)
|
||||
return None
|
||||
|
||||
logging.debug('Reading SM config.')
|
||||
sm_config = configparser.ConfigParser()
|
||||
files_read = len(sm_config.read(SM_CONFIG_PATH))
|
||||
storage = sm_config.get(
|
||||
'ObjectStorage', 'service', fallback='LocalStorage'
|
||||
)
|
||||
def get_epoch_prefix():
|
||||
"""Returns a prefix with epoch time
|
||||
|
||||
if is_node_primary(config_root):
|
||||
:rtype: String
|
||||
"""
|
||||
epoch_time = int(time.time())
|
||||
|
||||
return 'backup_{}'.format(epoch_time)
|
||||
|
||||
|
||||
def get_save_brm_dir_path(a_mcs_config_root):
|
||||
"""Returns a path that SM treats as local
|
||||
|
||||
:param file_path: xml config XML root
|
||||
:rtype
|
||||
"""
|
||||
save_brm_dir_path = BRM_BACKUP_PATH
|
||||
if a_mcs_config_root is not None:
|
||||
try:
|
||||
retcode = subprocess.check_call(SAVEBRM, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
logging.error('{} exits with {}.'.format(exc.cmd, exc.returncode))
|
||||
sys.exit(1)
|
||||
system_temp_file_dir = a_mcs_config_root.find('./SystemConfig/SystemTempFileDir').text
|
||||
hdfs_rdwr_scratch = a_mcs_config_root.find('./SystemConfig/hdfsRdwrScratch').text
|
||||
# There is a danger to have no '/' in the end of system_temp_file_dir
|
||||
# or have two of them there. In both cases save_brm will fail to store
|
||||
# files locally.
|
||||
save_brm_dir_path = system_temp_file_dir + hdfs_rdwr_scratch
|
||||
except AttributeError:
|
||||
logging.error('Exception while getting SystemTempFileDir and hdfsRdwrScratch from Columnstore.xml', exc_info=True)
|
||||
|
||||
return save_brm_dir_path
|
||||
|
||||
|
||||
def get_save_brm_path_prefix(a_mcs_config_root):
|
||||
"""Returns a path that SM treats as local
|
||||
|
||||
:param file_path: xml config XML root
|
||||
:rtype: String
|
||||
"""
|
||||
epoch_prefix = get_epoch_prefix()
|
||||
return get_save_brm_dir_path(a_mcs_config_root) + '/' + BRM_BACKUP_PATH_PART.format(epoch_prefix)
|
||||
|
||||
|
||||
def call_save_brm(path):
|
||||
"""Calls save_brm first and then tries to call it with local path.
|
||||
|
||||
:param file_path: xml config XML root
|
||||
:rtype: None
|
||||
"""
|
||||
savebrm_cmd = SAVEBRM + ' ' + path
|
||||
try:
|
||||
subprocess.check_call(savebrm_cmd, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
logging.error('The call to {} exits with {}.'.format(savebrm_cmd, exc.returncode))
|
||||
return None
|
||||
except OSError:
|
||||
logging.error('Os error while calling savebrm', exc_info=True)
|
||||
return None
|
||||
return path
|
||||
|
||||
|
||||
def call_save_brm_locally(a_mcs_config_root):
|
||||
"""Calls save_brm first and then tries to call it with local path.
|
||||
|
||||
:param file_path: xml config XML root
|
||||
:rtype: None
|
||||
"""
|
||||
local_path = get_save_brm_path_prefix(a_mcs_config_root)
|
||||
return call_save_brm(local_path)
|
||||
|
||||
|
||||
def call_save_brm_with_local_fallback(a_mcs_config_root):
|
||||
"""Calls save_brm first and then tries to call it with local path.
|
||||
|
||||
:param file_path: xml config XML root
|
||||
:rtype: None
|
||||
"""
|
||||
try:
|
||||
subprocess.check_call(SAVEBRM, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
logging.error('The primary call to {} exits with {}.'.format(exc.cmd, exc.returncode))
|
||||
backup_path = get_save_brm_path_prefix(a_mcs_config_root)
|
||||
logging.debug('Back up BRM files locally to {}.'.format(backup_path))
|
||||
backup_cmd = SAVEBRM + ' ' + backup_path
|
||||
try:
|
||||
subprocess.check_call(backup_cmd, shell=True)
|
||||
except subprocess.CalledProcessError:
|
||||
logging.error('The backup call to {} exits with {}.'.format(exc.cmd, exc.returncode))
|
||||
except OSError:
|
||||
logging.error('Os error while calling savebrm', exc_info=True)
|
||||
sys.exit(0)
|
||||
logging.error('Os error while calling savebrm during the backup', exc_info=True)
|
||||
|
||||
sys.exit(1)
|
||||
except OSError:
|
||||
logging.error('Os error while calling savebrm', exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
mcs_config_root = get_config_root_from_file(MCS_CONFIG_PATH)
|
||||
# config_root can be None
|
||||
if is_node_primary(mcs_config_root):
|
||||
em_local_path_prefix = call_save_brm_locally(mcs_config_root)
|
||||
if not em_local_path_prefix or em_is_empty(em_local_path_prefix):
|
||||
# remove_files_by_prefix_if_exist(em_local_path_prefix)
|
||||
logging.error('Exiting with error.')
|
||||
sys.exit(1)
|
||||
|
||||
clean_up_backup_brm_files(get_save_brm_dir_path(mcs_config_root))
|
||||
|
||||
call_save_brm(DEFAULT_EM_LOCAL_PATH_PREFIX)
|
||||
|
||||
sys.exit(0)
|
||||
|
Reference in New Issue
Block a user