1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-01 06:46:55 +03:00

feat(save_brm)!: MCOL-5709: protect from S3/NFS IO errors (#3206)

* feat(save_brm)!: protect from S3/NFS IO errors

* feat(save_brm)!: future refactoring

* cleanup

* feat(save_brm)!: forgotten template

* feat(save-brm,ci)!: python3 package for rocky8

---------

Co-authored-by: Roman Nozdrin <roman.nozdrin@mariadb.com>
This commit is contained in:
Alan Mologorsky
2024-06-18 20:42:01 +03:00
committed by GitHub
parent 0e17e14afb
commit 4da9bcdccf
5 changed files with 312 additions and 38 deletions

View File

@ -2,6 +2,7 @@
import configparser
import fcntl
import json
import glob
import logging
import os
import socket
@ -9,6 +10,7 @@ import ssl
import struct
import subprocess
import sys
import time
import xml.etree.ElementTree as ET
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
@ -21,7 +23,11 @@ MCS_CONFIG_PATH = os.path.join(MCS_ETC_PATH, 'Columnstore.xml')
SM_CONFIG_PATH = os.path.join(MCS_ETC_PATH, 'storagemanager.cnf')
MCS_BIN_DIR = '@ENGINE_BINDIR@'
SAVEBRM = os.path.join(MCS_BIN_DIR, 'save_brm')
EM_FILE_SUFFIX = '_em'
EM_FILE_SIZE_THRESHOLD = 1000
HALF_A_MINUTE = 30
NUMBER_OF_FILES_TO_KEEP = 40
DEFAULT_EM_LOCAL_PATH_PREFIX = ''
LOCALHOST = '127.0.0.1'
# according to https://www.ibm.com/docs/en/storage-sentinel/1.1.2?topic=installation-map-your-local-host-loopback-address
LOCALHOSTS = (
@ -33,6 +39,8 @@ LOCALHOSTS = (
)
API_VERSION = '0.4.0'
API_PORT = '8640'
BRM_BACKUP_PATH = '/tmp/columnstore_tmp_files/rdwrscratch/'
BRM_BACKUP_PATH_PART = '{}_BRM_saves'
def get_api_key():
@ -161,16 +169,13 @@ def is_node_primary(conf_root):
success = True
except HTTPError as exc:
logging.warning(
'Something goes wrong while requesting primary status ',
'through api.',
'Got response code "{}" with reason "{}".'.format(
'Something goes wrong while requesting primary status through api. Got response code "{}" with reason "{}".'.format(
exc.code, exc.reason
)
)
except URLError:
logging.warning(
'CMAPI became unavailable while trying',
'to request primary status.'
'CMAPI became unavailable while trying to request primary status.'
)
except Exception:
logging.error(
@ -192,42 +197,191 @@ def is_node_primary(conf_root):
return is_primary_fallback(conf_root.find('./DBRM_Controller/IPAddr').text)
if __name__ == '__main__':
master_addr = ''
pm_count = 0
logging.basicConfig(
format='%(levelname)s: %(message)s', level=logging.DEBUG
)
logging.debug('Loading Columnstore.xml.')
def get_file_size(file_path):
""" Returns the size of the file in bytes. """
try:
cs_config = ET.parse(MCS_CONFIG_PATH)
config_root = cs_config.getroot()
master_addr = config_root.find('./DBRM_Controller/IPAddr').text
pm_count = int(
config_root.find('./SystemModuleConfig/ModuleCount3').text
size = os.path.getsize(file_path)
return size
except OSError as e:
logging.error('OSError in get_file_size(): {}.'.format(e))
return None
def em_is_empty(file_path_prefix):
"""Returns True if EM file size is less than EM_FILE_SIZE_THRESHOLD
or its argument is None.
:rtype: Bool
"""
# Add error message if EM is empty
is_none = file_path_prefix is None
filesize = get_file_size(file_path_prefix + EM_FILE_SUFFIX)
is_em_empty = is_none or filesize < EM_FILE_SIZE_THRESHOLD
if is_em_empty:
logging.error('EM file is none or its size {} is less than {} bytes.'.format(filesize, EM_FILE_SIZE_THRESHOLD))
return is_em_empty
def clean_up_backup_brm_files(save_brm_dir_path):
""" Removes all but 5 last usable sets of BRM files in the specified directory.
Usable in the context means having non-empty EM.
"""
filenames = os.listdir(save_brm_dir_path)
filenames.sort(reverse=True)
files_to_remove = filenames[NUMBER_OF_FILES_TO_KEEP:]
for filename in files_to_remove:
file_path = os.path.join(save_brm_dir_path, filename)
logging.debug('Clean up {}.'.format(file_path))
try:
os.remove(file_path)
except OSError as e:
logging.error('OSError exception happens removing {}: {}.'.format(file_path, e))
def remove_files_by_prefix_if_exist(file_path_prefix):
""" Removes files with the given prefix if they exist. """
if file_path_prefix is None:
logging.error(
'file_path_prefix is None. Cannot remove files.',
exc_info=True
)
logging.debug('Succesfully loaded Columnstore.xml.')
except (FileNotFoundError, AttributeError, ValueError) as e:
# is it correct case?
return
try:
files_paths = glob.glob(file_path_prefix + '*')
for file_path in files_paths:
os.remove(file_path)
except OSError as e:
logging.error(
'Error removing file: {} - {}'.format(file_path, e.strerror),
exc_info=True
)
def get_config_root_from_file(file_path):
"""Returns XML root element from file.
:param file_path: xml config path
:return: XML root element or None
:rtype: Element or None
"""
try:
cs_config = ET.parse(file_path)
return cs_config.getroot()
except (FileNotFoundError, AttributeError, ValueError):
logging.error(
'Exception while loading Columnstore.xml. Continue anyway.',
exc_info=True
)
return None
logging.debug('Reading SM config.')
sm_config = configparser.ConfigParser()
files_read = len(sm_config.read(SM_CONFIG_PATH))
storage = sm_config.get(
'ObjectStorage', 'service', fallback='LocalStorage'
)
def get_epoch_prefix():
"""Returns a prefix with epoch time
if is_node_primary(config_root):
:rtype: String
"""
epoch_time = int(time.time())
return 'backup_{}'.format(epoch_time)
def get_save_brm_dir_path(a_mcs_config_root):
"""Returns a path that SM treats as local
:param file_path: xml config XML root
:rtype
"""
save_brm_dir_path = BRM_BACKUP_PATH
if a_mcs_config_root is not None:
try:
retcode = subprocess.check_call(SAVEBRM, shell=True)
except subprocess.CalledProcessError as exc:
logging.error('{} exits with {}.'.format(exc.cmd, exc.returncode))
sys.exit(1)
system_temp_file_dir = a_mcs_config_root.find('./SystemConfig/SystemTempFileDir').text
hdfs_rdwr_scratch = a_mcs_config_root.find('./SystemConfig/hdfsRdwrScratch').text
# There is a danger to have no '/' in the end of system_temp_file_dir
# or have two of them there. In both cases save_brm will fail to store
# files locally.
save_brm_dir_path = system_temp_file_dir + hdfs_rdwr_scratch
except AttributeError:
logging.error('Exception while getting SystemTempFileDir and hdfsRdwrScratch from Columnstore.xml', exc_info=True)
return save_brm_dir_path
def get_save_brm_path_prefix(a_mcs_config_root):
"""Returns a path that SM treats as local
:param file_path: xml config XML root
:rtype: String
"""
epoch_prefix = get_epoch_prefix()
return get_save_brm_dir_path(a_mcs_config_root) + '/' + BRM_BACKUP_PATH_PART.format(epoch_prefix)
def call_save_brm(path):
"""Calls save_brm first and then tries to call it with local path.
:param file_path: xml config XML root
:rtype: None
"""
savebrm_cmd = SAVEBRM + ' ' + path
try:
subprocess.check_call(savebrm_cmd, shell=True)
except subprocess.CalledProcessError as exc:
logging.error('The call to {} exits with {}.'.format(savebrm_cmd, exc.returncode))
return None
except OSError:
logging.error('Os error while calling savebrm', exc_info=True)
return None
return path
def call_save_brm_locally(a_mcs_config_root):
"""Calls save_brm first and then tries to call it with local path.
:param file_path: xml config XML root
:rtype: None
"""
local_path = get_save_brm_path_prefix(a_mcs_config_root)
return call_save_brm(local_path)
def call_save_brm_with_local_fallback(a_mcs_config_root):
"""Calls save_brm first and then tries to call it with local path.
:param file_path: xml config XML root
:rtype: None
"""
try:
subprocess.check_call(SAVEBRM, shell=True)
except subprocess.CalledProcessError as exc:
logging.error('The primary call to {} exits with {}.'.format(exc.cmd, exc.returncode))
backup_path = get_save_brm_path_prefix(a_mcs_config_root)
logging.debug('Back up BRM files locally to {}.'.format(backup_path))
backup_cmd = SAVEBRM + ' ' + backup_path
try:
subprocess.check_call(backup_cmd, shell=True)
except subprocess.CalledProcessError:
logging.error('The backup call to {} exits with {}.'.format(exc.cmd, exc.returncode))
except OSError:
logging.error('Os error while calling savebrm', exc_info=True)
sys.exit(0)
logging.error('Os error while calling savebrm during the backup', exc_info=True)
sys.exit(1)
except OSError:
logging.error('Os error while calling savebrm', exc_info=True)
sys.exit(1)
if __name__ == '__main__':
mcs_config_root = get_config_root_from_file(MCS_CONFIG_PATH)
# config_root can be None
if is_node_primary(mcs_config_root):
em_local_path_prefix = call_save_brm_locally(mcs_config_root)
if not em_local_path_prefix or em_is_empty(em_local_path_prefix):
# remove_files_by_prefix_if_exist(em_local_path_prefix)
logging.error('Exiting with error.')
sys.exit(1)
clean_up_backup_brm_files(get_save_brm_dir_path(mcs_config_root))
call_save_brm(DEFAULT_EM_LOCAL_PATH_PREFIX)
sys.exit(0)