1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

feat(save_brm)!: MCOL-5709: protect from S3/NFS IO errors (#3206)

* feat(save_brm)!: protect from S3/NFS IO errors

* feat(save_brm)!: future refactoring

* cleanup

* feat(save_brm)!: forgotten template

* feat(save-brm,ci)!: python3 package for rocky8

---------

Co-authored-by: Roman Nozdrin <roman.nozdrin@mariadb.com>
This commit is contained in:
Alan Mologorsky
2024-06-18 20:42:01 +03:00
committed by GitHub
parent 0e17e14afb
commit 4da9bcdccf
5 changed files with 312 additions and 38 deletions

View File

@ -40,7 +40,7 @@ local clang_update_alternatives = 'update-alternatives --install /usr/bin/clang
local rpm_build_deps = 'install -y lz4 systemd-devel git make libaio-devel openssl-devel boost-devel bison ' + local rpm_build_deps = 'install -y lz4 systemd-devel git make libaio-devel openssl-devel boost-devel bison ' +
'snappy-devel flex libcurl-devel libxml2-devel ncurses-devel automake libtool ' + 'snappy-devel flex libcurl-devel libxml2-devel ncurses-devel automake libtool ' +
'policycoreutils-devel rpm-build lsof iproute pam-devel perl-DBI cracklib-devel ' + 'policycoreutils-devel rpm-build lsof iproute pam-devel perl-DBI cracklib-devel ' +
'expect createrepo '; 'expect createrepo python3 ';
local centos7_build_deps = 'yum install -y epel-release centos-release-scl ' + local centos7_build_deps = 'yum install -y epel-release centos-release-scl ' +
'&& yum install -y pcre2-devel devtoolset-' + gcc_version + ' devtoolset-' + gcc_version + '-gcc cmake3 lz4-devel ' + '&& yum install -y pcre2-devel devtoolset-' + gcc_version + ' devtoolset-' + gcc_version + '-gcc cmake3 lz4-devel ' +

View File

@ -89,6 +89,7 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-storagemanager.service.in" "${CM
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-stop-controllernode.sh.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-stop-controllernode.sh" @ONLY) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-stop-controllernode.sh.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-stop-controllernode.sh" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py" @ONLY) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.py" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py" @ONLY) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-savebrm.py.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcssavebrm.py" @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog.in" "${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog" @ONLY) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog.in" "${CMAKE_CURRENT_SOURCE_DIR}/columnstoreSyslog" @ONLY)
install(PROGRAMS columnstore-post-install install(PROGRAMS columnstore-post-install
@ -124,3 +125,6 @@ install(FILES mariadb-columnstore.service
DESTINATION ${ENGINE_SUPPORTDIR} COMPONENT columnstore-engine) DESTINATION ${ENGINE_SUPPORTDIR} COMPONENT columnstore-engine)
install(FILES module DESTINATION ${ENGINE_DATADIR}/local COMPONENT columnstore-engine) install(FILES module DESTINATION ${ENGINE_DATADIR}/local COMPONENT columnstore-engine)
find_package (Python3 COMPONENTS Interpreter REQUIRED)
add_test(NAME PythonUnitTests COMMAND ${Python3_EXECUTABLE} -m unittest test_mcs-savebrm.py)

View File

@ -309,9 +309,9 @@ if __name__ == '__main__':
if s3_enabled: if s3_enabled:
# start SM using systemd # start SM using systemd
if use_systemd: if use_systemd:
cmd = 'systemctl start mcs-storagemanager' CMD = 'systemctl start mcs-storagemanager'
try: try:
subprocess.check_call(cmd, shell=True) subprocess.check_call(CMD, shell=True)
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
logging.error( logging.error(
'Failed to start storagemanager. ' 'Failed to start storagemanager. '
@ -393,8 +393,7 @@ if __name__ == '__main__':
shutil.chown(current_name, USER, GROUP) shutil.chown(current_name, USER, GROUP)
else: else:
logging.info( logging.info(
'Cmapi is not running on primary node. ' 'Cmapi is not running on primary node. Skip loading metafiles.'
'Skip loading metafiles.'
) )
except Exception as exc: except Exception as exc:
logging.error( logging.error(

View File

@ -2,6 +2,7 @@
import configparser import configparser
import fcntl import fcntl
import json import json
import glob
import logging import logging
import os import os
import socket import socket
@ -9,6 +10,7 @@ import ssl
import struct import struct
import subprocess import subprocess
import sys import sys
import time
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError from urllib.error import HTTPError, URLError
@ -21,7 +23,11 @@ MCS_CONFIG_PATH = os.path.join(MCS_ETC_PATH, 'Columnstore.xml')
SM_CONFIG_PATH = os.path.join(MCS_ETC_PATH, 'storagemanager.cnf') SM_CONFIG_PATH = os.path.join(MCS_ETC_PATH, 'storagemanager.cnf')
MCS_BIN_DIR = '@ENGINE_BINDIR@' MCS_BIN_DIR = '@ENGINE_BINDIR@'
SAVEBRM = os.path.join(MCS_BIN_DIR, 'save_brm') SAVEBRM = os.path.join(MCS_BIN_DIR, 'save_brm')
EM_FILE_SUFFIX = '_em'
EM_FILE_SIZE_THRESHOLD = 1000
HALF_A_MINUTE = 30 HALF_A_MINUTE = 30
NUMBER_OF_FILES_TO_KEEP = 40
DEFAULT_EM_LOCAL_PATH_PREFIX = ''
LOCALHOST = '127.0.0.1' LOCALHOST = '127.0.0.1'
# according to https://www.ibm.com/docs/en/storage-sentinel/1.1.2?topic=installation-map-your-local-host-loopback-address # according to https://www.ibm.com/docs/en/storage-sentinel/1.1.2?topic=installation-map-your-local-host-loopback-address
LOCALHOSTS = ( LOCALHOSTS = (
@ -33,6 +39,8 @@ LOCALHOSTS = (
) )
API_VERSION = '0.4.0' API_VERSION = '0.4.0'
API_PORT = '8640' API_PORT = '8640'
BRM_BACKUP_PATH = '/tmp/columnstore_tmp_files/rdwrscratch/'
BRM_BACKUP_PATH_PART = '{}_BRM_saves'
def get_api_key(): def get_api_key():
@ -161,16 +169,13 @@ def is_node_primary(conf_root):
success = True success = True
except HTTPError as exc: except HTTPError as exc:
logging.warning( logging.warning(
'Something goes wrong while requesting primary status ', 'Something goes wrong while requesting primary status through api. Got response code "{}" with reason "{}".'.format(
'through api.',
'Got response code "{}" with reason "{}".'.format(
exc.code, exc.reason exc.code, exc.reason
) )
) )
except URLError: except URLError:
logging.warning( logging.warning(
'CMAPI became unavailable while trying', 'CMAPI became unavailable while trying to request primary status.'
'to request primary status.'
) )
except Exception: except Exception:
logging.error( logging.error(
@ -192,42 +197,191 @@ def is_node_primary(conf_root):
return is_primary_fallback(conf_root.find('./DBRM_Controller/IPAddr').text) return is_primary_fallback(conf_root.find('./DBRM_Controller/IPAddr').text)
if __name__ == '__main__': def get_file_size(file_path):
master_addr = '' """ Returns the size of the file in bytes. """
pm_count = 0
logging.basicConfig(
format='%(levelname)s: %(message)s', level=logging.DEBUG
)
logging.debug('Loading Columnstore.xml.')
try: try:
cs_config = ET.parse(MCS_CONFIG_PATH) size = os.path.getsize(file_path)
config_root = cs_config.getroot() return size
master_addr = config_root.find('./DBRM_Controller/IPAddr').text except OSError as e:
pm_count = int( logging.error('OSError in get_file_size(): {}.'.format(e))
config_root.find('./SystemModuleConfig/ModuleCount3').text return None
def em_is_empty(file_path_prefix):
"""Returns True if EM file size is less than EM_FILE_SIZE_THRESHOLD
or its argument is None.
:rtype: Bool
"""
# Add error message if EM is empty
is_none = file_path_prefix is None
filesize = get_file_size(file_path_prefix + EM_FILE_SUFFIX)
is_em_empty = is_none or filesize < EM_FILE_SIZE_THRESHOLD
if is_em_empty:
logging.error('EM file is none or its size {} is less than {} bytes.'.format(filesize, EM_FILE_SIZE_THRESHOLD))
return is_em_empty
def clean_up_backup_brm_files(save_brm_dir_path):
""" Removes all but 5 last usable sets of BRM files in the specified directory.
Usable in the context means having non-empty EM.
"""
filenames = os.listdir(save_brm_dir_path)
filenames.sort(reverse=True)
files_to_remove = filenames[NUMBER_OF_FILES_TO_KEEP:]
for filename in files_to_remove:
file_path = os.path.join(save_brm_dir_path, filename)
logging.debug('Clean up {}.'.format(file_path))
try:
os.remove(file_path)
except OSError as e:
logging.error('OSError exception happens removing {}: {}.'.format(file_path, e))
def remove_files_by_prefix_if_exist(file_path_prefix):
""" Removes files with the given prefix if they exist. """
if file_path_prefix is None:
logging.error(
'file_path_prefix is None. Cannot remove files.',
exc_info=True
) )
logging.debug('Succesfully loaded Columnstore.xml.') return
except (FileNotFoundError, AttributeError, ValueError) as e: try:
# is it correct case?
files_paths = glob.glob(file_path_prefix + '*')
for file_path in files_paths:
os.remove(file_path)
except OSError as e:
logging.error(
'Error removing file: {} - {}'.format(file_path, e.strerror),
exc_info=True
)
def get_config_root_from_file(file_path):
"""Returns XML root element from file.
:param file_path: xml config path
:return: XML root element or None
:rtype: Element or None
"""
try:
cs_config = ET.parse(file_path)
return cs_config.getroot()
except (FileNotFoundError, AttributeError, ValueError):
logging.error( logging.error(
'Exception while loading Columnstore.xml. Continue anyway.', 'Exception while loading Columnstore.xml. Continue anyway.',
exc_info=True exc_info=True
) )
return None
logging.debug('Reading SM config.') def get_epoch_prefix():
sm_config = configparser.ConfigParser() """Returns a prefix with epoch time
files_read = len(sm_config.read(SM_CONFIG_PATH))
storage = sm_config.get(
'ObjectStorage', 'service', fallback='LocalStorage'
)
if is_node_primary(config_root): :rtype: String
"""
epoch_time = int(time.time())
return 'backup_{}'.format(epoch_time)
def get_save_brm_dir_path(a_mcs_config_root):
"""Returns a path that SM treats as local
:param file_path: xml config XML root
:rtype
"""
save_brm_dir_path = BRM_BACKUP_PATH
if a_mcs_config_root is not None:
try: try:
retcode = subprocess.check_call(SAVEBRM, shell=True) system_temp_file_dir = a_mcs_config_root.find('./SystemConfig/SystemTempFileDir').text
except subprocess.CalledProcessError as exc: hdfs_rdwr_scratch = a_mcs_config_root.find('./SystemConfig/hdfsRdwrScratch').text
logging.error('{} exits with {}.'.format(exc.cmd, exc.returncode)) # There is a danger to have no '/' in the end of system_temp_file_dir
sys.exit(1) # or have two of them there. In both cases save_brm will fail to store
# files locally.
save_brm_dir_path = system_temp_file_dir + hdfs_rdwr_scratch
except AttributeError:
logging.error('Exception while getting SystemTempFileDir and hdfsRdwrScratch from Columnstore.xml', exc_info=True)
return save_brm_dir_path
def get_save_brm_path_prefix(a_mcs_config_root):
"""Returns a path that SM treats as local
:param file_path: xml config XML root
:rtype: String
"""
epoch_prefix = get_epoch_prefix()
return get_save_brm_dir_path(a_mcs_config_root) + '/' + BRM_BACKUP_PATH_PART.format(epoch_prefix)
def call_save_brm(path):
"""Calls save_brm first and then tries to call it with local path.
:param file_path: xml config XML root
:rtype: None
"""
savebrm_cmd = SAVEBRM + ' ' + path
try:
subprocess.check_call(savebrm_cmd, shell=True)
except subprocess.CalledProcessError as exc:
logging.error('The call to {} exits with {}.'.format(savebrm_cmd, exc.returncode))
return None
except OSError:
logging.error('Os error while calling savebrm', exc_info=True)
return None
return path
def call_save_brm_locally(a_mcs_config_root):
"""Calls save_brm first and then tries to call it with local path.
:param file_path: xml config XML root
:rtype: None
"""
local_path = get_save_brm_path_prefix(a_mcs_config_root)
return call_save_brm(local_path)
def call_save_brm_with_local_fallback(a_mcs_config_root):
"""Calls save_brm first and then tries to call it with local path.
:param file_path: xml config XML root
:rtype: None
"""
try:
subprocess.check_call(SAVEBRM, shell=True)
except subprocess.CalledProcessError as exc:
logging.error('The primary call to {} exits with {}.'.format(exc.cmd, exc.returncode))
backup_path = get_save_brm_path_prefix(a_mcs_config_root)
logging.debug('Back up BRM files locally to {}.'.format(backup_path))
backup_cmd = SAVEBRM + ' ' + backup_path
try:
subprocess.check_call(backup_cmd, shell=True)
except subprocess.CalledProcessError:
logging.error('The backup call to {} exits with {}.'.format(exc.cmd, exc.returncode))
except OSError: except OSError:
logging.error('Os error while calling savebrm', exc_info=True) logging.error('Os error while calling savebrm during the backup', exc_info=True)
sys.exit(0)
sys.exit(1)
except OSError:
logging.error('Os error while calling savebrm', exc_info=True)
sys.exit(1)
if __name__ == '__main__':
mcs_config_root = get_config_root_from_file(MCS_CONFIG_PATH)
# config_root can be None
if is_node_primary(mcs_config_root):
em_local_path_prefix = call_save_brm_locally(mcs_config_root)
if not em_local_path_prefix or em_is_empty(em_local_path_prefix):
# remove_files_by_prefix_if_exist(em_local_path_prefix)
logging.error('Exiting with error.')
sys.exit(1)
clean_up_backup_brm_files(get_save_brm_dir_path(mcs_config_root))
call_save_brm(DEFAULT_EM_LOCAL_PATH_PREFIX)
sys.exit(0) sys.exit(0)

View File

@ -0,0 +1,117 @@
import unittest
from unittest.mock import patch, mock_open, MagicMock
import socket
import ssl
import json
import os
import subprocess
from urllib.error import HTTPError, URLError
from urllib.request import Request
from xml.etree.ElementTree import Element
import importlib.util
import mcssavebrm
class TestMcsSavebrmFunctions(unittest.TestCase):
@patch('mcssavebrm.configparser.ConfigParser.get', return_value='test_api_key')
@patch('mcssavebrm.configparser.ConfigParser.read')
def test_get_api_key(self, mock_read, mock_get):
self.assertEqual(mcssavebrm.get_api_key(), 'test_api_key')
mock_read.assert_called_once_with(mcssavebrm.CMAPI_CONFIG_PATH)
def test_get_unverified_context(self):
ctx = mcssavebrm.get_unverified_context()
self.assertFalse(ctx.check_hostname)
self.assertEqual(ctx.verify_mode, ssl.CERT_NONE)
@patch('mcssavebrm.urlopen')
@patch('mcssavebrm.get_unverified_context')
def test_cmapi_available(self, mock_get_unverified_context, mock_urlopen):
mock_get_unverified_context.return_value = ssl._create_unverified_context()
mock_urlopen.side_effect = HTTPError(None, 404, 'Not Found', None, None)
self.assertTrue(mcssavebrm.cmapi_available())
@patch('mcssavebrm.fcntl.ioctl')
@patch('mcssavebrm.socket.socket')
def test_get_ip_address_by_nic(self, mock_socket, mock_ioctl):
mock_socket_inst = MagicMock()
mock_socket.return_value = mock_socket_inst
mock_ioctl.return_value = b'\x00' * 20 + b'\x7f\x00\x00\x01'
self.assertEqual(mcssavebrm.get_ip_address_by_nic('lo'), '127.0.0.1')
@patch('mcssavebrm.get_ip_address_by_nic', return_value='127.0.0.1')
@patch('mcssavebrm.socket.gethostbyaddr', return_value=('localhost', [], []))
@patch('mcssavebrm.socket.if_nameindex', return_value=[(1, 'lo')])
def test_is_primary_fallback(self, mock_if_nameindex, mock_gethostbyaddr, mock_get_ip_address_by_nic):
self.assertTrue(mcssavebrm.is_primary_fallback('localhost'))
@patch('mcssavebrm.cmapi_available', return_value=True)
@patch('mcssavebrm.urlopen')
@patch('mcssavebrm.get_unverified_context')
@patch('mcssavebrm.get_api_key', return_value='test_api_key')
def test_is_node_primary(self, mock_get_api_key, mock_get_unverified_context, mock_urlopen, mock_cmapi_available):
mock_get_unverified_context.return_value = ssl._create_unverified_context()
mock_response = MagicMock()
mock_response.read.return_value = json.dumps({'is_primary': 'True'}).encode('utf-8')
mock_urlopen.return_value.__enter__.return_value = mock_response
root = MagicMock()
self.assertTrue(mcssavebrm.is_node_primary(root))
@patch('os.path.getsize', return_value=1024)
def test_get_file_size(self, mock_getsize):
self.assertEqual(mcssavebrm.get_file_size('test_file'), 1024)
@patch('mcssavebrm.get_file_size', return_value=500)
def test_em_is_empty(self, mock_get_file_size):
self.assertTrue(mcssavebrm.em_is_empty('test_prefix'))
@patch('os.remove')
@patch('os.listdir', return_value=[f'test_file{i}' for i in range(50)])
def test_clean_up_backup_brm_files(self, mock_listdir, mock_remove):
mcssavebrm.clean_up_backup_brm_files('/dummy/path')
self.assertEqual(mock_remove.call_count, 10)
@patch('os.remove')
@patch('mcssavebrm.glob.glob', return_value=['test_file1', 'test_file2'])
def test_remove_files_by_prefix_if_exist(self, mock_glob, mock_remove):
mcssavebrm.remove_files_by_prefix_if_exist('test_prefix')
mock_remove.assert_any_call('test_file1')
mock_remove.assert_any_call('test_file2')
@patch('xml.etree.ElementTree.parse')
def test_get_config_root_from_file(self, mock_parse):
mock_tree = MagicMock()
mock_parse.return_value = mock_tree
self.assertEqual(mcssavebrm.get_config_root_from_file('test_file'), mock_tree.getroot())
@patch('time.time', return_value=1624478400)
def test_get_epoch_prefix(self, mock_time):
self.assertEqual(mcssavebrm.get_epoch_prefix(), 'backup_1624478400')
@patch('mcssavebrm.get_epoch_prefix', return_value='backup_1624478400')
@patch('mcssavebrm.get_save_brm_dir_path', return_value='/tmp/columnstore_tmp_files/rdwrscratch/')
def test_get_save_brm_path_prefix(self, mock_get_epoch_prefix, mock_get_save_brm_dir_path):
root = MagicMock()
self.assertIn('backup_1624478400_BRM_saves', mcssavebrm.get_save_brm_path_prefix(root))
@patch('subprocess.check_call')
def test_call_save_brm(self, mock_check_call):
self.assertEqual(mcssavebrm.call_save_brm('test_path'), 'test_path')
@patch('mcssavebrm.call_save_brm', return_value='test_path')
@patch('mcssavebrm.get_save_brm_path_prefix', return_value='test_path')
def test_call_save_brm_locally(self, mock_get_save_brm_path_prefix, mock_call_save_brm):
root = MagicMock()
self.assertEqual(mcssavebrm.call_save_brm_locally(root), 'test_path')
@patch('subprocess.check_call')
@patch('mcssavebrm.get_save_brm_path_prefix', return_value='test_path')
def test_call_save_brm_with_local_fallback(self, mock_get_save_brm_path_prefix, mock_check_call):
root = MagicMock()
mock_check_call.side_effect = [subprocess.CalledProcessError(1, 'test'), None]
with self.assertRaises(SystemExit):
mcssavebrm.call_save_brm_with_local_fallback(root)
if __name__ == '__main__':
unittest.main()