diff --git a/.drone.jsonnet b/.drone.jsonnet index dd7e220f4..182765ce4 100644 --- a/.drone.jsonnet +++ b/.drone.jsonnet @@ -31,7 +31,7 @@ local cmakeflags = '-DCMAKE_BUILD_TYPE=RelWithDebInfo -DBUILD_CONFIG=mysql_relea '-DPLUGIN_GSSAPI=NO -DPLUGIN_SPIDER=NO -DPLUGIN_OQGRAPH=NO -DPLUGIN_SPHINX=NO ' + '-DWITH_EMBEDDED_SERVER=NO -DWITH_WSREP=NO -DWITH_COREDUMPS=ON'; -local clang_version = '14'; +local clang_version = '16'; local gcc_version = '11'; local clang_update_alternatives = 'update-alternatives --install /usr/bin/clang clang /usr/bin/clang-' + clang_version + ' 100 --slave /usr/bin/clang++ clang++ /usr/bin/clang++-' + clang_version + ' && update-alternatives --install /usr/bin/cc cc /usr/bin/clang 100 && update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++ 100 '; @@ -57,17 +57,12 @@ local rockylinux9_build_deps = "dnf install -y 'dnf-command(config-manager)' " + '&& dnf install -y pcre2-devel lz4-devel gcc gcc-c++'; local debian11_deps = 'apt update && apt install -y gnupg wget && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-' + clang_version + ' main" >> /etc/apt/sources.list && wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && apt update && apt install -y clang-' + clang_version + ' && ' + clang_update_alternatives; -local ubuntu20_04_deps = 'apt update && apt install -y gnupg wget && echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-' + clang_version + ' main" >> /etc/apt/sources.list && wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && apt update && apt install -y clang-' + clang_version + ' &&' + clang_update_alternatives; +local ubuntu20_04_deps = 'apt update && apt install -y gnupg wget && echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-' + clang_version + ' main" >> /etc/apt/sources.list && wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && apt update && apt install -y clang-' + clang_version + ' && ' + clang_update_alternatives; local deb_build_deps = 'apt update --yes && apt install --yes --no-install-recommends build-essential devscripts git ccache equivs eatmydata libssl-dev && mk-build-deps debian/control -t "apt-get -y -o Debug::pkgProblemResolver=yes --no-install-recommends" -r -i '; local turnon_clang = 'export CC=/usr/bin/clang; export CXX=/usr/bin/clang++ '; local bootstrap_deps = 'apt-get -y update && apt-get -y install build-essential automake libboost-all-dev bison cmake libncurses5-dev libaio-dev libsystemd-dev libpcre2-dev libperl-dev libssl-dev libxml2-dev libkrb5-dev flex libpam-dev git libsnappy-dev libcurl4-openssl-dev libgtest-dev libcppunit-dev googletest libsnappy-dev libjemalloc-dev liblz-dev liblzo2-dev liblzma-dev liblz4-dev libbz2-dev libbenchmark-dev libdistro-info-perl '; -local core_dump_format = 'https://raw.githubusercontent.com/mariadb-corporation/mariadb-columnstore-engine/develop/core_dumps/core_dump_format.sh'; -local core_dump_check = 'https://raw.githubusercontent.com/mariadb-corporation/mariadb-columnstore-engine/develop/core_dumps/core_dump_check.sh'; -local core_dump_drop = 'https://raw.githubusercontent.com/mariadb-corporation/mariadb-columnstore-engine/develop/core_dumps/core_dump_drop.sh'; -local ansi2html = 'https://raw.githubusercontent.com/mariadb-corporation/mariadb-columnstore-engine/develop/core_dumps/ansi2html.sh'; -local logs = 'https://raw.githubusercontent.com/mariadb-corporation/mariadb-columnstore-engine/with_service_logs/core_dumps/logs.sh'; local mtr_suite_list = 'basic,bugfixes'; local mtr_full_set = 'basic,bugfixes,devregression,autopilot,extended,multinode,oracle,1pmonly'; @@ -81,7 +76,7 @@ local platformMap(platform, arch) = 'ubuntu:22.04': bootstrap_deps + ' && ' + deb_build_deps + " && sleep $${BUILD_DELAY_SECONDS:-1s} && CMAKEFLAGS='" + cmakeflags + " -DDEB=jammy' debian/autobake-deb.sh", }; local result = std.strReplace(std.strReplace(platform, ':', ''), '/', '-'); - platform_map[platform] + ' | tee ' + result + '/build.log'; + 'export CLICOLOR_FORCE=1; ' + platform_map[platform] + ' | storage/columnstore/columnstore/build/ansi2txt.sh ' + result + '/build.log'; local testRun(platform) = @@ -112,6 +107,8 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') local pkg_format = if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'rpm' else 'deb', local init = if (pkg_format == 'rpm') then '/usr/lib/systemd/systemd' else 'systemd', local mtr_path = if (pkg_format == 'rpm') then '/usr/share/mysql-test' else '/usr/share/mysql/mysql-test', + local cmapi_path = '/usr/share/columnstore/cmapi', + local etc_path = '/etc/columnstore', local socket_path = if (pkg_format == 'rpm') then '/var/lib/mysql/mysql.sock' else '/run/mysqld/mysqld.sock', local config_path_prefix = if (pkg_format == 'rpm') then '/etc/my.cnf.d/' else '/etc/mysql/mariadb.conf.d/50-', local img = if (platform == 'centos:7' || platform == 'rockylinux:8') then platform else 'romcheck/' + std.strReplace(platform, '/', '-'), @@ -122,6 +119,10 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') local brancht = if (branch == '**') then '' else branch + '-', local result = std.strReplace(std.strReplace(platform, ':', ''), '/', '-'), + local publish_pkg_url = 'https://cspkg.s3.amazonaws.com/index.html?prefix=' + branchp + event + '/${DRONE_BUILD_NUMBER}/' + server + '/' + arch + '/' + result + '/', + + local packages_url = 'https://cspkg.s3.amazonaws.com/' + branchp + event + '/${DRONE_BUILD_NUMBER}/' + server, + local container_tags = if (event == 'cron') then [brancht + std.strReplace(event, '_', '-') + '${DRONE_BUILD_NUMBER}', brancht] else [brancht + std.strReplace(event, '_', '-') + '${DRONE_BUILD_NUMBER}'], local container_version = branchp + event + '/${DRONE_BUILD_NUMBER}/' + server + '/' + arch, @@ -202,11 +203,12 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') }, }, commands: [ - 'docker exec --env PRESERVE_LOGS=true -t --workdir /mariadb-columnstore-regression-test/mysql/queries/nightly/alltest regression$${DRONE_BUILD_NUMBER} timeout -k 1m -s SIGKILL --preserve-status $${REGRESSION_TIMEOUT} ./go.sh --sm_unit_test_dir=/storage-manager --tests=' + name, + 'docker exec -t --workdir /mariadb-columnstore-regression-test/mysql/queries/nightly/alltest regression$${DRONE_BUILD_NUMBER} mkdir -p reg-logs', + "docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c 'sleep 4800 && eu-stack -p `pidof PrimProc` -n 0 | tee /mariadb-columnstore-regression-test/mysql/queries/nightly/alltest/reg-logs/prim_proc_callstacks.txt' & ", + 'docker exec --env PRESERVE_LOGS=true -t --workdir /mariadb-columnstore-regression-test/mysql/queries/nightly/alltest regression$${DRONE_BUILD_NUMBER} bash -c "timeout -k 1m -s SIGKILL --preserve-status $${REGRESSION_TIMEOUT} ./go.sh --sm_unit_test_dir=/storage-manager --tests=' + name + ' || ./regression_logs.sh ' + name + '"', 'docker exec -t --workdir /mariadb-columnstore-regression-test/mysql/queries/nightly/alltest regression$${DRONE_BUILD_NUMBER} cat go.log || echo "missing go.log"', ], }, - _volumes:: { mdb: { name: 'mdb', @@ -219,30 +221,20 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') }, smoke:: { name: 'smoke', - depends_on: ['pkg'], + depends_on: ['publish pkg'], image: 'docker', volumes: [pipeline._volumes.docker], commands: [ - 'docker run --volume /sys/fs/cgroup:/sys/fs/cgroup:ro --env DEBIAN_FRONTEND=noninteractive --env MCS_USE_S3_STORAGE=0 --name smoke$${DRONE_BUILD_NUMBER} --ulimit core=-1 --privileged --detach ' + img + ' ' + init + ' --unit=basic.target', - 'docker cp ' + result + ' smoke$${DRONE_BUILD_NUMBER}:/', - if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "yum install -y wget procps-ng"', + 'docker run --volume /sys/fs/cgroup:/sys/fs/cgroup:ro --env OS=' + result + ' --env PACKAGES_URL=' + packages_url + ' --env DEBIAN_FRONTEND=noninteractive --env MCS_USE_S3_STORAGE=0 --name smoke$${DRONE_BUILD_NUMBER} --ulimit core=-1 --privileged --detach ' + img + ' ' + init + ' --unit=basic.target', + if (pkg_format == 'rpm') then 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "yum install -y wget gdb gawk epel-release which rsyslog hostname procps-ng"' else 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y gdb gawk rsyslog hostname procps wget"', if (pkg_format == 'deb') then 'docker exec -t smoke$${DRONE_BUILD_NUMBER} sed -i "s/exit 101/exit 0/g" /usr/sbin/policy-rc.d', - if (pkg_format == 'deb') then 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y procps wget"', 'docker exec -t smoke$${DRONE_BUILD_NUMBER} mkdir core', 'docker exec -t smoke$${DRONE_BUILD_NUMBER} chmod 777 core', 'docker exec -t smoke$${DRONE_BUILD_NUMBER} sysctl -w kernel.core_pattern="/core/%E_smoke_core_dump.%p"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_format + '"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_check + '"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_drop + '"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "wget ' + ansi2html + '"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "wget ' + logs + '"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_format.sh"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_drop.sh"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_check.sh"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "chmod +x ansi2html.sh"', - 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "chmod +x logs.sh"', - if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "yum install -y gdb gawk epel-release which rsyslog hostname procps-ng && yum install -y /' + result + '/*.' + pkg_format + '"' else '', - if (pkg_format == 'deb') then 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y gdb gawk rsyslog hostname && apt install -y -f /' + result + '/*.' + pkg_format + '"', + 'docker cp core_dumps/. smoke$${DRONE_BUILD_NUMBER}:/', + 'docker cp setup-repo.sh smoke$${DRONE_BUILD_NUMBER}:/', + 'docker exec -t smoke$${DRONE_BUILD_NUMBER} /setup-repo.sh', + if (pkg_format == 'deb') then 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "apt install -y mariadb-plugin-columnstore"' else 'docker exec -t smoke$${DRONE_BUILD_NUMBER} bash -c "yum install -y MariaDB-columnstore-engine"', 'sleep $${SMOKE_DELAY_SECONDS:-1s}', // start mariadb and mariadb-columnstore services and run simple query 'docker exec -t smoke$${DRONE_BUILD_NUMBER} systemctl start mariadb', @@ -265,33 +257,26 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') MTR_FULL_SUITE: '${MTR_FULL_SUITE:-false}', }, commands: [ - 'docker run --volume /sys/fs/cgroup:/sys/fs/cgroup:ro --shm-size=500m --env MYSQL_TEST_DIR=' + mtr_path + ' --env DEBIAN_FRONTEND=noninteractive --env MCS_USE_S3_STORAGE=0 --name mtr$${DRONE_BUILD_NUMBER} --ulimit core=-1 --privileged --detach ' + img + ' ' + init + ' --unit=basic.target', - 'docker cp ' + result + ' mtr$${DRONE_BUILD_NUMBER}:/', + 'docker run --volume /sys/fs/cgroup:/sys/fs/cgroup:ro --shm-size=500m --env MYSQL_TEST_DIR=' + mtr_path + ' --env OS=' + result + ' --env PACKAGES_URL=' + packages_url + ' --env DEBIAN_FRONTEND=noninteractive --env MCS_USE_S3_STORAGE=0 --name mtr$${DRONE_BUILD_NUMBER} --ulimit core=-1 --privileged --detach ' + img + ' ' + init + ' --unit=basic.target', if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "yum install -y wget tar lz4 procps-ng"', if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} sed -i "s/exit 101/exit 0/g" /usr/sbin/policy-rc.d', if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y procps wget tar liblz4-tool"', 'docker exec -t mtr$${DRONE_BUILD_NUMBER} mkdir core', 'docker exec -t mtr$${DRONE_BUILD_NUMBER} chmod 777 core', 'docker exec -t mtr$${DRONE_BUILD_NUMBER} sysctl -w kernel.core_pattern="/core/%E_mtr_core_dump.%p"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_format + '"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_check + '"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_drop + '"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "wget ' + ansi2html + '"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "wget ' + logs + '"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_format.sh"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_check.sh"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_drop.sh"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "chmod +x ansi2html.sh"', - 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "chmod +x logs.sh"', - if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "yum install -y wget gawk gdb epel-release diffutils which rsyslog hostname patch perl cracklib-dicts procps-ng && yum install -y /' + result + '/*.' + pkg_format + '"' else '', + 'docker cp core_dumps/. mtr$${DRONE_BUILD_NUMBER}:/', + if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "yum install -y wget gawk gdb epel-release diffutils which rsyslog hostname patch perl cracklib-dicts procps-ng"' else '', if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} sed -i "s/exit 101/exit 0/g" /usr/sbin/policy-rc.d', - if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y wget gawk gdb rsyslog hostname patch && apt install -y -f /' + result + '/*.' + pkg_format + '"' else '', + if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y wget gawk gdb rsyslog hostname patch"' else '', + 'docker cp setup-repo.sh mtr$${DRONE_BUILD_NUMBER}:/', + 'docker exec -t mtr$${DRONE_BUILD_NUMBER} /setup-repo.sh', + if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "apt install -y mariadb-plugin-columnstore mariadb-test"' else 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "yum install -y MariaDB-columnstore-engine MariaDB-test"', 'docker cp mysql-test/columnstore mtr$${DRONE_BUILD_NUMBER}:' + mtr_path + '/suite/', 'docker exec -t mtr$${DRONE_BUILD_NUMBER} chown -R mysql:mysql ' + mtr_path, // disable systemd 'ProtectSystem' (we need to write to /usr/share/) "docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c 'sed -i /ProtectSystem/d $(systemctl show --property FragmentPath mariadb | sed s/FragmentPath=//)'", - if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo \"character_set_server=latin1\" >> /etc/mysql/mariadb.conf.d/columnstore.cnf"' else 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo \"character_set_server=latin1\" >> /etc/my.cnf.d/columnstore.cnf"', - if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo \"collation_server=latin1_swedish_ci\" >> /etc/mysql/mariadb.conf.d/columnstore.cnf"' else 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo \"collation_server=latin1_swedish_ci\" >> /etc/my.cnf.d/columnstore.cnf"', + if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo character_set_server=latin1 >> /etc/mysql/mariadb.conf.d/columnstore.cnf"' else 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo character_set_server=latin1 >> /etc/my.cnf.d/columnstore.cnf"', + if (pkg_format == 'deb') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo collation_server=latin1_swedish_ci >> /etc/mysql/mariadb.conf.d/columnstore.cnf"' else 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "echo collation_server=latin1_swedish_ci >> /etc/my.cnf.d/columnstore.cnf"', 'docker exec -t mtr$${DRONE_BUILD_NUMBER} systemctl daemon-reload', 'docker exec -t mtr$${DRONE_BUILD_NUMBER} systemctl start mariadb', 'docker exec -t mtr$${DRONE_BUILD_NUMBER} mariadb -e "create database if not exists test;"', @@ -305,7 +290,7 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') 'MTR_SUITE_LIST=$([ "$MTR_FULL_SUITE" == true ] && echo "' + mtr_full_set + '" || echo "$MTR_SUITE_LIST")', if (event == 'custom' || event == 'cron') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "wget -qO- https://cspkg.s3.amazonaws.com/mtr-test-data.tar.lz4 | lz4 -dc - | tar xf - -C /"', if (event == 'custom' || event == 'cron') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "cd ' + mtr_path + ' && ./mtr --extern socket=' + socket_path + ' --force --print-core=detailed --print-method=gdb --max-test-fail=0 --suite=columnstore/setup"', - if (event == 'cron') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "cd ' + mtr_path + ' && ./mtr --extern socket=' + socket_path + ' --force --print-core=detailed --print-method=gdb --max-test-fail=0 --suite=' + std.join(",", std.map(function(x) "columnstore/" + x, std.split(mtr_full_set, ","))) + '"' else 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "cd ' + mtr_path + ' && ./mtr --extern socket=' + socket_path + ' --force --print-core=detailed --print-method=gdb --max-test-fail=0 --suite=columnstore/$${MTR_SUITE_LIST//,/,columnstore/}"', + if (event == 'cron') then 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "cd ' + mtr_path + ' && ./mtr --extern socket=' + socket_path + ' --force --print-core=detailed --print-method=gdb --max-test-fail=0 --suite=' + std.join(',', std.map(function(x) 'columnstore/' + x, std.split(mtr_full_set, ','))) + '"' else 'docker exec -t mtr$${DRONE_BUILD_NUMBER} bash -c "cd ' + mtr_path + ' && ./mtr --extern socket=' + socket_path + ' --force --print-core=detailed --print-method=gdb --max-test-fail=0 --suite=columnstore/$${MTR_SUITE_LIST//,/,columnstore/}"', ], }, mtrlog:: { @@ -336,52 +321,51 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') }, prepare_regression:: { name: 'prepare regression', - depends_on: ['mtr'], + depends_on: ['mtr', 'publish pkg', 'publish cmapi build'], when: { status: ['success', 'failure'], }, image: 'docker:git', volumes: [pipeline._volumes.docker, pipeline._volumes.mdb], environment: { - REGRESSION_REF: '${REGRESSION_REF:-' + regression_ref + '}', + REGRESSION_BRANCH_REF: '${DRONE_SOURCE_BRANCH}', + REGRESSION_REF_AUX: regression_ref, }, commands: [ + // compute branch. + 'echo "$$REGRESSION_REF"', + 'echo "$$REGRESSION_BRANCH_REF"', + // if REGRESSION_REF is empty, try to see whether regression repository has a branch named as one we PR. + 'export REGRESSION_REF=$${REGRESSION_REF:-$$(git ls-remote https://github.com/mariadb-corporation/mariadb-columnstore-regression-test --h --sort origin "refs/heads/$$REGRESSION_BRANCH_REF" | grep -E -o "[^/]+$$")}', + 'echo "$$REGRESSION_REF"', + // REGRESSION_REF can be empty if there is no appropriate branch in regression repository. + // assign what is appropriate by default. + 'export REGRESSION_REF=$${REGRESSION_REF:-$$REGRESSION_REF_AUX}', + 'echo "$$REGRESSION_REF"', // clone regression test repo 'git clone --recurse-submodules --branch $$REGRESSION_REF --depth 1 https://github.com/mariadb-corporation/mariadb-columnstore-regression-test', // where are we now? 'cd mariadb-columnstore-regression-test', 'git rev-parse --abbrev-ref HEAD && git rev-parse HEAD', 'cd ..', - 'docker run --shm-size=500m --volume /sys/fs/cgroup:/sys/fs/cgroup:ro --env DEBIAN_FRONTEND=noninteractive --env MCS_USE_S3_STORAGE=0 --ulimit core=-1 --name regression$${DRONE_BUILD_NUMBER} --privileged --detach ' + img + ' ' + init + ' --unit=basic.target', - // copy packages, regresssion test suite and storage manager unit test binary to the instance - 'docker cp ' + result + ' regression$${DRONE_BUILD_NUMBER}:/', - if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "yum install -y procps-ng wget"', + 'docker run --shm-size=500m --volume /sys/fs/cgroup:/sys/fs/cgroup:ro --env OS=' + result + ' --env PACKAGES_URL=' + packages_url + ' --env DEBIAN_FRONTEND=noninteractive --env MCS_USE_S3_STORAGE=0 --name regression$${DRONE_BUILD_NUMBER} --ulimit core=-1 --privileged --detach ' + img + ' ' + init + ' --unit=basic.target', + if (pkg_format == 'rpm') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "yum install -y wget gawk gdb gcc-c++ epel-release diffutils tar findutils lz4 wget which rsyslog hostname procps-ng elfutils"' else 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y wget tar liblz4-tool procps wget findutils gawk gdb rsyslog hostname g++"', if (pkg_format == 'deb') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} sed -i "s/exit 101/exit 0/g" /usr/sbin/policy-rc.d', - if (pkg_format == 'deb') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y procps wget"', 'docker exec -t regression$${DRONE_BUILD_NUMBER} mkdir core', 'docker exec -t regression$${DRONE_BUILD_NUMBER} chmod 777 core', 'docker exec -t regression$${DRONE_BUILD_NUMBER} sysctl -w kernel.core_pattern="/core/%E_regression_core_dump.%p"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_format + '"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_check + '"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "wget ' + core_dump_drop + '"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "wget ' + ansi2html + '"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "wget ' + logs + '"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_format.sh"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_check.sh"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "chmod +x core_dump_drop.sh"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "chmod +x ansi2html.sh"', - 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "chmod +x logs.sh"', + 'docker cp core_dumps/. regression$${DRONE_BUILD_NUMBER}:/', 'docker cp mariadb-columnstore-regression-test regression$${DRONE_BUILD_NUMBER}:/', // list storage manager binary 'ls -la /mdb/' + builddir + '/storage/columnstore/columnstore/storage-manager', 'docker cp /mdb/' + builddir + '/storage/columnstore/columnstore/storage-manager regression$${DRONE_BUILD_NUMBER}:/', // check storage-manager unit test binary file 'docker exec -t regression$${DRONE_BUILD_NUMBER} ls -l /storage-manager', - if (std.split(platform, ':')[0] == 'centos' || std.split(platform, ':')[0] == 'rockylinux') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "yum install -y wget gawk gdb gcc-c++ epel-release diffutils tar lz4 wget which rsyslog hostname procps-ng && yum install -y /' + result + '/*.' + pkg_format + '"' else '', - if (pkg_format == 'deb') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} sed -i "s/exit 101/exit 0/g" /usr/sbin/policy-rc.d', - if (pkg_format == 'deb') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y wget tar liblz4-tool wget gawk gdb rsyslog hostname && apt install -y -f g++ /' + result + '/*.' + pkg_format + '"' else '', // copy test data for regression test suite 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "wget -qO- https://cspkg.s3.amazonaws.com/testData.tar.lz4 | lz4 -dc - | tar xf - -C mariadb-columnstore-regression-test/"', + 'docker cp setup-repo.sh regression$${DRONE_BUILD_NUMBER}:/', + 'docker exec -t regression$${DRONE_BUILD_NUMBER} /setup-repo.sh', + if (pkg_format == 'deb') then 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "apt install -y mariadb-plugin-columnstore"' else 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "yum install -y MariaDB-columnstore-engine"', // set mariadb lower_case_table_names=1 config option 'docker exec -t regression$${DRONE_BUILD_NUMBER} sed -i "/^.mariadb.$/a lower_case_table_names=1" ' + config_path_prefix + 'server.cnf', // set default client character set to utf-8 @@ -401,7 +385,6 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') 'docker exec -t regression$${DRONE_BUILD_NUMBER} /usr/bin/g++ /mariadb-columnstore-regression-test/mysql/queries/queryTester.cpp -O2 -o /mariadb-columnstore-regression-test/mysql/queries/queryTester', ], }, - smokelog:: { name: 'smokelog', depends_on: ['smoke'], @@ -427,6 +410,29 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') status: ['success', 'failure'], }, }, + cmapilog:: { + name: 'cmapi log', + depends_on: ['cmapi test'], + image: 'docker', + volumes: [pipeline._volumes.docker], + commands: [ + 'echo "---------- start mariadb service logs ----------"', + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} journalctl -u mariadb --no-pager || echo "mariadb service failure"', + 'echo "---------- end mariadb service logs ----------"', + 'echo', + 'echo "---------- start columnstore debug log ----------"', + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} cat /var/log/mariadb/columnstore/debug.log || echo "missing columnstore debug.log"', + 'echo "---------- end columnstore debug log ----------"', + 'echo "---------- start cmapi log ----------"', + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} cat /var/log/mariadb/columnstore/cmapi_server.log || echo "missing cmapi cmapi_server.log"', + 'echo "---------- end cmapi log ----------"', + 'ls -l /drone/src/' + result, + 'docker stop cmapi$${DRONE_BUILD_NUMBER} && docker rm cmapi$${DRONE_BUILD_NUMBER} || echo "cleanup cmapi failure"', + ], + when: { + status: ['success', 'failure'], + }, + }, regressionlog:: { name: 'regressionlog', depends_on: [regression_tests[std.length(regression_tests) - 1]], @@ -437,13 +443,16 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') 'docker exec -t --workdir /mariadb-columnstore-regression-test/mysql/queries/nightly/alltest regression$${DRONE_BUILD_NUMBER} cat go.log || echo "missing go.log"', 'echo "---------- end columnstore regression short report ----------"', 'echo', + 'docker cp regression$${DRONE_BUILD_NUMBER}:/mariadb-columnstore-regression-test/mysql/queries/nightly/alltest/reg-logs/ /drone/src/' + result + '/', 'docker cp regression$${DRONE_BUILD_NUMBER}:/mariadb-columnstore-regression-test/mysql/queries/nightly/alltest/testErrorLogs.tgz /drone/src/' + result + '/ || echo "missing testErrorLogs.tgz"', + 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "tar czf regressionQueries.tgz /mariadb-columnstore-regression-test/mysql/queries/"', 'docker exec -t --workdir /mariadb-columnstore-regression-test/mysql/queries/nightly/alltest regression$${DRONE_BUILD_NUMBER} bash -c "tar czf testErrorLogs2.tgz *.log /var/log/mariadb/columnstore" || echo "failed to grab regression results"', 'docker cp regression$${DRONE_BUILD_NUMBER}:/mariadb-columnstore-regression-test/mysql/queries/nightly/alltest/testErrorLogs2.tgz /drone/src/' + result + '/ || echo "missing testErrorLogs.tgz"', 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "/logs.sh regression"', 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "/core_dump_check.sh core /core/ Regression"', 'docker cp regression$${DRONE_BUILD_NUMBER}:/unit_logs/ /drone/src/' + result + '/', 'docker cp regression$${DRONE_BUILD_NUMBER}:/core/ /drone/src/' + result + '/', + 'docker cp regression$${DRONE_BUILD_NUMBER}:regressionQueries.tgz /drone/src/' + result + '/', 'ls -l /drone/src/' + result, 'docker exec -t regression$${DRONE_BUILD_NUMBER} bash -c "/core_dump_drop.sh core"', 'docker stop regression$${DRONE_BUILD_NUMBER} && docker rm regression$${DRONE_BUILD_NUMBER} || echo "cleanup regression failure"', @@ -454,7 +463,7 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') }, dockerfile:: { name: 'dockerfile', - depends_on: ['publish pkg'], + depends_on: ['publish pkg', 'publish cmapi build'], //failure: 'ignore', image: 'alpine/git', commands: [ @@ -474,7 +483,7 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') // branchp has slash if not empty MCS_BASEURL: 'https://cspkg.s3.amazonaws.com/' + branchp + event + '/${DRONE_BUILD_NUMBER}/' + server + '/' + arch + '/' + result + '/', CMAPI_REPO: 'cmapi', - CMAPI_BASEURL: 'https://cspkg.s3.amazonaws.com/cmapi/develop/latest/' + arch + '/', + CMAPI_BASEURL: 'https://cspkg.s3.amazonaws.com/' + branchp + event + '/${DRONE_BUILD_NUMBER}/' + server + '/' + arch + '/' + result + '/', }, settings: { repo: 'mariadb/enterprise-columnstore-dev', @@ -490,6 +499,76 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') }, }, }, + cmapipython:: { + name: 'cmapi python', + image: img, + volumes: [pipeline._volumes.mdb], + environment: { + PYTHON_URL_AMD64: 'https://github.com/indygreg/python-build-standalone/releases/download/20220802/cpython-3.9.13+20220802-x86_64_v3-unknown-linux-gnu-pgo+lto-full.tar.zst', + PYTHON_URL_ARM64: 'https://github.com/indygreg/python-build-standalone/releases/download/20220802/cpython-3.9.13+20220802-aarch64-unknown-linux-gnu-noopt-full.tar.zst', + }, + commands: [ + 'cd cmapi', + '%s install -y wget zstd findutils gcc' % if (pkg_format == 'rpm') then 'yum install -y epel-release && yum makecache && yum ' else 'apt update && apt', + 'wget -qO- $${PYTHON_URL_' + std.asciiUpper(arch) + '} | tar --use-compress-program=unzstd -xf - -C ./', + 'mv python pp && mv pp/install python', + 'chown -R root:root python', + if (platform == 'rockylinux:9') then 'yum install -y libxcrypt-compat', + if (arch == 'arm64') then 'export CC=gcc', + 'python/bin/pip3 install -t deps --only-binary :all -r requirements.txt', + './cleanup.sh', + 'cp cmapi_server/cmapi_server.conf cmapi_server/cmapi_server.conf.default', + ], + }, + cmapibuild:: { + name: 'cmapi build', + depends_on: ['cmapi python'], + image: img, + volumes: [pipeline._volumes.mdb], + environment: { + DEBIAN_FRONTEND: 'noninteractive', + }, + commands: [ + 'cd cmapi', + if (pkg_format == 'rpm') then 'yum install -y cmake make rpm-build libarchive createrepo findutils' else 'apt update && apt install --no-install-recommends -y cmake make dpkg-dev', + if (platform == 'centos:7') then 'yum install -y epel-release && yum install -y cmake3 && ln -sf /usr/bin/cmake3 /usr/bin/cmake', + './cleanup.sh', + 'cmake -D' + std.asciiUpper(pkg_format) + '=1 . && make package', + 'mkdir ./' + result, + 'mv -v *.%s ./%s/' % [pkg_format, result], + if (pkg_format == 'rpm') then 'createrepo ./' + result else 'dpkg-scanpackages %s | gzip > ./%s/Packages.gz' % [result, result], + 'mkdir /drone/src/' + result, + 'yes | cp -vr ./%s /drone/src/' % result, + ], + }, + cmapitest:: { + name: 'cmapi test', + depends_on: ['publish cmapi build', 'smoke'], + image: 'docker:git', + volumes: [pipeline._volumes.docker], + environment: { + PYTHONPATH: '/usr/share/columnstore/cmapi/deps', + }, + commands: [ + 'docker run --volume /sys/fs/cgroup:/sys/fs/cgroup:ro --env OS=' + result + ' --env PACKAGES_URL=' + packages_url + ' --env DEBIAN_FRONTEND=noninteractive --env MCS_USE_S3_STORAGE=0 --env PYTHONPATH=$${PYTHONPATH} --name cmapi$${DRONE_BUILD_NUMBER} --ulimit core=-1 --privileged --detach ' + img + ' ' + init + ' --unit=basic.target', + if (pkg_format == 'rpm') then 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "yum install -y iproute sudo epel-release which rsyslog hostname procps-ng"' else 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "apt update --yes && apt install -y iproute2 rsyslog hostname procps sudo"', + if (pkg_format == 'deb') then 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} sed -i "s/exit 101/exit 0/g" /usr/sbin/policy-rc.d', + if (platform == 'rockylinux:9') then 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "yum install -y libxcrypt-compat"', + 'docker cp setup-repo.sh cmapi$${DRONE_BUILD_NUMBER}:/', + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} /setup-repo.sh', + if (pkg_format == 'deb') then 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "apt install -y mariadb-plugin-columnstore mariadb-columnstore-cmapi"' else 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "yum install -y MariaDB-columnstore-engine MariaDB-columnstore-cmapi"', + 'cd cmapi', + 'for i in mcs_node_control cmapi_server failover; do docker cp $${i}/test cmapi$${DRONE_BUILD_NUMBER}:' + cmapi_path + '/$${i}/; done', + 'docker cp run_tests.py cmapi$${DRONE_BUILD_NUMBER}:' + cmapi_path + '/', + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} systemctl start mariadb-columnstore-cmapi', + // set API key to /etc/columnstore/cmapi_server.conf + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "mcs cluster set api-key --key somekey123"', + // copy cmapi conf file for test purposes (there are api key already set inside) + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "cp %s/cmapi_server.conf %s/cmapi_server/"' % [etc_path, cmapi_path], + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} systemctl stop mariadb-columnstore-cmapi', + 'docker exec -t cmapi$${DRONE_BUILD_NUMBER} bash -c "cd ' + cmapi_path + ' && python/bin/python3 run_tests.py"', + ], + }, multi_node_mtr:: { name: 'mtr', depends_on: ['dockerhub'], @@ -606,8 +685,9 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') if (platform == 'ubuntu:22.04') then 'apt install -y lto-disabled-list && for i in mariadb-plugin-columnstore mariadb-server mariadb-server-core mariadb mariadb-10.6; do echo "$i any" >> /usr/share/lto-disabled-list/lto-disabled-list; done && grep mariadb /usr/share/lto-disabled-list/lto-disabled-list', platformMap(platform, arch), 'sccache --show-stats', - if (pkg_format == 'rpm') then 'mv *.' + pkg_format + ' ' + result + '/' else 'mv ../*.' + pkg_format + ' ' + result + '/', - if (pkg_format == 'rpm') then 'createrepo ' + result else 'dpkg-scanpackages ' + result + ' | gzip > ' + result + '/Packages.gz', + // move engine and cmapi packages to one dir to make a repo + 'mv -v -t ./%s/ %s/*.%s /drone/src/cmapi/%s/*.%s ' % [result, if (pkg_format == 'rpm') then '.' else '..', pkg_format, result, pkg_format], + if (pkg_format == 'rpm') then 'createrepo ./' + result else 'dpkg-scanpackages %s | gzip > ./%s/Packages.gz' % [result, result], // list storage manager binary 'ls -la /mdb/' + builddir + '/storage/columnstore/columnstore/storage-manager', ], @@ -629,7 +709,7 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') { name: 'pkg', depends_on: ['unittests'], - image: 'docker:git', + image: 'alpine/git', when: { status: ['success', 'failure'], }, @@ -639,19 +719,33 @@ local Pipeline(branch, platform, event, arch='amd64', server='10.6-enterprise') 'echo "engine: $DRONE_COMMIT" > buildinfo.txt', 'echo "server: $$(git rev-parse HEAD)" >> buildinfo.txt', 'echo "buildNo: $DRONE_BUILD_NUMBER" >> buildinfo.txt', - 'mv buildinfo.txt ' + result + '/', - 'mv ' + result + ' /drone/src/', + 'mv buildinfo.txt ./%s/' % result, + 'yes | cp -vr ./%s/. /drone/src/%s/' % [result, result], 'ls -l /drone/src/' + result, 'echo "check columnstore package:"', - 'ls -l /drone/src/' + result + ' | grep columnstore', + 'ls -l /drone/src/%s | grep columnstore' % result, ], }, ] + + [pipeline.cmapipython] + [pipeline.cmapibuild] + + [pipeline.publish('cmapi build')] + [pipeline.publish()] + + [ + { + name: 'publish pkg url', + depends_on: ['publish pkg'], + image: 'alpine/git', + commands: [ + "echo -e '\\e]8;;" + publish_pkg_url + '\\e\\\\' + publish_pkg_url + "\\e]8;;\\e\\\\'", + ], + }, + ] + (if (event == 'cron') then [pipeline.publish('pkg latest', 'latest')] else []) + [pipeline.smoke] + [pipeline.smokelog] + [pipeline.publish('smokelog')] + + [pipeline.cmapitest] + + [pipeline.cmapilog] + (if (platform == 'rockylinux:8' && arch == 'amd64') then [pipeline.dockerfile] + [pipeline.dockerhub] + [pipeline.multi_node_mtr] else [pipeline.mtr] + [pipeline.publish('mtr')] + [pipeline.mtrlog] + [pipeline.publish('mtrlog')]) + (if (event == 'cron' && platform == 'rockylinux:8' && arch == 'amd64') then [pipeline.publish('mtr latest', 'latest')] else []) + [pipeline.prepare_regression] + @@ -697,7 +791,6 @@ local FinalPipeline(branch, event) = { std.map(function(p) std.join(' ', [branch, p, event, 'arm64', '10.6-enterprise']), platforms_arm.develop), }; - [ Pipeline(b, p, e, 'amd64', s) for b in std.objectFields(platforms) diff --git a/build/ansi2txt.sh b/build/ansi2txt.sh new file mode 100755 index 000000000..009cd669c --- /dev/null +++ b/build/ansi2txt.sh @@ -0,0 +1,2 @@ +bash -c "tee >(sed $'s/\033[[][^A-Za-z]*m//g' > $1)" + diff --git a/build/bootstrap_mcs.sh b/build/bootstrap_mcs.sh index 3ead2a489..40e3ef3a5 100755 --- a/build/bootstrap_mcs.sh +++ b/build/bootstrap_mcs.sh @@ -22,7 +22,7 @@ DISTRO_OPTIONS=("Ubuntu" "CentOS" "Debian" "Rocky") cd $SCRIPT_LOCATION CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) BRANCHES=($(git branch --list --no-color| grep "[^* ]+" -Eo)) -cd - +cd - > /dev/null optparse.define short=t long=build-type desc="Build Type: ${BUILD_TYPE_OPTIONS[*]}" variable=MCS_BUILD_TYPE @@ -34,10 +34,17 @@ optparse.define short=u long=skip-unit-tests desc="Skip UnitTests" variable=SKIP optparse.define short=B long=run-microbench="Compile and run microbenchmarks " variable=RUN_BENCHMARKS default=false value=true optparse.define short=b long=branch desc="Choose git branch. For menu use -b \"\"" variable=BRANCH default=$CURRENT_BRANCH optparse.define short=D long=without-core-dumps desc="Do not produce core dumps" variable=WITHOUT_COREDUMPS default=false value=true -optparse.define short=A long=asan desc="Build with ASAN" variable=ASAN default=false value=true optparse.define short=v long=verbose desc="Verbose makefile commands" variable=MAKEFILE_VERBOSE default=false value=true +optparse.define short=A long=asan desc="Build with ASAN" variable=ASAN default=false value=true +optparse.define short=T long=tsan desc="Build with TSAN" variable=TSAN default=false value=true +optparse.define short=U long=ubsan desc="Build with UBSAN" variable=UBSAN default=false value=true optparse.define short=P long=report-path desc="Path for storing reports and profiles" variable=REPORT_PATH default="/core" +optparse.define short=N long=ninja desc="Build with ninja" variable=USE_NINJA default=false value=true +optparse.define short=G long=draw-deps desc="Draw dependencies graph" variable=DRAW_DEPS default=false value=true +optparse.define short=M long=skip-smoke desc="Skip final smoke test" variable=SKIP_SMOKE default=false value=true optparse.define short=n long=no-clean-install desc="Do not perform a clean install (keep existing db files)" variable=NO_CLEAN default=false value=true +optparse.define short=j long=parallel desc="Number of paralles for build" variable=CPUS default=$(getconf _NPROCESSORS_ONLN) +optparse.define short=F long=show-build-flags desc="Print CMake flags, while build" variable=PRINT_CMAKE_FLAGS default=false source $( optparse.build ) @@ -79,15 +86,16 @@ select_branch() message "Turning off Columnstore submodule auto update via gitconfig" cd $MDB_SOURCE_PATH git config submodule.storage/columnstore/columnstore.update none - cd - + cd - > /dev/null fi - cd - + cd - > /dev/null message "Columnstore will be built from $color_yellow$CURRENT_BRANCH$color_normal branch" } install_deps() { + message_split message "Installing deps" if [[ $OS = 'Ubuntu' || $OS = 'Debian' ]]; then apt-get -y update @@ -95,9 +103,9 @@ install_deps() libncurses5-dev libaio-dev libsystemd-dev libpcre2-dev \ libperl-dev libssl-dev libxml2-dev libkrb5-dev flex libpam-dev git \ libsnappy-dev libcurl4-openssl-dev libgtest-dev libcppunit-dev googletest libsnappy-dev libjemalloc-dev \ - liblz-dev liblzo2-dev liblzma-dev liblz4-dev libbz2-dev libbenchmark-dev + liblz-dev liblzo2-dev liblzma-dev liblz4-dev libbz2-dev libbenchmark-dev graphviz - elif [[ $OS = 'CentOS' || $OS = 'Rocky' ]]; then + elif [[ $OS = 'CentOS' || $OS = 'Rocky' || $OS = 'Fedora' ]]; then if [[ "$OS_VERSION" == "7" ]]; then yum -y install cmake3 epel-release centos-release-scl CMAKE_BIN_NAME=cmake3 @@ -109,16 +117,23 @@ install_deps() yum -y groupinstall "Development Tools" && yum config-manager --set-enabled powertools yum install -y checkpolicy fi - yum -y install epel-release \ - && yum -y install bison ncurses-devel readline-devel perl-devel openssl-devel libxml2-devel gperf libaio-devel libevent-devel tree wget pam-devel snappy-devel libicu \ - && yum -y install vim wget strace ltrace gdb rsyslog net-tools openssh-server expect boost perl-DBI libicu boost-devel initscripts \ - && yum -y install jemalloc-devel libcurl-devel gtest-devel cppunit-devel systemd-devel install lzo-devel xz-devel lz4-devel bzip2-devel \ - && yum -y install pcre2-devel + if [[ $OS != 'Fedora' ]]; then + yum -y install epel-release + fi + + yum install -y bison ncurses-devel readline-devel perl-devel openssl-devel libxml2-devel gperf libaio-devel libevent-devel tree wget pam-devel snappy-devel libicu \ + vim wget strace ltrace gdb rsyslog net-tools openssh-server expect boost perl-DBI libicu boost-devel initscripts \ + jemalloc-devel libcurl-devel gtest-devel cppunit-devel systemd-devel lzo-devel xz-devel lz4-devel bzip2-devel \ + pcre2-devel flex graphviz libaio-devel openssl-devel flex + else + error "Unsupported OS $OS" + exit 17 fi } stop_service() { + message_split message "Stopping MariaDB services" systemctl stop mariadb systemctl stop mariadb-columnstore @@ -127,25 +142,33 @@ stop_service() check_service() { if systemctl is-active --quiet $1; then - message "$1 service started$color_green OK $color_normal" + message "$1 $color_normal[$color_green OK $color_normal]" else - error "$1 service failed" + message "$1 $color_normal[$color_red Fail $color_normal]" service $1 status fi } start_service() { + message_split message "Starting MariaDB services" systemctl start mariadb-columnstore systemctl start mariadb - check_service mariadb-columnstore check_service mariadb + check_service mariadb-columnstore + check_service mcs-controllernode + check_service mcs-ddlproc + check_service mcs-dmlproc + check_service mcs-primproc + check_service mcs-workernode@1 + check_service mcs-writeengineserver } clean_old_installation() { + message_split message "Cleaning old installation" rm -rf /var/lib/columnstore/data1/* rm -rf /var/lib/columnstore/data/ @@ -163,6 +186,7 @@ clean_old_installation() build() { + message_split message "Building sources in $color_yellow$MCS_BUILD_TYPE$color_normal mode" local MDB_CMAKE_FLAGS="-DWITH_SYSTEMD=yes @@ -191,16 +215,37 @@ build() message "Buiding with unittests" fi + if [[ $DRAW_DEPS = true ]] ; then + warn "Generating dependendies graph to mariadb.dot" + MDB_CMAKE_FLAGS="${MDB_CMAKE_FLAGS} --graphviz=mariadb.dot" + fi + + if [[ $USE_NINJA = true ]] ; then + warn "Using Ninja instead of Makefiles" + MDB_CMAKE_FLAGS="${MDB_CMAKE_FLAGS} -GNinja" + fi + if [[ $ASAN = true ]] ; then - warn "Building with ASAN" + warn "Building with Address Sanitizer " MDB_CMAKE_FLAGS="${MDB_CMAKE_FLAGS} -DWITH_ASAN=ON -DWITH_COLUMNSTORE_ASAN=ON -DWITH_COLUMNSTORE_REPORT_PATH=${REPORT_PATH}" fi + if [[ $TSAN = true ]] ; then + warn "Building with Thread Sanitizer" + MDB_CMAKE_FLAGS="${MDB_CMAKE_FLAGS} -DWITH_TSAN=ON -DWITH_COLUMNSTORE_REPORT_PATH=${REPORT_PATH}" + fi + + if [[ $UBSAN = true ]] ; then + warn "Building with UB Sanitizer" + MDB_CMAKE_FLAGS="${MDB_CMAKE_FLAGS} -DWITH_UBSAN=ON -DWITH_COLUMNSTORE_REPORT_PATH=${REPORT_PATH}" + fi + if [[ $WITHOUT_COREDUMPS = true ]] ; then warn "Cores are not dumped" else MDB_CMAKE_FLAGS="${MDB_CMAKE_FLAGS} -DWITH_COREDUMPS=ON" - echo "${REPORT_PATH}/core_%e.%p" | sudo tee /proc/sys/kernel/core_pattern + warn Building with CoreDumps: /proc/sys/kernel/core_pattern changed to ${REPORT_PATH}/core_%e.%p + echo "${REPORT_PATH}/core_%e.%p" > /proc/sys/kernel/core_pattern fi if [[ $MAKEFILE_VERBOSE = true ]] ; then @@ -231,10 +276,10 @@ build() if [[ $SKIP_SUBMODULES = true ]] ; then warn "Skipping initialization of columnstore submodules" else - message "Initialization of columnstore submodules" - cd storage/columnstore/columnstore - git submodule update --init - cd - + message "Initialization of columnstore submodules" + cd storage/columnstore/columnstore + git submodule update --init + cd - > /dev/null fi if [[ $FORCE_CMAKE_CONFIG = true ]] ; then @@ -253,19 +298,25 @@ build() MDB_CMAKE_FLAGS="${MDB_CMAKE_FLAGS} -DRPM=sles15" fi - message "building with flags $MDB_CMAKE_FLAGS" + if [[ $PRINT_CMAKE_FLAGS = true ]] ; then + message "Building with flags" + newline_array ${MDB_CMAKE_FLAGS[@]} + fi - local CPUS=$(getconf _NPROCESSORS_ONLN) - ${CMAKE_BIN_NAME} -DCMAKE_BUILD_TYPE=$MCS_BUILD_TYPE $MDB_CMAKE_FLAGS && \ | - make -j $CPUS - message "Installing silently" - make -j $CPUS install > /dev/null + message "Configuring cmake silently" + ${CMAKE_BIN_NAME} -DCMAKE_BUILD_TYPE=$MCS_BUILD_TYPE $MDB_CMAKE_FLAGS . | spinner + message_split + ${CMAKE_BIN_NAME} --build . -j $CPUS && \ + message "Installing silently" && + ${CMAKE_BIN_NAME} --install . | spinner 30 if [ $? -ne 0 ]; then + message_split error "!!!! BUILD FAILED !!!!" + message_split exit 1 fi - cd - + cd - > /dev/null } check_user_and_group() @@ -284,25 +335,27 @@ check_user_and_group() run_unit_tests() { + message_split if [[ $SKIP_UNIT_TESTS = true ]] ; then warn "Skipping unittests" else message "Running unittests" cd $MDB_SOURCE_PATH ${CTEST_BIN_NAME} . -R columnstore: -j $(nproc) --progress - cd - + cd - > /dev/null fi } run_microbenchmarks_tests() { + message_split if [[ $RUN_BENCHMARKS = false ]] ; then warn "Skipping microbenchmarks" else message "Runnning microbenchmarks" cd $MDB_SOURCE_PATH ${CTEST_BIN_NAME} . -V -R columnstore_microbenchmarks: -j $(nproc) --progress - cd - + cd - > /dev/null fi } @@ -324,8 +377,10 @@ fix_config_files() THREAD_STACK_SIZE="20M" SYSTEMD_SERVICE_DIR="/usr/lib/systemd/system" + MDB_SERVICE_FILE=$SYSTEMD_SERVICE_DIR/mariadb.service + COLUMNSTORE_CONFIG=$CONFIG_DIR/columnstore.cnf + if [[ $ASAN = true ]] ; then - COLUMNSTORE_CONFIG=$CONFIG_DIR/columnstore.cnf if grep -q thread_stack $COLUMNSTORE_CONFIG; then warn "MDB Server has thread_stack settings on $COLUMNSTORE_CONFIG check it's compatibility with ASAN" else @@ -333,19 +388,39 @@ fix_config_files() message "thread_stack was set to ${THREAD_STACK_SIZE} in $COLUMNSTORE_CONFIG" fi - MDB_SERVICE_FILE=$SYSTEMD_SERVICE_DIR/mariadb.service if grep -q ASAN $MDB_SERVICE_FILE; then warn "MDB Server has ASAN options in $MDB_SERVICE_FILE, check it's compatibility" else - echo Environment="'ASAN_OPTIONS=abort_on_error=1:disable_coredump=0,print_stats=false,detect_odr_violation=0,check_initialization_order=1,detect_stack_use_after_return=1,atexit=false,log_path=${ASAN_PATH}'" >> $MDB_SERVICE_FILE + echo Environment="'ASAN_OPTIONS=abort_on_error=1:disable_coredump=0,print_stats=false,detect_odr_violation=0,check_initialization_order=1,detect_stack_use_after_return=1,atexit=false,log_path=${REPORT_PATH}/asan.mariadb'" >> $MDB_SERVICE_FILE message "ASAN options were added to $MDB_SERVICE_FILE" fi fi + + if [[ $TSAN = true ]] ; then + if grep -q TSAN $MDB_SERVICE_FILE; then + warn "MDB Server has TSAN options in $MDB_SERVICE_FILE, check it's compatibility" + else + echo Environment="'TSAN_OPTIONS=abort_on_error=0,log_path=${REPORT_PATH}/tsan.mariadb'" >> $MDB_SERVICE_FILE + message "TSAN options were added to $MDB_SERVICE_FILE" + fi + fi + + if [[ $UBSAN = true ]] ; then + if grep -q UBSAN $MDB_SERVICE_FILE; then + warn "MDB Server has UBSAN options in $MDB_SERVICE_FILE, check it's compatibility" + else + echo Environment="'UBSAN_OPTIONS=abort_on_error=0,print_stacktrace=true,log_path=${REPORT_PATH}/ubsan.mariadb'" >> $MDB_SERVICE_FILE + message "UBSAN options were added to $MDB_SERVICE_FILE" + fi + fi + + message Reloading systemd systemctl daemon-reload } install() { + message_split message "Installing MariaDB" disable_plugins_for_bootstrap @@ -425,14 +500,29 @@ socket=/run/mysqld/mysqld.sock" > /etc/my.cnf.d/socket.cnf' smoke() { - message "Creating test database" - mariadb -e "create database if not exists test;" - message "Selecting magic numbers" - MAGIC=`mysql -N test < $MDB_SOURCE_PATH/storage/columnstore/columnstore/tests/scripts/smoke.sql` - if [[ $MAGIC == '42' ]] ; then - message "Great answer correct" - else - warn "Smoke failed, answer is '$MAGIC'" + if [[ $SKIP_SMOKE = false ]] ; then + message_split + message "Creating test database" + mariadb -e "create database if not exists test;" + message "Selecting magic numbers" + MAGIC=`mysql -N test < $MDB_SOURCE_PATH/storage/columnstore/columnstore/tests/scripts/smoke.sql` + if [[ $MAGIC == '42' ]] ; then + message "Great answer correct!" + else + warn "Smoke failed, answer is '$MAGIC'" + fi + fi +} + + +generate_svgs() +{ + if [[ $DRAW_DEPS = true ]] ; then + message_split + warn "Generating svgs with dependency graph to $REPORT_PATH" + for f in $MDB_SOURCE_PATH/mariadb.dot.*; + do dot -Tsvg -o $REPORT_PATH/`basename $f`.svg $f; + done fi } @@ -454,5 +544,6 @@ run_microbenchmarks_tests install start_service smoke +generate_svgs -message "$color_green FINISHED $color_normal" +message_splitted "FINISHED" diff --git a/build/utils.sh b/build/utils.sh index 133bfb036..9e9b72bf0 100644 --- a/build/utils.sh +++ b/build/utils.sh @@ -1,10 +1,11 @@ color_normal=$(tput sgr0) color_bold=$(tput bold) color_red="$color_bold$(tput setaf 1)" -color_green=$(tput setaf 2) +color_green="$color_bold$(tput setaf 2)" color_fawn=$(tput setaf 3); color_beige="$color_fawn" color_yellow="$color_bold$color_fawn" color_darkblue=$(tput setaf 4) + color_blue="$color_bold$color_darkblue" color_purple=$(tput setaf 5); color_magenta="$color_purple" color_pink="$color_bold$color_purple" @@ -14,21 +15,89 @@ color_gray=$(tput setaf 7) color_darkgray="$color_bold"$(tput setaf 0) color_white="$color_bold$color_gray" + +if [[ $(tput colors) == '256' ]]; then + color_red=$(tput setaf 196) + color_yellow=$(tput setaf 228) + color_cyan=$(tput setaf 87) + color_green=$(tput setaf 156) + color_darkgray=$(tput setaf 59) +fi + message() { - echo $color_cyan -- $@$color_normal + echo $color_cyan ・ $@$color_normal } warn() { - echo $color_yellow -- $@$color_normal + echo $color_yellow ・ $@$color_normal } error() { - echo $color_red -- $@$color_normal + echo $color_red ・ $@$color_normal } +message_split() +{ + echo $color_darkgray ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ $color_normal +} + +message_splitted() +{ + message_split + echo $color_green ・ $@$color_normal + message_split +} + + +colorify_array() +{ + PROMT="" + for a in "$@" + do + i=$((((i+1) % (123-106)) + 106)) + if [[ $(tput colors) == '256' ]]; then + PROMT="$PROMT $(tput setaf $i)$a$color_normal" + else + PROMT="$PROMT $a" + fi + done + echo $PROMT +} + + +newline_array() +{ + PROMT="" + for a in "$@" + do + PROMT="$PROMT$a\n" + done + echo -e $PROMT +} + + +function spinner +{ + freq=${1:-10} + points=(⣾ ⣽ ⣻ ⢿ ⡿ ⣟ ⣯ ⣷) + colored_points=($(colorify_array ${points[@]})) + len=${#points[@]} + point_num=0 + line_num=0 + while read data; do + line_num=$((line_num+1)) + if [[ $((line_num % freq)) = 0 ]]; then + point_num=$(((point_num + 1) % len )) + echo -ne "\r${colored_points[point_num]}" + fi + done; + echo +} + + detect_distro() { if [ -f /etc/os-release ]; then diff --git a/cmapi/.gitignore b/cmapi/.gitignore new file mode 100644 index 000000000..bdbe67f3f --- /dev/null +++ b/cmapi/.gitignore @@ -0,0 +1,89 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ +venv/ +.python-version + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +#Ipython Notebook +.ipynb_checkpoints + +*.py.swp +python/ +deps/ +engine/ +cmapi_server/test/tmp.xml +systemd.env +mariadb-columnstore-cmapi.service +prerm +postinst +conffiles + +CMakeCache.txt +CMakeFiles +CMakeScripts +Makefile +cmake_install.cmake +install_manifest.txt +*CPack* +*.rpm +*.deb +result +centos8 +ubuntu20.04 +buildinfo.txt diff --git a/cmapi/CMakeLists.txt b/cmapi/CMakeLists.txt new file mode 100644 index 000000000..bde8bbaa4 --- /dev/null +++ b/cmapi/CMakeLists.txt @@ -0,0 +1,128 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.11) +PROJECT(cmapi NONE) + +SET(CPACK_PACKAGE_NAME "MariaDB-columnstore-cmapi") + +FILE(STRINGS VERSION CRUDE_CMAPI_VERSION_MAJOR REGEX "^CMAPI_VERSION_MAJOR=") +FILE(STRINGS VERSION CRUDE_CMAPI_VERSION_MINOR REGEX "^CMAPI_VERSION_MINOR=") +FILE(STRINGS VERSION CRUDE_CMAPI_VERSION_PATCH REGEX "^CMAPI_VERSION_PATCH=") +STRING(REPLACE "CMAPI_VERSION_MAJOR=" "" CMAPI_VERSION_MAJOR ${CRUDE_CMAPI_VERSION_MAJOR}) +STRING(REPLACE "CMAPI_VERSION_MINOR=" "" CMAPI_VERSION_MINOR ${CRUDE_CMAPI_VERSION_MINOR}) +STRING(REPLACE "CMAPI_VERSION_PATCH=" "" CMAPI_VERSION_PATCH ${CRUDE_CMAPI_VERSION_PATCH}) +SET(PACKAGE_VERSION "${CMAPI_VERSION_MAJOR}.${CMAPI_VERSION_MINOR}.${CMAPI_VERSION_PATCH}") +SET(CMAPI_USER "root") + +SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "MariaDB ColumnStore CMAPI: cluster management API and command line tool.") +SET(CPACK_PACKAGE_URL "http://www.mariadb.com") +SET(CPACK_PACKAGE_CONTACT "MariaDB Corporation Ab") +SET(CPACK_PACKAGE_SUMMARY "MariaDB ColumnStore CMAPI: cluster management API and command line tool.") +SET(CPACK_PACKAGE_VENDOR "MariaDB Corporation Ab") +SET(CPACK_PACKAGE_LICENSE "Copyright (c) 2023 MariaDB Corporation Ab.; redistributable under the terms of the GPLv2, see the file LICENSE.GPL2 for details.") + +SET(BIN_DIR "/usr/bin") +SET(ETC_DIR "/etc/columnstore") +SET(SHARE_DIR "/usr/share/columnstore") +SET(CMAPI_DIR "${SHARE_DIR}/cmapi") +SET(SYSTEMD_UNIT_DIR "/usr/lib/systemd/system") +SET(SYSTEMD_ENGINE_UNIT_NAME "mariadb-columnstore") +SET(CMAPI_CONF_FILEPATH "${ETC_DIR}/cmapi_server.conf") + +STRING(TOLOWER ${CPACK_PACKAGE_NAME} SYSTEMD_UNIT_NAME) + +CONFIGURE_FILE(service.template ${SYSTEMD_UNIT_NAME}.service) +CONFIGURE_FILE(systemd.env.template systemd.env) +CONFIGURE_FILE(postinst.template postinst) +CONFIGURE_FILE(prerm.template prerm) +CONFIGURE_FILE(conffiles.template conffiles) +CONFIGURE_FILE(mcs.template mcs) + +INSTALL(DIRECTORY python deps mcs_node_control failover cmapi_server engine_files mcs_cluster_tool + DESTINATION ${CMAPI_DIR} + USE_SOURCE_PERMISSIONS + PATTERN "test" EXCLUDE + PATTERN "cmapi_server.conf" EXCLUDE) +INSTALL(FILES LICENSE.GPL2 VERSION + DESTINATION ${CMAPI_DIR}) +INSTALL(FILES check_ready.sh + PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ + DESTINATION ${CMAPI_DIR}) +INSTALL(FILES cmapi_server/cmapi_server.conf systemd.env + DESTINATION ${ETC_DIR}) +INSTALL(FILES ${SYSTEMD_UNIT_NAME}.service + DESTINATION ${SYSTEMD_UNIT_DIR}) +INSTALL(FILES mcs + PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ + DESTINATION ${BIN_DIR}) + +OPTION(RPM "Build an RPM" OFF) +IF(RPM) + SET(CPACK_GENERATOR "RPM") + # cmake wants to byte-compile all .py files. + # this line below will prevent it of doing such + # trying to byte-compile could produce some issues with not existing + # python3 on some systems like centos7 + # more info: + # https://fedoraproject.org/wiki/Changes/No_more_automagic_Python_bytecompilation#Status_quo + # https://stackoverflow.com/questions/69988093/cmake-brp-python-bytecompile-and-python3 + SET(CPACK_RPM_SPEC_INSTALL_POST "%global _python_bytecompile_extra 0") + SET(CPACK_RPM_SPEC_MORE_DEFINE "%global _python_bytecompile_extra 0") + # Turn off the brp-python-bytecompile script + # for every release including EPEL + # more info here: + # https://pagure.io/packaging-committee/issue/755 + SET(CPACK_RPM_SPEC_INSTALL_POST "%global __os_install_post %(echo '%{__os_install_post}' | sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g')") + SET(CPACK_RPM_SPEC_MORE_DEFINE "%global __os_install_post %(echo '%{__os_install_post}' | sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g')") + # otherwise it could be solved to install python3 in centos7 and adding this line: + # SET(CPACK_RPM_SPEC_MORE_DEFINE "%define __python %{__python3}") + # example here: + # https://github.com/irods/irods/pull/6347/files + # but it's doesn't work because of some CPACK versions don't add definitions + # to the spec file using CPACK_RPM_SPEC_MORE_DEFINE + + SET(CPACK_RPM_PACKAGE_VERSION ${PACKAGE_VERSION}) + SET(CPACK_RPM_PACKAGE_NAME ${CPACK_PACKAGE_NAME}) + + SET(CPACK_RPM_PACKAGE_LICENSE "GPLv2") + SET(CPACK_RPM_PACKAGE_GROUP "Applications/Databases") + SET(CPACK_RPM_PACKAGE_URL ${CPACK_PACKAGE_URL}) + SET(CPACK_RPM_PACKAGE_SUMMARY ${CPACK_PACKAGE_SUMMARY}) + SET(CPACK_RPM_PACKAGE_VENDOR ${CPACK_PACKAGE_VENDOR}) + SET(CPACK_RPM_PACKAGE_LICENSE ${CPACK_PACKAGE_LICENSE}) + SET(CPACK_RPM_PACKAGE_DESCRIPTION ${CPACK_PACKAGE_DESCRIPTION_SUMMARY}) + + SET(CPACK_RPM_SPEC_MORE_DEFINE "%undefine __brp_mangle_shebangs") + SET(CPACK_RPM_PACKAGE_AUTOREQ "no") + + SET(CPACK_RPM_POST_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/postinst) + SET(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/prerm) + SET(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION ${ETC_DIR} ${SHARE_DIR}) + SET(CPACK_RPM_USER_FILELIST "%config(noreplace) ${CMAPI_CONF_FILEPATH}") + + SET(CPACK_RPM_PACKAGE_OBSOLETES "mariadb-columnstore-cmapi") + SET(CPACK_DEBIAN_PACKAGE_DEPENDS "curl") +ENDIF() + +OPTION(DEB "Build a DEB" OFF) +IF(DEB) + + SET(CPACK_GENERATOR "DEB") + + SET(CPACK_DEBIAN_PACKAGE_VERSION ${PACKAGE_VERSION}) + # TODO: different names in deb and rpm packages, fix it in next releases. + STRING(TOLOWER ${CPACK_PACKAGE_NAME} CPACK_DEBIAN_PACKAGE_NAME) + STRING(TOLOWER ${CPACK_PACKAGE_NAME} CPACK_PACKAGE_NAME) + SET(CPACK_DEBIAN_PACKAGE_LICENSE "GPLv2") + SET(CPACK_DEBIAN_PACKAGE_URL ${CPACK_PACKAGE_URL}) + SET(CPACK_DEBIAN_PACKAGE_SUMMARY ${CPACK_PACKAGE_SUMMARY}) + SET(CPACK_DEBIAN_PACKAGE_VENDOR ${CPACK_PACKAGE_VENDOR}) + SET(CPACK_DEBIAN_PACKAGE_LICENSE ${CPACK_PACKAGE_LICENSE}) + SET(CPACK_DEBIAN_PACKAGE_DESCRIPTION ${CPACK_PACKAGE_DESCRIPTION_SUMMARY}) + + SET(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/prerm;${CMAKE_CURRENT_SOURCE_DIR}/postinst;${CMAKE_CURRENT_SOURCE_DIR}/conffiles") + + SET(CPACK_DEBIAN_PACKAGE_REPLACES "mariadb-columnstore-cmapi") + SET(CPACK_RPM_PACKAGE_REQUIRES "curl") +ENDIF() + +SET(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${PACKAGE_VERSION}.${CMAKE_HOST_SYSTEM_PROCESSOR}") +INCLUDE (CPack) diff --git a/cmapi/LICENSE.GPL2 b/cmapi/LICENSE.GPL2 new file mode 100644 index 000000000..cd0782919 --- /dev/null +++ b/cmapi/LICENSE.GPL2 @@ -0,0 +1,339 @@ +GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/cmapi/README.md b/cmapi/README.md new file mode 100644 index 000000000..0afef4ab1 --- /dev/null +++ b/cmapi/README.md @@ -0,0 +1,61 @@ +# CMAPI REST server +[![Build Status](https://ci.columnstore.mariadb.net/api/badges/mariadb-corporation/mariadb-columnstore-cmapi/status.svg)](https://ci.columnstore.mariadb.net/mariadb-corporation/mariadb-columnstore-cmapi) + +## Overview +This RESTfull server enables multi-node setups for MCS. + +## Requirements + +See requirements.txt file. + +All the Python packages prerequisits are shipped with a pre-built Python enterpreter. + +## Usage + +To run the server using defaults call: +```sh +python3 -m cmapi_server +``` +There is a configuration server inside cmapi_server. + +## Testing + +To launch the integration and unit tests use unittest discovery mode. +```sh +python3 -m unittest discover -v mcs_node_control +python3 -m unittest discover -v cmapi_server +python3 -m unittest discover -v failover +``` + +mcs_control_node unit tests ask for root privileges and additional systemd unit +to run smoothly. + +## Build packages + +Packages have bundled python interpreter and python dependencies. + +## Get dependencies + +# get portable python +wget -qO- https://cspkg.s3.amazonaws.com/python-dist-no-nis.tar.gz | tar xzf - -C ./ + +# install python dependencies +python/bin/pip3 install -t deps --only-binary :all -r requirements.txt + +## RPM + +```sh +./cleanup.sh +yum install -y wget cmake make rpm-build +cmake -DRPM=1 . +make package +``` + +## DEB + +```sh +./cleanup.sh +DEBIAN_FRONTEND=noninteractive apt update && apt install -y cmake make +cmake -DDEB=1 . +make package +``` diff --git a/cmapi/VERSION b/cmapi/VERSION new file mode 100644 index 000000000..d19fa468e --- /dev/null +++ b/cmapi/VERSION @@ -0,0 +1,3 @@ +CMAPI_VERSION_MAJOR=23 +CMAPI_VERSION_MINOR=03 +CMAPI_VERSION_PATCH=1b diff --git a/cmapi/check_ready.sh b/cmapi/check_ready.sh new file mode 100755 index 000000000..69e7c4034 --- /dev/null +++ b/cmapi/check_ready.sh @@ -0,0 +1,19 @@ +SEC_TO_WAIT=15 +echo -n "Waiting CMAPI to finish startup" +success=false +for i in $(seq 1 $SEC_TO_WAIT); do + echo -n "..$i" + if ! $(curl -k -s --output /dev/null --fail https://127.0.0.1:8640/cmapi/ready); then + sleep 1 + else + success=true + break + fi +done + +echo +if $success; then + echo "CMAPI ready to handle requests." +else + echo "CMAPI not ready after waiting $SEC_TO_WAIT seconds. Check log file for further details." +fi diff --git a/cmapi/cleanup.sh b/cmapi/cleanup.sh new file mode 100755 index 000000000..6423dcdf2 --- /dev/null +++ b/cmapi/cleanup.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -euo pipefail +IFS=$'\n\t' + +rm -rf \ + cmapi_server/test/tmp.xml \ + systemd.env \ + *.service \ + prerm \ + postinst \ + CMakeCache.txt \ + CMakeFiles \ + CMakeScripts \ + Makefile \ + cmake_install.cmake \ + install_manifest.txt \ + *CPack* \ +# buildinfo.txt + +find . -type d -name __pycache__ -exec rm -rf {} + +find . -type f -iname '*.swp' -exec rm -rf {} + diff --git a/cmapi/cmapi_server/SingleNode.xml b/cmapi/cmapi_server/SingleNode.xml new file mode 100644 index 000000000..67c8637bd --- /dev/null +++ b/cmapi/cmapi_server/SingleNode.xml @@ -0,0 +1,249 @@ + + + + 127.0.0.1 + 8601 + unassigned + + + 127.0.0.1 + 8630 + + + 127.0.0.1 + 8612 + + + 127.0.0.1 + 8614 + + + 10000 + + + 1 + 2 + 128 + 10K + 0 + 512 + 512 + + 1 + 0 + n + + + + y + + + + + + 127.0.0.1 + 8620 + + + columnstore-1 + pm1 + pm1 + + 1 + /var/lib/columnstore/data1 + /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves + /var/lib/columnstore/data1/systemFiles/dbrm/tablelocks + 15 + 100000 + 10 + 95 + OFF + + /rdwrscratch + + /tmp/columnstore_tmp_files + + + um + User Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + pm + Performance Module + 1 + 127.0.0.1 + localhost + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + 1 + 1 + + + 1000 + /var/lib/columnstore/data1/systemFiles/dbrm/SMTxnID + + + + 1GB + + + + /var/lib/columnstore/data1/systemFiles/dbrm/oidbitmap + + 3000 + + + /var/log/mariadb/columnstore/data/bulk + /var/lib/columnstore/data1/systemFiles/bulkRollback + 98 + 1 + n + + + 1 + 127.0.0.1 + 8616 + + + + 127.0.0.1 + 8700 + pm1 + + + + + + + 1 + 0 + 0 + 65536 + 2K + 200 + 0 + 50 + + + 2 + y + n + internal + internal + /etc/profile.d/columnstoreAlias.sh + + + + + 4 + 0x0 + + + 128 + 128K + 1G + 25% + 100 + N + Y + Snappy + + + 16K + 16 + 1 + + + + + 100 + + + + + + N + + + 127.0.0.1 + 3306 + root + + + + + + + N + + + N + + + Y + Snappy + + + 127.0.0.1 + 0 + + + 30 + N + + + + + diff --git a/cmapi/cmapi_server/__init__.py b/cmapi/cmapi_server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/cmapi_server/__main__.py b/cmapi/cmapi_server/__main__.py new file mode 100644 index 000000000..e5bf5b753 --- /dev/null +++ b/cmapi/cmapi_server/__main__.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 + +""" +CherryPy-based webservice daemon with background threads +""" + +import logging +import os +import threading +import time +from datetime import datetime, timedelta + +import cherrypy +from cherrypy.process import plugins +from cryptography import x509 +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives import serialization, hashes +from cryptography.hazmat.primitives.asymmetric import rsa +from cryptography.x509.oid import NameOID + +# TODO: fix dispatcher choose logic because code executing in endpoints.py +# while import process, this cause module logger misconfiguration +from cmapi_server.logging_management import config_cmapi_server_logging +config_cmapi_server_logging() + +from cmapi_server import helpers +from cmapi_server.constants import DEFAULT_MCS_CONF_PATH, CMAPI_CONF_PATH +from cmapi_server.controllers.dispatcher import dispatcher, jsonify_error +from cmapi_server.failover_agent import FailoverAgent +from cmapi_server.managers.process import MCSProcessManager +from cmapi_server.managers.application import AppManager +from failover.node_monitor import NodeMonitor +from mcs_node_control.models.dbrm_socket import SOCK_TIMEOUT, DBRMSocketHandler +from mcs_node_control.models.node_config import NodeConfig + + +cert_filename = './cmapi_server/self-signed.crt' + + +def worker(): + """Background Timer that runs clean_txn_by_timeout() every 5 seconds + TODO: this needs to be fixed/optimized. I don't like creating the thread + repeatedly. + """ + while True: + t = threading.Timer(5.0, clean_txn_by_timeout) + t.start() + t.join() + + +def clean_txn_by_timeout(): + txn_section = app.config.get('txn', None) + timeout_timestamp = txn_section.get('timeout') if txn_section is not None else None + current_timestamp = int(datetime.now().timestamp()) + if timeout_timestamp is not None and current_timestamp > timeout_timestamp: + txn_config_changed = txn_section.get('config_changed', None) + if txn_config_changed is True: + node_config = NodeConfig() + node_config.rollback_config() + node_config.apply_config( + xml_string=node_config.get_current_config() + ) + app.config.update({ + 'txn': { + 'id': 0, + 'timeout': 0, + 'manager_address': '', + 'config_changed': False, + }, + }) + + +class TxnBackgroundThread(plugins.SimplePlugin): + """CherryPy plugin to create a background worker thread""" + app = None + + def __init__(self, bus, app): + super(TxnBackgroundThread, self).__init__(bus) + self.t = None + self.app = app + + def start(self): + """Plugin entrypoint""" + + self.t = threading.Thread(target=worker, name='TxnBackgroundThread') + self.t.daemon = True + self.t.start() + + # Start at a higher priority than "Daemonize" (which we're not using + # yet but may in the future) + start.priority = 85 + + +class FailoverBackgroundThread(plugins.SimplePlugin): + """CherryPy plugin to start the thread for failover monitoring.""" + + def __init__(self, bus, turned_on): + super().__init__(bus) + self.node_monitor = NodeMonitor(agent=FailoverAgent()) + self.running = False + self.turned_on = turned_on + if self.turned_on: + logging.info( + 'Failover is turned ON by default or in CMAPI config file.' + ) + else: + logging.info('Failover is turned OFF in CMAPI config file.') + + def _start(self): + if self.running: + return + self.bus.log('Starting Failover monitor thread.') + self.node_monitor.start() + self.running = True + + def _stop(self): + if not self.running: + return + self.bus.log('Stopping Failover monitor thread.') + self.node_monitor.stop() + self.running = False + + def _subscriber(self, run_failover: bool): + if not self.turned_on: + return + if not isinstance(run_failover, bool): + self.bus.log(f'Got wrong obj in failover channel {run_failover}') + return + if run_failover: + self._start() + else: + self._stop() + + def start(self): + self.bus.subscribe('failover', self._subscriber) + + def stop(self): + cherrypy.engine.unsubscribe('failover', self._subscriber) + self._stop() + + +def create_self_signed_certificate(): + key_filename = './cmapi_server/self-signed.key' + + key = rsa.generate_private_key( + public_exponent=65537, + key_size=2048, + backend=default_backend() + ) + + with open(key_filename, "wb") as f: + f.write(key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.NoEncryption()), + ) + + subject = issuer = x509.Name([ + x509.NameAttribute(NameOID.COUNTRY_NAME, 'US'), + x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, 'California'), + x509.NameAttribute(NameOID.LOCALITY_NAME, 'Redwood City'), + x509.NameAttribute(NameOID.ORGANIZATION_NAME, 'MariaDB'), + x509.NameAttribute(NameOID.COMMON_NAME, 'mariadb.com'), + ]) + + basic_contraints = x509.BasicConstraints(ca=True, path_length=0) + + cert = x509.CertificateBuilder( + ).subject_name( + subject + ).issuer_name( + issuer + ).public_key( + key.public_key() + ).serial_number( + x509.random_serial_number() + ).not_valid_before( + datetime.utcnow() + ).not_valid_after( + datetime.utcnow() + timedelta(days=365) + ).add_extension( + basic_contraints, + False + ).add_extension( + x509.SubjectAlternativeName([x509.DNSName('localhost')]), + critical=False + ).sign(key, hashes.SHA256(), default_backend()) + + with open(cert_filename, 'wb') as f: + f.write(cert.public_bytes(serialization.Encoding.PEM)) + + +if __name__ == '__main__': + logging.info(f'CMAPI Version: {AppManager.get_version()}') + + # TODO: read cmapi config filepath as an argument + helpers.cmapi_config_check() + + if not os.path.exists(cert_filename): + create_self_signed_certificate() + + app = cherrypy.tree.mount(root=None, config=CMAPI_CONF_PATH) + app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': CMAPI_CONF_PATH, + }, + }) + + cherrypy.config.update(CMAPI_CONF_PATH) + cfg_parser = helpers.get_config_parser(CMAPI_CONF_PATH) + dispatcher_name, dispatcher_path = helpers.get_dispatcher_name_and_path( + cfg_parser + ) + MCSProcessManager.detect(dispatcher_name, dispatcher_path) + # If we don't have auto_failover flag in the config turn it ON by default. + turn_on_failover = cfg_parser.getboolean( + 'application', 'auto_failover', fallback=True + ) + TxnBackgroundThread(cherrypy.engine, app).subscribe() + # subscribe FailoverBackgroundThread plugin code to bus channels + # code below not starting "real" failover background thread + FailoverBackgroundThread(cherrypy.engine, turn_on_failover).subscribe() + cherrypy.engine.start() + cherrypy.engine.wait(cherrypy.engine.states.STARTED) + + success = False + config_mtime = os.path.getmtime(DEFAULT_MCS_CONF_PATH) + # if the mtime changed, we infer that a put_config was run on this node, + # and we now have a current config file. + # TODO: Research all affected cases and remove/rewrite this loop below. + # Previously this affects endless waiting time while starting + # application after upgrade. + # Do we have any cases when we need to try syncing config with other + # nodes with endless retry? + if not helpers.in_maintenance_state(DEFAULT_MCS_CONF_PATH): + while ( + not success + and config_mtime == os.path.getmtime(DEFAULT_MCS_CONF_PATH) + ): + try: + success = helpers.get_current_config_file() + except Exception: + logging.info( + 'Main got exception while get_current_config_file', + exc_info=True + ) + success = False + if not success: + delay = 10 + logging.warning( + 'Failed to fetch the current config file, ' + f'retrying in {delay}s' + ) + time.sleep(delay) + + config_mtime = os.path.getmtime(DEFAULT_MCS_CONF_PATH) + helpers.wait_for_deactivation_or_put_config(config_mtime) + + dbrm_socket = DBRMSocketHandler() + # TODO: fix DBRM message show on nodes restart. + # Use DBRM() context manager. + try: + dbrm_socket.connect() + dbrm_socket._detect_protocol() + dbrm_socket.close() + except Exception: + logging.warning( + 'Something went wrong while trying to detect dbrm protocol.\n' + 'Seems "controllernode" process isn\'t started.\n' + 'This is just a notification, not a problem.\n' + 'Next detection will started at first node\\cluster ' + 'status check.\n' + f'This can cause extra {SOCK_TIMEOUT} seconds delay while\n' + 'first attempt to get status.', + exc_info=True + ) + else: + logging.info( + 'In maintenance state, not syncing config from other nodes.' + ) + + if turn_on_failover: + if not helpers.in_maintenance_state(DEFAULT_MCS_CONF_PATH): + cherrypy.engine.publish('failover', True) + else: + logging.info('In maintenance state, not starting Failover.') + + AppManager.started = True + cherrypy.engine.block() diff --git a/cmapi/cmapi_server/cmapi_logger.conf b/cmapi/cmapi_server/cmapi_logger.conf new file mode 100644 index 000000000..2bc3d383d --- /dev/null +++ b/cmapi/cmapi_server/cmapi_logger.conf @@ -0,0 +1,83 @@ +{ + "version": 1, + "filters": { + "add_ip_filter": { + "()": "cmapi_server.logging_management.AddIpFilter" + } + }, + "formatters": { + "cmapi_server": { + "format": "%(asctime)s [%(levelname)s] (%(name)s) {%(threadName)s} %(ip)s %(message)s", + "datefmt": "%d/%b/%Y %H:%M:%S" + }, + "default": { + "format": "%(asctime)s [%(levelname)s] (%(name)s) {%(threadName)s} %(message)s", + "datefmt": "%d/%b/%Y %H:%M:%S" + }, + "container_sh": { + "format" : "`%(asctime)s`: %(message)s", + "datefmt": "%a %d %b %Y %I:%M:%S %p %Z" + } + }, + "handlers": { + "cmapi_server": { + "level": "DEBUG", + "class": "logging.StreamHandler", + "filters": ["add_ip_filter"], + "formatter": "cmapi_server", + "stream": "ext://sys.stdout" + }, + "console": { + "level": "DEBUG", + "class": "logging.StreamHandler", + "formatter": "default", + "stream": "ext://sys.stdout" + }, + "file": { + "level": "DEBUG", + "class": "logging.handlers.RotatingFileHandler", + "formatter": "default", + "filename": "/var/log/mariadb/columnstore/cmapi_server.log", + "mode": "a", + "maxBytes": 1048576, + "backupCount": 10, + "encoding": "utf8" + }, + "container_sh_file": { + "level": "DEBUG", + "class": "logging.handlers.RotatingFileHandler", + "formatter": "container_sh", + "filename": "/var/log/mariadb/columnstore/container-sh.log", + "mode": "a", + "maxBytes": 1024, + "backupCount": 3, + "encoding": "utf8" + } + }, + "loggers": { + "cherrypy.access": { + "handlers": ["console", "file"], + "level": "INFO", + "propagate": false + }, + "cherrypy.error": { + "handlers": ["console", "file"], + "level": "INFO", + "propagate": false + }, + "cmapi_server": { + "handlers": ["cmapi_server", "file"], + "level": "DEBUG", + "propagate": false + }, + "container_sh": { + "handlers": ["file", "container_sh_file"], + "level": "DEBUG", + "propagate": false + }, + "": { + "handlers": ["console", "file"], + "level": "DEBUG" + } + } +} diff --git a/cmapi/cmapi_server/cmapi_server.conf b/cmapi/cmapi_server/cmapi_server.conf new file mode 100644 index 000000000..3f86c0a91 --- /dev/null +++ b/cmapi/cmapi_server/cmapi_server.conf @@ -0,0 +1,9 @@ +[global] +server.socket_host = '0.0.0.0' +server.socket_port = 8640 +server.ssl_module = 'builtin' +server.ssl_certificate = './cmapi_server/self-signed.crt' +server.ssl_private_key = './cmapi_server/self-signed.key' +engine.autoreload.on = False +log.access_file = '' +log.error_file = '' diff --git a/cmapi/cmapi_server/constants.py b/cmapi/cmapi_server/constants.py new file mode 100644 index 000000000..a1e4142b9 --- /dev/null +++ b/cmapi/cmapi_server/constants.py @@ -0,0 +1,84 @@ +"""Module contains constants values for cmapi, failover and other .py files. + +TODO: move main constant paths here and replace in files in next releases. +""" +import os +from typing import NamedTuple + + +# default MARIADB ColumnStore config path +MCS_ETC_PATH = '/etc/columnstore' +DEFAULT_MCS_CONF_PATH = os.path.join(MCS_ETC_PATH, 'Columnstore.xml') + +# default Storage Manager config path +DEFAULT_SM_CONF_PATH = os.path.join(MCS_ETC_PATH, 'storagemanager.cnf') + +# MCSDATADIR (in mcs engine code) and related paths +MCS_DATA_PATH = '/var/lib/columnstore' +MCS_MODULE_FILE_PATH = os.path.join(MCS_DATA_PATH, 'local/module') +EM_PATH_SUFFIX = 'data1/systemFiles/dbrm' +MCS_EM_PATH = os.path.join(MCS_DATA_PATH, EM_PATH_SUFFIX) +MCS_BRM_CURRENT_PATH = os.path.join(MCS_EM_PATH, 'BRM_saves_current') +S3_BRM_CURRENT_PATH = os.path.join(EM_PATH_SUFFIX, 'BRM_saves_current') +# keys file for CEJ password encryption\decryption +# (CrossEngineSupport section in Columnstore.xml) +MCS_SECRETS_FILE_PATH = os.path.join(MCS_DATA_PATH, '.secrets') + +# CMAPI SERVER +CMAPI_CONFIG_FILENAME = 'cmapi_server.conf' +CMAPI_ROOT_PATH = os.path.dirname(__file__) +PROJECT_PATH = os.path.dirname(CMAPI_ROOT_PATH) +# path to VERSION file +VERSION_PATH = os.path.join(PROJECT_PATH, 'VERSION') +CMAPI_LOG_CONF_PATH = os.path.join(CMAPI_ROOT_PATH, 'cmapi_logger.conf') +# path to CMAPI default config +CMAPI_DEFAULT_CONF_PATH = os.path.join(CMAPI_ROOT_PATH, CMAPI_CONFIG_FILENAME) +# CMAPI config path +CMAPI_CONF_PATH = os.path.join(MCS_ETC_PATH, CMAPI_CONFIG_FILENAME) + +# TOTP secret key +SECRET_KEY = 'MCSIsTheBestEver' # not just a random string! (base32) + + +# network constants +LOCALHOSTS = ('localhost', '127.0.0.1', '::1') + +CMAPI_INSTALL_PATH = '/usr/share/columnstore/cmapi/' +CMAPI_PYTHON_BIN = os.path.join(CMAPI_INSTALL_PATH, "python/bin/python3") +CMAPI_PYTHON_DEPS_PATH = os.path.join(CMAPI_INSTALL_PATH, "deps") +CMAPI_PYTHON_BINARY_DEPS_PATH = os.path.join(CMAPI_PYTHON_DEPS_PATH, "bin") +CMAPI_SINGLE_NODE_XML = os.path.join( + CMAPI_INSTALL_PATH, 'cmapi_server/SingleNode.xml' +) + +# constants for dispatchers +class ProgInfo(NamedTuple): + """NamedTuple for some additional info about handling mcs processes.""" + stop_priority: int # priority for building stop sequence + service_name: str # systemd service name + subcommand: str # subcommand for process run in docker container + only_primary: bool # use this process only on primary + delay: int = 0 # delay after process start in docker container + +# mcs-loadbrm and mcs-savebrm are dependencies for workernode and resolved +# on top level of process handling +# mcs-storagemanager starts conditionally inside mcs-loadbrm, but should be +# stopped using cmapi +ALL_MCS_PROGS = { + # workernode starts on primary and non primary node with 1 or 2 added + # to subcommand (DBRM_Worker1 - on primary, DBRM_Worker2 - non primary) + 'StorageManager': ProgInfo(15, 'mcs-storagemanager', '', False, 1), + 'workernode': ProgInfo(13, 'mcs-workernode', 'DBRM_Worker{}', False, 1), + 'controllernode': ProgInfo(11, 'mcs-controllernode', 'fg', True), + 'PrimProc': ProgInfo(5, 'mcs-primproc', '', False, 1), + 'ExeMgr': ProgInfo(9, 'mcs-exemgr', '', False, 1), + 'WriteEngineServer': ProgInfo(7, 'mcs-writeengineserver', '', False, 3), + 'DMLProc': ProgInfo(3, 'mcs-dmlproc', '', False), + 'DDLProc': ProgInfo(1, 'mcs-ddlproc', '', False), +} + +# constants for docker container dispatcher +MCS_INSTALL_BIN = '/usr/bin' +IFLAG = os.path.join(MCS_ETC_PATH, 'container-initialized') +LIBJEMALLOC_DEFAULT_PATH = os.path.join(MCS_DATA_PATH, 'libjemalloc.so.2') +MCS_LOG_PATH = '/var/log/mariadb/columnstore' diff --git a/cmapi/cmapi_server/controllers/__init__.py b/cmapi/cmapi_server/controllers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/cmapi_server/controllers/dispatcher.py b/cmapi/cmapi_server/controllers/dispatcher.py new file mode 100644 index 000000000..76da33f2b --- /dev/null +++ b/cmapi/cmapi_server/controllers/dispatcher.py @@ -0,0 +1,262 @@ +import json + +import cherrypy + +from cmapi_server.controllers.endpoints import ( + StatusController, ConfigController, BeginController, CommitController, + RollbackController, StartController, ShutdownController, + ExtentMapController, ClusterController, ApiKeyController, + LoggingConfigController, AppController +) + +from cmapi_server.controllers.s3dataload import S3DataLoadController + +_version = '0.4.0' +dispatcher = cherrypy.dispatch.RoutesDispatcher() + + +# /_version/status (GET) +dispatcher.connect(name = 'status', + route = f'/cmapi/{_version}/node/status', + action = 'get_status', + controller = StatusController(), + conditions = {'method': ['GET']}) + + +# /_version/master (GET) +dispatcher.connect(name = 'get_primary', + route = f'/cmapi/{_version}/node/primary', + action = 'get_primary', + controller = StatusController(), + conditions = {'method': ['GET']}) + + +# /_version/new_primary (GET) +dispatcher.connect(name = 'get_new_primary', + route = f'/cmapi/{_version}/node/new_primary', + action = 'get_new_primary', + controller = StatusController(), + conditions = {'method': ['GET']}) + + +# /_version/config/ (GET) +dispatcher.connect(name = 'get_config', # what does this name is used for? + route = f'/cmapi/{_version}/node/config', + action = 'get_config', + controller = ConfigController(), + conditions = {'method': ['GET']}) + + +# /_version/config/ (PUT) +dispatcher.connect(name = 'put_config', + route = f'/cmapi/{_version}/node/config', + action = 'put_config', + controller = ConfigController(), + conditions = {'method': ['PUT']}) + + +# /_version/begin/ (PUT) +dispatcher.connect(name = 'put_begin', + route = f'/cmapi/{_version}/node/begin', + action = 'put_begin', + controller = BeginController(), + conditions = {'method': ['PUT']}) + + +# /_version/rollback/ (PUT) +dispatcher.connect(name = 'put_rollback', + route = f'/cmapi/{_version}/node/rollback', + action = 'put_rollback', + controller = RollbackController(), + conditions = {'method': ['PUT']}) + + +# /_version/commit/ (PUT) +dispatcher.connect(name = 'put_commit', + route = f'/cmapi/{_version}/node/commit', + action = 'put_commit', + controller = CommitController(), + conditions = {'method': ['PUT']}) + + +# /_version/start/ (PUT) +dispatcher.connect(name = 'start', + route = f'/cmapi/{_version}/node/start', + action = 'put_start', + controller = StartController(), + conditions = {'method': ['PUT']}) + + +# /_version/shutdown/ (PUT) +dispatcher.connect(name = 'shutdown', + route = f'/cmapi/{_version}/node/shutdown', + action = 'put_shutdown', + controller = ShutdownController(), + conditions = {'method': ['PUT']}) + + +# /_version/meta/em/ (GET) +dispatcher.connect(name = 'get_em', + route = f'/cmapi/{_version}/node/meta/em', + action = 'get_em', + controller = ExtentMapController(), + conditions = {'method': ['GET']}) + + +# /_version/meta/journal/ (GET) +dispatcher.connect(name = 'get_journal', + route = f'/cmapi/{_version}/node/meta/journal', + action = 'get_journal', + controller = ExtentMapController(), + conditions = {'method': ['GET']}) + + +# /_version/meta/vss/ (GET) +dispatcher.connect(name = 'get_vss', + route = f'/cmapi/{_version}/node/meta/vss', + action = 'get_vss', + controller = ExtentMapController(), + conditions = {'method': ['GET']}) + + +# /_version/meta/vbbm/ (GET) +dispatcher.connect(name = 'get_vbbm', + route = f'/cmapi/{_version}/node/meta/vbbm', + action = 'get_vbbm', + controller = ExtentMapController(), + conditions = {'method': ['GET']}) + + +# /_version/meta/footprint/ (GET) +dispatcher.connect(name = 'get_footprint', + route = f'/cmapi/{_version}/node/meta/footprint', + action = 'get_footprint', + controller = ExtentMapController(), + conditions = {'method': ['GET']}) + + +# /_version/cluster/start/ (PUT) +dispatcher.connect(name = 'cluster_start', + route = f'/cmapi/{_version}/cluster/start', + action = 'put_start', + controller = ClusterController(), + conditions = {'method': ['PUT']}) + + +# /_version/cluster/shutdown/ (PUT) +dispatcher.connect(name = 'cluster_shutdown', + route = f'/cmapi/{_version}/cluster/shutdown', + action = 'put_shutdown', + controller = ClusterController(), + conditions = {'method': ['PUT']}) + + +# /_version/cluster/mode-set/ (PUT) +dispatcher.connect(name = 'cluster_mode_set', + route = f'/cmapi/{_version}/cluster/mode-set', + action = 'put_mode_set', + controller = ClusterController(), + conditions = {'method': ['PUT']}) + + +# /_version/cluster/node/ (POST, PUT) +dispatcher.connect(name = 'cluster_add_node', + route = f'/cmapi/{_version}/cluster/node', + action = 'put_add_node', + controller = ClusterController(), + conditions = {'method': ['POST', 'PUT']}) + + +# /_version/cluster/node/ (DELETE) +dispatcher.connect(name = 'cluster_remove_node', + route = f'/cmapi/{_version}/cluster/node', + action = 'delete_remove_node', + controller = ClusterController(), + conditions = {'method': ['DELETE']}) + + +# /_version/cluster/status/ (GET) +dispatcher.connect(name = 'cluster_status', + route = f'/cmapi/{_version}/cluster/status', + action = 'get_status', + controller = ClusterController(), + conditions = {'method': ['GET']}) + + +# /_version/node/apikey-set/ (PUT) +dispatcher.connect( + name = 'node_set_api_key', + route = f'/cmapi/{_version}/node/apikey-set', + action = 'set_api_key', + controller = ApiKeyController(), + conditions = {'method': ['PUT']} +) + + +# /_version/cluster/apikey-set/ (PUT) +dispatcher.connect( + name = 'cluster_set_api_key', + route = f'/cmapi/{_version}/cluster/apikey-set', + action = 'set_api_key', + controller = ClusterController(), + conditions = {'method': ['PUT']} +) + + +# /_version/cluster/node/ (POST, PUT) +dispatcher.connect(name = 'cluster_load_s3data', + route = f'/cmapi/{_version}/cluster/load_s3data', + action = 'load_s3data', + controller = S3DataLoadController(), + conditions = {'method': ['POST', 'PUT']}) + + +# /_version/node/log-config/ (PUT) +dispatcher.connect( + name = 'node_set_log_level', + route = f'/cmapi/{_version}/node/log-level', + action = 'set_log_level', + controller = LoggingConfigController(), + conditions = {'method': ['PUT']} +) + + +# /_version/cluster/log-config'/ (PUT) +dispatcher.connect( + name = 'cluster_set_log_level', + route = f'/cmapi/{_version}/cluster/log-level', + action = 'set_log_level', + controller = ClusterController(), + conditions = {'method': ['PUT']} +) + + +# /ready (GET) +dispatcher.connect( + name = 'app_ready', + route = '/cmapi/ready', + action = 'ready', + controller = AppController(), + conditions = {'method': ['GET']} +) + + +def jsonify_error(status, message, traceback, version): \ + # pylint: disable=unused-argument + """JSONify all CherryPy error responses (created by raising the + cherrypy.HTTPError exception) + """ + + cherrypy.response.headers['Content-Type'] = 'application/json' + response_body = json.dumps( + { + 'error': { + 'http_status': status, + 'message': message, + } + } + ) + + cherrypy.response.status = status + + return response_body diff --git a/cmapi/cmapi_server/controllers/endpoints.py b/cmapi/cmapi_server/controllers/endpoints.py new file mode 100644 index 000000000..0f190ba9f --- /dev/null +++ b/cmapi/cmapi_server/controllers/endpoints.py @@ -0,0 +1,1139 @@ +import logging + +import socket +import subprocess +import time + +from copy import deepcopy +from datetime import datetime +from pathlib import Path + +import cherrypy +import pyotp +import requests + +from cmapi_server.exceptions import CMAPIBasicError +from cmapi_server.constants import ( + DEFAULT_SM_CONF_PATH, EM_PATH_SUFFIX, DEFAULT_MCS_CONF_PATH, MCS_EM_PATH, + MCS_BRM_CURRENT_PATH, S3_BRM_CURRENT_PATH, CMAPI_CONF_PATH, SECRET_KEY, +) +from cmapi_server.controllers.error import APIError +from cmapi_server.handlers.cej import CEJError +from cmapi_server.handlers.cluster import ClusterHandler +from cmapi_server.helpers import ( + cmapi_config_check, get_config_parser, get_current_key, get_dbroots, + system_ready, save_cmapi_conf_file, dequote, in_maintenance_state, +) +from cmapi_server.logging_management import change_loggers_level +from cmapi_server.managers.process import MCSProcessManager +from cmapi_server.managers.application import AppManager +from cmapi_server.node_manipulation import is_master, switch_node_maintenance +from mcs_node_control.models.dbrm import set_cluster_mode +from mcs_node_control.models.node_config import NodeConfig +from mcs_node_control.models.node_status import NodeStatus + + +# Bug in pylint https://github.com/PyCQA/pylint/issues/4584 +requests.packages.urllib3.disable_warnings() # pylint: disable=no-member + + +module_logger = logging.getLogger('cmapi_server') + + +def log_begin(logger, func_name): + logger.debug(f"{func_name} starts") + + +def raise_422_error( + logger, func_name: str = '', err_msg: str = '', exc_info: bool = True +) -> None: + """Function to log error and raise 422 api error. + + :param logger: logger to use + :type logger: logging.Logger + :param func_name: function name where it called, defaults to '' + :type func_name: str, optional + :param err_msg: error message, defaults to '' + :type err_msg: str, optional + :param exc_info: write traceback to logs or not. + :type exc_info: bool + :raises APIError: everytime with custom error message + """ + logger.error(f'{func_name} {err_msg}', exc_info=exc_info) + raise APIError(422, err_msg) + + +@cherrypy.tools.register('before_handler', priority=80) +def validate_api_key(): + """Validate API key. + + If no config file, create new one by coping from default. If no API key, + set api key from request headers. + """ + # TODO: simplify validation, using preload and may be class-controller + req = cherrypy.request + if 'X-Api-Key' not in req.headers: + error_message = 'No API key provided.' + module_logger.warning(error_message) + raise cherrypy.HTTPError(401, error_message) + + # we thinking that api_key is the same with quoted api_key + request_api_key = dequote(req.headers.get('X-Api-Key', '')) + if not request_api_key: + error_message = 'Empty API key.' + module_logger.warning(error_message) + raise cherrypy.HTTPError(401, error_message) + + # because of architecture of cherrypy config parser it makes from values + # python objects it causes some non standart behaviour + # - makes dequote of config values automatically if it is strings + # - config objects always gives a dict object + # - strings with only integers inside will be always converted to int type + inmemory_api_key = str( + req.app.config.get('Authentication', {}).get('x-api-key', '') + ) + if not inmemory_api_key: + module_logger.warning( + 'No API key in the configuration. Adding it into the config.' + ) + req.app.config.update( + {'Authentication': {'x-api-key': request_api_key}} + ) + # update the cmapi server config file + config_filepath = req.app.config['config']['path'] + cmapi_config_check(config_filepath) + cfg_parser = get_config_parser(config_filepath) + + if not cfg_parser.has_section('Authentication'): + cfg_parser.add_section('Authentication') + # TODO: Do not store api key in cherrypy config. + # It causes some overhead on custom ini file and handling it. + # For cherrypy config file values have to be python objects. + # So string have to be quoted. + cfg_parser['Authentication']['x-api-key'] = f"'{request_api_key}'" + save_cmapi_conf_file(cfg_parser, config_filepath) + + return + + if inmemory_api_key != request_api_key: + module_logger.warning(f'Incorrect API key [ {request_api_key} ]') + raise cherrypy.HTTPError(401, 'Incorrect API key') + + +@cherrypy.tools.register("before_handler", priority=81) +def active_operation(): + app = cherrypy.request.app + txn_section = app.config.get('txn', None) + txn_manager_address = None + if txn_section is not None: + txn_manager_address = app.config['txn'].get('manager_address', None) + if txn_manager_address is not None and len(txn_manager_address) > 0: + raise APIError(422, "There is an active operation.") + + +class TimingTool(cherrypy.Tool): + """Tool to measure imncoming requests processing time.""" + def __init__(self): + # if before_handler used we got 500 on each error in request body + # (eg wrong or no content in PUT requests): + # - wrong request body + # - never happened handler + # - no before_handler event + # - never add cherrypy.request._time + # - got error at before_finalize event getting cherrypy.request._time + # - return 500 instead of 415 error + super().__init__('before_request_body', self.start_timer, priority=90) + + def _setup(self): + """Method to call by CherryPy when the tool is applied.""" + super()._setup() + cherrypy.request.hooks.attach( + 'before_finalize', self.end_timer, priority=5 + ) + + def start_timer(self): + """Save time and log information about incoming request.""" + cherrypy.request._time = time.time() + logger = logging.getLogger('access_logger') + request = cherrypy.request + remote = request.remote.name or request.remote.ip + logger.info( + f'Got incoming {request.method} request from "{remote}" ' + f'to "{request.path_info}". uid: {request.unique_id}' + ) + + def end_timer(self): + """Calculate request processing duration and leave a log message.""" + duration = time.time() - cherrypy.request._time + logger = logging.getLogger('access_logger') + request = cherrypy.request + remote = request.remote.name or request.remote.ip + logger.info( + f'Finished processing incoming {request.method} ' + f'request from "{remote}" to "{request.path_info}" in ' + f'{duration:.4f} seconds. uid: {request.unique_id}' + ) + + +cherrypy.tools.timeit = TimingTool() + + +class StatusController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def get_status(self): + """ + Handler for /status (GET) + """ + func_name = 'get_status' + log_begin(module_logger, func_name) + node_status = NodeStatus() + hostname = ( + cherrypy.request.headers.get('Host', '').split(':')[0] or + socket.gethostname() + ) + #TODO: add localhost condition check and another way to get FQDN + node_fqdn = socket.gethostbyaddr(hostname)[0] + + status_response = { + 'timestamp': str(datetime.now()), + 'uptime': node_status.get_host_uptime(), + 'dbrm_mode': node_status.get_dbrm_status(), + 'cluster_mode': node_status.get_cluster_mode(), + 'dbroots': sorted(get_dbroots(node_fqdn)), + 'module_id': int(node_status.get_module_id()), + 'services': MCSProcessManager.get_running_mcs_procs(), + } + + module_logger.debug(f'{func_name} returns {str(status_response)}') + return status_response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_out() + def get_primary(self): + """ + Handler for /primary (GET) + + ..WARNING: do not add api key validation here, this may cause + mcs-loadbrm.py (in MCS engine repo) failure + """ + func_name = 'get_primary' + log_begin(module_logger, func_name) + # TODO: convert this value to json bool (remove str() invoke here) + # to do so loadbrm and save brm have to be fixed + # + check other places + get_master_response = {'is_primary': str(NodeConfig().is_primary_node())} + module_logger.debug(f'{func_name} returns {str(get_master_response)}') + + return get_master_response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_out() + def get_new_primary(self): + """ + Handler for /new_primary (GET) + """ + func_name = 'get_new_primary' + log_begin(module_logger, func_name) + try: + get_master_response = {'is_primary': is_master()} + except CEJError as cej_error: + raise_422_error( + module_logger, func_name, cej_error.message + ) + module_logger.debug(f'{func_name} returns {str(get_master_response)}') + + return get_master_response + + +class ConfigController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def get_config(self): + """ + Handler for /config (GET) + """ + func_name = 'get_config' + log_begin(module_logger, func_name) + + mcs_config = NodeConfig() + config_response = {'timestamp': str(datetime.now()), + 'config': mcs_config.get_current_config(), + 'sm_config': mcs_config.get_current_sm_config(), + } + + if (module_logger.isEnabledFor(logging.DEBUG)): + dbg_config_response = deepcopy(config_response) + dbg_config_response.pop('config') + dbg_config_response['config'] = 'config was removed to reduce logs.' + dbg_config_response['sm_config'] = 'config was removed to reduce logs.' + module_logger.debug( + f'{func_name} returns {str(dbg_config_response)}' + ) + + return config_response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_config(self): + """ + Handler for /config (PUT) + """ + + func_name = 'put_config' + log_begin(module_logger, func_name) + + app = cherrypy.request.app + txn_section = app.config.get('txn', None) + + if txn_section is None: + raise_422_error( + module_logger, func_name, + 'PUT /config called outside of an operation.' + ) + + req = cherrypy.request + use_sudo = get_use_sudo(req.app.config) + request_body = cherrypy.request.json + request_revision = request_body.get('revision', None) + request_manager = request_body.get('manager', None) + request_timeout = request_body.get('timeout', None) + + #TODO: remove is_test + # is_test = True means this should not save + # the config file or apply the changes + is_test = request_body.get('test', False) + + mandatory = [request_revision, request_manager, request_timeout] + if None in mandatory: + raise_422_error( + module_logger, func_name, 'Mandatory attribute is missing.') + + request_mode = request_body.get('cluster_mode', None) + request_config = request_body.get('config', None) + mcs_config_filename = request_body.get( + 'mcs_config_filename', DEFAULT_MCS_CONF_PATH + ) + sm_config_filename = request_body.get( + 'sm_config_filename', DEFAULT_SM_CONF_PATH + ) + + if request_mode is None and request_config is None: + raise_422_error( + module_logger, func_name, 'Mandatory attribute is missing.' + ) + + request_headers = cherrypy.request.headers + request_manager_address = request_headers.get('Remote-Addr', None) + if request_manager_address is None: + raise_422_error( + module_logger, func_name, + 'Cannot get Cluster manager IP address.' + ) + txn_manager_address = app.config['txn'].get('manager_address', None) + if txn_manager_address is None or len(txn_manager_address) == 0: + raise_422_error( + module_logger, func_name, + 'PUT /config called outside of an operation.' + ) + txn_manager_address = dequote(txn_manager_address).lower() + request_manager_address = dequote(request_manager_address).lower() + + if request_manager_address in ['127.0.0.1', 'localhost', '::1']: + request_manager_address = socket.gethostbyname( + socket.gethostname() + ) + request_response = {'timestamp': str(datetime.now())} + + node_config = NodeConfig() + xml_config = request_body.get('config', None) + sm_config = request_body.get('sm_config', None) + if is_test: + return request_response + if request_mode is not None: + current_mode = set_cluster_mode( + request_mode, config_filename=mcs_config_filename + ) + if current_mode == request_mode: + # Normal exit + module_logger.debug( + f'{func_name} returns {str(request_response)}' + ) + return request_response + else: + raise_422_error( + module_logger, func_name, + ( + f'Error occured setting cluster to "{request_mode}" ' + f'mode, got "{current_mode}"' + ) + ) + elif xml_config is not None: + node_config.apply_config( + config_filename=mcs_config_filename, + xml_string=xml_config, + sm_config_filename=sm_config_filename, + sm_config_string=sm_config + ) + # TODO: change stop/start to restart option. + try: + MCSProcessManager.stop_node( + is_primary=node_config.is_primary_node(), + use_sudo=use_sudo, + timeout=request_timeout + ) + except CMAPIBasicError as err: + raise_422_error( + module_logger, func_name, + f'Error while stopping node. Details: {err.message}.', + exc_info=False + ) + + # if not in the list of active nodes, + # then do not start the services + new_root = node_config.get_current_config_root( + mcs_config_filename + ) + if in_maintenance_state(): + module_logger.info( + 'Maintaninance state is active in new config. ' + 'MCS processes should not be started.' + ) + cherrypy.engine.publish('failover', False) + # skip all other operations below + return request_response + else: + cherrypy.engine.publish('failover', True) + if node_config.in_active_nodes(new_root): + try: + MCSProcessManager.start_node( + is_primary=node_config.is_primary_node(), + use_sudo=use_sudo, + ) + except CMAPIBasicError as err: + raise_422_error( + module_logger, func_name, + ( + 'Error while starting node. ' + f'Details: {err.message}.' + ), + exc_info=False + ) + else: + module_logger.info( + 'This node is not in the current ActiveNodes section. ' + 'Not starting Columnstore processes.' + ) + + attempts = 0 + # TODO: FIX IT. If got (False, False) result, for eg in case + # when there are no special CEJ user set, this check loop + # is useless and do nothing. + try: + ready, retry = system_ready(mcs_config_filename) + except CEJError as cej_error: + raise_422_error( + module_logger, func_name, cej_error.message + ) + + while not ready: + if retry: + attempts +=1 + if attempts >= 10: + module_logger.debug( + 'Timed out waiting for node to be ready.' + ) + break + time.sleep(1) + else: + break + try: + ready, retry = system_ready(mcs_config_filename) + except CEJError as cej_error: + raise_422_error( + module_logger, func_name, cej_error.message + ) + else: + module_logger.debug(f'Node is ready to accept queries.') + + app.config['txn']['config_changed'] = True + + # We might want to raise error + return request_response + + # Unexpected exit + raise_422_error(module_logger, func_name, 'Unknown error.') + + +class BeginController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + @cherrypy.tools.active_operation() # pylint: disable=no-member + def put_begin(self): + """ + Handler for /begin (PUT) + """ + func_name = 'put_begin' + log_begin(module_logger, func_name) + + app = cherrypy.request.app + request_body = cherrypy.request.json + txn_id = request_body.get('id', None) + txn_timeout = request_body.get('timeout', None) + request_headers = cherrypy.request.headers + txn_manager_address = request_headers.get('Remote-Addr', None) + module_logger.debug(f'{func_name} JSON body {str(request_body)}') + + if txn_manager_address is None: + raise_422_error(module_logger, func_name, "Cannot get Cluster Manager \ +IP address.") + txn_manager_address = dequote(txn_manager_address).lower() + if txn_manager_address in ['127.0.0.1', 'localhost', '::1']: + txn_manager_address = socket.gethostbyname(socket.gethostname()) + if txn_id is None or txn_timeout is None or txn_manager_address is None: + raise_422_error(module_logger, func_name, "id or timeout is not set.") + + app.config.update({ + 'txn': { + 'id': txn_id, + 'timeout': int(datetime.now().timestamp()) + txn_timeout, + 'manager_address': txn_manager_address, + 'config_changed': False, + }, + }) + + begin_response = {'timestamp': str(datetime.now())} + + module_logger.debug(f'{func_name} returns {str(begin_response)}') + return begin_response + +class CommitController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_commit(self): + """ + Handler for /commit (PUT) + """ + func_name = 'put_commit' + log_begin(module_logger, func_name) + + commit_response = {'timestamp': str(datetime.now())} + app = cherrypy.request.app + txn_section = app.config.get('txn', None) + + if txn_section is None: + raise_422_error(module_logger, func_name, "No operation to commit.") + + request_headers = cherrypy.request.headers + request_manager_address = request_headers.get('Remote-Addr', None) + if request_manager_address is None: + raise_422_error(module_logger, func_name, "Cannot get Cluster\ + Manager IP address.") + txn_manager_address = app.config['txn'].get('manager_address', None) + if txn_manager_address is None or len(txn_manager_address) == 0: + raise_422_error(module_logger, func_name, "No operation to commit.") + txn_manager_address = dequote(txn_manager_address).lower() + request_manager_address = dequote(request_manager_address).lower() + if request_manager_address in ['127.0.0.1', 'localhost', '::1']: + request_manager_address = socket.gethostbyname(socket.gethostname()) + # txn is active + app.config['txn']['id'] = 0 + app.config['txn']['timeout'] = 0 + app.config['txn']['manager_address'] = '' + app.config['txn']['config_changed'] = False + + module_logger.debug(f'{func_name} returns {str(commit_response)}') + + return commit_response + + +class RollbackController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_rollback(self): + """ + Handler for /rollback (PUT) + """ + rollback_response = {'timestamp': str(datetime.now())} + app = cherrypy.request.app + txn_section = app.config.get('txn', None) + + if txn_section is None: + raise APIError(422, 'No operation to rollback.') + + request_headers = cherrypy.request.headers + request_manager_address = request_headers.get('Remote-Addr', None) + if request_manager_address is None: + raise APIError(422, 'Cannot get Cluster Manager IP address.') + txn_manager_address = app.config['txn'].get('manager_address', None) + if txn_manager_address is None or len(txn_manager_address) == 0: + raise APIError(422, 'No operation to rollback.') + txn_manager_address = dequote(txn_manager_address).lower() + request_manager_address = dequote(request_manager_address).lower() + if request_manager_address in ['127.0.0.1', 'localhost', '::1']: + request_manager_address = socket.gethostbyname(socket.gethostname()) + + #TODO: add restart processes flag? + # txn is active + txn_config_changed = app.config['txn'].get('config_changed', None) + if txn_config_changed is True: + node_config = NodeConfig() + node_config.rollback_config() + # TODO: do we need to restart node here? + node_config.apply_config( + xml_string=node_config.get_current_config() + ) + app.config['txn']['id'] = 0 + app.config['txn']['timeout'] = 0 + app.config['txn']['manager_address'] = '' + app.config['txn']['config_changed'] = False + + return rollback_response + + +def get_use_sudo(app_config): + privileges_section = app_config.get('Privileges', None) + if privileges_section is not None: + use_sudo = privileges_section.get('use_sudo', False) + else: + use_sudo = False + return use_sudo + + +class StartController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_start(self): + func_name = 'put_start' + log_begin(module_logger, func_name) + + req = cherrypy.request + use_sudo = get_use_sudo(req.app.config) + node_config = NodeConfig() + try: + MCSProcessManager.start_node( + is_primary=node_config.is_primary_node(), + use_sudo=use_sudo + ) + except CMAPIBasicError as err: + raise_422_error( + module_logger, func_name, + f'Error while starting node processes. Details: {err.message}', + exc_info=False + ) + # TODO: should we change config revision here? Seem to be no. + # Do we need to change flag in a one node maintenance? + switch_node_maintenance(False) + cherrypy.engine.publish('failover', True) + start_response = {'timestamp': str(datetime.now())} + module_logger.debug(f'{func_name} returns {str(start_response)}') + return start_response + + +class ShutdownController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_shutdown(self): + func_name = 'put_shutdown' + log_begin(module_logger, func_name) + + req = cherrypy.request + use_sudo = get_use_sudo(req.app.config) + request_body = cherrypy.request.json + timeout = request_body.get('timeout', 0) + node_config = NodeConfig() + try: + MCSProcessManager.stop_node( + is_primary=node_config.is_primary_node(), + use_sudo=use_sudo, + timeout=timeout + ) + except CMAPIBasicError as err: + raise_422_error( + module_logger, func_name, + f'Error while stopping node processes. Details: {err.message}', + exc_info=False + ) + # TODO: should we change config revision here? Seem to be no. + # Do we need to change flag in a one node maintenance? + switch_node_maintenance(True) + cherrypy.engine.publish('failover', False) + shutdown_response = {'timestamp': str(datetime.now())} + module_logger.debug(f'{func_name} returns {str(shutdown_response)}') + return shutdown_response + + +class ExtentMapController: + def get_brm_bytes(self, element:str): + func_name = 'get_brm_bytes' + log_begin(module_logger, func_name) + node_config = NodeConfig() + result = b'' + # there must be sm available + if node_config.s3_enabled(): + success = False + retry_count = 0 + while not success and retry_count < 10: + module_logger.debug(f'{func_name} returns {element} from S3.') + + # TODO: Remove conditional once container dispatcher + # uses non-root by default + if MCSProcessManager.dispatcher_name == 'systemd': + args = [ + 'su', '-s', '/bin/sh', '-c', + f'smcat {S3_BRM_CURRENT_PATH}', 'mysql' + ] + else: + args = ['smcat', S3_BRM_CURRENT_PATH] + + ret = subprocess.run(args, stdout=subprocess.PIPE) + if ret.returncode != 0: + module_logger.warning(f"{func_name} got error code {ret.returncode} from smcat, retrying") + time.sleep(1) + retry_count += 1 + continue + elem_current_suffix = ret.stdout.decode("utf-8").rstrip() + elem_current_filename = f'{EM_PATH_SUFFIX}/{elem_current_suffix}_{element}' + + # TODO: Remove conditional once container dispatcher + # uses non-root by default + if MCSProcessManager.dispatcher_name == 'systemd': + args = [ + 'su', '-s', '/bin/sh', '-c', + f'smcat {elem_current_filename}', 'mysql' + ] + else: + args = ['smcat', elem_current_filename] + + ret = subprocess.run(args, stdout=subprocess.PIPE) + if ret.returncode != 0: + module_logger.warning(f"{func_name} got error code {ret.returncode} from smcat, retrying") + time.sleep(1) + retry_count += 1 + continue + result = ret.stdout + success = True + else: + module_logger.debug( + f'{func_name} returns {element} from local storage.' + ) + elem_current_name = Path(MCS_BRM_CURRENT_PATH) + elem_current_filename = elem_current_name.read_text().rstrip() + elem_current_file = Path( + f'{MCS_EM_PATH}/{elem_current_filename}_{element}' + ) + result = elem_current_file.read_bytes() + + module_logger.debug(f'{func_name} returns.') + return result + + @cherrypy.tools.timeit() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def get_em(self): + return self.get_brm_bytes('em') + + @cherrypy.tools.timeit() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def get_journal(self): + return self.get_brm_bytes('journal') + + @cherrypy.tools.timeit() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def get_vss(self): + return self.get_brm_bytes('vss') + + @cherrypy.tools.timeit() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def get_vbbm(self): + return self.get_brm_bytes('vbbm') + + @cherrypy.tools.timeit() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + @cherrypy.tools.json_out() + def get_footprint(self): + # Dummy footprint + result = {'em': '00f62e18637e1708b080b076ea6aa9b0', + 'journal': '00f62e18637e1708b080b076ea6aa9b0', + 'vss': '00f62e18637e1708b080b076ea6aa9b0', + 'vbbm': '00f62e18637e1708b080b076ea6aa9b0', + } + return result + + +class ClusterController: + _cp_config = { + "request.methods_with_bodies": ("POST", "PUT", "PATCH", "DELETE") + } + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_start(self): + func_name = 'put_start' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = request.json + config = request_body.get('config', DEFAULT_MCS_CONF_PATH) + + try: + response = ClusterHandler.start(config) + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_shutdown(self): + func_name = 'put_shutdown' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = request.json + config = request_body.get('config', DEFAULT_MCS_CONF_PATH) + + try: + response = ClusterHandler.shutdown(config) + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_mode_set(self): + func_name = 'put_mode_set' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = request.json + mode = request_body.get('mode', 'readonly') + config = request_body.get('config', DEFAULT_MCS_CONF_PATH) + + try: + response = ClusterHandler.set_mode(mode, config=config) + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_add_node(self): + func_name = 'add_node' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = request.json + node = request_body.get('node', None) + config = request_body.get('config', DEFAULT_MCS_CONF_PATH) + + if node is None: + raise_422_error(module_logger, func_name, 'missing node argument') + + try: + response = ClusterHandler.add_node(node, config) + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def delete_remove_node(self): + func_name = 'remove_node' + log_begin(module_logger, func_name) + request = cherrypy.request + request_body = request.json + node = request_body.get('node', None) + config = request_body.get('config', DEFAULT_MCS_CONF_PATH) + response = {'timestamp': str(datetime.now())} + + #TODO: add arguments verification decorator + if node is None: + raise_422_error(module_logger, func_name, 'missing node argument') + + try: + response = ClusterHandler.remove_node(node, config) + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_scan_for_attached_dbroots(self): + '''TODO: Based on doc, endpoint not exposed''' + func_name = 'put_scan_for_attached_dbroots' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = cherrypy.request.json + node = request_body.get('node', None) + response = {'timestamp': str(datetime.now())} + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_failover_master(self): + '''TODO: Based on doc, endpoint not exposed''' + func_name = 'put_failover_master' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = cherrypy.request.json + source = request_body.get('from', None) + dest = request_body.get('to', None) + response = {'timestamp': str(datetime.now())} + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_move_dbroot(self): + '''TODO: Based on doc, endpoint not exposed''' + func_name = 'put_move_dbroot' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = cherrypy.request.json + source = request_body.get('from', None) + dest = request_body.get('to', None) + response = {'timestamp': str(datetime.now())} + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def put_decommission_node(self): + '''TODO: Based on doc, endpoint not exposed''' + func_name = 'put_decommission_node' + log_begin(module_logger, func_name) + + request = cherrypy.request + request_body = cherrypy.request.json + node = request_body.get('node', None) + response = {'timestamp': str(datetime.now())} + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def get_status(self): + func_name = 'get_status' + log_begin(module_logger, func_name) + + try: + response = ClusterHandler.status() + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + def set_api_key(self): + """Handler for /cluster/apikey-set (PUT) + + Only for cli tool usage. + """ + func_name = 'cluster_set_api_key' + module_logger.debug('Start setting API key to all nodes in cluster.') + request = cherrypy.request + request_body = request.json + new_api_key = dequote(request_body.get('api_key', '')) + totp_key = request_body.get('verification_key', '') + + if not totp_key or not new_api_key: + # not show which arguments in error message because endpoint for + # internal usage only + raise_422_error( + module_logger, func_name, 'Missing required arguments.' + ) + + totp = pyotp.TOTP(SECRET_KEY) + if not totp.verify(totp_key): + raise_422_error( + module_logger, func_name, 'Wrong verification key.' + ) + + try: + response = ClusterHandler.set_api_key(new_api_key, totp_key) + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + def set_log_level(self): + """Handler for /cluster/log-level (PUT) + + Only for develop purposes. + """ + func_name = 'cluster_set_log_level' + module_logger.debug( + 'Start setting new log level to all nodes in cluster.' + ) + request = cherrypy.request + request_body = request.json + new_level = request_body.get('level', None) + if not new_level: + raise_422_error( + module_logger, func_name, 'Missing required level argument.' + ) + module_logger.info(f'Start setting new logging level "{new_level}".') + + try: + response = ClusterHandler.set_log_level(new_level) + except CMAPIBasicError as err: + raise_422_error(module_logger, func_name, err.message) + + module_logger.debug(f'{func_name} returns {str(response)}') + return response + + +class ApiKeyController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + def set_api_key(self): + """Handler for /node/apikey-set (PUT) + + Only for cli tool usage. + """ + func_name = 'node_set_api_key' + module_logger.debug('Start setting new node API key.') + request = cherrypy.request + request_body = request.json + new_api_key = dequote(request_body.get('api_key', '')) + totp_key = request_body.get('verification_key', '') + + if not totp_key or not new_api_key: + # not show which arguments in error message because endpoint for + # internal usage only + raise_422_error( + module_logger, func_name, 'Missing required arguments.' + ) + + totp = pyotp.TOTP(SECRET_KEY) + if not totp.verify(totp_key): + raise_422_error( + module_logger, func_name, 'Wrong verification key.' + ) + + config_filepath = request.app.config['config']['path'] + cmapi_config_check(config_filepath) + cfg_parser = get_config_parser(config_filepath) + config_api_key = get_current_key(cfg_parser) + if config_api_key != new_api_key: + if not cfg_parser.has_section('Authentication'): + cfg_parser.add_section('Authentication') + # TODO: Do not store api key in cherrypy config. + # It causes some overhead on custom ini file and handling it. + # For cherrypy config file values have to be python objects. + # So string have to be quoted. + cfg_parser['Authentication']['x-api-key'] = f"'{new_api_key}'" + save_cmapi_conf_file(cfg_parser, config_filepath) + else: + module_logger.info( + 'API key in config file is the same with new one.' + ) + + # anyway update inmemory api key + request.app.config.update( + {'Authentication': {'x-api-key': new_api_key}} + ) + + module_logger.info('API key successfully updated.') + return {'timestamp': str(datetime.now())} + + +class LoggingConfigController: + @cherrypy.tools.timeit() + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + def set_log_level(self): + """Handler for /node/log-level (PUT) + + Only for develop purposes. + """ + func_name = 'node_put_log_level' + request = cherrypy.request + request_body = request.json + new_level = request_body.get('level', None) + if not new_level: + raise_422_error( + module_logger, func_name, 'Missing required level argument.' + ) + module_logger.info(f'Start setting new logging level "{new_level}".') + try: + change_loggers_level(new_level) + except ValueError as exc: + raise_422_error( + module_logger, func_name, str(exc) + ) + except Exception: + raise_422_error( + module_logger, func_name, 'Unknown error' + ) + module_logger.debug( + f'Finished setting new logging level "{new_level}".' + ) + return {'new_level': new_level} + + +class AppController(): + + @cherrypy.tools.json_out() + def ready(self): + if AppManager.started: + return {'started': True} + else: + raise APIError(503, 'CMAPI not ready to handle requests.') diff --git a/cmapi/cmapi_server/controllers/error.py b/cmapi/cmapi_server/controllers/error.py new file mode 100644 index 000000000..eff784c50 --- /dev/null +++ b/cmapi/cmapi_server/controllers/error.py @@ -0,0 +1,12 @@ +import json +import cherrypy as cp + +class APIError(cp.HTTPError): + def __init__(self, status: int = 500, message: str = ''): + super().__init__(status=status) + self._error_message = message + + def set_response(self): + super().set_response() + response = cp.serving.response + response.body = json.dumps({'error': self._error_message}).encode() diff --git a/cmapi/cmapi_server/controllers/s3dataload.py b/cmapi/cmapi_server/controllers/s3dataload.py new file mode 100644 index 000000000..1c57ab575 --- /dev/null +++ b/cmapi/cmapi_server/controllers/s3dataload.py @@ -0,0 +1,335 @@ +import logging +import os +import re +import selectors +import tempfile +import uuid +from subprocess import PIPE, Popen, run, CalledProcessError + +import cherrypy +import furl +from cmapi_server.constants import ( + CMAPI_PYTHON_BIN, CMAPI_PYTHON_BINARY_DEPS_PATH, CMAPI_PYTHON_DEPS_PATH +) + +from cmapi_server.controllers.endpoints import raise_422_error + + +module_logger = logging.getLogger('cmapi_server') + + +def response_error(text): + raise_422_error(module_logger, 'load_s3data', text) + + +class S3DataLoadController: + @cherrypy.tools.json_in() + @cherrypy.tools.json_out() + @cherrypy.tools.validate_api_key() # pylint: disable=no-member + def load_s3data(self): + """ + Handler for /cluster/load_s3data (POST, PUT) + Invokes cpimport with passed params + This is internal columnstore engine handler + Not targeted for manual usage + + Waits for json dictionary params in request + bucket - S3 bucket with table data + table - table name to load data into + filename - name of file in S3 with table data + key - S3 secret key + secret - S3 secret + region - S3 region + database - db name to load data into + """ + + def checkShellParamsAreOK(param, paramname): + """Check shell params for dangerous symbols. + + As this params will be passed to shell, we should check, + there is no shell injection + AWS Access Key ID is 20 alpha-numeric characters + like022QF06E7MXBSH9DHM02 + AWS Secret Access Key is 40 alpha-numeric-slash-plus characters + like kWcrlUX5JEDGM/LtmEENI/aVmYvHNif5zB+d9+ct + AWS buckets names are alpha-numeric-dot-underscore + like log-delivery-march-2020.com + AWS regions names, table names, file names are also not allowed + for dangerous symbols so just raise error for injection dangerous + symbols in params. + """ + dangerous_symbols = ' &|;\n\r`$' + for symbol in dangerous_symbols: + if symbol in param: + response_error( + f'S3 configuration parameters wrong: {paramname}' + f'cannot contain "{symbol}"' + ) + + def getKey(keyname, request_body, skip_check=False, required=True): + value = request_body.get(keyname, None) + + if not value and required: + response_error( + f'Some S3 configuration parameters missing: {keyname} ' + 'not provided' + ) + + if not skip_check: + checkShellParamsAreOK(value, keyname) + + return value + + def prepare_aws(bucket, filename, secret, key, region): + """Prepare aws_cli popen object. + + Invoke aws_cli download, and return proc for further + use with cpimport. + + :param bucket: bucket name + :type bucket: str + :param filename: filename in bucket + :type filename: str + :param secret: aws secret + :type secret: str + :param key: aws key + :type key: str + :param region: aws region + :type region: str + :return: popen aws_cli object + :rtype: subprocess.Popen + """ + my_env = os.environ.copy() + my_env['AWS_ACCESS_KEY_ID'] = key + my_env['AWS_SECRET_ACCESS_KEY'] = secret + my_env['PYTHONPATH'] = CMAPI_PYTHON_DEPS_PATH + + aws_cli_binary = os.path.join(CMAPI_PYTHON_BINARY_DEPS_PATH, 'aws') + s3_url = furl.furl(bucket).add(path=filename).url + aws_command_line = [ + CMAPI_PYTHON_BIN, aws_cli_binary, + "s3", "cp", "--source-region", region, s3_url, "-" + ] + module_logger.debug( + f'AWS commandline: {" ".join(aws_command_line)}') + try: + aws_proc = Popen( + aws_command_line, env=my_env, stdout=PIPE, + stderr=PIPE, shell=False, encoding='utf-8' + ) + except CalledProcessError as exc: + response_error(exc.stderr.split('\n')[0]) + + return aws_proc + + def prepare_google_storage( + bucket, filename, secret, key, temporary_config + ): + """Prepare gsutil popen object. + + Invoke gsutil download, and return proc for further use + with cpimport. + + :param bucket: bucket name + :type bucket: str + :param filename: filename in bucket + :type filename: str + :param secret: gsutil secret + :type secret: str + :param key: gsutil key + :type key: str + :param temporary_config: temp config for gsutil + :type temporary_config: str + :return: popen gsutil object + :rtype: subprocess.Popen + """ + project_id = 'project_id' + gs_cli_binary = os.path.join( + CMAPI_PYTHON_BINARY_DEPS_PATH, 'gsutil' + ) + + commandline = ( + f'/usr/bin/bash -c ' + f'\'echo -e "{key}\n{secret}\n{project_id}"\' | ' + f'{CMAPI_PYTHON_BIN} {gs_cli_binary} ' + f'config -a -o {temporary_config}' + ) + + module_logger.debug( + f'gsutil config commadline: ' + f'{commandline.encode("unicode_escape").decode("utf-8")}' + ) + + my_env = os.environ.copy() + my_env['PYTHONPATH'] = CMAPI_PYTHON_DEPS_PATH + my_env['BOTO_CONFIG'] = temporary_config + + try: + p = run( + commandline, capture_output=True, + shell=True, encoding='utf-8', check=True, env=my_env + ) + except CalledProcessError as exc: + response_error(exc.stderr.split('\n')[0]) + + try: + check_commandline = [ + CMAPI_PYTHON_BIN, gs_cli_binary, 'version', '-l' + ] + p = run( + check_commandline, capture_output=True, + shell=False, encoding='utf-8', check=True, env=my_env + ) + module_logger.debug( + f'gsutil config check commandline : ' + f'{" ".join(check_commandline)}' + ) + module_logger.debug(f'gsutil config : {p.stdout}') + + except CalledProcessError as exc: + response_error(exc.stderr.split('\n')[0]) + + gs_url = furl.furl(bucket).add(path=filename).url + gs_command_line = [ + CMAPI_PYTHON_BIN, gs_cli_binary, 'cat', gs_url + ] + module_logger.debug( + f'gsutil cat commandline : {" ".join(gs_command_line)}' + ) + + try: + gs_process = Popen( + gs_command_line, env=my_env, stdout=PIPE, stderr=PIPE, + shell=False, encoding='utf-8' + ) + except CalledProcessError as exc: + response_error(exc.stderr.split('\n')[0]) + + return gs_process + + module_logger.debug(f'LOAD S3 Data') + request = cherrypy.request + request_body = request.json + + bucket = getKey('bucket', request_body) + + if bucket.startswith(r's3://'): + storage = 'aws' + elif bucket.startswith(r'gs://'): + storage = 'gs' + else: + error = ( + 'Incorrect bucket. Should start with s3://for AWS S3 or ' + 'gs:// for Google Storage' + ) + response_error(error) + + table = getKey('table', request_body) + filename = getKey('filename', request_body) + key = getKey('key', request_body) + secret = getKey('secret', request_body) + region = getKey('region', request_body, required=storage=='aws') + database = getKey('database', request_body) + terminated_by = getKey('terminated_by', request_body, skip_check=True) + enclosed_by = getKey( + 'enclosed_by', request_body, skip_check=True, required=False + ) + escaped_by = getKey( + 'escaped_by', request_body, skip_check=True, required=False + ) + + if storage == 'aws': + download_proc = prepare_aws(bucket, filename, secret, key, region) + elif storage == 'gs': + temporary_config = os.path.join( + tempfile.gettempdir(), '.boto.' + str(uuid.uuid4()) + ) + + download_proc = prepare_google_storage( + bucket, filename, secret, key, temporary_config + ) + else: + response_error('Unknown storage detected. Internal error') + + cpimport_command_line = [ + 'cpimport', database, table, '-s', terminated_by + ] + if escaped_by: + cpimport_command_line += ['-C', escaped_by] + if enclosed_by: + cpimport_command_line += ['-E', enclosed_by] + + module_logger.debug( + f'cpimport command line: {" ".join(cpimport_command_line)}' + ) + + cpimport_proc = Popen( + cpimport_command_line, shell=False, stdin=download_proc.stdout, + stdout=PIPE, stderr=PIPE, encoding='utf-8' + ) + + selector = selectors.DefaultSelector() + for stream in [ + download_proc.stderr, cpimport_proc.stderr, cpimport_proc.stdout + ]: + os.set_blocking(stream.fileno(), False) + + selector.register( + download_proc.stderr, selectors.EVENT_READ, data='downloader_error' + ) + selector.register( + cpimport_proc.stderr, selectors.EVENT_READ, data='cpimport_error' + ) + selector.register( + cpimport_proc.stdout, selectors.EVENT_READ, data='cpimport_output' + ) + + downloader_error = '' + cpimport_error = '' + cpimport_output = '' + + alive = 3 + while alive > 0: + events = selector.select() + for key, mask in events: + name = key.data + line = key.fileobj.readline().rstrip() + if not line: + # EOF + alive -= 1 + selector.unregister(key.fileobj) + continue + if name == 'downloader_error': + downloader_error += line + '\n' + if name == 'cpimport_error': + cpimport_error += line + '\n' + if name == 'cpimport_output': + cpimport_output += line + '\n' + + # clean after Prepare Google + if storage == 'gs' and os.path.exists(temporary_config): + os.remove(temporary_config) + + if downloader_error: + response_error(downloader_error) + + if cpimport_error: + response_error(cpimport_error) + + module_logger.debug(f'LOAD S3 Data stdout: {cpimport_output}') + + pattern = '([0-9]+) rows processed and ([0-9]+) rows inserted' + match = re.search(pattern, cpimport_output) + + if not match: + return { + 'success': False, + 'inserted': 0, + 'processed': 0 + } + + return { + 'success': True, + 'inserted': match.group(2), + 'processed': match.group(1) + } diff --git a/cmapi/cmapi_server/exceptions.py b/cmapi/cmapi_server/exceptions.py new file mode 100644 index 000000000..311e2b035 --- /dev/null +++ b/cmapi/cmapi_server/exceptions.py @@ -0,0 +1,22 @@ +"""Module contains custom exceptions.""" + + +class CMAPIBasicError(Exception): + """Basic exception raised for CMAPI related processes. + + Attributes: + message -- explanation of the error + """ + def __init__(self, message: str) -> None: + self.message = message + super().__init__(self.message) + def __str__(self) -> str: + return self.message + + +class CEJError(CMAPIBasicError): + """Exception raised for CEJ related processes. + + Attributes: + message -- explanation of the error + """ diff --git a/cmapi/cmapi_server/failover_agent.py b/cmapi/cmapi_server/failover_agent.py new file mode 100644 index 000000000..864715e09 --- /dev/null +++ b/cmapi/cmapi_server/failover_agent.py @@ -0,0 +1,185 @@ +''' +This class implements the interface used by the failover module to notify +the cluster of events like node-up / node-down, etc. +''' + +import logging +import time + +import requests + +from cmapi_server import helpers, node_manipulation +from cmapi_server.constants import DEFAULT_MCS_CONF_PATH +from cmapi_server.exceptions import CMAPIBasicError +from cmapi_server.managers.process import MCSProcessManager +from failover.agent_comm import AgentBase +from mcs_node_control.models.node_config import NodeConfig + + +# Bug in pylint https://github.com/PyCQA/pylint/issues/4584 +requests.packages.urllib3.disable_warnings() # pylint: disable=no-member +logger = logging.getLogger('failover_agent') + + +class FailoverAgent(AgentBase): + + def activateNodes( + self, nodes, input_config_filename=DEFAULT_MCS_CONF_PATH, + output_config_filename=None, test_mode=False + ): + logger.info(f'FA.activateNodes(): activating nodes: {nodes}') + new_node_count = 0 + for node in nodes: + try: + logger.info(f'FA.activateNodes(): adding node {node}') + node_manipulation.add_node( + node, input_config_filename, output_config_filename + ) + new_node_count += 1 + except Exception: + logger.error(f'FA.activateNodes(): failed to add node {node}') + raise + return new_node_count + + def deactivateNodes( + self, nodes, input_config_filename=DEFAULT_MCS_CONF_PATH, + output_config_filename=None, test_mode=False + ): + logger.info(f'FA.deactivateNodes(): deactivating nodes: {nodes}') + + removed_node_count = 0 + for node in nodes: + try: + logger.info(f'FA.deactivateNodes(): deactivating node {node}') + node_manipulation.remove_node( + node, input_config_filename, output_config_filename, + deactivate_only=True, test_mode=test_mode + ) + removed_node_count += 1 + except Exception as err: + logger.error( + f'FA.deactivateNodes(): failed to deactivate node {node}, ' + f'got {str(err)}' + ) + raise + return removed_node_count + + + # the 'hack' parameter is a placeholder. When run by agent_comm, this function gets a first parameter + # of (). When that is the input_config_filename, that's bad. Need to fix. + def movePrimaryNode(self, hack, input_config_filename = None, output_config_filename = None, test_mode = False): + logger.info(f"FA.movePrimaryNode(): moving primary node functionality") + + # to save a little typing in testing + kwargs = { + "cs_config_filename": input_config_filename, + "input_config_filename" : input_config_filename, + "output_config_filename" : output_config_filename, + "test_mode" : test_mode + } + + try: + node_manipulation.move_primary_node(**kwargs) + except Exception as e: + logger.error(f"FA.movePrimaryNode(): failed to move primary node, got {str(e)}") + raise + + def enterStandbyMode(self, test_mode = False): + nc = NodeConfig() + node_name = nc.get_module_net_address(nc.get_current_config_root()) + logger.info( + f'FA.enterStandbyMode(): shutting down node "{node_name}"' + ) + + # this gets retried by the caller on error + try: + # TODO: remove test_mode condition and add mock for testing + if not test_mode: + MCSProcessManager.stop_node(is_primary=nc.is_primary_node()) + logger.info( + 'FA.enterStandbyMode(): successfully stopped node.' + ) + except CMAPIBasicError as err: + logger.error( + 'FA.enterStandbyMode(): caught error while stopping node.' + f'{err.message}' + ) + + + def raiseAlarm(self, msg): + logger.critical(msg) + + + # The start/commit/rollback transaction fcns use the active list to decide which + # nodes to send to; when we're adding a node the new node isn't in the active list yet + # extra_nodes gives us add'l hostnames/addrs to send the transaction to. + # Likewise for removing a node. Presumably that node is not reachable, so must be + # removed from the list to send to. + def startTransaction(self, extra_nodes = [], remove_nodes = []): + got_txn = False + count = 0 + while not got_txn: + msg = None + try: + (got_txn, txn_id, nodes) = helpers.start_transaction( + extra_nodes=extra_nodes, remove_nodes=remove_nodes + ) + except Exception as e: + got_txn = False + msg = ( + f'FA.start_transaction(): attempt #{count+1}, ' + f'failed to get a transaction, got {str(e)}' + ) + + if not got_txn: + if msg is None: + msg = ( + f'FA.start_transaction(): attempt #{count+1}, ' + 'failed to get a transaction' + ) + if count < 5: + logger.warning(msg) + else: + logger.error(msg) + time.sleep(1) + count += 1 + logger.info(f'FA.startTransaction(): started transaction {txn_id}') + return (txn_id, nodes) + + + # These shouldn't throw for now + def commitTransaction(self, txn_id, nodes, **kwargs): + try: + helpers.update_revision_and_manager() + # broadcacting new config invokes node restart + helpers.broadcast_new_config(nodes=nodes) + helpers.commit_transaction(txn_id, nodes=nodes) + except Exception: + logger.error( + ( + 'FA.commitTransaction(): failed to commit transaciton ' + f'{txn_id}' + ), + exc_info=True + ) + else: + logger.info( + f'FA.commitTransaction(): committed transaction {txn_id}' + ) + + + def rollbackTransaction(self, txn_id, nodes): + try: + helpers.rollback_transaction(txn_id, nodes = nodes) + except Exception: + logger.error( + ( + 'FA.rollbackTransaction(): failed to rollback transaction ' + f'{txn_id}. Got unrecognised error.' + ), + exc_info=True + ) + else: + logger.info( + f'FA.rollbackTransaction(): rolled back transaction {txn_id})' + ) diff --git a/cmapi/cmapi_server/handlers/__init__.py b/cmapi/cmapi_server/handlers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/cmapi_server/handlers/cej.py b/cmapi/cmapi_server/handlers/cej.py new file mode 100644 index 000000000..74c977d9a --- /dev/null +++ b/cmapi/cmapi_server/handlers/cej.py @@ -0,0 +1,119 @@ +"""Module contains all things related to working with .secrets file.""" +import json +import logging +import os + +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes +from cryptography.hazmat.primitives import padding + +from cmapi_server.constants import MCS_SECRETS_FILE_PATH +from cmapi_server.exceptions import CEJError + + +AES_BLOCK_SIZE_BITS = algorithms.AES.block_size +AES_IV_BIN_SIZE = int(AES_BLOCK_SIZE_BITS/8) +# two hex chars for each byte +AES_IV_HEX_SIZE = AES_IV_BIN_SIZE * 2 + + +class CEJPasswordHandler(): + """Handler for CrossEngineSupport password decryption.""" + + @classmethod + def secretsfile_exists(cls): + """Check the .secrets file in MCS_SECRETS_FILE_PATH. + + :return: True if file exists and not empty. + :rtype: bool + """ + try: + if ( + os.path.isfile(MCS_SECRETS_FILE_PATH) and + os.path.getsize(MCS_SECRETS_FILE_PATH) > 0 + ): + return True + except Exception: + # TODO: remove after check if python 3.8 everytime exist + # in package because isfile and getsize not rasing + # exceptions after 3.8 + logging.warning( + 'Something went wrong while detecting the .secrets file.', + exc_info=True + ) + return False + + @classmethod + def get_secrets_json(cls): + """Get json from .secrets file. + + :raises CEJError: on empty\corrupted\wrong format .secrets file + :return: json from .secrets file + :rtype: dict + """ + if not cls.secretsfile_exists(): + raise CEJError(f'{MCS_SECRETS_FILE_PATH} file does not exist.') + with open(MCS_SECRETS_FILE_PATH) as secrets_file: + try: + secrets_json = json.load(secrets_file) + except Exception: + logging.error( + 'Something went wrong while loading json from ' + f'{MCS_SECRETS_FILE_PATH}', + exc_info=True + ) + raise CEJError( + f'Looks like file {MCS_SECRETS_FILE_PATH} is corrupted or' + 'has wrong format.' + ) from None + return secrets_json + + @classmethod + def decrypt_password(cls, enc_data:str): + """Decrypt CEJ password if needed. + + :param enc_data: encrypted initialization vector + password in hex str + :type enc_data: str + :return: decrypted CEJ password + :rtype: str + """ + if not cls.secretsfile_exists(): + logging.warning('Unencrypted CrossEngineSupport password used.') + return enc_data + + logging.info('Encrypted CrossEngineSupport password found.') + + try: + iv = bytes.fromhex(enc_data[:AES_IV_HEX_SIZE]) + encrypted_passwd = bytes.fromhex(enc_data[AES_IV_HEX_SIZE:]) + except ValueError as value_error: + raise CEJError( + 'Non-hexadecimal number found in encrypted CEJ password.' + ) from value_error + + secrets_json = cls.get_secrets_json() + encryption_key_hex = secrets_json.get('encryption_key') + if not encryption_key_hex: + raise CEJError( + f'Empty "encryption key" found in {MCS_SECRETS_FILE_PATH}' + ) + try: + encryption_key = bytes.fromhex(encryption_key_hex) + except ValueError as value_error: + raise CEJError( + 'Non-hexadecimal number found in encryption key from ' + f'{MCS_SECRETS_FILE_PATH} file.' + ) from value_error + cipher = Cipher( + algorithms.AES(encryption_key), + modes.CBC(iv) + ) + decryptor = cipher.decryptor() + unpadder = padding.PKCS7(AES_BLOCK_SIZE_BITS).unpadder() + padded_passwd_bytes = ( + decryptor.update(encrypted_passwd) + + decryptor.finalize() + ) + passwd_bytes = ( + unpadder.update(padded_passwd_bytes) + unpadder.finalize() + ) + return passwd_bytes.decode() diff --git a/cmapi/cmapi_server/handlers/cluster.py b/cmapi/cmapi_server/handlers/cluster.py new file mode 100644 index 000000000..f8988ad9a --- /dev/null +++ b/cmapi/cmapi_server/handlers/cluster.py @@ -0,0 +1,579 @@ +"""Module contains Cluster business logic functions.""" +import logging +from datetime import datetime + +import requests + +from cmapi_server.constants import ( + CMAPI_CONF_PATH, DEFAULT_MCS_CONF_PATH, +) +from cmapi_server.exceptions import CMAPIBasicError +from cmapi_server.helpers import ( + broadcast_new_config, commit_transaction, get_active_nodes, get_dbroots, + get_config_parser, get_current_key, get_id, get_version, start_transaction, + rollback_transaction, update_revision_and_manager, +) +from cmapi_server.node_manipulation import ( + add_node, add_dbroot, remove_node, switch_node_maintenance, +) +from mcs_node_control.models.misc import get_dbrm_master +from mcs_node_control.models.node_config import NodeConfig + + +class ClusterHandler(): + """Class for handling MCS Cluster operations.""" + + @staticmethod + def status( + config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to get MCS Cluster status information + + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :raises CMAPIBasicError: if catch some exception while getting status + from each node separately + :return: status result + :rtype: dict + """ + logger.debug('Cluster status command called. Getting status.') + + response = {'timestamp': str(datetime.now())} + active_nodes = get_active_nodes(config) + cmapi_cfg_parser = get_config_parser(CMAPI_CONF_PATH) + api_key = get_current_key(cmapi_cfg_parser) + headers = {'x-api-key': api_key} + num_nodes = 0 + + for node in active_nodes: + url = f'https://{node}:8640/cmapi/{get_version()}/node/status' + try: + r = requests.get(url, verify=False, headers=headers) + r.raise_for_status() + r_json = r.json() + if len(r_json.get('services', 0)) == 0: + r_json['dbrm_mode'] = 'offline' + + response[f'{str(node)}'] = r_json + num_nodes += 1 + except Exception as err: + raise CMAPIBasicError( + f'Got an error retrieving status from node {node}' + ) from err + + response['num_nodes'] = num_nodes + logger.debug('Successfully finished getting cluster status.') + return response + + @staticmethod + def start( + config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to start MCS Cluster. + + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :raises CMAPIBasicError: on exception while starting transaction + :raises CMAPIBasicError: if transaction start isn't successful + :raises CMAPIBasicError: if no nodes in the cluster + :raises CMAPIBasicError: on exception while distributing new config + :raises CMAPIBasicError: on unsuccessful distibuting config file + :raises CMAPIBasicError: on exception while committing transaction + :return: start timestamp + :rtype: dict + """ + logger.debug('Cluster start command called. Starting the cluster.') + start_time = str(datetime.now()) + transaction_id = get_id() + + try: + suceeded, transaction_id, successes = start_transaction( + cs_config_filename=config, id=transaction_id + ) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while starting the transaction.' + ) from err + if not suceeded: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Starting transaction isn\'t successful.') + + if suceeded and len(successes) == 0: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('There are no nodes in the cluster.') + + switch_node_maintenance(False) + update_revision_and_manager() + + # TODO: move this from multiple places to one, eg to helpers + try: + broadcast_successful = broadcast_new_config(config) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while distributing config file.' + ) from err + + if not broadcast_successful: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Config distribution isn\'t successful.') + + try: + commit_transaction(transaction_id, cs_config_filename=config) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while committing transaction.' + ) from err + + logger.debug('Successfully finished cluster start.') + return {'timestamp': start_time} + + @staticmethod + def shutdown( + config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to stop the MCS Cluster. + + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :raises CMAPIBasicError: if no nodes in the cluster + :return: start timestamp + :rtype: dict + """ + logger.debug( + 'Cluster shutdown command called. Shutting down the cluster.' + ) + + start_time = str(datetime.now()) + transaction_id = get_id() + + try: + suceeded, transaction_id, successes = start_transaction( + cs_config_filename=config, id=transaction_id + ) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while starting the transaction.' + ) from err + if not suceeded: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Starting transaction isn\'t successful.') + + if suceeded and len(successes) == 0: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('There are no nodes in the cluster.') + + switch_node_maintenance(True) + update_revision_and_manager() + + # TODO: move this from multiple places to one, eg to helpers + try: + broadcast_successful = broadcast_new_config(config) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while distributing config file.' + ) from err + + if not broadcast_successful: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Config distribution isn\'t successful.') + + try: + commit_transaction(transaction_id, cs_config_filename=config) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while committing transaction.' + ) from err + + logger.debug('Successfully finished shutting down the cluster.') + return {'timestamp': start_time} + + @staticmethod + def add_node( + node: str, config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to add node to MCS CLuster. + + :param node: node IP or name or FQDN + :type node: str + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :raises CMAPIBasicError: on exception while starting transaction + :raises CMAPIBasicError: if transaction start isn't successful + :raises CMAPIBasicError: on exception while adding node + :raises CMAPIBasicError: on exception while distributing new config + :raises CMAPIBasicError: on unsuccessful distibuting config file + :raises CMAPIBasicError: on exception while committing transaction + :return: result of adding node + :rtype: dict + """ + logger.debug(f'Cluster add node command called. Adding node {node}.') + + response = {'timestamp': str(datetime.now())} + transaction_id = get_id() + + try: + suceeded, transaction_id, successes = start_transaction( + cs_config_filename=config, extra_nodes=[node], + id=transaction_id + ) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while starting the transaction.' + ) from err + if not suceeded: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Starting transaction isn\'t successful.') + + try: + add_node( + node, input_config_filename=config, + output_config_filename=config + ) + if not get_dbroots(node, config): + add_dbroot( + host=node, input_config_filename=config, + output_config_filename=config + ) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Error while adding node.') from err + + response['node_id'] = node + update_revision_and_manager( + input_config_filename=config, output_config_filename=config + ) + + try: + broadcast_successful = broadcast_new_config(config) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while distributing config file.' + ) from err + + if not broadcast_successful: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Config distribution isn\'t successful.') + + try: + commit_transaction(transaction_id, cs_config_filename=config) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while committing transaction.' + ) from err + + logger.debug(f'Successfully finished adding node {node}.') + return response + + @staticmethod + def remove_node( + node: str, config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to remove node from MCS CLuster. + + :param node: node IP or name or FQDN + :type node: str + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :raises CMAPIBasicError: on exception while starting transaction + :raises CMAPIBasicError: if transaction start isn't successful + :raises CMAPIBasicError: on exception while removing node + :raises CMAPIBasicError: on exception while distributing new config + :raises CMAPIBasicError: on unsuccessful distibuting config file + :raises CMAPIBasicError: on exception while committing transaction + :return: result of node removing + :rtype: dict + """ + logger.debug( + f'Cluster remove node command called. Removing node {node}.' + ) + response = {'timestamp': str(datetime.now())} + transaction_id = get_id() + + try: + suceeded, transaction_id, txn_nodes = start_transaction( + cs_config_filename=config, remove_nodes=[node], + id=transaction_id + ) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while starting the transaction.' + ) from err + if not suceeded: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Starting transaction isn\'t successful.') + + try: + remove_node( + node, input_config_filename=config, + output_config_filename=config + ) + except Exception as err: + rollback_transaction( + transaction_id, nodes=txn_nodes, cs_config_filename=config + ) + raise CMAPIBasicError('Error while removing node.') from err + + response['node_id'] = node + if len(txn_nodes) > 0: + update_revision_and_manager( + input_config_filename=config, output_config_filename=config + ) + try: + broadcast_successful = broadcast_new_config( + config, nodes=txn_nodes + ) + except Exception as err: + rollback_transaction( + transaction_id, nodes=txn_nodes, cs_config_filename=config + ) + raise CMAPIBasicError( + 'Error while distributing config file.' + ) from err + if not broadcast_successful: + rollback_transaction( + transaction_id, nodes=txn_nodes, cs_config_filename=config + ) + raise CMAPIBasicError('Config distribution isn\'t successful.') + + try: + commit_transaction(transaction_id, cs_config_filename=config) + except Exception as err: + rollback_transaction( + transaction_id, nodes=txn_nodes, cs_config_filename=config + ) + raise CMAPIBasicError( + 'Error while committing transaction.' + ) from err + + logger.debug(f'Successfully finished removing node {node}.') + return response + + @staticmethod + def set_mode( + mode: str, timeout:int = 60, config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to set MCS CLuster mode. + + :param mode: cluster mode to set, can be only "readonly" or "readwrite" + :type mode: str + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :raises CMAPIBasicError: if no master found in the cluster + :raises CMAPIBasicError: on exception while starting transaction + :raises CMAPIBasicError: if transaction start isn't successful + :raises CMAPIBasicError: on exception while adding node + :raises CMAPIBasicError: on exception while distributing new config + :raises CMAPIBasicError: on unsuccessful distibuting config file + :raises CMAPIBasicError: on exception while committing transaction + :return: result of adding node + :rtype: dict + """ + logger.debug( + f'Cluster mode set command called. Setting mode to {mode}.' + ) + + response = {'timestamp': str(datetime.now())} + cmapi_cfg_parser = get_config_parser(CMAPI_CONF_PATH) + api_key = get_current_key(cmapi_cfg_parser) + headers = {'x-api-key': api_key} + transaction_id = get_id() + + master = None + if len(get_active_nodes(config)) != 0: + master = get_dbrm_master(config) + + if master is None: + raise CMAPIBasicError('No master found in the cluster.') + else: + master = master['IPAddr'] + payload = {'cluster_mode': mode} + url = f'https://{master}:8640/cmapi/{get_version()}/node/config' + + try: + suceeded, transaction_id, successes = start_transaction( + cs_config_filename=config, id=transaction_id + ) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while starting the transaction.' + ) from err + if not suceeded: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError('Starting transaction isn\'t successful.') + + nc = NodeConfig() + root = nc.get_current_config_root(config_filename=config) + payload['manager'] = root.find('./ClusterManager').text + payload['revision'] = root.find('./ConfigRevision').text + payload['timeout'] = timeout + payload['cluster_mode'] = mode + + try: + r = requests.put(url, headers=headers, json=payload, verify=False) + r.raise_for_status() + response['cluster-mode'] = mode + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + f'Error while setting cluster mode to {mode}' + ) from err + + try: + commit_transaction(transaction_id, cs_config_filename=config) + except Exception as err: + rollback_transaction(transaction_id, cs_config_filename=config) + raise CMAPIBasicError( + 'Error while committing transaction.' + ) from err + + logger.debug(f'Successfully set cluster mode to {mode}.') + return response + + @staticmethod + def set_api_key( + api_key: str, verification_key: str, + config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to set API key for each CMAPI node in cluster. + + :param api_key: API key to set + :type api_key: str + :param verification_key: TOTP key to verify + :type verification_key: str + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :raises CMAPIBasicError: if catch some exception while setting API key + to each node + :return: status result + :rtype: dict + """ + logger.debug('Cluster set API key command called.') + + active_nodes = get_active_nodes(config) + body = { + 'api_key': api_key, + 'verification_key': verification_key + } + response = {} + # only for changing response object below + active_nodes_count = len(active_nodes) + + if not active_nodes: + # set api key in configuration file on this node + logger.debug( + 'No active nodes found, set API key into current CMAPI conf.' + ) + active_nodes.append('localhost') + + for node in active_nodes: + logger.debug(f'Setting new api key to "{node}".') + url = f'https://{node}:8640/cmapi/{get_version()}/node/apikey-set' + try: + resp = requests.put(url, verify=False, json=body) + resp.raise_for_status() + r_json = resp.json() + if active_nodes_count > 0: + response[str(node)] = r_json + except Exception as err: + raise CMAPIBasicError( + f'Got an error setting API key to "{node}".' + ) from err + logger.debug(f'Successfully set new api key to "{node}".') + + response['timestamp'] = str(datetime.now()) + logger.debug( + 'Successfully finished setting new API key to all nodes.' + ) + return response + + @staticmethod + def set_log_level( + level: str, config: str = DEFAULT_MCS_CONF_PATH, + logger: logging.Logger = logging.getLogger('cmapi_server') + ) -> dict: + """Method to set level for loggers on each CMAPI node in cluster. + + :param level: logging level, including custom + :type level: str + :param config: columnstore xml config file path, + defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :param logger: logger, defaults to logging.getLogger('cmapi_server') + :type logger: logging.Logger, optional + :return: status result + :rtype: dict + """ + logger.debug('Cluster set new logging level called.') + + active_nodes = get_active_nodes(config) + body = {'level': level} + response = {} + # only for changing response object below + active_nodes_count = len(active_nodes) + + if not active_nodes: + # set api key in configuration file on this node + logger.debug( + 'No active nodes found, set log level onluy for current node.' + ) + active_nodes.append('localhost') + + for node in active_nodes: + logger.debug(f'Setting new log level to "{node}".') + url = f'https://{node}:8640/cmapi/{get_version()}/node/log-level' + try: + resp = requests.put(url, verify=False, json=body) + resp.raise_for_status() + r_json = resp.json() + if active_nodes_count > 0: + response[str(node)] = r_json + except Exception as err: + raise CMAPIBasicError( + f'Got an error setting log level to "{node}".' + ) from err + logger.debug(f'Successfully set new log level to "{node}".') + + response['timestamp'] = str(datetime.now()) + logger.debug( + 'Successfully finished setting new log level to all nodes.' + ) + return response diff --git a/cmapi/cmapi_server/helpers.py b/cmapi/cmapi_server/helpers.py new file mode 100644 index 000000000..91045ca73 --- /dev/null +++ b/cmapi/cmapi_server/helpers.py @@ -0,0 +1,847 @@ +"""Module with helpers functions. + +TODO: remove NodeConfig usage and move to arguments (eg. nc or root) +""" + +import asyncio +import concurrent +import configparser +import datetime +import logging +import os +import socket +import time +from functools import partial +from random import random +from shutil import copyfile +from typing import Tuple, Optional + +import lxml.objectify +import requests + +from cmapi_server.exceptions import CMAPIBasicError +# Bug in pylint https://github.com/PyCQA/pylint/issues/4584 +requests.packages.urllib3.disable_warnings() # pylint: disable=no-member + +from cmapi_server.constants import ( + CMAPI_CONF_PATH, CMAPI_DEFAULT_CONF_PATH, DEFAULT_MCS_CONF_PATH, + DEFAULT_SM_CONF_PATH, LOCALHOSTS +) +from cmapi_server.handlers.cej import CEJPasswordHandler +from cmapi_server.managers.process import MCSProcessManager +from mcs_node_control.models.node_config import NodeConfig + + +def get_id(): + return int(random() * 1000000) + + +def start_transaction( + config_filename=CMAPI_CONF_PATH, + cs_config_filename=DEFAULT_MCS_CONF_PATH, + extra_nodes=None, + remove_nodes=None, + optional_nodes=None, + id=get_id() +): + """Start internal CMAPI transaction. + + Returns (success, txnid, nodes). success = True means it successfully + started a transaction, False means it didn't. If True, then txnid will have + the transaction ID and the list of nodes the transaction was started on. + If False, the txnid and nodes have undefined values. + + :param config_filename: cmapi config filepath, + defaults to CMAPI_CONF_PATH + :type config_filename: str + :param cs_config_filename: columnstore xml config filepath, + defaults to DEFAULT_MCS_CONF_PATH + :type cs_config_filename: str, optional + :param extra_nodes: extra nodes, defaults to None + :type extra_nodes: list, optional + :param remove_nodes: remove nodes, defaults to None + :type remove_nodes: list, optional + :param optional_nodes: optional nodes, defaults to None + :type optional_nodes: list, optional + :return: (success, txnid, nodes) + :rtype: tuple + """ + # TODO: Somehow change that logic for eg using several input types + # (str\list\set) and detect which one we got. + extra_nodes = extra_nodes or [] + remove_nodes = remove_nodes or [] + optional_nodes = optional_nodes or [] + + cfg_parser = get_config_parser(config_filename) + api_key = get_current_key(cfg_parser) + + version = get_version() + + headers = {'x-api-key': api_key} + body = {'id' : id} + final_time = datetime.datetime.now() + datetime.timedelta(seconds=300) + + success = False + while datetime.datetime.now() < final_time and not success: + successes = [] + + # it's painful to look at, but if this call fails to get a lock on + # every server, it may be because a node went down, and the config file + # was updated. So, update the list on every iteration. + # + # There is a race here between reading the config and getting the txn. + # What can stop it with the current design is using a mutex here, + # and having config updates come from only one node. + # For changes coming from failover, this will be true. + # + # There is also a race on the config file in general. + # Need to read it before you can get a lock, and need to lock it before + # it can be read reliably. Resolution TBD. File locking? Retries? + + # TODO: need to work with data types of nodes here + unfiltered_nodes = [ + *get_active_nodes(cs_config_filename), + *extra_nodes, + *optional_nodes + ] + tmp_active_nodes = { + node for node in unfiltered_nodes + if node not in remove_nodes + } + active_nodes = set() + + # resolve localhost addrs + for node in tmp_active_nodes: + if node in ['127.0.0.1', 'localhost', '::1']: + active_nodes.add(socket.gethostbyname(socket.gethostname())) + else: + active_nodes.add(node) + # this copy will be updated if an optional node can't be reached + real_active_nodes = set(active_nodes) + logging.trace(f'Active nodes on start transaction {active_nodes}') + for node in active_nodes: + url = f'https://{node}:8640/cmapi/{version}/node/begin' + node_success = False + logging.trace(f'Processing node "{node}"') + for retry in range(5): + logging.trace( + f'In {retry} attempt for node {node} and active nodes var ' + f'is {active_nodes} and real active nodes var is ' + f'{real_active_nodes}' + ) + try: + # who knows how much time has gone by... + # Update timeout to keep nodes in sync +/- + body['timeout'] = ( + final_time - datetime.datetime.now() + ).seconds + r = requests.put( + url, verify=False, headers=headers, json=body, + timeout=10 + ) + + # a 4xx error from our endpoint; + # likely another txn is running + # Breaking here will cause a rollback on nodes we have + # successfully started a txn on so far. Then it will try + # again to get a transaction on all nodes. Put all + # conditions where that is the desired behavior here. + if int(r.status_code / 100) == 4: + logging.debug( + 'Got a 4xx error while beginning transaction ' + f'with response text {r.text}' + ) + break # TODO: useless, got break in finally statement + # TODO: is there any case to separate 4xx + # from all other error codes + r.raise_for_status() + node_success = True + break + except requests.Timeout: + logging.warning( + f'start_transaction(): timeout on node {node}' + ) + except Exception: + logging.warning( + 'start_transaction(): got error during request ' + f'to node {node}', + exc_info=True + ) + finally: + if not node_success and node in optional_nodes: + logging.info( + f'start_transaction(): node {node} is optional;' + 'ignoring the error' + ) + real_active_nodes.remove(node) + break + + # wait 1 sec and try on this node again + time.sleep(1) + + if not node_success and node not in optional_nodes: + rollback_txn_attempt(api_key, version, id, successes) + # wait up to 5 secs and try the whole thing again + time.sleep(random() * 5) + break + elif node_success: + successes.append(node) + + # TODO: a little more work needs to be done here. If not all of the active-nodes + # are up when start is called, this will fail. It should succeed if 'enough' nodes + # are up (> 50%). + success = (len(successes) == len(real_active_nodes)) + + return (success, id, successes) + +def rollback_txn_attempt(key, version, txnid, nodes): + headers = {'x-api-key': key} + body = {'id': txnid} + for node in nodes: + url = f"https://{node}:8640/cmapi/{version}/node/rollback" + for retry in range(5): + try: + r = requests.put( + url, verify=False, headers=headers, json=body, timeout=5 + ) + r.raise_for_status() + except requests.Timeout: + logging.warning( + f'rollback_txn_attempt(): timeout on node "{node}"' + ) + except Exception: + logging.error( + ( + f'rollback_txn_attempt(): got unrecognised error ' + f'during request to "{node}".' + ), + exc_info=True + ) + else: + break + time.sleep(1) + +# on a failure to rollback or commit a txn on a subset of nodes, what are the options? +# - open a new txn and revert the changes on the nodes that respond +# - go forward with the subset. If those nodes are still up, they will have a config that is out of sync. +# -> for now, going to assume that the node went down, and that when it comes back up, its config +# will be sync'd + +def rollback_transaction( + id, config_filename=CMAPI_CONF_PATH, + cs_config_filename=DEFAULT_MCS_CONF_PATH, nodes=None +): + cfg_parser = get_config_parser(config_filename) + key = get_current_key(cfg_parser) + version = get_version() + if nodes is None: + nodes = get_active_nodes(cs_config_filename) + rollback_txn_attempt(key, version, id, nodes) + + +def commit_transaction( + id, config_filename=CMAPI_CONF_PATH, + cs_config_filename=DEFAULT_MCS_CONF_PATH, nodes = None +): + cfg_parser = get_config_parser(config_filename) + key = get_current_key(cfg_parser) + version = get_version() + if nodes is None: + nodes = get_active_nodes(cs_config_filename) + + headers = {'x-api-key': key} + body = {'id': id} + + for node in nodes: + url = f"https://{node}:8640/cmapi/{version}/node/commit" + for retry in range(5): + try: + r = requests.put(url, verify = False, headers = headers, json = body, timeout = 5) + r.raise_for_status() + except requests.Timeout as e: + logging.warning(f"commit_transaction(): timeout on node {node}") + except Exception as e: + logging.warning(f"commit_transaction(): got error during request to {node}: {str(e)}") + else: + break + time.sleep(1) + + +def broadcast_new_config( + cs_config_filename: str = DEFAULT_MCS_CONF_PATH, + cmapi_config_filename: str = CMAPI_CONF_PATH, + sm_config_filename: str = DEFAULT_SM_CONF_PATH, + test_mode: bool = False, + nodes: Optional[list] = None, +) -> bool: + """Send new config to nodes. Now in async way. + + :param cs_config_filename: Columnstore.xml path, + defaults to DEFAULT_MCS_CONF_PATH + :type cs_config_filename: str, optional + :param cmapi_config_filename: cmapi config path, + defaults to CMAPI_CONF_PATH + :type cmapi_config_filename: str, optional + :param sm_config_filename: storage manager config path, + defaults to DEFAULT_SM_CONF_PATH + :type sm_config_filename: str, optional + :param test_mode: for test purposes, defaults to False TODO: remove + :type test_mode: bool, optional + :param nodes: nodes list for config put, defaults to None + :type nodes: Optional[list], optional + :return: success state + :rtype: _type_ + """ + + cfg_parser = get_config_parser(cmapi_config_filename) + key = get_current_key(cfg_parser) + version = get_version() + if nodes is None: + nodes = get_active_nodes(cs_config_filename) + + nc = NodeConfig() + root = nc.get_current_config_root(config_filename=cs_config_filename) + with open(cs_config_filename) as f: + config_text = f.read() + + with open(sm_config_filename) as f: + sm_config_text = f.read() + + headers = {'x-api-key': key} + body = { + 'manager': root.find('./ClusterManager').text, + 'revision': root.find('./ConfigRevision').text, + 'timeout': 300, + 'config': config_text, + 'cs_config_filename': cs_config_filename, + 'sm_config_filename': sm_config_filename, + 'sm_config': sm_config_text + } + # TODO: remove test mode here and replace it by mock in tests + if test_mode: + body['test'] = True + + failed_nodes = [] + success_nodes = [] + + async def update_config(node, success_nodes, failed_nodes, headers, body): + url = f'https://{node}:8640/cmapi/{version}/node/config' + request_put = partial( + requests.put, url, verify=False, headers=headers, json=body, + timeout=120 + ) + success = False + executor = concurrent.futures.ThreadPoolExecutor() + loop = asyncio.get_event_loop() + + # TODO: remove this retry, it cause retries and long waiting time + # for eg if some of mcs processes couldn't properly start/stop. + # Fix error handling, could be raising error instead of returning + # bool value + for retry in range(5): + try: + r = await loop.run_in_executor(executor, request_put) + r.raise_for_status() + except requests.Timeout as e: + logging.warning( + f'Timeout while pushing new config to "{node}"' + ) + except Exception as e: + logging.warning( + f'Got an unexpected error pushing new config to "{node}"', + exc_info=True + ) + else: + success_nodes.append(node) + success = True + break + if not success: + failed_nodes.append(node) + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + tasks = [ + update_config(node, success_nodes, failed_nodes, headers, body) + for node in nodes + ] + loop.run_until_complete(asyncio.wait(tasks)) + loop.close() + + if len(success_nodes) > 0: + logging.info( + f'Successfully pushed new config file to {success_nodes}' + ) + if len(failed_nodes) > 0: + logging.error( + f'Failed to push the new config to {failed_nodes}' + ) + return False + return True + + +# Might be more appropriate to put these in node_manipulation? +def update_revision_and_manager( + input_config_filename: Optional[str] = None, + output_config_filename: Optional[str] = None +): + """Update MCS xml config revision and cluster manager tags. + + :param input_config_filename: , defaults to None + :type input_config_filename: Optional[str], optional + :param output_config_filename: _description_, defaults to None + :type output_config_filename: Optional[str], optional + """ + nc = NodeConfig() + + if input_config_filename is None: + root = nc.get_current_config_root() + else: + root = nc.get_current_config_root(input_config_filename) + + try: + rev_node = root.find('./ConfigRevision') + cur_revision = int(rev_node.text) + 1 + rev_node.text = str(cur_revision) + root.find('./ClusterManager').text = str( + nc.get_module_net_address(root=root, module_id=1) + ) + except Exception: + logging.error( + 'Caught exception while updating MCS config revision cluster ' + 'manager tag, will not write new config', + exc_info=True + ) + else: + if output_config_filename is None: + nc.write_config(root) + else: + nc.write_config(root, filename = output_config_filename) + + +def get_config_parser( + config_filepath: str = CMAPI_CONF_PATH +) -> configparser.ConfigParser: + """Get config parser from cmapi server ini config file. + + :param config_filename: cmapi server conf path, defaults to CMAPI_CONF_PATH + :type config_filename: str, optional + :return: config parser + :rtype: configparser.ConfigParser + """ + cfg_parser = configparser.ConfigParser() + try: + with open(config_filepath, 'r', encoding='utf-8') as cfg_file: + cfg_parser.read_file(cfg_file) + except PermissionError as e: + # TODO: looks like it's useless here, because of creating config + # from default on cmapi server startup + # Anyway looks like it have to raise error and then + # return 500 error + logging.error( + 'CMAPI cannot create configuration file. ' + 'API key stored in memory only.', + exc_info=True + ) + return cfg_parser + + +def save_cmapi_conf_file(cfg_parser, config_filepath: str = CMAPI_CONF_PATH): + """Save config file from config parser. + + :param cfg_parser: config parser to save + :type cfg_parser: configparser.ConfigParser + :param config_filepath: cmapi config filepath, defaults to CMAPI_CONF_PATH + :type config_filepath: str, optional + """ + try: + with open(config_filepath, 'w', encoding='utf-8') as cfg_file: + cfg_parser.write(cfg_file) + except PermissionError: + logging.error( + 'CMAPI cannot save configuration file due to permissions. ' + 'Some values still can be stored in memory.', + exc_info=True + ) + + +def get_active_nodes(config:str = DEFAULT_MCS_CONF_PATH) -> list: + """Get active nodes from Columnstore.xml. + + Actually this is only names of nodes by which node have been added. + + :param config: xml config path, defaults to DEFAULT_MCS_CONF_PATH + :type config: str, optional + :return: active nodes + :rtype: list + """ + nc = NodeConfig() + root = nc.get_current_config_root(config, upgrade=False) + nodes = root.findall('./ActiveNodes/Node') + return [ node.text for node in nodes ] + + +def get_desired_nodes(config=DEFAULT_MCS_CONF_PATH): + nc = NodeConfig() + root = nc.get_current_config_root(config, upgrade=False) + nodes = root.findall("./DesiredNodes/Node") + return [ node.text for node in nodes ] + + +def in_maintenance_state(config=DEFAULT_MCS_CONF_PATH): + nc = NodeConfig() + root = nc.get_current_config_root(config, upgrade=False) + raw_state = root.find('./Maintenance') + # if no Maintainace tag in xml config found + state = False + if raw_state is not None: + # returns True on "true" string else return false + state = lxml.objectify.BoolElement(raw_state.text) + return state + + +def get_current_key(config_parser): + """Get API key for cmapi server endpoints from ini config. + + :param config_parser: config parser + :type config_parser: configparser.ConfigParser + :return: api key + :rtype: str + """ + # ConfigParser reading value as is , for eg with quotes + return config_parser.get('Authentication', 'x-api-key', fallback='') + + +def get_version(): + from cmapi_server.controllers.dispatcher import _version + return _version + + +def get_dbroots(node, config=DEFAULT_MCS_CONF_PATH): + # TODO: somehow duplicated with NodeConfig.get_all_dbroots? + nc = NodeConfig() + root = nc.get_current_config_root(config) + dbroots = [] + smc_node = root.find('./SystemModuleConfig') + mod_count = int(smc_node.find('./ModuleCount3').text) + for i in range(1, mod_count+1): + ip_addr = smc_node.find(f'./ModuleIPAddr{i}-1-3').text + hostname = smc_node.find(f'./ModuleHostName{i}-1-3').text + node_fqdn = socket.gethostbyaddr(hostname)[0] + + if node in LOCALHOSTS and hostname != 'localhost': + node = socket.gethostbyaddr(socket.gethostname())[0] + elif node not in LOCALHOSTS and hostname == 'localhost': + # hostname will only be loclahost if we are in one node cluster + hostname = socket.gethostbyaddr(socket.gethostname())[0] + + + if node == ip_addr or node == hostname or node == node_fqdn: + for j in range( + 1, int(smc_node.find(f"./ModuleDBRootCount{i}-3").text) + 1 + ): + dbroots.append( + smc_node.find(f"./ModuleDBRootID{i}-{j}-3").text + ) + return dbroots + + +def get_current_config_file( + config_filename=DEFAULT_MCS_CONF_PATH, + cmapi_config_filename=CMAPI_CONF_PATH +): + """Start a transaction on all DesiredNodes, which are all optional. + + - the transaction prevents config changes from being made at the same time + - get the config from each node + - discard config files for different clusters + - call put_config on the config file with the highest revision number found + - end the transaction + """ + + logging.info('get_current_config_file(): seeking the current config file') + + cfg_parser = get_config_parser(cmapi_config_filename) + key = get_current_key(cfg_parser) + nc = NodeConfig() + root = nc.get_current_config_root(config_filename = config_filename) + # TODO: here we got set of ip addresses of DesiredNodes + # but after that we convert them to list and send as + # an optional_nodes argument to start_transaction() + # So need to work with data type of nodes. + desired_nodes = { + node.text for node in root.findall('./DesiredNodes/Node') + } + if len(desired_nodes) <= 1: + return True + + current_rev = int(root.find('ConfigRevision').text) + cluster_name = root.find('ClusterName').text + highest_rev = current_rev + highest_node = 'localhost' + highest_config = nc.to_string(root) + + # TODO: data type of optional_nodes set -> list + # Need to work with it inside and outside of start_transaction + (success, txn_id, nodes) = start_transaction( + cs_config_filename=config_filename, + optional_nodes=list(desired_nodes) + ) + localhost_aliases = set(nc.get_network_addresses_and_names()) + other_nodes = set(nodes) - localhost_aliases + if not success or len(other_nodes) == 0: + if success: + commit_transaction(txn_id, nodes = nodes) + return False + + nodes_in_same_cluster = 0 + for node in nodes: + if node in localhost_aliases: + continue + + headers = {'x-api-key' : key} + url = f'https://{node}:8640/cmapi/{get_version()}/node/config' + try: + r = requests.get(url, verify=False, headers=headers, timeout=5) + r.raise_for_status() + config = r.json()['config'] + except Exception as e: + logging.warning( + 'get_current_config_file(): got an error fetching the ' + f'config file from {node}: {str(e)}' + ) + continue + tmp_root = nc.get_root_from_string(config) + name_node = tmp_root.find('ClusterName') + if name_node is None or name_node.text != cluster_name: + continue + nodes_in_same_cluster += 1 + rev_node = tmp_root.find('ConfigRevision') + if rev_node is None or int(rev_node.text) <= highest_rev: + continue + highest_rev = int(rev_node.text) + highest_config = config + highest_node = node + + nc.apply_config(config_filename=config_filename, xml_string=highest_config) + # TODO: do we need restart node here? + commit_transaction(txn_id, cs_config_filename=config_filename, nodes=nodes) + + # todo, we might want stronger criteria for a large cluster. + # Right now we want to reach at least one other node + # (if there is another node) + if len(desired_nodes) > 1 and nodes_in_same_cluster < 1: + logging.error( + 'get_current_config_file(): failed to contact enough nodes ' + f'in my cluster ({cluster_name}) to reliably retrieve a current ' + 'configuration file. Manual intervention may be required.' + ) + # TODO: addition error handling. + try: + MCSProcessManager.stop_node(is_primary=nc.is_primary_node()) + except CMAPIBasicError as err: + logging.error(err.message) + return False + + if highest_rev != current_rev: + logging.info( + 'get_current_config_file(): Accepted the config file from' + f' {highest_node}' + ) + else: + logging.info( + 'get_current_config_file(): This node has the current config file' + ) + return True + + +def wait_for_deactivation_or_put_config( + config_mtime, config_filename=DEFAULT_MCS_CONF_PATH +): + ''' + if a multi-node cluster... + Wait for either a put_config operation (as determined by monitoring the mtime of config_filename), + or wait for this node to be removed from active_nodes, + or wait for a period long enough for this to be considered a 'long' outage (30s right now, as determined + by the failover code. TODO: make that time period configurable... + + Activating failover after one of these three events should allow this node to join the cluster either as part + of the failover behavior, or as part of the cluster-wide start cmd. + ''' + + my_names = set(NodeConfig().get_network_addresses_and_names()) + desired_nodes = get_desired_nodes(config_filename) + if len(desired_nodes) == 1 and desired_nodes[0] in my_names: + logging.info("wait_for_deactivation_or_put_config: Single-node cluster, safe to continue") + return + + final_time = datetime.datetime.now() + datetime.timedelta(seconds = 40) + while config_mtime == os.path.getmtime(config_filename) and \ + len(my_names.intersection(set(get_active_nodes(config_filename)))) > 0 and \ + datetime.datetime.now() < final_time: + logging.info("wait_for_deactivation_or_put_config: Waiting...") + time.sleep(5) + + if config_mtime != os.path.getmtime(config_filename): + logging.info("wait_for_deactivation_or_put_config: A new config was received, safe to continue.") + elif len(my_names.intersection(set(get_active_nodes(config_filename)))) == 0: + logging.info("wait_for_deactivation_or_put_config: Was removed from the cluster, safe to continue.") + else: + logging.info("wait_for_deactivation_or_put_config: Time limit reached, continuing.") + + +# This isn't used currently. Remove once we decide there is no need for it. +def if_primary_restart( + config_filename=DEFAULT_MCS_CONF_PATH, + cmapi_config_filename=CMAPI_CONF_PATH +): + nc = NodeConfig() + root = nc.get_current_config_root(config_filename = config_filename) + primary_node = root.find("./PrimaryNode").text + + if primary_node not in nc.get_network_addresses_and_names(): + return + + cfg_parser = get_config_parser(cmapi_config_filename) + key = get_current_key(cfg_parser) + headers = { "x-api-key" : key } + body = { "config": config_filename } + + logging.info("if_primary_restart(): restarting the cluster.") + url = f"https://{primary_node}:8640/cmapi/{get_version()}/cluster/start" + endtime = datetime.datetime.now() + datetime.timedelta(seconds = 600) # questionable how long to retry + success = False + while not success and datetime.datetime.now() < endtime: + try: + response = requests.put(url, verify = False, headers = headers, json = body, timeout = 60) + response.raise_for_status() + success = True + except Exception as e: + logging.warning(f"if_primary_restart(): failed to start the cluster, got {str(e)}") + time.sleep(10) + if not success: + logging.error(f"if_primary_restart(): failed to start the cluster. Manual intervention is required.") + + +def get_cej_info(config_root): + """Get CEJ (Cross Engine Join) info. + + Get credentials from CrossEngineSupport section in Columnstore.xml . + Decrypt CEJ user password if needed. + + :param config_root: config root element from Columnstore.xml file + :type config_root: lxml.Element + :return: cej_host, cej_port, cej_username, cej_password + :rtype: tuple + """ + cej_node = config_root.find('./CrossEngineSupport') + cej_host = cej_node.find('Host').text or '127.0.0.1' + cej_port = cej_node.find('Port').text or '3306' + cej_username = cej_node.find('./User').text + cej_password = cej_node.find('./Password').text or '' + + if not cej_username: + logging.error( + 'Columnstore.xml has an empty CrossEngineSupport.User tag' + ) + if not cej_password: + logging.warning( + 'Columnstore.xml has an empty CrossEngineSupport.Password tag' + ) + + if CEJPasswordHandler.secretsfile_exists(): + cej_password = CEJPasswordHandler.decrypt_password(cej_password) + + return cej_host, cej_port, cej_username, cej_password + + +def system_ready(config_filename=DEFAULT_MCS_CONF_PATH): + """Indicates whether the node is ready to accept queries. + + :param config_filename: columnstore xml config filepath, + defaults to DEFAULT_MCS_CONF_PATH + :type config_filename: str, optional + :return: tuple of 2 booleans + :rtype: tuple + """ + nc = NodeConfig() + root = nc.get_current_config_root(config_filename) + host, port, username, password = get_cej_info(root) + + if username is None: + # Second False indicates not to retry inside calling function's + # retry loop + return False, False + + cmd = ( + f"/usr/bin/mariadb -h '{host}' " + f"-P '{port}' " + f"-u '{username}' " + f"--password='{password}' " + "-sN -e " + "\"SELECT mcssystemready();\"" + ) + + import subprocess + ret = subprocess.run(cmd, stdout=subprocess.PIPE, shell = True) + if ret.returncode == 0: + response = ret.stdout.decode("utf-8").strip() + if response == '1': + return True, False + else: + return False, True + return False, False + + +def cmapi_config_check(cmapi_conf_path: str = CMAPI_CONF_PATH): + """Check if cmapi config file exists and copy default config if not. + + :param cmapi_conf_path: cmapi conf path, defaults to CMAPI_CONF_PATH + :type cmapi_conf_path: str, optional + """ + if not os.path.exists(cmapi_conf_path): + logging.info( + f'There are no config file at "{cmapi_conf_path}". ' + f'So copy default config from {CMAPI_DEFAULT_CONF_PATH} there.' + ) + copyfile(CMAPI_DEFAULT_CONF_PATH, cmapi_conf_path) + + +def dequote(input_str: str) -> str: + """Dequote input string. + + If a string has single or double quotes around it, remove them. + Make sure the pair of quotes match. + If a matching pair of quotes is not found, return the string unchanged. + + :param input_str: input probably quoted string + :type input_str: str + :return: unquoted string + :rtype: str + """ + if ( + len(input_str) >= 2 and + input_str[0] == input_str[-1] + ) and input_str.startswith(("'", '"')): + return input_str[1:-1] + return input_str + + +def get_dispatcher_name_and_path( + config_parser: configparser.ConfigParser + ) -> Tuple[str, str]: + """Get dispatcher name and path from cmapi conf file. + + :param config_parser: cmapi conf file parser + :type config_parser: configparser.ConfigParser + :return: dispatcher name and path strings + :rtype: tuple[str, str] + """ + dispatcher_name = dequote( + config_parser.get('Dispatcher', 'name', fallback='systemd') + ) + # TODO: used only for next releases for CustomDispatcher class + # remove if useless + dispatcher_path = dequote( + config_parser.get('Dispatcher', 'path', fallback='') + ) + return dispatcher_name, dispatcher_path diff --git a/cmapi/cmapi_server/logging_management.py b/cmapi/cmapi_server/logging_management.py new file mode 100644 index 000000000..fc1e6b583 --- /dev/null +++ b/cmapi/cmapi_server/logging_management.py @@ -0,0 +1,131 @@ +import json +import logging +import logging.config +from functools import partial, partialmethod + +import cherrypy +from cherrypy import _cperror + +from cmapi_server.constants import CMAPI_LOG_CONF_PATH + + +class AddIpFilter(logging.Filter): + """Filter to add IP address to logging record.""" + def filter(self, record): + record.ip = cherrypy.request.remote.name or cherrypy.request.remote.ip + return True + + +def custom_cherrypy_error( + self, msg='', context='', severity=logging.INFO, traceback=False + ): + """Write the given ``msg`` to the error log. [now without hardcoded time] + + This is not just for errors! [looks awful, but cherrypy realisation as is] + Applications may call this at any time to log application-specific + information. + + If ``traceback`` is True, the traceback of the current exception + (if any) will be appended to ``msg``. + + ..Note: + All informatio + """ + exc_info = None + if traceback: + exc_info = _cperror._exc_info() + + self.error_log.log(severity, ' '.join((context, msg)), exc_info=exc_info) + + +def dict_config(config_filepath: str): + with open(config_filepath, 'r', encoding='utf-8') as json_config: + config_dict = json.load(json_config) + logging.config.dictConfig(config_dict) + + +def add_logging_level(level_name, level_num, method_name=None): + """ + Comprehensively adds a new logging level to the `logging` module and the + currently configured logging class. + + `level_name` becomes an attribute of the `logging` module with the value + `level_num`. + `methodName` becomes a convenience method for both `logging` itself + and the class returned by `logging.getLoggerClass()` (usually just + `logging.Logger`). + If `methodName` is not specified, `levelName.lower()` is used. + + To avoid accidental clobberings of existing attributes, this method will + raise an `AttributeError` if the level name is already an attribute of the + `logging` module or if the method name is already present + + Example + ------- + >>> add_logging_level('TRACE', logging.DEBUG - 5) + >>> logging.getLogger(__name__).setLevel('TRACE') + >>> logging.getLogger(__name__).trace('that worked') + >>> logging.trace('so did this') + >>> logging.TRACE + 5 + + """ + if not method_name: + method_name = level_name.lower() + + if hasattr(logging, level_name): + raise AttributeError(f'{level_name} already defined in logging module') + if hasattr(logging, method_name): + raise AttributeError( + f'{method_name} already defined in logging module' + ) + if hasattr(logging.getLoggerClass(), method_name): + raise AttributeError(f'{method_name} already defined in logger class') + + # This method was inspired by the answers to Stack Overflow post + # http://stackoverflow.com/q/2183233/2988730, especially + # https://stackoverflow.com/a/35804945 + # https://stackoverflow.com/a/55276759 + logging.addLevelName(level_num, level_name) + setattr(logging, level_name, level_num) + setattr( + logging.getLoggerClass(), method_name, + partialmethod(logging.getLoggerClass().log, level_num) + ) + setattr(logging, method_name, partial(logging.log, level_num)) + + +def config_cmapi_server_logging(): + # add custom level TRACE only for develop purposes + # could be activated using API endpoints or cli tool without relaunching + add_logging_level('TRACE', 5) + cherrypy._cplogging.LogManager.error = custom_cherrypy_error + # reconfigure cherrypy.access log message format + # Default access_log_format '{h} {l} {u} {t} "{r}" {s} {b} "{f}" "{a}"' + # h - remote.name or remote.ip, l - "-", + # u - getattr(request, 'login', None) or '-', t - self.time(), + # r - request.request_line, s - status, + # b - dict.get(outheaders, 'Content-Length', '') or '-', + # f - dict.get(inheaders, 'Referer', ''), + # a - dict.get(inheaders, 'User-Agent', ''), + # o - dict.get(inheaders, 'Host', '-'), + # i - request.unique_id, z - LazyRfc3339UtcTime() + cherrypy._cplogging.LogManager.access_log_format = ( + '{h} ACCESS "{r}" code {s}, bytes {b}, user-agent "{a}"' + ) + dict_config(CMAPI_LOG_CONF_PATH) + + +def change_loggers_level(level: str): + """Set level for each custom logger except cherrypy library. + + :param level: logging level to set + :type level: str + """ + loggers = [ + logging.getLogger(name) for name in logging.root.manager.loggerDict + if 'cherrypy' not in name + ] + loggers.append(logging.getLogger()) # add RootLogger + for logger in loggers: + logger.setLevel(level) diff --git a/cmapi/cmapi_server/managers/__init__.py b/cmapi/cmapi_server/managers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/cmapi_server/managers/application.py b/cmapi/cmapi_server/managers/application.py new file mode 100644 index 000000000..d0adeab17 --- /dev/null +++ b/cmapi/cmapi_server/managers/application.py @@ -0,0 +1,29 @@ +import logging +from typing import Optional + +from cmapi_server.constants import VERSION_PATH + + +class AppManager: + started: bool = False + version: Optional[str] = None + + @classmethod + def get_version(cls) -> str: + """Get CMAPI version. + + :return: cmapi version + :rtype: str + """ + if cls.version: + return cls.version + with open(VERSION_PATH, encoding='utf-8') as version_file: + version = '.'.join([ + i.strip().split('=')[1] + for i in version_file.read().splitlines() if i + ]) + if not version: + logging.error('Couldn\'t detect version from VERSION file!') + version = 'Undefined' + cls.version = version + return cls.version diff --git a/cmapi/cmapi_server/managers/process.py b/cmapi/cmapi_server/managers/process.py new file mode 100644 index 000000000..ef01f57aa --- /dev/null +++ b/cmapi/cmapi_server/managers/process.py @@ -0,0 +1,439 @@ +from __future__ import annotations +import logging +import os.path +import socket +from time import sleep + +import psutil + +from cmapi_server.exceptions import CMAPIBasicError +from cmapi_server.constants import MCS_INSTALL_BIN, ALL_MCS_PROGS +from cmapi_server.process_dispatchers.systemd import SystemdDispatcher +from cmapi_server.process_dispatchers.container import ( + ContainerDispatcher +) +from mcs_node_control.models.dbrm import DBRM +from mcs_node_control.models.dbrm_socket import SOCK_TIMEOUT +from mcs_node_control.models.misc import get_workernodes +from mcs_node_control.models.process import Process + + +PROCESS_DISPATCHERS = { + 'systemd': SystemdDispatcher, + # could be used in docker containers and OSes w/o systemd + 'container': ContainerDispatcher, +} +PRIMARY_PROGS = ('controllernode', 'DMLProc', 'DDLProc') + + +class MCSProcessManager: + """Class to run process operations. + + e.g. re/-start or stop systemd services, run executable. + """ + CONTROLLER_MAX_RETRY = 30 + mcs_progs = {} + mcs_version_info = None + dispatcher_name = None + process_dispatcher = None + + @classmethod + def _get_prog_name(cls, name: str) -> str: + """Get proper service name for systemd or non-systemd installations. + + :param name: service name + :type name: str + :return: correct service name + :rtype: str + """ + if cls.dispatcher_name == 'systemd': + return ALL_MCS_PROGS[name].service_name + return name + + @classmethod + def _get_sorted_progs( + cls, is_primary: bool, reverse: bool = False + ) -> dict: + """Get sorted services dict. + + :param is_primary: is primary node or not + :type is_primary: bool + :param reverse: reverse sort order, defaults to False + :type reverse: bool, optional + :return: dict with sorted services in correct start/stop order + :rtype: dict + """ + unsorted_progs: dict + if is_primary: + unsorted_progs = cls.mcs_progs + else: + unsorted_progs = { + prog_name: prog_info + for prog_name, prog_info in cls.mcs_progs.items() + if prog_name not in PRIMARY_PROGS + } + if reverse: + # stop sequence builds using stop_priority property + return dict( + sorted( + unsorted_progs.items(), + key=lambda item: item[1].stop_priority, + ) + ) + # start up sequence is a dict default sequence + return unsorted_progs + + @classmethod + def _detect_processes(cls) -> None: + """Detect existing mcs services. Depends on MCS version.""" + if cls.mcs_progs: + logging.warning('Mcs ProcessHandler already detected processes.') + + for prog_name, prog_info in ALL_MCS_PROGS.items(): + if os.path.exists(os.path.join(MCS_INSTALL_BIN, prog_name)): + cls.mcs_progs[prog_name] = prog_info + + @classmethod + def detect(cls, dispatcher_name: str, dispatcher_path: str = None) -> None: + """Detect mcs version info and installed processes. + + :param dispatcher_name: process dispatcher name + :type dispatcher_name: str + :param dispatcher_path: path to custom dispatcher, + for next releases, defaults to None + :type dispatcher_path: str, optional + :raises CMAPIBasicError: if custom dispatcher path doesn't exists + :raises CMAPIBasicError: Not implemented custom dispatcher error + """ + cls._detect_processes() + # detect mcs version info by processes + if len(cls.mcs_progs) == 8: + cls.mcs_version_info = '6.4.x and lower' + elif len(cls.mcs_progs) == 7 and 'ExeMgr' not in cls.mcs_progs: + cls.mcs_version_info = '22.08.x and higher' + else: + cls.mcs_version_info = 'Undefined' + logging.warning( + 'MCS version haven\'t been detected properly.' + 'Please try to update your CMAPI version or contact support.' + ) + logging.info( + f'Detected {len(cls.mcs_progs)} MCS services.' + f'MCS version is {cls.mcs_version_info}' + ) + # TODO: For next releases. Do we really need custom dispatchers? + if dispatcher_name not in PROCESS_DISPATCHERS: + logging.warning( + f'Custom process dispatcher with name "{dispatcher_name}" ' + f'and path "{dispatcher_path}" used.' + ) + if not dispatcher_path or not os.path.exists(dispatcher_path): + err_msg = 'Wrong dispatcher path in cmapi_config file.' + logging.error(err_msg) + raise CMAPIBasicError(err_msg) + cls.dispatcher_name = 'custom' + raise CMAPIBasicError('Custom dispatchers yet not implemented!') + + cls.dispatcher_name = dispatcher_name + cls.process_dispatcher = PROCESS_DISPATCHERS[dispatcher_name] + cls.process_dispatcher.init() + + @classmethod + def _wait_for_workernodes(cls) -> bool: + """Wait for workernodes processes. + + Waiting for all workernodes to come up before starting + controllernode on a primary. + + :return: True on success + :rtype: bool + """ + logging.debug( + 'Waiting for all workernodes to come up before starting ' + 'controllernode on a primary.' + ) + workernodes = get_workernodes() + attempts = cls.CONTROLLER_MAX_RETRY + while attempts > 0 and len(workernodes) > 0: + logging.debug(f'Waiting for "{list(workernodes)}"....{attempts}') + # creating a separated list with workernode names + # for safe deleting items from source dict + for name in list(workernodes): + try: + sock = socket.socket( + socket.AF_INET, socket.SOCK_STREAM + ) + sock.settimeout(SOCK_TIMEOUT) + sock.connect( + ( + workernodes[name]['IPAddr'], + workernodes[name]['Port'] + ) + ) + except socket.timeout: + logging.debug( + f'"{name}" {workernodes[name]["IPAddr"]}:' + f'{workernodes[name]["Port"]} not started yet.' + ) + else: + # delete started workernode from workernodes dict + del workernodes[name] + finally: + sock.close() + attempts -= 1 + + if workernodes: + logging.error( + f'Some workernodes: "{workernodes}" are not reachable after ' + f'{cls.CONTROLLER_MAX_RETRY} attempts to connect with ' + f'{SOCK_TIMEOUT} seconds timeout.' + 'Starting mcs-controllernode anyway.' + ) + return False + return True + + @classmethod + def _wait_for_controllernode(cls) -> bool: + """Waiting for controllernode to come up on a primary. + + :return: True on success + :rtype: bool + """ + logging.debug( + 'Waiting for controllernode to come up before starting ' + 'ddlproc/dmlproc on non-primary nodes.' + ) + attempts = cls.CONTROLLER_MAX_RETRY + success = False + while attempts > 0: + try: + with DBRM(): + # check connection + success = True + except (ConnectionRefusedError, RuntimeError, socket.error): + logging.info( + 'Cannot establish connection to controllernode.' + f'Controller node still not started. Waiting...{attempts}' + ) + else: + break + attempts -= 1 + + if not success: + logging.error( + 'Controllernode is not reachable after ' + f'{cls.CONTROLLER_MAX_RETRY} attempts to connect with ' + f'{SOCK_TIMEOUT} seconds timeout.' + 'Starting mcs-dmlproc/mcs-ddlproc anyway.' + ) + return False + return True + + @classmethod + def _wait_for_DMLProc_stop(cls, timeout: int = 10) -> bool: + """Waiting DMLProc process to stop. + + :param timeout: timeout to wait, defaults to 10 + :type timeout: int, optional + :return: True on success + :rtype: bool + """ + logging.info(f'Waiting for DMLProc to stop in {timeout} seconds') + dmlproc_stopped = False + while timeout > 0: + logging.info( + f'Waiting for DMLProc to stop. Seconds left {timeout}.' + ) + if not Process.check_process_alive('DMLProc'): + logging.info('DMLProc gracefully stopped by DBRM command.') + dmlproc_stopped = True + break + sleep(1) + timeout -= 1 + else: + logging.error( + f'DMLProc did not stopped gracefully by DBRM command within ' + f'{timeout} seconds. Will be stopped directly.' + ) + return dmlproc_stopped + + @classmethod + def noop(cls, *args, **kwargs): + """No operation. TODO: looks like useless.""" + cls.process_dispatcher.noop() + + @classmethod + def start(cls, name: str, is_primary: bool, use_sudo: bool) -> bool: + """Start mcs process. + + :param name: mcs process name + :type name: str + :param is_primary: is node primary or not + :type is_primary: bool + :param use_sudo: use sudo or not + :type use_sudo: bool + :return: True if process started successfully + :rtype: bool + """ + return cls.process_dispatcher.start( + cls._get_prog_name(name), is_primary, use_sudo + ) + + @classmethod + def stop( + cls, name: str, is_primary: bool, use_sudo: bool, timeout: int = 10 + ) -> bool: + """Stop mcs process. + + :param name: mcs process name + :type name: str + :param is_primary: is node primary or not + :type is_primary: bool + :param use_sudo: use sudo or not + :type use_sudo: bool + :param timeout: timeout for DMLProc gracefully stop using DBRM, seconds + :type timeout: int + :return: True if process stopped successfully + :rtype: bool + """ + # TODO: do we need here force stop DMLProc as a method argument? + + if is_primary and name == 'DMLProc': + logging.info( + 'Trying to gracefully stop DMLProc using DBRM commands.' + ) + try: + with DBRM() as dbrm: + dbrm.set_system_state( + ['SS_ROLLBACK', 'SS_SHUTDOWN_PENDING'] + ) + except (ConnectionRefusedError, RuntimeError): + logging.error( + 'Cannot set SS_ROLLBACK and SS_SHUTDOWN_PENDING ' + 'using DBRM while trying to gracefully auto stop DMLProc.' + 'Continue with a regular stop method.' + ) + # stop DMLProc using regular signals or systemd + return cls.process_dispatcher.stop( + cls._get_prog_name(name), is_primary, use_sudo + ) + # DMLProc gracefully stopped using DBRM commands otherwise + # continue with a regular stop method + if cls._wait_for_DMLProc_stop(timeout): + return True + return cls.process_dispatcher.stop( + cls._get_prog_name(name), is_primary, use_sudo + ) + + @classmethod + def restart(cls, name: str, is_primary: bool, use_sudo: bool) -> bool: + """Restart mcs process. + + :param name: mcs process name + :type name: str + :param is_primary: is node primary or not + :type is_primary: bool + :param use_sudo: use sudo or not + :type use_sudo: bool + :return: True if process restarted successfully + :rtype: bool + """ + return cls.process_dispatcher.restart( + cls._get_prog_name(name), is_primary, use_sudo + ) + + @classmethod + def get_running_mcs_procs(cls) -> list[dict]: + """Search for mcs processes. + + The method returns PIDs of MCS services in both container or systemd + environments. + + :return: list of dicts with name and pid of mcs process + :rtype: list[dict] + """ + return [ + {'name': proc.name(), 'pid': proc.pid} + for proc in psutil.process_iter(['pid', 'name']) + if proc.name() in cls.mcs_progs + ] + + @classmethod + def is_node_processes_ok( + cls, is_primary: bool, node_stopped: bool + ) -> bool: + """Check if needed processes exists or not. + + :param is_primary: is node primary or not + :type is_primary: bool + :param node_stopped: is node stopped or started + :type node_stopped: bool + :return: True if there are expected value of processes, else False + :rtype: bool + + ..NOTE: For next releases. Now only used in tests. + """ + running_procs = cls.get_running_mcs_procs() + if node_stopped: + return len(running_procs) == 0 + node_progs = cls._get_sorted_progs(is_primary) + return set(node_progs) == set(p['name'] for p in running_procs) + + @classmethod + def start_node(cls, is_primary: bool, use_sudo: bool = True): + """Start mcs node processes. + + :param is_primary: is node primary or not, defaults to True + :type is_primary: bool + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :raises CMAPIBasicError: immediately if one mcs process not started + """ + for prog_name in cls._get_sorted_progs(is_primary): + if ( + cls.dispatcher_name == 'systemd' + and prog_name == 'StorageManager' + ): + # TODO: MCOL-5458 + logging.info( + f'Skip starting {prog_name} with systemd dispatcher.' + ) + continue + # TODO: additional error handling + if prog_name == 'controllernode': + cls._wait_for_workernodes() + if prog_name in ('DMLProc', 'DDLProc'): + cls._wait_for_controllernode() + if not cls.start(prog_name, is_primary, use_sudo): + logging.error(f'Process "{prog_name}" not started properly.') + raise CMAPIBasicError(f'Error while starting "{prog_name}".') + + @classmethod + def stop_node( + cls, is_primary: bool, use_sudo: bool = True, timeout: int = 10 + ): + """Stop mcs node processes. + + :param is_primary: is node primary or not, defaults to True + :type is_primary: bool + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :param timeout: timeout for DMLProc gracefully stop using DBRM, seconds + :type timeout: int + :raises CMAPIBasicError: immediately if one mcs process not stopped + """ + # Every time try to stop all processes no matter primary it or slave, + # so use full available list of processes. Otherwise, it could cause + # undefined behaviour when primary gone and then recovers (failover + # triggered 2 times). + for prog_name in cls._get_sorted_progs(True, reverse=True): + if not cls.stop(prog_name, is_primary, use_sudo): + logging.error(f'Process "{prog_name}" not stopped properly.') + raise CMAPIBasicError(f'Error while stopping "{prog_name}"') + + @classmethod + def restart_node(cls, is_primary: bool, use_sudo: bool): + """TODO: For next releases.""" + if cls.get_running_mcs_procs(): + cls.stop_node(is_primary, use_sudo) + cls.start_node(is_primary, use_sudo) diff --git a/cmapi/cmapi_server/node_manipulation.py b/cmapi/cmapi_server/node_manipulation.py new file mode 100644 index 000000000..d4e9240ed --- /dev/null +++ b/cmapi/cmapi_server/node_manipulation.py @@ -0,0 +1,1124 @@ +''' +This file contains functions to manipulate the CS config file to add a node, +remove a node, etc. Should be synchronized externally. +''' + +import datetime +import logging +import os +import shutil +import socket +import subprocess +import time +from typing import Optional + +import requests +from lxml import etree + +from cmapi_server import helpers +from cmapi_server.constants import ( + CMAPI_CONF_PATH, CMAPI_SINGLE_NODE_XML, DEFAULT_MCS_CONF_PATH, LOCALHOSTS, + MCS_DATA_PATH, +) +from mcs_node_control.models.node_config import NodeConfig + + +PMS_NODE_PORT = '8620' +EXEMGR_NODE_PORT = '8601' + + +# TODO: add some description of the public interfaces... +# Split this file or change the structure and move functions out of here + + +def switch_node_maintenance( + maintenance_state: bool, + input_config_filename: str = DEFAULT_MCS_CONF_PATH, + output_config_filename: str = DEFAULT_MCS_CONF_PATH, +): + """Change maintenance flag value in Columnstore xml config file. + + :param maintenance_state: state of maintenance flag + :type maintenance_state: bool + :param input_config_filename: mcs input config path, + defaults to DEFAULT_MCS_CONF_PATH + :type input_config_filename: str, optional + :param output_config_filename: mcs output config path, + defaults to DEFAULT_MCS_CONF_PATH + :type output_config_filename: str, optional + """ + node_config = NodeConfig() + config_root = node_config.get_current_config_root(input_config_filename) + maintenance_element = config_root.find('Maintenance') + # should be done in upgrade_from_v0, but looks like better to doublecheck + if maintenance_element is None: + maintenance_element = etree.SubElement(config_root, 'Maintenance') + maintenance_element.text = str(maintenance_state).lower() + node_config.write_config(config_root, filename=output_config_filename) + # TODO: probably move publishing to cherrypy.emgine failover channel here? + + +def add_node( + node: str, input_config_filename: str = DEFAULT_MCS_CONF_PATH, + output_config_filename: Optional[str] = None, + rebalance_dbroots: bool = True +): + """Add node to a cluster. + + Check whether or not '127.0.0.1' or 'localhost' are in the config file, and + if so, replace those instances with this node's external hostname + Do we need to detect IP addresses given as node, and use the hostname? + - if we're always using hostnames or always using addrs everywhere + it won't matter + Add to the PMS section + Add an ExeMgr section + Add the DBRM workers + Add the writeengineservers + Add "Module*" keys + Move DBRoots (moved to a separate function) + Update CEJ to point to ExeMgr1 (for now) + Update the list of active nodes + + :param node: node address or hostname + :type node: str + :param input_config_filename: mcs input config path, + defaults to DEFAULT_MCS_CONF_PATH + :type input_config_filename: str, optional + :param output_config_filename: mcs output config path, defaults to None + :type output_config_filename: Optional[str], optional + :param rebalance_dbroots: rebalance dbroots or not, defaults to True + :type rebalance_dbroots: bool, optional + """ + node_config = NodeConfig() + c_root = node_config.get_current_config_root(input_config_filename) + + try: + if not _replace_localhost(c_root, node): + pm_num = _add_node_to_PMS(c_root, node) + _add_WES(c_root, pm_num, node) + _add_DBRM_Worker(c_root, node) + _add_Module_entries(c_root, node) + _add_active_node(c_root, node) + _add_node_to_ExeMgrs(c_root, node) + if rebalance_dbroots: + _rebalance_dbroots(c_root) + _move_primary_node(c_root) + except Exception: + logging.error( + 'Caught exception while adding node, config file is unchanged', + exc_info=True + ) + raise + else: + if output_config_filename is None: + node_config.write_config(c_root) + else: + node_config.write_config(c_root, filename=output_config_filename) + + +# deactivate_only is a bool that indicates whether the node is being removed completely from +# the cluster, or whether it has gone offline and should still be monitored in case it comes back. +# Note! this does not pick a new primary node, use the move_primary_node() fcn to change that. +def remove_node( + node, input_config_filename=DEFAULT_MCS_CONF_PATH, + output_config_filename=None, deactivate_only=False, + rebalance_dbroots = True, **kwargs +): + node_config = NodeConfig() + c_root = node_config.get_current_config_root(input_config_filename) + + ''' + Rebuild the PMS section w/o node + Remove the DBRM_Worker entry + Remove the WES entry + Rebuild the "Module*" entries w/o node + Update the list of active / inactive / desired nodes + ''' + + try: + active_nodes = helpers.get_active_nodes(input_config_filename) + + if len(active_nodes) > 1: + pm_num = _remove_node_from_PMS(c_root, node) + _remove_WES(c_root, pm_num) + _remove_DBRM_Worker(c_root, node) + _remove_Module_entries(c_root, node) + _remove_from_ExeMgrs(c_root, node) + + if deactivate_only: + _deactivate_node(c_root, node) + else: + # TODO: unspecific name, need to think of a better one + _remove_node(c_root, node) + + if rebalance_dbroots: + _rebalance_dbroots(c_root) + _move_primary_node(c_root) + else: + # TODO: + # - IMO undefined behaviour here. Removing one single node + # in some cases can produces Single node cluster. + # - No MCS services stopped after removing node. + # + # reproduce: + # add node by ip then remove using localhost, got one active node + # 127.0.0.1. If use same name got no active nodes but all working + # mcs processes. + shutil.copyfile( + CMAPI_SINGLE_NODE_XML, + output_config_filename + if output_config_filename + else input_config_filename + ) + return + + except Exception: + logging.error( + 'remove_node(): Caught exception, did not modify the config file', + exc_info=True + ) + raise + else: + if output_config_filename is None: + node_config.write_config(c_root) + else: + node_config.write_config(c_root, filename=output_config_filename) + + +def rebalance_dbroots( + input_config_filename: Optional[str] = None, + output_config_filename: Optional[str] = None +) -> None: + """Rebalance dbroots between nodes. + + :param input_config_filename: input mcs config path, defaults to None + :type input_config_filename: Optional[str], optional + :param output_config_filename: oputput mcs config path, defaults to None + :type output_config_filename: Optional[str], optional + :rases: Exception if error happens while rebalancing. + """ + node_config = NodeConfig() + if input_config_filename is None: + c_root = node_config.get_current_config_root() + else: + c_root = node_config.get_current_config_root( + config_filename=input_config_filename + ) + + try: + _rebalance_dbroots(c_root) + except Exception: + logging.error( + 'Caught exception while rebalancing dbroots, did not modify ' + 'the config file.', + exc_info=True + ) + raise + else: + if output_config_filename is None: + node_config.write_config(c_root) + else: + node_config.write_config(c_root, filename=output_config_filename) + +# all params are optional. If node_id is unset, it will add a dbroot but not attach it to a node. +# if node_id is set, it will attach the new dbroot to that node. Node_id should be either +# 'pm1' 'PM1' or '1'. Those three all refer to node 1 as identified by the Module* entries in the +# config file. TBD whether we need a different identifier for the node. Maybe the hostname instead. +# +# returns the id of the new dbroot on success +# raises an exception on error +def add_dbroot(input_config_filename = None, output_config_filename = None, host = None): + node_config = NodeConfig() + if input_config_filename is None: + c_root = node_config.get_current_config_root() + else: + c_root = node_config.get_current_config_root(config_filename = input_config_filename) + + try: + ret = _add_dbroot(c_root, host) + except Exception as e: + logging.error(f"add_dbroot(): Caught exception: '{str(e)}', did not modify the config file") + raise + + if output_config_filename is None: + node_config.write_config(c_root) + else: + node_config.write_config(c_root, filename = output_config_filename) + return ret + +def move_primary_node( + input_config_filename=None, output_config_filename=None, **kwargs +): + node_config = NodeConfig() + if input_config_filename is None: + c_root = node_config.get_current_config_root() + else: + c_root = node_config.get_current_config_root( + config_filename=input_config_filename + ) + + try: + _move_primary_node(c_root) + except Exception: + logging.error( + 'move_primary_node(): did not modify the config file', + exc_info=True + ) + raise + else: + if output_config_filename is None: + node_config.write_config(c_root) + else: + node_config.write_config(c_root, filename=output_config_filename) + + +def find_dbroot1(root): + smc_node = root.find("./SystemModuleConfig") + pm_count = int(smc_node.find("./ModuleCount3").text) + for pm_num in range(1, pm_count + 1): + dbroot_count = int(smc_node.find(f"./ModuleDBRootCount{pm_num}-3").text) + for dbroot_num in range(1, dbroot_count + 1): + dbroot = smc_node.find(f"./ModuleDBRootID{pm_num}-{dbroot_num}-3").text + if dbroot == "1": + name = smc_node.find(f"ModuleHostName{pm_num}-1-3").text + addr = smc_node.find(f"ModuleIPAddr{pm_num}-1-3").text + return (name, addr) + raise NodeNotFoundException("Could not find dbroot 1 in the list of dbroot assignments!") + + +def _move_primary_node(root): + ''' + Verify new_primary is in the list of active nodes + + Change ExeMgr1 + Change CEJ + Change DMLProc + Change DDLProc + Change Contollernode + Change PrimaryNode + ''' + + new_primary = find_dbroot1(root) + logging.info(f"_move_primary_node(): dbroot 1 is assigned to {new_primary}") + active_nodes = root.findall("./ActiveNodes/Node") + found = False + for node in active_nodes: + if node.text in new_primary: + found = True + break + if not found: + raise NodeNotFoundException(f"{new_primary} is not in the list of active nodes") + + root.find("./ExeMgr1/IPAddr").text = new_primary[0] + root.find("./DMLProc/IPAddr").text = new_primary[0] + root.find("./DDLProc/IPAddr").text = new_primary[0] + # keep controllernode as hostname + # b/c if IP used got troubles on a SKYSQL side + # on the other hand if hostname/fqdn used customers with wrong address + # resolving can got troubles on their side + # related issues: MCOL-4804, MCOL-4440, MCOL-5017, DBAAS-7442 + root.find("./DBRM_Controller/IPAddr").text = new_primary[0] + root.find("./PrimaryNode").text = new_primary[0] + + +def _add_active_node(root, node): + ''' + if in inactiveNodes, delete it there + if not in desiredNodes, add it there + if not in activeNodes, add it there + ''' + + nodes = root.findall("./DesiredNodes/Node") + found = False + for n in nodes: + if n.text == node: + found = True + if not found: + desired_nodes = root.find("./DesiredNodes") + etree.SubElement(desired_nodes, "Node").text = node + + __remove_helper(root.find("./InactiveNodes"), node) + + active_nodes = root.find("./ActiveNodes") + nodes = active_nodes.findall("./Node") + found = False + for n in nodes: + if n.text == node: + found = True + break + if not found: + etree.SubElement(active_nodes, "Node").text = node + + +def __remove_helper(parent_node, node): + nodes = list(parent_node.findall("./Node")) + for n in nodes: + if n.text == node: + parent_node.remove(n) + + +def _remove_node(root, node): + ''' + remove node from DesiredNodes, InactiveNodes, and ActiveNodes + ''' + + for n in (root.find("./DesiredNodes"), root.find("./InactiveNodes"), root.find("./ActiveNodes")): + __remove_helper(n, node) + + +# This moves a node from ActiveNodes to InactiveNodes +def _deactivate_node(root, node): + __remove_helper(root.find("./ActiveNodes"), node) + inactive_nodes = root.find("./InactiveNodes") + etree.SubElement(inactive_nodes, "Node").text = node + + +def _add_dbroot(root, host) -> int: + """Add a dbroot to the system. + + Attach it to node_id if it's specified. + Increment the nextdbrootid. + + :param root: xml config root + :type root: xml.Tree.ElementTree.Element + :param host: host + :type host: any? + :raises NodeNotFoundException: if node not in a cluster + :return: Added dbroot number + :rtype: int + """ + sysconf_node = root.find('./SystemConfig') + dbroot_count_node = sysconf_node.find('./DBRootCount') + dbroot_count = int(dbroot_count_node.text) + dbroot_count += 1 + dbroot_count_node.text = str(dbroot_count) + + next_dbroot_node = root.find('./NextDBRootId') + next_dbroot_id = int(next_dbroot_node.text) + + # Use DBRoot path from Columnstore.xml + dbroot1_path = sysconf_node.find('./DBRoot1') + if dbroot1_path is not None: + dbroots_path = os.path.dirname(dbroot1_path.text) + else: + dbroots_path = MCS_DATA_PATH + + etree.SubElement( + sysconf_node, f'DBRoot{next_dbroot_id}' + ).text = os.path.join(dbroots_path, f'data{next_dbroot_id}') + + current_dbroot_id = next_dbroot_id + + # find an unused dbroot id from 1-99 + for i in range(1, 100): + if sysconf_node.find(f'./DBRoot{i}') is None: + next_dbroot_id = i + break + next_dbroot_node.text = str(next_dbroot_id) + + if host is None: + return current_dbroot_id + + # Attach it to the specified node + + # get the existing dbroot info for pm X + smc_node = root.find('./SystemModuleConfig') + + # find the node id we're trying to add to + mod_count = int(smc_node.find('./ModuleCount3').text) + node_id = 0 + for i in range(1, mod_count+1): + ip_addr = smc_node.find(f'./ModuleIPAddr{i}-1-3').text + hostname = smc_node.find(f'./ModuleHostName{i}-1-3').text + if host == ip_addr or host == hostname: + node_id = i + break + if node_id == 0: + raise NodeNotFoundException( + f'Host {host} is not currently part of the cluster' + ) + + dbroot_count_node = smc_node.find(f'./ModuleDBRootCount{node_id}-3') + dbroot_count = int(dbroot_count_node.text) + dbroot_count += 1 + etree.SubElement( + smc_node, f'ModuleDBRootID{node_id}-{dbroot_count}-3' + ).text = str(current_dbroot_id) + dbroot_count_node.text = str(dbroot_count) + return current_dbroot_id + + +# check if the node is the master, maxscale might have chose new master +# so use CEJ User to check via mariadb if the node has slave connections +def is_master(): + node_config = NodeConfig() + root = node_config.get_current_config_root() + host, port, username, password = helpers.get_cej_info(root) + + if username is None: + return False + + cmd = ( + f"/usr/bin/mariadb -h '{host}' " + f"-P '{port}' " + f"-u '{username}' " + f"--password='{password}' " + "-sN -e " + "\"SELECT COUNT(1) AS slave_threads " + "FROM information_schema.PROCESSLIST " + "WHERE USER = 'system user' " + "AND COMMAND LIKE 'Slave%';\"" + ) + + ret = subprocess.run(cmd, stdout=subprocess.PIPE, shell = True) + if ret.returncode == 0: + response = ret.stdout.decode("utf-8").strip() + # Primary will have no slave_threads + if response == '0': + return True + else: + return False + return None + + +def unassign_dbroot1(root): + smc_node = root.find("./SystemModuleConfig") + pm_count = int(smc_node.find("./ModuleCount3").text) + owner_id = 0 + for i in range(1, pm_count + 1): + dbroot_count_node = smc_node.find(f"./ModuleDBRootCount{i}-3") + dbroot_count = int(dbroot_count_node.text) + dbroot_list = [] + for j in range(1, dbroot_count + 1): + dbroot = smc_node.find(f"./ModuleDBRootID{i}-{j}-3").text + if dbroot == "1": + owner_id = i # this node has dbroot 1 + else: + dbroot_list.append(dbroot) # the dbroot assignments to keep + if owner_id != 0: + break + if owner_id == 0: + return # dbroot 1 is already unassigned (primary node must have gone down) + + # remove the dbroot entries for node owner_id + for i in range(1, dbroot_count + 1): + doomed_node = smc_node.find(f"./ModuleDBRootID{owner_id}-{i}-3") + smc_node.remove(doomed_node) + # create the new dbroot entries + dbroot_count_node.text = str(len(dbroot_list)) + i = 1 + for dbroot in dbroot_list: + etree.SubElement(smc_node, f"ModuleDBRootID{owner_id}-{i}-3").text = dbroot + i += 1 + + +def _rebalance_dbroots(root, test_mode=False): + # TODO: add code to detect whether we are using shared storage or not. If not, exit + # without doing anything. + + ''' + this will be a pita + identify unassigned dbroots + assign those to the node with the fewest dbroots + + then, + id the nodes with the most dbroots and the least dbroots + when most - least <= 1, we're done + else, move a dbroot from the node with the most to the one with the least + + Not going to try to be clever about the alg. We're dealing with small lists. + Aiming for simplicity and comprehensibility. + ''' + + ''' + Borderline hack here. We are going to remove dbroot1 from its current host so that + it will always look for the current replication master and always resolve the discrepancy + between what maxscale and what cmapi choose for the primary/master node. + + We know of 2 constraints around primary node selection. + 1) dbroot 1 has to be assigned to the primary node b/c controllernode and possibly + other processes try to access data1 directly + 2) The primary node has to be the same as the master replication node chosen by + Maxscale b/c there is a schema sync issue + + Right now the code is doing this because we discovered these restrictions late in the dev + process: + 1) unassign dbroot 1 to force new primary node selection + 2) look for the master repl node + 3) put dbroot 1 on it + 4) look for dbroot 1 + 5) make it the primary node + + Once we are done with the constraint discovery process, we should refactor this. + ''' + unassign_dbroot1(root) + + current_mapping = get_current_dbroot_mapping(root) + sysconf_node = root.find("./SystemConfig") + + # There can be holes in the dbroot numbering, so can't just scan from [1-dbroot_count] + # Going to scan from 1-99 instead. + existing_dbroots = [] + for num in range(1, 100): + node = sysconf_node.find(f"./DBRoot{num}") + if node is not None: + existing_dbroots.append(num) + + # assign the unassigned dbroots + unassigned_dbroots = set(existing_dbroots) - set(current_mapping[0]) + + ''' + If dbroot 1 is in the unassigned list, then we need to put it on the node that will be the next + primary node. Need to choose the same node as maxscale here. For now, we will wait until + maxscale does the replication reconfig, then choose the new master. Later, + we will choose the node using the same method that maxscale does to avoid + the need to go through the mariadb client. + + If this process goes on longer than 1 min, then we will assume there is no maxscale, + so this should choose where dbroot 1 should go itself. + ''' + if 1 in unassigned_dbroots: + logging.info("Waiting for Maxscale to choose the new repl master...") + smc_node = root.find("./SystemModuleConfig") + # Maybe iterate over the list of ModuleHostName tags instead + pm_count = int(smc_node.find("./ModuleCount3").text) + found_master = False + final_time = datetime.datetime.now() + datetime.timedelta(seconds = 30) + + # skip this if in test mode. + retry = True + while not found_master and datetime.datetime.now() < final_time and not test_mode: + for node_num in range(1, pm_count + 1): + node_ip = smc_node.find(f"./ModuleIPAddr{node_num}-1-3").text + node_name = smc_node.find(f"./ModuleHostName{node_num}-1-3").text + if pm_count == 1: + found_master = True + else: + cfg_parser = helpers.get_config_parser(CMAPI_CONF_PATH) + key = helpers.get_current_key(cfg_parser) + version = helpers.get_version() + headers = {'x-api-key': key} + url = f"https://{node_ip}:8640/cmapi/{version}/node/new_primary" + try: + r = requests.get(url, verify = False, headers = headers, timeout = 10) + r.raise_for_status() + r = r.json() + is_primary = r['is_primary'] + if is_primary is None: + # neither True nor False + # possible node is not ready, leave retry as-is + pass + elif is_primary: + found_master = True + except requests.exceptions.Timeout: + # timed out + # possible node is not ready, leave retry as-is + pass + except Exception as e: + retry = False + + if not found_master: + if not retry: + logging.info("There was an error retrieving replication master") + break + else: + continue + + # assign dbroot 1 to this node, put at the front of the list + current_mapping[node_num].insert(0, 1) + unassigned_dbroots.remove(1) + logging.info(f"The new replication master is {node_name}") + break + if not found_master: + logging.info("New repl master has not been chosen yet") + time.sleep(1) + if not found_master: + logging.info("Maxscale has not reconfigured repl master, continuing...") + + for dbroot in unassigned_dbroots: + (_min, min_index) = _find_min_max_length(current_mapping)[0] + if dbroot != 1: + current_mapping[min_index].append(dbroot) + else: + # make dbroot 1 move only if the new node goes down by putting it at the front of the list + current_mapping[min_index].insert(0, dbroot) + + # balance the distribution + ((_min, min_index), (_max, max_index)) = _find_min_max_length(current_mapping) + while _max - _min > 1: + current_mapping[min_index].append(current_mapping[max_index].pop(-1)) + ((_min, min_index), (_max, max_index)) = _find_min_max_length(current_mapping) + + # write the new mapping + sysconf_node = root.find("./SystemModuleConfig") + for i in range(1, len(current_mapping)): + dbroot_count_node = sysconf_node.find(f"./ModuleDBRootCount{i}-3") + # delete the original assignments for node i + for dbroot_num in range(1, int(dbroot_count_node.text) + 1): + old_node = sysconf_node.find(f"./ModuleDBRootID{i}-{dbroot_num}-3") + sysconf_node.remove(old_node) + + # write the new assignments for node i + dbroot_count_node.text = str(len(current_mapping[i])) + for dbroot_num in range(len(current_mapping[i])): + etree.SubElement(sysconf_node, f"ModuleDBRootID{i}-{dbroot_num+1}-3").text = str(current_mapping[i][dbroot_num]) + + +# returns ((min, index-of-min), (max, index-of-max)) +def _find_min_max_length(mappings): + _min = 100 + min_index = -1 + _max = -1 + max_index = -1 + for i in range(1, len(mappings)): + this_len = len(mappings[i]) + if this_len < _min: + _min = this_len + min_index = i + if this_len > _max: + _max = this_len + max_index = i + return ((_min, min_index), (_max, max_index)) + + +# returns a list indexed by node_num, where the value is a list of dbroot ids (ints) +# so, list[1] == [1, 2, 3] would mean that node 1 has dbroots 1, 2, & 3. +# To align the list with node IDs, element 0 is a list with all of the assigned dbroots +def get_current_dbroot_mapping(root): + ''' + get the current node count + iterate over the ModuleDBRootIDX-Y-3 entries to build the mapping + ''' + + smc_node = root.find("./SystemModuleConfig") + node_count = int(smc_node.find("./ModuleCount3").text) + current_mapping = [[]] + + for i in range(1, node_count + 1): + dbroot_count = int(smc_node.find(f"./ModuleDBRootCount{i}-3").text) + dbroots_on_this_node = [] + for dbroot_num in range(1, dbroot_count + 1): + dbroot_id = int(smc_node.find(f"./ModuleDBRootID{i}-{dbroot_num}-3").text) + dbroots_on_this_node.append(dbroot_id) + current_mapping[0].append(dbroot_id) + current_mapping.append(dbroots_on_this_node) + + return current_mapping + + +def _remove_Module_entries(root, node): + ''' + figure out which module_id node is + store info from the other modules + ModuleIPAddr + ModuleHostName + ModuleDBRootCount + ModuleDBRootIDs + delete all of those tags + write new versions + write new ModuleCount3 value + write new NextNodeID + ''' + smc_node = root.find("./SystemModuleConfig") + mod_count_node = smc_node.find("./ModuleCount3") + current_module_count = int(mod_count_node.text) + node_module_id = 0 + + for num in range(1, current_module_count + 1): + m_ip_node = smc_node.find(f"./ModuleIPAddr{num}-1-3") + m_name_node = smc_node.find(f"./ModuleHostName{num}-1-3") + if node == m_ip_node.text or node == m_name_node.text: + node_module_id = num + break + if node_module_id == 0: + logging.warning(f"remove_module_entries(): did not find node {node} in the Module* entries of the config file") + return + + # Get the existing info except for node, remove the existing nodes + new_module_info = [] + for num in range(1, current_module_count + 1): + m_ip_node = smc_node.find(f"./ModuleIPAddr{num}-1-3") + m_name_node = smc_node.find(f"./ModuleHostName{num}-1-3") + dbrc_node = smc_node.find(f"./ModuleDBRootCount{num}-3") + dbr_count = int(dbrc_node.text) + smc_node.remove(dbrc_node) + dbroots = [] + for i in range(1, dbr_count + 1): + dbr_node = smc_node.find(f"./ModuleDBRootID{num}-{i}-3") + dbroots.append(dbr_node.text) + smc_node.remove(dbr_node) + + if node != m_ip_node.text and node != m_name_node.text: + new_module_info.append((m_ip_node.text, m_name_node.text, dbroots)) + + smc_node.remove(m_ip_node) + smc_node.remove(m_name_node) + + # Regenerate these entries + current_module_count = len(new_module_info) + for num in range(1, current_module_count + 1): + (ip, name, dbroots) = new_module_info[num - 1] + etree.SubElement(smc_node, f"ModuleIPAddr{num}-1-3").text = ip + etree.SubElement(smc_node, f"ModuleHostName{num}-1-3").text = name + etree.SubElement(smc_node, f"ModuleDBRootCount{num}-3").text = str(len(dbroots)) + for i in range(1, len(dbroots) + 1): + etree.SubElement(smc_node, f"ModuleDBRootID{num}-{i}-3").text = dbroots[i - 1] + + # update NextNodeId and ModuleCount3 + nni_node = root.find("./NextNodeId") + nni_node.text = str(current_module_count + 1) + mod_count_node.text = str(current_module_count) + + +def _remove_WES(root, pm_num): + ''' + Avoid gaps in pm numbering where possible. + Read the existing pmX_WriteEngineServer entries except where X = pm_num, + Delete them, + Write new entries + + Not sure yet, but I believe for the dbroot -> PM mapping to work, the node # in the Module + entries has to match the pm # in other fields. They should be written consistently and intact + already, but this is a guess at this point. Short-term, a couple options. 1) Construct an argument + that they are maintained consistently right now. 2) Add consistency checking logic, and on a mismatch, + remove all affected sections and reconstruct them with add_node() and add_dbroot(). + + Longer term, make the config file less stupid. Ex: + + + ... + hostname + + hostname-or-ipv4 + 1,2,3 + + ... + + + ^^ The above is all we need to figure out where everything is and what each node should run + ''' + + pm_count = int(root.find("./PrimitiveServers/Count").text) + pms = [] + # This is a bit of a hack. We already decremented the pm count; need to add 2 to this loop instead of 1 + # to scan the full range of these entries [1, pm_count + 2) + for i in range(1, pm_count + 2): + node = root.find(f"./pm{i}_WriteEngineServer") + if node is not None: + if i != pm_num: + pms.append(node.find("./IPAddr").text) + root.remove(node) + + # Write the new entries + for i in range(1, len(pms) + 1): + wes = etree.SubElement(root, f"pm{i}_WriteEngineServer") + etree.SubElement(wes, "IPAddr").text = pms[i - 1] + etree.SubElement(wes, "Port").text = "8630" + + +def _remove_DBRM_Worker(root, node): + ''' + regenerate the DBRM_Worker list without node + update NumWorkers + ''' + + num = 1 + workers = [] + while True: + w_node = root.find(f"./DBRM_Worker{num}") + if w_node is not None: + addr = w_node.find("./IPAddr").text + if addr != "0.0.0.0" and addr != node: + workers.append(addr) + root.remove(w_node) + else: + break + num += 1 + + for num in range(len(workers)): + w_node = etree.SubElement(root, f"DBRM_Worker{num+1}") + etree.SubElement(w_node, "IPAddr").text = workers[num] + etree.SubElement(w_node, "Port").text = "8700" + root.find("./DBRM_Controller/NumWorkers").text = str(len(workers)) + + +def _remove_from_ExeMgrs(root, node): + """Remove the corresponding ExeMgrX section from the config.""" + num = 1 + ems = [] + # TODO: use loop by nodes count instead of "while True" + while True: + em_node = root.find(f"./ExeMgr{num}") + if em_node is not None: + addr = em_node.find("./IPAddr").text + if addr != "0.0.0.0" and addr != node: + ems.append(addr) + root.remove(em_node) + else: + break + num += 1 + + for num in range(len(ems)): + em_node = etree.SubElement(root, f"ExeMgr{num+1}") + etree.SubElement(em_node, "IPAddr").text = ems[num] + etree.SubElement(em_node, "Port").text = "8601" + etree.SubElement(em_node, "Module").text = "unassigned" + + +def _remove_node_from_PMS(root, node): + ''' + find the PM number we're removing + replace existing PMS entries + ''' + connections_per_pm = int( + root.find("./PrimitiveServers/ConnectionsPerPrimProc").text + ) + count_node = root.find("./PrimitiveServers/Count") + pm_count = int(count_node.text) + + # get current list of PMs to avoid changing existing assignments + pm_list = [] + pm_num = 0 + for num in range(1, pm_count+1): + addr = root.find(f"./PMS{num}/IPAddr") + if addr.text != node: + pm_list.append(addr.text) + else: + pm_num = num + + if pm_num == 0: + return 0 + + # remove the existing PMS entries + num = 1 + while True: + pmsnode = root.find(f"./PMS{num}") + if pmsnode is not None: + root.remove(pmsnode) + else: + break + num += 1 + + # generate new list + pm_count = len(pm_list) + count_node.text = str(pm_count) + pm_list.append(node) + for num in range(pm_count*connections_per_pm): + pmsnode = etree.SubElement(root, f"PMS{num+1}") + addrnode = etree.SubElement(pmsnode, "IPAddr") + addrnode.text = pm_list[num % pm_count] + portnode = etree.SubElement(pmsnode, "Port") + portnode.text = PMS_NODE_PORT + + return pm_num + +def _add_Module_entries(root, node): + ''' + get new node id + add ModuleIPAddr, ModuleHostName, ModuleDBRootCount (don't set ModuleDBRootID* here) + set ModuleCount3 and NextNodeId + no need to rewrite existing entries for this fcn + ''' + + # XXXPAT: No guarantee these are the values used in the rest of the system. + # This will work best with a simple network configuration where there is 1 IP addr + # and 1 host name for a node. + ip4 = socket.gethostbyname(node) + if ip4 == node: # node is an IP addr + node_name = socket.gethostbyaddr(node)[0] + else: + node_name = node # node is a hostname + + logging.info(f"_add_Module_entries(): using ip address {ip4} and hostname {node_name}") + + smc_node = root.find("./SystemModuleConfig") + mod_count_node = smc_node.find("./ModuleCount3") + nnid_node = root.find("./NextNodeId") + nnid = int(nnid_node.text) + current_module_count = int(mod_count_node.text) + + # look for existing entries and fix if they exist + for i in range(1, nnid): + ip_node = smc_node.find(f"./ModuleIPAddr{i}-1-3") + name_node = smc_node.find(f"./ModuleHostName{i}-1-3") + # if we find a matching IP address, but it has a different hostname, update the addr + if ip_node is not None and ip_node.text == ip4: + logging.info(f"_add_Module_entries(): found ip address already at ModuleIPAddr{i}-1-3") + hostname = smc_node.find(f"./ModuleHostName{i}-1-3").text + if hostname != node_name: + new_ip_addr = socket.gethostbyname(hostname) + logging.info(f"_add_Module_entries(): hostname doesn't match, updating address to {new_ip_addr}") + smc_node.find(f"ModuleHostName{i}-1-3").text = new_ip_addr + else: + logging.info(f"_add_Module_entries(): no update is necessary") + return + + # if we find a matching hostname, update the ip addr + if name_node is not None and name_node.text == node_name: + logging.info(f"_add_Module_entries(): found existing entry for {node_name}, updating its address to {ip4}") + ip_node.text = ip4 + return + + etree.SubElement(smc_node, f"ModuleIPAddr{nnid}-1-3").text = ip4 + etree.SubElement(smc_node, f"ModuleHostName{nnid}-1-3").text = node_name + etree.SubElement(smc_node, f"ModuleDBRootCount{nnid}-3").text = "0" + mod_count_node.text = str(current_module_count + 1) + nnid_node.text = str(nnid + 1) + + +def _add_WES(root, pm_num, node): + wes_node = etree.SubElement(root, f"pm{pm_num}_WriteEngineServer") + etree.SubElement(wes_node, "IPAddr").text = node + etree.SubElement(wes_node, "Port").text = "8630" + + +def _add_DBRM_Worker(root, node): + ''' + find the highest numbered DBRM_Worker entry, or one that isn't used atm + prune unused entries + add this node at the end + ''' + + num = 1 + already_exists = False + while True: + e_node = root.find(f"./DBRM_Worker{num}") + if e_node is None: + break + addr = e_node.find("./IPAddr").text + if addr == "0.0.0.0": + root.remove(e_node) + elif addr == node: + logging.info(f"_add_DBRM_Worker(): node {node} is already a worker node") + already_exists = True + num += 1 + + if already_exists: + return + + num_workers_node = root.find("./DBRM_Controller/NumWorkers") + num_workers = int(num_workers_node.text) + 1 + brm_node = etree.SubElement(root, f"DBRM_Worker{num_workers}") + etree.SubElement(brm_node, "Port").text = "8700" + etree.SubElement(brm_node, "IPAddr").text = node + num_workers_node.text = str(num_workers) + + +def _add_node_to_ExeMgrs(root, node): + """Find the highest numbered ExeMgr entry, add this node at the end.""" + num = 1 + while True: + e_node = root.find(f"./ExeMgr{num}") + if e_node is None: + break + addr = e_node.find("./IPAddr") + if addr.text == node: + logging.info(f"_add_node_to_ExeMgrs(): node {node} already exists") + return + num += 1 + e_node = etree.SubElement(root, f"ExeMgr{num}") + addr_node = etree.SubElement(e_node, "IPAddr") + addr_node.text = node + port_node = etree.SubElement(e_node, "Port") + port_node.text = EXEMGR_NODE_PORT + + +def _add_node_to_PMS(root, node): + ''' + the PMS section is a sequential list of connections descriptions + + For example, if ConnectionsPerPrimProc is 2, and the Count is 2, then + the PMS entries look like this: + + PMS1 = connection 1 of PM 1 + PMS2 = connection 1 of PM 2 + + The easiest way to add a node is probably to generate a whole new list. + ''' + count_node = root.find('./PrimitiveServers/Count') + pm_count = int(count_node.text) + + # get current list of PMs to avoid changing existing assignments + pm_list = {} + new_pm_num = 0 + for num in range(1, pm_count+1): + addr = root.find(f'./PMS{num}/IPAddr') + if addr.text == node and new_pm_num == 0: + logging.info(f'_add_node_to_PMS(): node {node} already exists') + new_pm_num = num + else: + pm_list[num] = addr.text + + # remove the existing PMS entries + num = 1 + while True: + pmsnode = root.find(f'./PMS{num}') + if pmsnode is not None: + root.remove(pmsnode) + else: + break + num += 1 + + # generate new list + if new_pm_num == 0: + pm_count += 1 + count_node.text = str(pm_count) + pm_list[pm_count] = node + new_pm_num = pm_count + for num in range(pm_count): + pmsnode = etree.SubElement(root, f'PMS{num+1}') + addrnode = etree.SubElement(pmsnode, 'IPAddr') + addrnode.text = pm_list[(num % pm_count) + 1] + portnode = etree.SubElement(pmsnode, 'Port') + portnode.text = PMS_NODE_PORT + + return new_pm_num + +def _replace_localhost(root, node): + # if DBRM_Controller/IPAddr is 127.0.0.1 or localhost, + # then replace all instances, else do nothing. + controller_host = root.find('./DBRM_Controller/IPAddr') + if controller_host.text not in LOCALHOSTS: + logging.debug( + 'Nothing to replace, DBRM_Controller/IPAddr isn\'t localhost.' + ) + return False + + # getaddrinfo returns list of 5-tuples (..., sockaddr) + # use sockaddr to retrieve ip, sockaddr = (address, port) for AF_INET + ipaddr = socket.getaddrinfo(node, 8640, family=socket.AF_INET)[0][-1][0] + # signifies that node is an IP addr already + if ipaddr == node: + # use the primary hostname if given an ip addr + hostname = socket.gethostbyaddr(ipaddr)[0] + else: + hostname = node # use whatever name they gave us + logging.info( + f'add_node(): replacing 127.0.0.1/localhost with {ipaddr}/{hostname} ' + f'as this node\'s name. Be sure {hostname} resolves to {ipaddr} on ' + 'all other nodes in the cluster.' + ) + + nodes_to_reassign = [ + n for n in root.findall('.//') if n.text in LOCALHOSTS + ] + + # Host field is contained within CrossEngineSupport User and QueryTele. + # Leave these values as default (will be local IP) + exclude = ['Host'] + for n in nodes_to_reassign: + if 'ModuleIPAddr' in n.tag: + n.text = ipaddr + elif 'ModuleHostName' in n.tag: + n.text = hostname + elif n.tag not in exclude: + # if tag is neither ip nor hostname, then save as node + n.text = node + + controller_host.text = hostname # keep controllernode as fqdn + + return True + +# New Exception types +class NodeNotFoundException(Exception): + pass diff --git a/cmapi/cmapi_server/process_dispatchers/__init__.py b/cmapi/cmapi_server/process_dispatchers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/cmapi_server/process_dispatchers/base.py b/cmapi/cmapi_server/process_dispatchers/base.py new file mode 100644 index 000000000..45d10c6e8 --- /dev/null +++ b/cmapi/cmapi_server/process_dispatchers/base.py @@ -0,0 +1,146 @@ +"""Module contains base process dispatcher class implementation. + +Formally this is must have interface for subclasses. +""" + +import logging +import os +import shlex +import subprocess +from datetime import datetime +from pathlib import Path +from typing import Dict, Optional, TextIO, Tuple + +from cmapi_server.constants import MCS_INSTALL_BIN, MCS_LOG_PATH + + +class BaseDispatcher: + """Class with base interfaces for dispatchers.""" + + @staticmethod + def _create_mcs_process_logfile(filename: str) -> str: + """Create log file by name. + + :param filename: log filename + :type filename: str + :return: full path of created log file + :rtype: str + """ + log_fullpath = os.path.join(MCS_LOG_PATH, filename) + Path(log_fullpath).touch(mode=666) + return log_fullpath + + @staticmethod + def exec_command( + command: str, daemonize: bool = False, silent: bool = False, + stdout: TextIO = subprocess.PIPE, env: Optional[Dict] = None + ) -> Tuple[bool, str]: + """Run command using subprocess. + + :param command: command to run + :type command: str + :param daemonize: run command in detached mode, defaults to False + :type daemonize: bool, optional + :param silent: prevent error logs on non-zero exit status, + defaults to False + :type silent: bool, optional + :param stdout: stdout argument for Popen, defaults to subprocess.STDOUT + :type stdout: TextIO, optional + :param env: environment argument for Popen, defaults to None + :type env: Optional[Dict], optional + :return: tuple with success status and output string from subprocess, + if there are multiple lines in output they should be splitted + :rtype: Tuple[bool, str] + """ + output: str = '' + result: Tuple = (False, output) + try: + proc = subprocess.Popen( + shlex.split(command), + stdout=stdout, + stderr=subprocess.STDOUT, + start_new_session=daemonize, + env=env, + encoding='utf-8' + ) + except Exception: + logging.error(f'Failed on run command "{command}".', exc_info=True) + # TODO: cmapi have to close with exception here + # to stop docker container? + # raise + return result + if daemonize: + # remove Popen object. optionally gc.collect could be invoked. + # this is made to prevent eventually spawning duplicated "defunct" + # (zombie) python parented processes. This could happened + # previously after cluster restart. It didn't affects cluster + # condition, only makes "mcs cluster status" command output + # confusing and ugly. + del proc + result = (True, output) + else: + logging.debug('Waiting command to finish.') + stdout_str, _ = proc.communicate() + returncode = proc.wait() + if stdout_str is not None: + # output guaranteed to be empty string not None + output = stdout_str + result = (True, output) + if returncode != 0: + if not silent: + logging.error( + f'Calling "{command}" finished with return code: ' + f'"{returncode}" and stderr+stdout "{output}".' + ) + result = (False, output) + return result + + @classmethod + def _run_dbbuilder(cls, use_su=False) -> None: + # attempt to run dbbuilder on primary node + # e.g., s3 was setup after columnstore install + logging.info('Attempt to run dbbuilder on primary node') + dbbuilder_path = os.path.join(MCS_INSTALL_BIN, 'dbbuilder') + dbbuilder_arg = '7' + dbb_command = f'{dbbuilder_path} {dbbuilder_arg}' + if use_su: + # TODO: move mysql user to constants + dbb_command = f'su -s /bin/sh -c "{dbb_command}" mysql' + dbb_log_path = cls._create_mcs_process_logfile('dbbuilder.log') + with open(dbb_log_path, 'a', encoding='utf-8') as dbb_log_fh: + dbb_start_time = datetime.now().strftime('%d/%b/%Y %H:%M:%S') + dbb_log_fh.write(f'-----Started at {dbb_start_time}.-----\n') + # TODO: error handling? + # check if exist for next releases? + success, _ = cls.exec_command(dbb_command, stdout=dbb_log_fh) + dbb_log_fh.write('-----Finished run.-----\n\n') + + @classmethod + def init(cls): + """Method for dispatcher initialisation.""" + pass + + @classmethod + def is_service_running(cls, service: str, use_sudo: bool) -> bool: + """Check if systemd proceess/service is running.""" + raise NotImplementedError + + @classmethod + def start(cls, service: str, is_primary: bool, use_sudo: bool) -> bool: + """Start process/service.""" + raise NotImplementedError + + @classmethod + def stop(cls, service: str, is_primary: bool, use_sudo: bool) -> bool: + """Stop process/service.""" + raise NotImplementedError + + @classmethod + def restart(cls, service: str, is_primary: bool, use_sudo: bool) -> bool: + """Restart process/service.""" + raise NotImplementedError + + @classmethod + def reload(cls, service: str, is_primary: bool, use_sudo: bool) -> bool: + """Reload process/service.""" + raise NotImplementedError diff --git a/cmapi/cmapi_server/process_dispatchers/container.py b/cmapi/cmapi_server/process_dispatchers/container.py new file mode 100644 index 000000000..0aeccae7a --- /dev/null +++ b/cmapi/cmapi_server/process_dispatchers/container.py @@ -0,0 +1,294 @@ +""" +Module contains non-systemd/container process dispatcher class implementation. +""" + +import logging +import os.path +import re +from pathlib import Path +from time import sleep + +import psutil + +from cmapi_server.constants import ( + IFLAG, LIBJEMALLOC_DEFAULT_PATH, MCS_INSTALL_BIN, ALL_MCS_PROGS +) +from cmapi_server.exceptions import CMAPIBasicError +from cmapi_server.process_dispatchers.base import BaseDispatcher + + +class ContainerDispatcher(BaseDispatcher): + """Manipulates processes in docker container. + + It's possible to use in any OS/container environment in cases when + we don't want to use systemd or don't have it. + """ + libjemalloc_path = None + + @staticmethod + def _set_iflag(): + """Create IFLAG file. + + Means Columnstore container init finished. + """ + Path(IFLAG).touch() + + @classmethod + def _get_proc_object(cls, name: str) -> psutil.Process: + """Getting psutil Process object by service name. + + :param name: process name + :type name: str + :raises psutil.NoSuchProcess: if no process with such name presented + :return: Process object with specified name + :rtype: psutil.Process + + ...TODO: add types-psutil to requirements for mypy checks + """ + for proc in psutil.process_iter(['pid', 'name', 'username']): + if proc.name().lower() == name.lower(): + return proc + raise psutil.NoSuchProcess(pid=None, name=name) + + @classmethod + def get_libjemalloc_path(cls) -> str: + """Get libjemalloc.so path. + + :raises CMAPIBasicError: raises if ldconfig execution returned non zero + :raises FileNotFoundError: if no libjemalloc.so.2 found + :return: libjemalloc.so.2 path + :rtype: str + """ + logger = logging.getLogger('container_sh') + if cls.libjemalloc_path: + return cls.libjemalloc_path + # pylint: disable=line-too-long + # for reference: https://github.com/pyinstaller/pyinstaller/blob/f29b577df4e1659cf65aacb797034763308fd298/PyInstaller/depend/utils.py#L304 + + splitlines_count = 1 + pattern = re.compile(r'^\s+(\S+)(\s.*)? => (\S+)') + success, result = cls.exec_command('ldconfig -p') + if not success: + raise CMAPIBasicError('Failed executing ldconfig.') + + text = result.strip().splitlines()[splitlines_count:] + + for line in text: + # this assumes library names do not contain whitespace + p_match = pattern.match(line) + # Sanitize away any abnormal lines of output. + if p_match is None: + continue + + lib_path = p_match.groups()[-1] + lib_name = p_match.group(1) + if 'libjemalloc' in lib_name: + # use the first entry + # TODO: do we need path or name here? + # $(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') + cls.libjemalloc_path = lib_path + break + + if not cls.libjemalloc_path: + if not os.path.exists(LIBJEMALLOC_DEFAULT_PATH): + logger.error('No libjemalloc.so.2 found.') + raise FileNotFoundError + cls.libjemalloc_path = LIBJEMALLOC_DEFAULT_PATH + + return cls.libjemalloc_path + + @classmethod + def is_service_running(cls, service: str, use_sudo: bool = True) -> bool: + """Check if mcs process is running. + + :param service: service name + :type service: str + :param use_sudo: interface requirement, unused here, defaults to True + :type use_sudo: bool, optional + :return: True if service is running, otherwise False + :rtype: bool + """ + try: + cls._get_proc_object(service) + except psutil.NoSuchProcess: + return False + return True + + @staticmethod + def _make_cmd(service: str) -> str: + """Make shell command by service name. + + :param service: service name + :type service: str + :return: command with arguments if needed + :rtype: str + """ + service_info = ALL_MCS_PROGS[service] + command = os.path.join(MCS_INSTALL_BIN, service) + + if service_info.subcommand: + subcommand = service_info.subcommand + command = f'{command} {subcommand}' + + return command + + @classmethod + def start( + cls, service: str, is_primary: bool, use_sudo: bool = True + ) -> bool: + """Start process in docker container. + + :param service: process name + :type service: str + :param is_primary: is node primary or not + :type is_primary: bool, optional + :param use_sudo: interface required, unused here, defaults to True + :type use_sudo: bool, optional + :return: True if service started successfully + :rtype: bool + """ + logger = logging.getLogger('container_sh') + if cls.is_service_running(service): + return True + + logger.debug(f'Starting {service}') + env_vars = {"LD_PRELOAD": cls.get_libjemalloc_path()} + command = cls._make_cmd(service) + + if service == 'workernode': + # workernode starts on primary and non primary node with 1 or 2 + # added to the end of argument: + # DBRM_Worker1 - on primary, DBRM_Worker2 - non primary + command = command.format(1 if is_primary else 2) + + # start mcs-loadbrm.py before workernode + logger.debug('Waiting to load BRM.') + loadbrm_path = os.path.join(MCS_INSTALL_BIN, 'mcs-loadbrm.py') + loadbrm_logpath = cls._create_mcs_process_logfile( + 'mcs-loadbrm.log' + ) + with open(loadbrm_logpath, 'a', encoding='utf-8') as loadbrm_logfh: + success, _ = cls.exec_command( + f'{loadbrm_path} no', stdout=loadbrm_logfh, env=env_vars + ) + if not success: + logger.error('Error while loading BRM.') + else: + logger.debug('Successfully loaded BRM.') + + service_log_path = cls._create_mcs_process_logfile( + f'{service.lower()}.log' + ) + success, _ = cls.exec_command( + command, daemonize=True, + stdout=open(service_log_path, 'a', encoding='utf-8'), + env=env_vars + ) + # TODO: any other way to detect service finished its initialisation? + sleep(ALL_MCS_PROGS[service].delay) + logger.debug(f'Started "{service}".') + + if is_primary and service == 'DDLProc': + cls._run_dbbuilder() + + return cls.is_service_running(service) + + @classmethod + def stop( + cls, service: str, is_primary: bool, use_sudo: bool = True + ) -> bool: + """Stop process in docker container. + + :param service: process name + :type service: str + :param is_primary: is node primary or not + :type is_primary: bool, optional + :param use_sudo: interface required, unused here, defaults to True + :type use_sudo: bool, optional + :return: True if service started successfully + :rtype: bool + """ + logger = logging.getLogger('container_sh') + if not cls.is_service_running(service): + return True + + logger.debug(f'Stopping {service}') + service_proc = cls._get_proc_object(service) + + if service == 'workernode': + # start mcs-savebrm.py before stoping workernode + logger.debug('Waiting to save BRM.') + savebrm_path = os.path.join(MCS_INSTALL_BIN, 'mcs-savebrm.py') + savebrm_logpath = cls._create_mcs_process_logfile( + 'mcs-savebrm.log' + ) + with open(savebrm_logpath, 'a', encoding='utf-8') as savebrm_logfh: + success, _ = cls.exec_command( + savebrm_path, stdout=savebrm_logfh + ) + if not success: + logger.error('Error while saving BRM.') + else: + logger.debug('Successfully saved BRM.') + + logger.debug('Start clearing SHM.') + clearshm_path = os.path.join(MCS_INSTALL_BIN, 'clearShm') + success, _ = cls.exec_command(clearshm_path) + if not success: + logger.error('Error while clearing SHM.') + else: + logger.debug('Successfully cleared SHM.') + + service_proc.terminate() + # timeout got from old container.sh + # TODO: this is still not enough for controllernode process + # it should be always stop by SIGKILL, need to investigate. + timeout = 3 + if service == 'StorageManager': + timeout = 300 # 5 minutes + logger.debug(f'Waiting to gracefully stop "{service}".') + # This function will return as soon as all processes terminate + # or when timeout (seconds) occurs. + gone, alive = psutil.wait_procs([service_proc], timeout=timeout) + if alive: + logger.debug( + f'{service} not terminated with SIGTERM, sending SIGKILL.' + ) + # only one process could be in a list + alive[0].kill() + gone, alive = psutil.wait_procs([service_proc], timeout=timeout) + if gone: + logger.debug(f'Successfully killed "{service}".') + else: + logger.warning( + f'Service "{service}" still alive after sending "kill -9" ' + f'and waiting {timeout} seconds.' + ) + else: + logger.debug(f'Gracefully stopped "{service}".') + + return not cls.is_service_running(service) + + @classmethod + def restart( + cls, service: str, is_primary: bool, use_sudo: bool = True + ) -> bool: + """Restart process in docker container. + + :param service: process name + :type service: str + :param is_primary: is node primary or not + :type is_primary: bool, optional + :param use_sudo: interface required, unused here, defaults to True + :type use_sudo: bool, optional + :return: True if service started successfully + :rtype: bool + + ...TODO: for next releases. Additional error handling. + """ + if cls.is_service_running(service): + # TODO: retry? + stop_success = cls.stop(service, is_primary, use_sudo) + start_success = cls.start(service, is_primary, use_sudo) + + return stop_success and start_success diff --git a/cmapi/cmapi_server/process_dispatchers/systemd.py b/cmapi/cmapi_server/process_dispatchers/systemd.py new file mode 100644 index 000000000..7d3e7e305 --- /dev/null +++ b/cmapi/cmapi_server/process_dispatchers/systemd.py @@ -0,0 +1,231 @@ +"""Module contains systemd process dispatcher class implementation.""" + +import logging +import re +from typing import Union, Tuple + +from cmapi_server.process_dispatchers.base import BaseDispatcher + + +class SystemdDispatcher(BaseDispatcher): + """Manipulates with systemd services.""" + systemctl_version: int = 219 #CentOS 7 version + + @classmethod + def _systemctl_call( + cls, command: str, service: str, use_sudo: bool = True, + return_output=False, *args, **kwargs + ) -> Union[Tuple[bool, str], bool]: + """Run "systemctl" with arguments. + + :param command: command for systemctl + :type command: str + :param service: systemd service name + :type service: str + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :return: return status of operation, True if success, otherwise False + :rtype: Union[Tuple[bool, str], bool] + """ + cmd = f'systemctl {command} {service}' + if use_sudo: + cmd = f'sudo {cmd}' + logging.debug(f'Call "{command}" on service "{service}" with "{cmd}".') + success, output = cls.exec_command(cmd, *args, **kwargs) + if return_output: + return success, output + return success + + @classmethod + def init(cls): + cmd = 'systemctl --version' + success, output = cls.exec_command(cmd) + if success: + # raw result will be like + # "systemd 239 (245.4-4ubuntu3.17)\n " + cls.systemctl_version = int( + re.search(r'systemd (\d+)', output).group(1) + ) + logging.info(f'Detected {cls.systemctl_version} SYSTEMD version.') + else: + logging.error('Couldn\'t detect SYSTEMD version') + + @classmethod + def is_service_running(cls, service: str, use_sudo: bool = True) -> bool: + """Check if systemd service is running. + + :param service: service name + :type service: str, optional + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :return: True if service is running, otherwise False + :rtype: bool + + ..Note: + Not working with multiple services at a time. + """ + logging.debug(f'Checking "{service}" is running.') + # TODO: remove conditions below when we'll drop CentOS 7 support + cmd = 'show -p ActiveState --value' + if cls.systemctl_version < 230: # not supported --value in old version + cmd = 'show -p ActiveState' + _, output = cls._systemctl_call( + cmd, + service, use_sudo, return_output=True + ) + service_state = output.strip() + if cls.systemctl_version < 230: # result like 'ActiveState=active' + service_state = service_state.split('=')[1] + logging.debug(f'Service "{service}" is in "{service_state}" state') + # interpret non "active" state as not running service + if service_state == 'active': + return True + # output could be inactive, activating or even empty if + # command execution was unsuccessfull + return False + + @staticmethod + def _workernode_get_service_name(is_primary: bool) -> str: + """Get proper workernode service name based on primary status. + + :param is_primary: is node where we running primary? + :type is_primary: bool + :return: correct workernode service name + :rtype: str + """ + service = 'mcs-workernode' + return f'{service}@1.service' if is_primary else f'{service}@2.service' + + @classmethod + def _workernode_enable(cls, enable: bool, use_sudo: bool = True) -> None: + """Enable workernode service. + + :param enable: enable or disable + :type enable: bool + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + """ + sub_cmd = 'enable' if enable else 'disable' + service = 'mcs-workernode@1.service' + + if not cls._systemctl_call(sub_cmd, service, use_sudo): + # enabling\disabling service is not critical, just log failure + logging.warning(f'Failed to {sub_cmd} {service}') + + @classmethod + def start( + cls, service: str, is_primary: bool = True, use_sudo: bool = True + ) -> bool: + """Start systemd service. + + :param service: service name + :type service: str, optional + :param is_primary: is node primary or not + :type is_primary: bool, optional + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :return: True if service started successfully + :rtype: bool + """ + service_name = service + if service_name == 'mcs-workernode': + service_name = cls._workernode_get_service_name(is_primary) + if is_primary: + cls._workernode_enable(True, use_sudo) + + if cls.is_service_running(service_name, use_sudo): + return True + + logging.debug(f'Starting "{service_name}".') + if not cls._systemctl_call('start', service_name, use_sudo): + logging.error(f'Failed while starting "{service_name}".') + return False + + if is_primary and service == 'mcs-ddlproc': + cls._run_dbbuilder(use_su=True) + + logging.debug(f'Successfully started {service_name}.') + return cls.is_service_running(service_name, use_sudo) + + @classmethod + def stop( + cls, service: str, is_primary: bool = True, use_sudo: bool = True + ) -> bool: + """Stop systemd service. + + :param service: service name + :type service: str, optional + :param is_primary: is node primary or not + :type is_primary: bool, optional + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :return: True if service stopped successfully + :rtype: bool + """ + service_name = service + if service_name == 'mcs-workernode': + service_name = f'{service_name}@1.service {service_name}@2.service' + cls._workernode_enable(False, use_sudo) + + logging.debug(f'Stopping "{service_name}".') + if not cls._systemctl_call('stop', service_name, use_sudo): + logging.error(f'Failed while stopping "{service_name}".') + return False + + return not cls.is_service_running(service, use_sudo) + + @classmethod + def restart( + cls, service: str, is_primary: bool = True, use_sudo: bool = True + ) -> bool: + """Restart systemd service. + + :param service: service name + :type service: str, optional + :param is_primary: is node primary or not, defaults to True + :type is_primary: bool, optional + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :return: True if service restarted successfully + :rtype: bool + """ + service_name = service + if service_name == 'mcs-workernode': + service_name = cls._workernode_get_service_name(is_primary) + + logging.debug(f'Restarting "{service_name}".') + if not cls._systemctl_call('restart', service_name, use_sudo): + logging.error(f'Failed while restarting "{service_name}".') + return False + + return cls.is_service_running(service, use_sudo) + + @classmethod + def reload( + cls, service: str, is_primary: bool = True, use_sudo: bool=True + ) -> bool: + """Reload systemd service. + + :param service: service name, defaults to 'Unknown_service' + :type service: str, optional + :param is_primary: is node primary or not, defaults to True + :type is_primary: bool, optional + :param use_sudo: use sudo or not, defaults to True + :type use_sudo: bool, optional + :return: True if service reloaded successfully + :rtype: bool + + ..NOTE: For next releases. It should become important when we teach + MCS to add/remove nodes w/o whole cluster restart. + Additional error handling? + """ + service_name = service + if service_name == 'mcs-workernode': + service_name = cls._workernode_get_service_name(is_primary) + + logging.debug(f'Reloading "{service_name}".') + if not cls._systemctl_call('reload', service_name, use_sudo): + logging.error(f'Failed while reloading "{service_name}".') + return False + + return not cls.is_service_running(service, use_sudo) diff --git a/cmapi/cmapi_server/test/CS-config-test.xml b/cmapi/cmapi_server/test/CS-config-test.xml new file mode 100644 index 000000000..ac4995629 --- /dev/null +++ b/cmapi/cmapi_server/test/CS-config-test.xml @@ -0,0 +1,501 @@ + + + + 127.0.0.1 + 8601 + unassigned + + + 0.0.0.0 + 8602 + + + 127.0.0.1 + 8603 + + + 127.0.0.1 + 8606 + + + 127.0.0.1 + 8604 + + + 0.0.0.0 + 8605 + + + + + 127.0.0.1 + 8800 + + + 0.0.0.0 + 8800 + + + 0.0.0.0 + 8800 + + + 127.0.0.1 + 8800 + + + 0.0.0.0 + 8622 + + + 0.0.0.0 + 8622 + + + 127.0.0.1 + 8622 + + + 127.0.0.1 + 8630 + + + 127.0.0.1 + 8612 + + + 127.0.0.1 + 8614 + + + 10000 + + + 1 + 2 + 128 + 10K + 0 + 512 + 512 + + 1 + 0 + n + + + + y + + + + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + columnstore-1 + pm1 + pm1 + + 1 + /var/lib/columnstore/data1 + /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves + /var/lib/columnstore/data1/systemFiles/dbrm/tablelocks + 15 + 100000 + 10 + 95 + OFF + + /rdwrscratch + + /tmp/columnstore_tmp_files + + + dm + Director Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + um + User Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + pm + Performance Module + 1 + 127.0.0.1 + localhost + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + 1 + 1 + + + 0 + unassigned + 0.0.0.0 + ENABLED + + + 1000 + /var/lib/columnstore/data1/systemFiles/dbrm/SMTxnID + + + + 1GB + + + + /var/lib/columnstore/data1/systemFiles/dbrm/oidbitmap + + 3000 + + + /var/log/mariadb/columnstore/data/bulk + /var/lib/columnstore/data1/systemFiles/bulkRollback + 98 + 1 + n + + + 1 + 127.0.0.1 + 8616 + + + + 127.0.0.1 + 8700 + pm1 + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + + + + + 1 + 0 + 0 + 65536 + 2K + 200 + 0 + 50 + + + 2 + n + n + internal + internal + /etc/profile.d/columnstoreAlias.sh + + + + + 4 + 0x0 + + + 128 + 128K + 1G + 25% + 100 + N + Y + Snappy + + + 16K + 16 + 1 + + + + + 100 + + + + + + N + + + 127.0.0.1 + 3306 + root + + + + + + + N + + + N + + + Y + Snappy + + + 127.0.0.1 + 0 + + + 30 + N + + + + + diff --git a/cmapi/cmapi_server/test/Columnstore_apply_config.xml b/cmapi/cmapi_server/test/Columnstore_apply_config.xml new file mode 100644 index 000000000..580a829ae --- /dev/null +++ b/cmapi/cmapi_server/test/Columnstore_apply_config.xml @@ -0,0 +1,535 @@ + + + 0.0.0.0 + 2 + 3 + 5 + + 10.128.0.23 + 8601 + pm1 + + + 0.0.0.0 + 8602 + + + 10.128.0.23 + 8603 + + + 10.128.0.23 + 8606 + + + 10.128.0.23 + 8604 + + + 0.0.0.0 + 8605 + + + + + 127.0.0.1 + 8800 + + + 0.0.0.0 + 8800 + + + 0.0.0.0 + 8800 + + + 10.128.0.23 + 8800 + + + 0.0.0.0 + 8622 + + + 0.0.0.0 + 8622 + + + 10.128.0.23 + 8622 + + + 10.128.0.23 + 8630 + + + 10.128.0.23 + 8612 + + + 10.128.0.23 + 8614 + + + 10000 + + + 2 + 2 + 128 + 10K + 0 + 512 + 512 + + 1 + 0 + n + + + + y + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + 10.128.0.23 + 8620 + + + 172.30.0.144 + 8620 + + + C + columnstore-1 + pm1 + pm1 + + 2 + /var/lib/columnstore/data1 + /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves + /var/lib/columnstore/data1/systemFiles/dbrm/tablelocks + 20 + 100000 + 10 + 95 + OFF + + /rdwrscratch + + /tmp/columnstore_tmp_files + + /var/lib/columnstore/data2 + + + dm + Director Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + um + User Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + + pm + Performance Module + 2 + 10.128.0.23 + localhost + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + 1 + 1 + unassigned + 0.0.0.0 + unassigned + 0.0.0.0 + ENABLED + pm2 + 172.30.0.144 + unassigned + 0.0.0.0 + 3 + 4 + 5 + 3 + + + 0 + unassigned + 0.0.0.0 + ENABLED + + + 1000 + /var/lib/columnstore/data1/systemFiles/dbrm/SMTxnID + + + + 1GB + + + + /var/lib/columnstore/data1/systemFiles/dbrm/oidbitmap + + 3000 + + + /var/log/mariadb/columnstore/data/bulk + /var/lib/columnstore/data1/systemFiles/bulkRollback + 98 + 1 + n + + + 2 + 10.128.0.23 + 8616 + + + + 10.128.0.23 + 8700 + pm1 + + + 172.30.0.144 + 8700 + pm2 + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + + + + + 1 + 0 + 0 + 65536 + 2K + 200 + 0 + 50 + + + 2 + n + y + internal + internal + /etc/profile.d/columnstoreAlias.sh + + + + 4 + 0x0 + + + 128 + 128K + 1G + 25% + 100 + N + Y + Snappy + + + 16K + 16 + 1 + + + + + 100 + + + + + + N + + + 127.0.0.1 + 3306 + root + + + + + + + N + + + N + + + Y + Snappy + + + 127.0.0.1 + 0 + + + 30 + N + + + 10.128.0.23 + + + 172.30.0.144 + 8800 + + + 172.30.0.144 + 8622 + + + 172.30.0.144 + 8630 + + + 172.30.0.144 + 8601 + pm2 + + diff --git a/cmapi/cmapi_server/test/__init__.py b/cmapi/cmapi_server/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/cmapi_server/test/config_apply_example.py b/cmapi/cmapi_server/test/config_apply_example.py new file mode 100644 index 000000000..54cea2214 --- /dev/null +++ b/cmapi/cmapi_server/test/config_apply_example.py @@ -0,0 +1,50 @@ +import unittest +import requests +import configparser +from pathlib import Path +from datetime import datetime + +from cmapi_server.controllers.dispatcher import _version + +config_filename = './cmapi_server/cmapi_server.conf' + +url = f"https://localhost:8640/cmapi/{_version}/node/config" +begin_url = f"https://localhost:8640/cmapi/{_version}/node/begin" +config_path = './cmapi_server/test/Columnstore_apply_config.xml' + +# create tmp dir +tmp_prefix = '/tmp/mcs_config_test' +tmp_path = Path(tmp_prefix) +tmp_path.mkdir(parents = True, exist_ok = True) +copyfile(config_path_old, tmp_prefix + '/Columnstore.xml') + + +def get_current_key(): + app_config = configparser.ConfigParser() + try: + with open(config_filename, 'r') as _config_file: + app_config.read_file(_config_file) + except FileNotFoundError: + return '' + if 'Authentication' not in app_config.sections(): + return '' + return app_config['Authentication'].get('x-api-key', '') + +headers = {'x-api-key': get_current_key()} +body = {'id': 42, 'timeout': 120} +r = requests.put(begin_url, verify=False, headers=headers, json=body) + +config_file = Path(config_path) +config = config_file.read_text() + +body = { + 'revision': 42, + 'manager': '1.1.1.1', + 'timeout': 0, + 'config': config, +} + +#print(config) + +#r = requests.put(url, verify=False, headers=headers, json=body) + diff --git a/cmapi/cmapi_server/test/test_cej.py b/cmapi/cmapi_server/test/test_cej.py new file mode 100644 index 000000000..6b5167c4e --- /dev/null +++ b/cmapi/cmapi_server/test/test_cej.py @@ -0,0 +1,35 @@ +"""Tests for all the CEJ (Cross Engine Join) related stuff.""" +import os +import subprocess +import sys +import unittest +from shutil import which + +from cmapi_server.handlers.cej import CEJPasswordHandler +from cmapi_server.constants import MCS_SECRETS_FILE_PATH + + +class SecretsTestCase(unittest.TestCase): + """Test case for checking .secrets file related stuff.""" + + @unittest.skipIf(which('cskeys') is None, + 'This MCS version doesn\'t provide "cskeys" tool.') + def test_cspasswd_decrypt_algorithm(self) -> None: + """Test to check decrypt algorithm. + + Check that CEJ password decrypting algorithm is the same between + "cspasswd" tool in MCS and in CMAPI. + """ + + test_passwd = 'columstore is the best' + # create .secrets file using cskeys util + ret = subprocess.run( + 'cskeys', shell=True, stdout=subprocess.PIPE, check=True + ) + encrypted_passwd = subprocess.check_output( + ['cspasswd', test_passwd] + ).decode(sys.stdout.encoding).strip() + self.assertEqual( + test_passwd, CEJPasswordHandler.decrypt_password(encrypted_passwd) + ) + os.remove(MCS_SECRETS_FILE_PATH) diff --git a/cmapi/cmapi_server/test/test_cluster.py b/cmapi/cmapi_server/test/test_cluster.py new file mode 100644 index 000000000..f174f72ad --- /dev/null +++ b/cmapi/cmapi_server/test/test_cluster.py @@ -0,0 +1,236 @@ +import logging +import os +import socket +import subprocess +from shutil import copyfile + +import requests + +from cmapi_server.controllers.dispatcher import _version +from cmapi_server.managers.process import MCSProcessManager +from cmapi_server.test.unittest_global import ( + BaseServerTestCase, MCS_CONFIG_FILEPATH, COPY_MCS_CONFIG_FILEPATH, + TEST_MCS_CONFIG_FILEPATH, +) + + +logging.basicConfig(level='DEBUG') +requests.urllib3.disable_warnings() + + +class BaseClusterTestCase(BaseServerTestCase): + + @classmethod + def setUpClass(cls) -> None: + copyfile(MCS_CONFIG_FILEPATH, COPY_MCS_CONFIG_FILEPATH) + return super().setUpClass() + + @classmethod + def tearDownClass(cls) -> None: + copyfile(COPY_MCS_CONFIG_FILEPATH, MCS_CONFIG_FILEPATH) + os.remove(os.path.abspath(COPY_MCS_CONFIG_FILEPATH)) + MCSProcessManager.stop_node(is_primary=True) + MCSProcessManager.start_node(is_primary=True) + return super().tearDownClass() + + def setUp(self) -> None: + copyfile(TEST_MCS_CONFIG_FILEPATH, MCS_CONFIG_FILEPATH) + MCSProcessManager.stop_node(is_primary=True) + MCSProcessManager.start_node(is_primary=True) + return super().setUp() + + +class ClusterStartTestCase(BaseClusterTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/cluster/start' + + def test_endpoint_with_no_api_key(self): + r = requests.put( + self.URL, verify=False, headers=self.NO_AUTH_HEADERS, + json={} + ) + self.assertEqual(r.status_code, 401) + + def test_endpoint_with_no_nodes_in_cluster(self): + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, + json={} + ) + error = r.json()['error'] + self.assertEqual(r.status_code, 422) + self.assertEqual(error, 'There are no nodes in the cluster.') + + def test_start_after_adding_a_node(self): + payload = {'node': socket.gethostname()} + resp = requests.post( + ClusterAddNodeTestCase.URL, verify=False, headers=self.HEADERS, + json=payload + ) + self.assertEqual(resp.status_code, 200) + + payload = {'node': None} + resp = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=payload + ) + self.assertEqual(resp.status_code, 200) + + # test_columnstore_started + controllernode = subprocess.check_output(['pgrep', 'controllernode']) + self.assertIsNotNone(controllernode) + + +class ClusterShutdownTestCase(BaseClusterTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/cluster/shutdown' + + def test_endpoint_with_no_api_key(self): + r = requests.put( + self.URL, verify=False, headers=self.NO_AUTH_HEADERS, + json={} + ) + self.assertEqual(r.status_code, 401) + + def test_endpoint_with_no_nodes_in_cluster(self): + resp = requests.put(self.URL, verify=False, headers=self.HEADERS, + json={} + ) + error = resp.json()['error'] + self.assertEqual(resp.status_code, 422) + self.assertEqual(error, 'There are no nodes in the cluster.') + + def test_add_node_and_shutdown(self): + payload = {'node': socket.gethostname()} + resp = requests.post( + ClusterAddNodeTestCase.URL, verify=False, headers=self.HEADERS, + json=payload + ) + self.assertEqual(resp.status_code, 200) + + # note: POST node starts up node + try: + controllernode = subprocess.check_output( + ['pgrep', 'controllernode'] + ) + except Exception as e: + controllernode = None + self.assertIsNotNone(controllernode) + + payload = {'timeout': 60} + resp = requests.put( + self.URL, verify=False, headers=self.HEADERS, + json=payload + ) + self.assertEqual(resp.status_code, 200) + + # Check columnstore stopped + try: + controllernode = subprocess.check_output( + ['pgrep', 'controllernode'] + ) + except Exception as e: + controllernode = None + self.assertIsNone(controllernode) + + +class ClusterModesetTestCase(BaseClusterTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/cluster/mode-set' + + def test_endpoint_with_no_api_key(self): + resp = requests.put( + self.URL, verify=False, headers=self.NO_AUTH_HEADERS, + json={} + ) + self.assertEqual(resp.status_code, 401) + + def test_endpoint_with_no_nodes_in_cluster(self): + resp = requests.put( + self.URL, verify=False, headers=self.HEADERS, + json={} + ) + error = resp.json()['error'] + self.assertEqual(resp.status_code, 422) + self.assertEqual(error, 'No master found in the cluster.') + + def test_add_node_and_set_readonly(self): + payload = {'node': socket.gethostname()} + resp = requests.post( + ClusterAddNodeTestCase.URL, verify=False, headers=self.HEADERS, + json=payload + ) + self.assertEqual(resp.status_code, 200) + + payload = {'mode': 'readonly'} + resp = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=payload + ) + self.assertEqual(resp.status_code, 200) + + # return readwrite mode back + payload = {'mode': 'readwrite'} + resp = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=payload + ) + self.assertEqual(resp.status_code, 200) + +class ClusterAddNodeTestCase(BaseClusterTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/cluster/node' + + def test_endpoint_with_no_apikey(self): + resp = requests.post( + self.URL, verify=False, headers=self.NO_AUTH_HEADERS, + json={} + ) + self.assertEqual(resp.status_code, 401) + + def test_endpoint_with_missing_node_parameter(self): + resp = requests.put( + self.URL, verify=False, headers=self.HEADERS, + json={} + ) + error = resp.json()['error'] + self.assertEqual(resp.status_code, 422) + self.assertEqual(error, 'missing node argument') + + def test_endpoint(self): + payload = {'node': socket.gethostname()} + resp = requests.put( + self.URL, verify=False, headers=self.HEADERS, + json=payload + ) + self.assertEqual(resp.status_code, 200) + + # Check Columntore started + controllernode = subprocess.check_output( + ['pgrep', 'controllernode']) + self.assertIsNotNone(controllernode) + + +class ClusterRemoveNodeTestCase(BaseClusterTestCase): + URL = ClusterAddNodeTestCase.URL + + def test_endpoint_with_no_apikey(self): + resp = requests.delete( + self.URL, verify=False, headers=self.NO_AUTH_HEADERS, + json={} + ) + self.assertEqual(resp.status_code, 401) + + def test_endpoint_with_missing_node_parameter(self): + resp = requests.delete( + self.URL, verify=False, headers=self.HEADERS, + json={} + ) + error = resp.json()['error'] + self.assertEqual(resp.status_code, 422) + self.assertEqual(error, 'missing node argument') + + def test_add_node_and_remove(self): + payload = {'node': socket.gethostname()} + resp = requests.post( + ClusterAddNodeTestCase.URL, verify=False, headers=self.HEADERS, + json=payload + ) + self.assertEqual(resp.status_code, 200) + + resp = requests.delete( + self.URL, verify=False, headers=self.HEADERS, json=payload + ) + self.assertEqual(resp.status_code, 200) diff --git a/cmapi/cmapi_server/test/test_em_endpoints.py b/cmapi/cmapi_server/test/test_em_endpoints.py new file mode 100644 index 000000000..d6b92fd60 --- /dev/null +++ b/cmapi/cmapi_server/test/test_em_endpoints.py @@ -0,0 +1,217 @@ +import configparser +import subprocess +import unittest +from contextlib import contextmanager +from os import path, remove +from pathlib import Path +from shutil import copyfile + +import cherrypy +import requests +requests.packages.urllib3.disable_warnings() + +from cmapi_server.constants import ( + EM_PATH_SUFFIX, MCS_EM_PATH, MCS_BRM_CURRENT_PATH, S3_BRM_CURRENT_PATH +) +from cmapi_server.controllers.dispatcher import ( + dispatcher, jsonify_error,_version +) +from cmapi_server.test.unittest_global import ( + create_self_signed_certificate, cert_filename, cmapi_config_filename, + tmp_cmapi_config_filename +) +from mcs_node_control.models.node_config import NodeConfig + + +@contextmanager +def run_server(): + if not path.exists(cert_filename): + create_self_signed_certificate() + cherrypy.engine.start() + cherrypy.engine.wait(cherrypy.engine.states.STARTED) + yield + cherrypy.engine.exit() + cherrypy.engine.block() + + +def get_current_key(): + app_config = configparser.ConfigParser() + try: + with open(cmapi_config_filename, 'r') as _config_file: + app_config.read_file(_config_file) + except FileNotFoundError: + return '' + + if 'Authentication' not in app_config.sections(): + return '' + return app_config['Authentication'].get('x-api-key', '') + +class TestEMEndpoints(unittest.TestCase): + @classmethod + def setUpClass(cls): + if not path.exists(tmp_cmapi_config_filename): + f = open(tmp_cmapi_config_filename, 'x') + f.close() + copyfile(cmapi_config_filename, tmp_cmapi_config_filename) + + @classmethod + def tearDownClass(cls): + if path.exists(tmp_cmapi_config_filename): + copyfile(tmp_cmapi_config_filename, cmapi_config_filename) + remove(tmp_cmapi_config_filename) + + def get_examplar_bytes(self, element: str): + node_config = NodeConfig() + if node_config.s3_enabled(): + ret = subprocess.run( + ["smcat", S3_BRM_CURRENT_PATH], stdout=subprocess.PIPE + ) + element_current_suffix = ret.stdout.decode("utf-8").rstrip() + element_current_filename = f'{EM_PATH_SUFFIX}/{element_current_suffix}_{element}' + ret = subprocess.run( + ["smcat", element_current_filename], stdout=subprocess.PIPE + ) + result = ret.stdout + else: + element_current_name = Path(MCS_BRM_CURRENT_PATH) + element_current_filename = element_current_name.read_text().rstrip() + element_current_file = Path( + f'{MCS_EM_PATH}/{element_current_filename}_{element}' + ) + result = element_current_file.read_bytes() + return result + + def test_em(self): + app = cherrypy.tree.mount(root=None, + config=cmapi_config_filename) + app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': cmapi_config_filename, + }, + }) + cherrypy.config.update(cmapi_config_filename) + + api_key = get_current_key() + try: + with run_server(): + url = f"https://localhost:8640/cmapi/{_version}/node/meta/em" + # Auth failure + headers = {'x-api-key': None} + r = requests.get(url, verify=False, headers=headers) + self.assertEqual(r.status_code, 401) + # OK + headers = {'x-api-key': api_key} + r = requests.get(url, verify=False, headers=headers) + extent_map = self.get_examplar_bytes('em') + self.assertEqual(r.status_code, 200) + self.assertEqual(r.content, extent_map) + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + + def test_journal(self): + app = cherrypy.tree.mount(root=None, + config=cmapi_config_filename) + app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': cmapi_config_filename, + }, + }) + cherrypy.config.update(cmapi_config_filename) + + api_key = get_current_key() + try: + with run_server(): + url = f"https://localhost:8640/cmapi/{_version}/node/meta/journal" + # Auth failure + headers = {'x-api-key': None} + r = requests.get(url, verify=False, headers=headers) + self.assertEqual(r.status_code, 401) + # OK + headers = {'x-api-key': api_key} + r = requests.get(url, verify=False, headers=headers) + journal = self.get_examplar_bytes('journal') + self.assertEqual(r.status_code, 200) + self.assertEqual(r.content, journal) + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + + def test_vss(self): + app = cherrypy.tree.mount(root=None, + config=cmapi_config_filename) + app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': cmapi_config_filename, + }, + }) + cherrypy.config.update(cmapi_config_filename) + + api_key = get_current_key() + try: + with run_server(): + url = f"https://localhost:8640/cmapi/{_version}/node/meta/vss" + # Auth failure + headers = {'x-api-key': None} + r = requests.get(url, verify=False, headers=headers) + self.assertEqual(r.status_code, 401) + # OK + headers = {'x-api-key': api_key} + r = requests.get(url, verify=False, headers=headers) + vss = self.get_examplar_bytes('vss') + self.assertEqual(r.status_code, 200) + self.assertEqual(r.content, vss) + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + + def test_vbbm(self): + app = cherrypy.tree.mount(root=None, + config=cmapi_config_filename) + app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': cmapi_config_filename, + }, + }) + cherrypy.config.update(cmapi_config_filename) + + api_key = get_current_key() + try: + with run_server(): + url = f"https://localhost:8640/cmapi/{_version}/node/meta/vbbm" + # Auth failure + headers = {'x-api-key': None} + r = requests.get(url, verify=False, headers=headers) + self.assertEqual(r.status_code, 401) + # OK + headers = {'x-api-key': api_key} + r = requests.get(url, verify=False, headers=headers) + vbbm = self.get_examplar_bytes('vbbm') + self.assertEqual(r.status_code, 200) + self.assertEqual(r.content, vbbm) + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise diff --git a/cmapi/cmapi_server/test/test_failover_agent.py b/cmapi/cmapi_server/test/test_failover_agent.py new file mode 100644 index 000000000..9261e5575 --- /dev/null +++ b/cmapi/cmapi_server/test/test_failover_agent.py @@ -0,0 +1,124 @@ +import logging +import socket + +from cmapi_server.failover_agent import FailoverAgent +from cmapi_server.node_manipulation import add_node, remove_node +from mcs_node_control.models.node_config import NodeConfig +from cmapi_server.test.unittest_global import ( + tmp_mcs_config_filename, BaseNodeManipTestCase +) + + +logging.basicConfig(level='DEBUG') + + +class TestFailoverAgent(BaseNodeManipTestCase): + + def test_activateNodes(self): + self.tmp_files = ('./activate0.xml', './activate1.xml') + hostaddr = socket.gethostbyname(socket.gethostname()) + fa = FailoverAgent() + fa.activateNodes( + [self.NEW_NODE_NAME], tmp_mcs_config_filename, self.tmp_files[0], + test_mode=True + ) + add_node( + hostaddr, self.tmp_files[0], self.tmp_files[1] + ) + + nc = NodeConfig() + root = nc.get_current_config_root(self.tmp_files[1]) + pm_count = int(root.find('./PrimitiveServers/Count').text) + self.assertEqual(pm_count, 2) + node = root.find('./PMS1/IPAddr') + self.assertEqual(node.text, self.NEW_NODE_NAME) + node = root.find('./pm1_WriteEngineServer/IPAddr') + self.assertEqual(node.text, self.NEW_NODE_NAME) + node = root.find('./PMS2/IPAddr') + self.assertEqual(node.text, hostaddr) + node = root.find('./pm2_WriteEngineServer/IPAddr') + self.assertEqual(node.text, hostaddr) + remove_node(self.NEW_NODE_NAME, self.tmp_files[1], self.tmp_files[1]) + + def test_deactivateNodes(self): + self.tmp_files = ( + './deactivate0.xml','./deactivate1.xml', './deactivate2.xml' + ) + fa = FailoverAgent() + hostname = socket.gethostname() + hostaddr = socket.gethostbyname(hostname) + add_node( + hostaddr, tmp_mcs_config_filename, self.tmp_files[0] + ) + fa.activateNodes( + [self.NEW_NODE_NAME], self.tmp_files[0], self.tmp_files[1], + test_mode=True + ) + fa.deactivateNodes( + [self.NEW_NODE_NAME], self.tmp_files[1], self.tmp_files[2], + test_mode=True + ) + + nc = NodeConfig() + root = nc.get_current_config_root(self.tmp_files[2]) + pm_count = int(root.find('./PrimitiveServers/Count').text) + self.assertEqual(pm_count, 1) + node = root.find('./PMS1/IPAddr') + self.assertEqual(node.text, hostaddr) + # TODO: Fix node_manipulation add_node logic and _replace_localhost + # node = root.find('./PMS2/IPAddr') + # self.assertEqual(node, None) + node = root.find('./pm1_WriteEngineServer/IPAddr') + self.assertTrue(node.text, hostaddr) + node = root.find('./pm2_WriteEngineServer/IPAddr') + self.assertTrue(node is None) + #node = root.find("./ConfigRevision") + #self.assertEqual(node.text, "3") + + # make sure there are no traces of mysql.com, + # or an ip addr that isn't localhost or 127.0.0.1 + all_nodes = root.findall('./') + for node in all_nodes: + self.assertFalse(node.text == self.NEW_NODE_NAME) + if node.tag in ['IPAddr', 'Node']: + self.assertTrue(node.text in [hostname, hostaddr]) + + def test_designatePrimaryNode(self): + self.tmp_files = ( + './primary-node0.xml', './primary-node1.xml', './primary-node2.xml' + ) + fa = FailoverAgent() + hostaddr = socket.gethostbyname(socket.gethostname()) + fa.activateNodes( + [self.NEW_NODE_NAME], tmp_mcs_config_filename, self.tmp_files[0], + test_mode=True + ) + add_node( + hostaddr, self.tmp_files[0], self.tmp_files[1] + ) + fa.movePrimaryNode( + 'placeholder', self.tmp_files[1], self.tmp_files[2], test_mode=True + ) + + nc = NodeConfig() + root = nc.get_current_config_root(self.tmp_files[2]) + pm_count = int(root.find('./PrimitiveServers/Count').text) + self.assertEqual(pm_count, 2) + node = root.find('./PMS1/IPAddr') + self.assertEqual(node.text, self.NEW_NODE_NAME) + node = root.find('./PMS2/IPAddr') + self.assertEqual(node.text, hostaddr) + node = root.find('./pm1_WriteEngineServer/IPAddr') + self.assertEqual(node.text, self.NEW_NODE_NAME) + node = root.find('./pm2_WriteEngineServer/IPAddr') + self.assertEqual(node.text, hostaddr) + + for tag in ['ExeMgr1', 'DMLProc', 'DDLProc']: + node = root.find(f'./{tag}/IPAddr') + self.assertEqual(node.text, self.NEW_NODE_NAME) + + self.assertEqual(self.NEW_NODE_NAME, root.find('./PrimaryNode').text) + + def test_enterStandbyMode(self): + fa = FailoverAgent() + fa.enterStandbyMode(test_mode=True) diff --git a/cmapi/cmapi_server/test/test_mcs_process_operations.py b/cmapi/cmapi_server/test/test_mcs_process_operations.py new file mode 100644 index 000000000..aa2efe800 --- /dev/null +++ b/cmapi/cmapi_server/test/test_mcs_process_operations.py @@ -0,0 +1,117 @@ +import os + +from cmapi_server.managers.process import MCSProcessManager +from cmapi_server.process_dispatchers.systemd import SystemdDispatcher +from cmapi_server.test.unittest_global import ( + DDL_SERVICE, CONTROLLERNODE_SERVICE, SYSTEMCTL, + BaseProcessDispatcherCase +) + +class SystemdTest(BaseProcessDispatcherCase): + + def test_systemd_status_start(self): + os.system(f'{SYSTEMCTL} stop {DDL_SERVICE}') + self.assertFalse( + SystemdDispatcher.is_service_running(DDL_SERVICE) + ) + self.assertTrue(SystemdDispatcher.start(DDL_SERVICE)) + + os.system(f'{SYSTEMCTL} stop {CONTROLLERNODE_SERVICE}') + self.assertFalse( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + result = SystemdDispatcher.start(CONTROLLERNODE_SERVICE) + self.assertTrue(result) + self.assertTrue( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + + def test_systemd_status_stop(self): + os.system(f'{SYSTEMCTL} start {CONTROLLERNODE_SERVICE}') + self.assertTrue( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + self.assertTrue(SystemdDispatcher.stop(CONTROLLERNODE_SERVICE)) + self.assertFalse( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + + def test_systemd_status_restart(self): + os.system(f'{SYSTEMCTL} start {CONTROLLERNODE_SERVICE}') + self.assertTrue( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + self.assertTrue(SystemdDispatcher.restart(CONTROLLERNODE_SERVICE)) + self.assertTrue( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + + os.system(f'{SYSTEMCTL} stop {CONTROLLERNODE_SERVICE}') + self.assertFalse( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + self.assertTrue(SystemdDispatcher.restart(CONTROLLERNODE_SERVICE)) + self.assertTrue( + SystemdDispatcher.is_service_running(CONTROLLERNODE_SERVICE) + ) + + +class MCSProcessManagerTest(BaseProcessDispatcherCase): + + def get_systemd_serv_name(self, service_name): + if service_name == 'mcs-workernode': + return f'{service_name}@1' + return service_name + + def test_mcs_process_manager(self): + MCSProcessManager.detect('systemd', '') + for prog in MCSProcessManager._get_sorted_progs(True, True).values(): + serv_name = self.get_systemd_serv_name(prog.service_name) + os.system(f'{SYSTEMCTL} stop {serv_name}') + self.assertIsNone(MCSProcessManager.start_node(True)) + + for prog in MCSProcessManager.mcs_progs.values(): + serv_name = self.get_systemd_serv_name(prog.service_name) + if serv_name == 'mcs-storagemanager': + continue + self.assertTrue( + MCSProcessManager.process_dispatcher.is_service_running( + serv_name + ) + ) + + self.assertIsNone(MCSProcessManager.stop_node(is_primary=True)) + for prog in MCSProcessManager.mcs_progs.values(): + serv_name = self.get_systemd_serv_name(prog.service_name) + self.assertFalse( + MCSProcessManager.process_dispatcher.is_service_running( + serv_name + ) + ) + self.assertEqual(len(MCSProcessManager.get_running_mcs_procs()), 0) + self.assertTrue( + MCSProcessManager.is_node_processes_ok( + is_primary=True, node_stopped=True + ) + ) + + for prog in MCSProcessManager._get_sorted_progs(True).values(): + serv_name = self.get_systemd_serv_name(prog.service_name) + os.system(f'{SYSTEMCTL} start {serv_name}') + + for prog in MCSProcessManager.mcs_progs.values(): + serv_name = self.get_systemd_serv_name(prog.service_name) + self.assertTrue( + MCSProcessManager.process_dispatcher.is_service_running( + serv_name + ) + ) + self.assertEqual( + len(MCSProcessManager.get_running_mcs_procs()), + len(MCSProcessManager.mcs_progs.keys()) + ) + self.assertTrue( + MCSProcessManager.is_node_processes_ok( + is_primary=True, node_stopped=False + ) + ) diff --git a/cmapi/cmapi_server/test/test_node_manip.py b/cmapi/cmapi_server/test/test_node_manip.py new file mode 100644 index 000000000..22a35c64a --- /dev/null +++ b/cmapi/cmapi_server/test/test_node_manip.py @@ -0,0 +1,211 @@ +import logging +import socket + +from lxml import etree + +from cmapi_server import node_manipulation +from cmapi_server.constants import MCS_DATA_PATH +from cmapi_server.test.unittest_global import ( + tmp_mcs_config_filename, BaseNodeManipTestCase +) +from mcs_node_control.models.node_config import NodeConfig + + +logging.basicConfig(level='DEBUG') + + +class NodeManipTester(BaseNodeManipTestCase): + + def test_add_remove_node(self): + self.tmp_files = ( + './test-output0.xml','./test-output1.xml','./test-output2.xml' + ) + hostaddr = socket.gethostbyname(socket.gethostname()) + node_manipulation.add_node( + self.NEW_NODE_NAME, tmp_mcs_config_filename, self.tmp_files[0] + ) + node_manipulation.add_node( + hostaddr, self.tmp_files[0], self.tmp_files[1] + ) + + # get a NodeConfig, read test.xml + # look for some of the expected changes. + # Total verification will take too long to code up right now. + nc = NodeConfig() + root = nc.get_current_config_root(self.tmp_files[1]) + pms_node_ipaddr = root.find('./PMS1/IPAddr') + self.assertEqual(pms_node_ipaddr.text, self.NEW_NODE_NAME) + pms_node_ipaddr = root.find('./PMS2/IPAddr') + self.assertEqual(pms_node_ipaddr.text, hostaddr) + node = root.find("./ExeMgr2/IPAddr") + self.assertEqual(node.text, hostaddr) + + node_manipulation.remove_node( + self.NEW_NODE_NAME, self.tmp_files[1], self.tmp_files[2], + test_mode=True + ) + nc = NodeConfig() + root = nc.get_current_config_root(self.tmp_files[2]) + node = root.find('./PMS1/IPAddr') + self.assertEqual(node.text, hostaddr) + # TODO: Fix node_manipulation add_node logic and _replace_localhost + # node = root.find('./PMS2/IPAddr') + # self.assertEqual(node, None) + + def test_add_dbroots_nodes_rebalance(self): + self.tmp_files = ( + './extra-dbroots-0.xml', './extra-dbroots-1.xml', + './extra-dbroots-2.xml' + ) + # add 2 dbroots, let's see what happen + nc = NodeConfig() + root = nc.get_current_config_root(tmp_mcs_config_filename) + + sysconf_node = root.find('./SystemConfig') + dbroot_count_node = sysconf_node.find('./DBRootCount') + dbroot_count = int(dbroot_count_node.text) + 2 + dbroot_count_node.text = str(dbroot_count) + etree.SubElement(sysconf_node, 'DBRoot2').text = '/dummy_path/data2' + etree.SubElement(sysconf_node, 'DBRoot10').text = '/dummy_path/data10' + nc.write_config(root, self.tmp_files[0]) + + node_manipulation.add_node( + self.NEW_NODE_NAME, self.tmp_files[0], self.tmp_files[1] + ) + + # get a NodeConfig, read test.xml + # look for some of the expected changes. + # Total verification will take too long to code up right now. + # Do eyeball verification for now. + nc = NodeConfig() + root = nc.get_current_config_root(self.tmp_files[1]) + node = root.find("./PMS2/IPAddr") + self.assertEqual(node.text, self.NEW_NODE_NAME) + + hostname = socket.gethostname() + # awesome, I saw dbroots 1 and 10 get assigned to node 1, + # and dbroot 2 assigned to node 2 + # now, remove node 1 (hostname) and see what we get + + node_manipulation.remove_node( + hostname, self.tmp_files[1], self.tmp_files[2], + test_mode=True + ) + + def test_add_dbroot(self): + self.tmp_files = ( + './dbroot-test0.xml', './dbroot-test1.xml', './dbroot-test2.xml', + './dbroot-test3.xml', './dbroot-test4.xml' + ) + # add a dbroot, verify it exists + + id = node_manipulation.add_dbroot( + tmp_mcs_config_filename, self.tmp_files[0] + ) + self.assertEqual(id, 2) + nc = NodeConfig() + root = nc.get_current_config_root(self.tmp_files[0]) + self.assertEqual(2, int(root.find('./SystemConfig/DBRootCount').text)) + self.assertEqual( + f'{MCS_DATA_PATH}/data2', + root.find('./SystemConfig/DBRoot2').text + ) + + # add a node, verify we can add a dbroot to each of them + hostname = socket.gethostname() + node_manipulation.add_node( + hostname, tmp_mcs_config_filename, self.tmp_files[1] + ) + node_manipulation.add_node( + self.NEW_NODE_NAME, self.tmp_files[1], self.tmp_files[2] + ) + id1 = node_manipulation.add_dbroot( + self.tmp_files[2], self.tmp_files[3], host=self.NEW_NODE_NAME + ) + id2 = node_manipulation.add_dbroot( + self.tmp_files[3], self.tmp_files[4], host=hostname + ) + self.assertEqual(id1, 2) + self.assertEqual(id2, 3) + + root = nc.get_current_config_root(self.tmp_files[4]) + dbroot_count1 = int( + root.find('./SystemModuleConfig/ModuleDBRootCount1-3').text + ) + dbroot_count2 = int( + root.find('./SystemModuleConfig/ModuleDBRootCount2-3').text + ) + self.assertEqual(dbroot_count1 + dbroot_count2, 3) + + unique_dbroots = set() + for i in range(1, dbroot_count1 + 1): + unique_dbroots.add(int( + root.find(f'./SystemModuleConfig/ModuleDBRootID1-{i}-3').text) + ) + for i in range(1, dbroot_count2 + 1): + unique_dbroots.add(int( + root.find(f'./SystemModuleConfig/ModuleDBRootID2-{i}-3').text) + ) + + self.assertEqual(list(unique_dbroots), [1, 2, 3]) + + def test_change_primary_node(self): + # add a node, make it the primary, verify expected result + self.tmp_files = ('./primary-node0.xml', './primary-node1.xml') + node_manipulation.add_node( + self.NEW_NODE_NAME, tmp_mcs_config_filename, self.tmp_files[0] + ) + node_manipulation.move_primary_node( + self.tmp_files[0], self.tmp_files[1] + ) + + root = NodeConfig().get_current_config_root(self.tmp_files[1]) + + self.assertEqual( + root.find('./ExeMgr1/IPAddr').text, self.NEW_NODE_NAME + ) + self.assertEqual( + root.find('./DMLProc/IPAddr').text, self.NEW_NODE_NAME + ) + self.assertEqual( + root.find('./DDLProc/IPAddr').text, self.NEW_NODE_NAME + ) + # This version doesn't support IPv6 + dbrm_controller_ip = root.find("./DBRM_Controller/IPAddr").text + self.assertEqual(dbrm_controller_ip, self.NEW_NODE_NAME) + self.assertEqual(root.find('./PrimaryNode').text, self.NEW_NODE_NAME) + + def test_unassign_dbroot1(self): + self.tmp_files = ( + './tud-0.xml', './tud-1.xml', './tud-2.xml', './tud-3.xml', + ) + node_manipulation.add_node( + self.NEW_NODE_NAME, tmp_mcs_config_filename, self.tmp_files[0] + ) + root = NodeConfig().get_current_config_root(self.tmp_files[0]) + (name, addr) = node_manipulation.find_dbroot1(root) + self.assertEqual(name, self.NEW_NODE_NAME) + + # add a second node and more dbroots to make the test slightly more robust + node_manipulation.add_node( + socket.gethostname(), self.tmp_files[0], self.tmp_files[1] + ) + node_manipulation.add_dbroot( + self.tmp_files[1], self.tmp_files[2], socket.gethostname() + ) + node_manipulation.add_dbroot( + self.tmp_files[2], self.tmp_files[3], self.NEW_NODE_NAME + ) + + root = NodeConfig().get_current_config_root(self.tmp_files[3]) + (name, addr) = node_manipulation.find_dbroot1(root) + self.assertEqual(name, self.NEW_NODE_NAME) + + node_manipulation.unassign_dbroot1(root) + caught_it = False + try: + node_manipulation.find_dbroot1(root) + except node_manipulation.NodeNotFoundException: + caught_it = True + + self.assertTrue(caught_it) diff --git a/cmapi/cmapi_server/test/test_server.py b/cmapi/cmapi_server/test/test_server.py new file mode 100644 index 000000000..f7eb5e4fd --- /dev/null +++ b/cmapi/cmapi_server/test/test_server.py @@ -0,0 +1,388 @@ +import logging +from datetime import datetime +from pathlib import Path + +import requests + +from cmapi_server.controllers.dispatcher import _version +from cmapi_server.test.unittest_global import BaseServerTestCase +from mcs_node_control.models.dbrm import DBRM + + +logging.basicConfig(level='DEBUG') +requests.urllib3.disable_warnings() + + +class ConfigTestCase(BaseServerTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/node/config' + + def test_config(self): + for msg, headers, status_code in self.TEST_PARAMS: + with self.subTest( + msg=msg, headers=headers, status_code=status_code + ): + r = requests.get(self.URL, verify=False, headers=headers) + self.assertEqual(r.status_code, status_code) + + +class StatusTestCase(BaseServerTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/node/status' + + def test_status(self): + for msg, headers, status_code in self.TEST_PARAMS: + with self.subTest( + msg=msg, headers=headers, status_code=status_code + ): + r = requests.get(self.URL, verify=False, headers=headers) + self.assertEqual(r.status_code, status_code) + + +class BeginTestCase(BaseServerTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/node/begin' + + def test_wrong_content_type(self): + r = requests.put(self.URL, verify=False, headers=self.HEADERS) + self.assertEqual(r.status_code, 415) + + def test_no_timeout(self): + body = {'id': 42} + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual(r.json(), {'error': 'id or timeout is not set.'}) + + def test_no_auth(self): + body = {'id': 42, 'timeout': 300} + r = requests.put( + self.URL, verify=False, headers=self.NO_AUTH_HEADERS, json=body + ) + self.assertEqual(r.status_code, 401) + + def test_ok(self): + txn_id_local = 42 + txn_timeout = 300 + txn_timeout_local = 300 + int(datetime.now().timestamp()) + body = {'id': txn_id_local, 'timeout': txn_timeout} + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 200) + txn_section = self.app.config.get('txn', None) + self.assertTrue(txn_section is not None) + txn_id = txn_section.get('id', None) + txn_timeout = txn_section.get('timeout', None) + txn_manager_address = txn_section.get('manager_address', None) + txn_config_changed = txn_section.get('config_changed', None) + txn = [txn_id, txn_timeout, txn_manager_address, txn_config_changed] + self.assertTrue(None not in txn) + self.assertTrue(txn_id == txn_id_local) + self.assertTrue(txn_timeout - txn_timeout_local <= 2) + + def test_multiple_begin(self): + txn_id_local = 42 + txn_timeout = 300 + body = {'id': txn_id_local, 'timeout': txn_timeout} + _ = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual( + r.json(), {'error': 'There is an active operation.'} + ) + + +class CommitTestCase(BaseServerTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/node/commit' + + def test_wrong_content_type(self): + r = requests.put(self.URL, verify=False, headers=self.HEADERS) + self.assertEqual(r.status_code, 415) + + def test_no_operation(self): + body = {'id': 42} + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual(r.json(), {'error': 'No operation to commit.'}) + + def test_begin_and_commit(self): + txn_timeout = 300 + txn_id = 42 + body = {'id': txn_id, 'timeout': txn_timeout} + r = requests.put( + BeginTestCase.URL, verify=False, headers=self.HEADERS, json=body + ) + txn_section = self.app.config.get('txn', None) + self.assertTrue(txn_section is not None) + self.assertEqual(r.status_code, 200) + body = {'id': 42} + r = requests.put(self.URL, verify=False, headers=self.HEADERS, json=body) + self.assertEqual(r.status_code, 200) + txn_id = txn_section.get('id', None) + txn_timeout = txn_section.get('timeout', None) + txn_manager_address = txn_section.get('manager_address', None) + txn_config_changed = txn_section.get('config_changed', None) + self.assertTrue(txn_id == 0) + self.assertEqual(txn_timeout, 0) + self.assertEqual(txn_manager_address, '') + self.assertFalse(txn_config_changed) + + def test_multiple_commit(self): + body = {'id': 42} + _ = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + + +class RollbackTestCase(BaseServerTestCase): + URL = f"https://localhost:8640/cmapi/{_version}/node/rollback" + + def test_wrong_content_type(self): + r = requests.put(self.URL, verify=False, headers=self.HEADERS) + self.assertEqual(r.status_code, 415) + + def test_no_operation(self): + body = {'id': 42} + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual(r.json(), {'error': 'No operation to rollback.'}) + + def test_begin_and_rollback(self): + txn_timeout = 300 + txn_id = 42 + body = {'id': txn_id, 'timeout': txn_timeout} + r = requests.put( + BeginTestCase.URL, verify=False, headers=self.HEADERS, json=body + ) + txn_section = self.app.config.get('txn', None) + self.assertTrue(txn_section is not None) + self.assertEqual(r.status_code, 200) + body = {'id': 42} + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 200) + txn_id = txn_section.get('id', None) + txn_timeout = txn_section.get('timeout', None) + txn_manager_address = txn_section.get('manager_address', None) + txn_config_changed = txn_section.get('config_changed', None) + self.assertTrue(txn_id == 0) + self.assertEqual(txn_timeout, 0) + self.assertEqual(txn_manager_address, '') + self.assertFalse(txn_config_changed) + + def test_no_operation_again(self): + body = {'id': 42} + _ = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + + +class ConfigPutTestCase(BaseServerTestCase): + URL = ConfigTestCase.URL + + def setUp(self): + if 'skip_setUp' not in self.shortDescription(): + body = {'id': 42, 'timeout': 42} + _ = requests.put( + BeginTestCase.URL, + verify=False, headers=self.HEADERS, json=body + ) + return super().setUp() + + def tearDown(self): + body = {'id': 42} + _ = requests.put( + RollbackTestCase.URL, verify=False, headers=self.HEADERS, json=body + ) + return super().tearDownClass() + + def test_wrong_content_type(self): + """Test wrong Content-Type.""" + r = requests.put(self.URL, verify=False, headers=self.HEADERS) + self.assertEqual(r.status_code, 415) + + def test_no_active_operation(self): + """Test no active operation. skip_setUp""" + body = { + 'revision': 42, + 'manager': '1.1.1.1', + 'timeout': 42, + 'config': "...", + 'mcs_config_filename': self.mcs_config_filename + } + + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual( + r.json(), {'error': 'PUT /config called outside of an operation.'} + ) + + def test_no_mandatory_attributes(self): + """Test no mandatory attributes. skip_setUp""" + body = {'id': 42, 'timeout': 42} + r = requests.put( + BeginTestCase.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 200) + body = { + 'revision': 42, + 'timeout': 42, + 'config': "...", + 'mcs_config_filename': self.mcs_config_filename + } + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual( + r.json(), {'error': 'Mandatory attribute is missing.'} + ) + body = { + 'manager': '1.1.1.1', + 'revision': 42, + 'config': "...", + 'mcs_config_filename': self.mcs_config_filename + } + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual( + r.json(), {'error': 'Mandatory attribute is missing.'} + ) + body = { + 'manager': '1.1.1.1', + 'revision': 42, + 'timeout': 42, + 'mcs_config_filename': self.mcs_config_filename + } + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertEqual( + r.json(), {'error': 'Mandatory attribute is missing.'} + ) + + def test_no_auth(self): + """Test no auth.""" + body = { + 'revision': 42, + 'manager': '1.1.1.1', + 'timeout': 42, + 'config': "...", + 'mcs_config_filename': self.mcs_config_filename + } + r = requests.put( + self.URL, verify=False, headers=self.NO_AUTH_HEADERS, json=body + ) + self.assertEqual(r.status_code, 401) + + def test_send_rollback(self): + """Test send rollback.""" + body = {'id': 42} + r = requests.put( + RollbackTestCase.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 200) + + def test_wrong_cluster_mode(self): + """Test wrong cluster mode.""" + body = { + 'revision': 42, + 'manager': '1.1.1.1', + 'timeout': 42, + 'cluster_mode': 'somemode', + 'mcs_config_filename': self.mcs_config_filename + } + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 422) + self.assertTrue( + "Error occured setting cluster" in r.content.decode('ASCII') + ) + + def test_set_mode(self): + """Test set mode.""" + mode = 'readwrite' + body = { + 'revision': 42, + 'manager': '1.1.1.1', + 'timeout': 42, + 'cluster_mode': mode, + 'mcs_config_filename': self.mcs_config_filename + } + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + # DBRM controller must be up and running + self.assertEqual(r.status_code, 200) + r = requests.get( + StatusTestCase.URL, verify=False, headers=self.HEADERS + ) + self.assertEqual(r.status_code, 200) + + fake_mode = mode + with DBRM() as dbrm: + if dbrm.get_dbrm_status() != 'master': + fake_mode = 'readonly' + self.assertEqual(r.json()['cluster_mode'], fake_mode) + self.assertEqual(dbrm._get_cluster_mode(), mode) + + def test_apply_config(self): + """Test apply config.""" + body = {'id': 42, 'timeout': 42} + _ = requests.put( + BeginTestCase.URL, + verify=False, headers=self.HEADERS, json=body + ) + config_file = Path(self.mcs_config_filename) + config = config_file.read_text() + body = { + 'revision': 42, + 'manager': '1.1.1.1', + 'timeout': 15, + 'config': config, + 'mcs_config_filename': self.mcs_config_filename + } + r = requests.put( + self.URL, verify=False, headers=self.HEADERS, json=body + ) + self.assertEqual(r.status_code, 200) + txn_section = self.app.config.get('txn', None) + self.assertTrue(txn_section is not None) + txn_config_changed = txn_section.get('config_changed', None) + self.assertEqual(txn_config_changed, True) + r = requests.get( + ConfigTestCase.URL, verify=False, headers=self.HEADERS + ) + self.assertEqual(r.status_code, 200) + # commenting this out until we get global config + # self.assertEqual(r.json()['config'], config) + + +class PrimaryTestCase(BaseServerTestCase): + URL = f'https://localhost:8640/cmapi/{_version}/node/primary' + + def test_is_primary(self): + r = requests.get(self.URL, verify=False) + self.assertEqual(r.status_code, 200) diff --git a/cmapi/cmapi_server/test/test_txns.py b/cmapi/cmapi_server/test/test_txns.py new file mode 100644 index 000000000..a2f48f79c --- /dev/null +++ b/cmapi/cmapi_server/test/test_txns.py @@ -0,0 +1,160 @@ +import cherrypy +import unittest +import os +import socket +from shutil import copyfile +from contextlib import contextmanager + +from cmapi_server import helpers, node_manipulation +from mcs_node_control.models.node_config import NodeConfig +from cmapi_server.controllers.dispatcher import dispatcher, jsonify_error +from cmapi_server.test.unittest_global import create_self_signed_certificate, \ + cert_filename, mcs_config_filename, cmapi_config_filename, \ + tmp_mcs_config_filename, tmp_cmapi_config_filename + + +@contextmanager +def start_server(): + if not os.path.exists(cert_filename): + create_self_signed_certificate() + + app = cherrypy.tree.mount(root = None, config = cmapi_config_filename) + app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': cmapi_config_filename, + }, + }) + cherrypy.config.update(cmapi_config_filename) + + cherrypy.engine.start() + cherrypy.engine.wait(cherrypy.engine.states.STARTED) + yield + + cherrypy.engine.exit() + cherrypy.engine.block() + +class TestTransactions(unittest.TestCase): + def setUp(self): + if not os.path.exists(tmp_mcs_config_filename): + f = open(tmp_mcs_config_filename, 'x') + f.close() + copyfile(mcs_config_filename, tmp_mcs_config_filename) + + def tearDown(self): + if os.path.exists(tmp_mcs_config_filename): + copyfile(tmp_mcs_config_filename, mcs_config_filename) + os.remove(tmp_mcs_config_filename) + + @classmethod + def setUpClass(cls): + if not os.path.exists(tmp_cmapi_config_filename): + f = open(tmp_cmapi_config_filename, 'x') + f.close() + copyfile(cmapi_config_filename, tmp_cmapi_config_filename) + + @classmethod + def tearDownClass(cls): + if os.path.exists(tmp_cmapi_config_filename): + copyfile(tmp_cmapi_config_filename, cmapi_config_filename) + os.remove(tmp_cmapi_config_filename) + + def test_start_commit(self): + print(" ******** Running TestTransactions.test_start_commit()") + with start_server(): + try: + hostname = socket.gethostname() + myaddr = socket.gethostbyname(hostname) + node_manipulation.add_node( + myaddr, mcs_config_filename, mcs_config_filename + ) + result = helpers.start_transaction( + cmapi_config_filename, mcs_config_filename, + optional_nodes = [myaddr] + ) + self.assertTrue(result[0]) + self.assertEqual(len(result[2]), 1) + self.assertEqual(result[2][0], myaddr) + helpers.commit_transaction(result[1], cmapi_config_filename, mcs_config_filename, nodes = result[2]) + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + def test_start_rollback(self): + print(" ******** Running TestTransactions.test_start_rollback()") + with start_server(): + try: + hostname = socket.gethostname() + myaddr = socket.gethostbyname(hostname) + node_manipulation.add_node( + myaddr, mcs_config_filename, mcs_config_filename + ) + result = helpers.start_transaction( + cmapi_config_filename, mcs_config_filename, + optional_nodes = [myaddr] + ) + self.assertTrue(result[0]) + self.assertEqual(len(result[2]), 1) + self.assertEqual(result[2][0], myaddr) + helpers.rollback_transaction(result[1], cmapi_config_filename, mcs_config_filename) # not specifying nodes here to exercise the nodes = None path + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + def test_broadcast_new_config(self): + print(" ******** Running TestTransactions.test_broadcast_new_config()") + with start_server(): + try: + myaddr = socket.gethostbyname(socket.gethostname()) + node_manipulation.add_node(myaddr, mcs_config_filename, mcs_config_filename) + + # Note, 1.2.3.4 is intentional -> doesn't exist, so shouldn't end up in the node list returned + print("\n\nNOTE! This is expected to pause here for ~10s, this isn't an error, yet.\n") + result = helpers.start_transaction( + cmapi_config_filename, mcs_config_filename, + optional_nodes = ['1.2.3.4'] + ) + self.assertTrue(result[0]) + self.assertEqual(len(result[2]), 1) + self.assertEqual(result[2][0], myaddr) + success = helpers.broadcast_new_config( + mcs_config_filename, + cmapi_config_filename=cmapi_config_filename, + test_mode=True, + nodes = result[2] + ) + # not specifying nodes here to exercise the nodes = None path + helpers.commit_transaction( + result[1], cmapi_config_filename, mcs_config_filename + ) + self.assertTrue(success) + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + def test_update_rev_and_manager(self): + print(" ******** Running TestTransactions.test_update_rev_and_manager()") + with start_server(): + try: + + myaddr = socket.gethostbyname(socket.gethostname()) + node_manipulation.add_node( + myaddr, mcs_config_filename, mcs_config_filename + ) + helpers.update_revision_and_manager(mcs_config_filename, "./update_rev1.xml") + nc = NodeConfig() + root = nc.get_current_config_root("./update_rev1.xml") + self.assertEqual(root.find("./ConfigRevision").text, "2") + self.assertEqual(root.find("./ClusterManager").text, socket.gethostbyname(socket.gethostname())) + except: + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + os.remove("./update_rev1.xml") diff --git a/cmapi/cmapi_server/test/unittest_global.py b/cmapi/cmapi_server/test/unittest_global.py new file mode 100644 index 000000000..8e1fad672 --- /dev/null +++ b/cmapi/cmapi_server/test/unittest_global.py @@ -0,0 +1,203 @@ +import logging +import os +import unittest +from contextlib import contextmanager +from datetime import datetime, timedelta +from shutil import copyfile +from tempfile import TemporaryDirectory + +import cherrypy +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import rsa +from cryptography import x509 +from cryptography.x509.oid import NameOID +from cryptography.hazmat.primitives import hashes + +from cmapi_server import helpers +from cmapi_server.constants import CMAPI_CONF_PATH +from cmapi_server.controllers.dispatcher import dispatcher, jsonify_error +from cmapi_server.managers.process import MCSProcessManager + + +TEST_API_KEY = 'somekey123' +cert_filename = './cmapi_server/self-signed.crt' +MCS_CONFIG_FILEPATH = '/etc/columnstore/Columnstore.xml' +COPY_MCS_CONFIG_FILEPATH = './cmapi_server/test/original_Columnstore.xml' +TEST_MCS_CONFIG_FILEPATH = './cmapi_server/test/CS-config-test.xml' +# TODO: +# - rename after fix in all places +# - fix path to abs +mcs_config_filename = './cmapi_server/test/CS-config-test.xml' +tmp_mcs_config_filename = './cmapi_server/test/tmp.xml' +cmapi_config_filename = './cmapi_server/cmapi_server.conf' +tmp_cmapi_config_filename = './cmapi_server/test/tmp.conf' +# constants for process dispatchers +DDL_SERVICE = 'mcs-ddlproc' +CONTROLLERNODE_SERVICE = 'mcs-controllernode.service' +UNKNOWN_SERVICE = 'unknown_service' +SYSTEMCTL = 'sudo systemctl' + + +logging.basicConfig(level=logging.DEBUG) + + +def create_self_signed_certificate(): + key_filename = './cmapi_server/self-signed.key' + + key = rsa.generate_private_key( + public_exponent=65537, + key_size=2048, + backend=default_backend() + ) + + with open(key_filename, "wb") as f: + f.write(key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.NoEncryption()), + ) + + subject = issuer = x509.Name([ + x509.NameAttribute(NameOID.COUNTRY_NAME, u"US"), + x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, u"California"), + x509.NameAttribute(NameOID.LOCALITY_NAME, u"Redwood City"), + x509.NameAttribute(NameOID.ORGANIZATION_NAME, u"MariaDB"), + x509.NameAttribute(NameOID.COMMON_NAME, u"mariadb.com"), + ]) + + basic_contraints = x509.BasicConstraints(ca=True, path_length=0) + + cert = x509.CertificateBuilder( + ).subject_name( + subject + ).issuer_name( + issuer + ).public_key( + key.public_key() + ).serial_number( + x509.random_serial_number() + ).not_valid_before( + datetime.utcnow() + ).not_valid_after( + datetime.utcnow() + timedelta(days=365) + ).add_extension( + basic_contraints, + False + ).add_extension( + x509.SubjectAlternativeName([x509.DNSName(u"localhost")]), + critical=False + ).sign(key, hashes.SHA256(), default_backend()) + + with open(cert_filename, "wb") as f: + f.write(cert.public_bytes(serialization.Encoding.PEM)) + + +def run_detect_processes(): + cfg_parser = helpers.get_config_parser(CMAPI_CONF_PATH) + d_name, d_path = helpers.get_dispatcher_name_and_path(cfg_parser) + MCSProcessManager.detect(d_name, d_path) + + +@contextmanager +def run_server(): + if not os.path.exists(cert_filename): + create_self_signed_certificate() + + cherrypy.engine.start() + cherrypy.engine.wait(cherrypy.engine.states.STARTED) + run_detect_processes() #TODO: Move cause slow down each test for 5s + yield + + cherrypy.engine.exit() + cherrypy.engine.block() + + +class BaseServerTestCase(unittest.TestCase): + HEADERS = {'x-api-key': TEST_API_KEY} + NO_AUTH_HEADERS = {'x-api-key': None} + TEST_PARAMS = ( + ('auth ok', HEADERS, 200), + ('no auth', NO_AUTH_HEADERS, 401) + ) + + def run(self, result=None): + with TemporaryDirectory() as tmp_dir: + self.tmp_dir = tmp_dir + self.cmapi_config_filename = os.path.join( + tmp_dir, 'tmp_cmapi_config.conf' + ) + self.mcs_config_filename = os.path.join( + tmp_dir, 'tmp_mcs_config.xml' + ) + copyfile(cmapi_config_filename, self.cmapi_config_filename) + copyfile(TEST_MCS_CONFIG_FILEPATH, self.mcs_config_filename) + self.app = cherrypy.tree.mount( + root=None, config=self.cmapi_config_filename + ) + self.app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': self.cmapi_config_filename, + }, + 'Authentication' : self.HEADERS + }) + cherrypy.config.update(self.cmapi_config_filename) + + with run_server(): + return super().run(result=result) + + +class BaseNodeManipTestCase(unittest.TestCase): + NEW_NODE_NAME = 'mysql.com' # something that has a DNS entry everywhere + + def setUp(self): + self.tmp_files = [] + copyfile(TEST_MCS_CONFIG_FILEPATH, tmp_mcs_config_filename) + + def tearDown(self): + for tmp_file in self.tmp_files: + if os.path.exists(tmp_file): + os.remove(tmp_file) + if os.path.exists(tmp_mcs_config_filename): + os.remove(tmp_mcs_config_filename) + + +class BaseProcessDispatcherCase(unittest.TestCase): + node_started = None + + @classmethod + def setUpClass(cls) -> None: + run_detect_processes() + cls.node_started = MCSProcessManager.get_running_mcs_procs() != 0 + return super().setUpClass() + + @classmethod + def tearDownClass(cls) -> None: + if (MCSProcessManager.get_running_mcs_procs() !=0) == cls.node_started: + return super().tearDownClass() + if cls.node_started: + MCSProcessManager.start_node(is_primary=True) + else: + MCSProcessManager.stop_node(is_primary=True) + return super().tearDownClass() + + def setUp(self) -> None: + if MCSProcessManager.process_dispatcher.is_service_running( + CONTROLLERNODE_SERVICE + ): + self.controller_node_cmd = 'start' + else: + self.controller_node_cmd = 'stop' + # prevent to get 'start-limit-hit' systemd error, see MCOL-5186 + os.system(f'{SYSTEMCTL} reset-failed') + return super().setUp() + + def tearDown(self) -> None: + os.system( + f'{SYSTEMCTL} {self.controller_node_cmd} {CONTROLLERNODE_SERVICE}' + ) + return super().tearDown() diff --git a/cmapi/conffiles.template b/cmapi/conffiles.template new file mode 100644 index 000000000..f32c52c47 --- /dev/null +++ b/cmapi/conffiles.template @@ -0,0 +1 @@ +${ETC_DIR}/cmapi_server.conf diff --git a/cmapi/engine_files/mariadb-columnstore-start.sh b/cmapi/engine_files/mariadb-columnstore-start.sh new file mode 100644 index 000000000..4a8c6b6d0 --- /dev/null +++ b/cmapi/engine_files/mariadb-columnstore-start.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# This script allows to gracefully start MCS + +/bin/systemctl start mcs-workernode +/bin/systemctl start mcs-controllernode +/bin/systemctl start mcs-primproc +/bin/systemctl start mcs-writeengineserver +/bin/systemctl start mcs-exemgr +/bin/systemctl start mcs-dmlproc +/bin/systemctl start mcs-ddlproc + +exit 0 diff --git a/cmapi/engine_files/mariadb-columnstore-stop.sh b/cmapi/engine_files/mariadb-columnstore-stop.sh new file mode 100644 index 000000000..267a82646 --- /dev/null +++ b/cmapi/engine_files/mariadb-columnstore-stop.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# This script allows to gracefully shut down MCS + +/bin/systemctl stop mcs-dmlproc +/bin/systemctl stop mcs-ddlproc +/bin/systemctl stop mcs-exemgr +/bin/systemctl stop mcs-writeengineserver +/bin/systemctl stop mcs-primproc +/bin/systemctl stop mcs-controllernode +/bin/systemctl stop mcs-workernode +/bin/systemctl stop mcs-storagemanager + +exit 0 diff --git a/cmapi/engine_files/mcs-loadbrm.py b/cmapi/engine_files/mcs-loadbrm.py new file mode 100755 index 000000000..762596bae --- /dev/null +++ b/cmapi/engine_files/mcs-loadbrm.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +import subprocess +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +import time +import configparser +import os +import datetime + +API_CONFIG_PATH = '/etc/columnstore/cmapi_server.conf' +BYPASS_SM_PATH = '/tmp/columnstore_tmp_files/rdwrscratch/BRM_saves' + + +def get_key(): + cmapi_config = configparser.ConfigParser() + cmapi_config.read(API_CONFIG_PATH) + if 'Authentication' not in cmapi_config.sections(): + return '' + return cmapi_config['Authentication'].get('x-api-key', '') + + +def get_version(): + return '0.4.0' + + +def get_port(): + return '8640' + + +if __name__ == '__main__': + # To avoid systemd in container environment + use_systemd = True + if len(sys.argv) > 1: + use_systemd = not sys.argv[1] == 'no' + sm_config = configparser.ConfigParser() + + sm_config.read('/etc/columnstore/storagemanager.cnf') + cs_config = ET.parse('/etc/columnstore/Columnstore.xml') + config_root = cs_config.getroot() + + storage = sm_config.get('ObjectStorage', 'service') + if storage is None: + storage = 'LocalStorage' + bucket = sm_config.get('S3', 'bucket') + if bucket is None: + bucket = 'some_bucket' + + dbrmroot = config_root.find('./SystemConfig/DBRMRoot').text + pmCount = int(config_root.find('./SystemModuleConfig/ModuleCount3').text) + loadbrm = '/usr/bin/load_brm' + + brm_saves_current = '' + + if storage.lower() == 's3' and not bucket.lower() == 'some_bucket': + # start SM using systemd + if use_systemd is True: + cmd = 'systemctl start mcs-storagemanager' + retcode = subprocess.call(cmd, shell=True) + if retcode < 0: + print('Failed to start storagemanager. \ +{} exits with {}.'.format(cmd, retcode)) + sys.exit(1) + # delay to allow storagemanager to init + time.sleep(1) + + brm = 'data1/systemFiles/dbrm/BRM_saves_current' + config_root.find('./Installation/DBRootStorageType').text = "StorageManager" + config_root.find('./StorageManager/Enabled').text = "Y" + + if config_root.find('./SystemConfig/DataFilePlugin') is None: + config_root.find('./SystemConfig').append(ET.Element("DataFilePlugin")) + + config_root.find('./SystemConfig/DataFilePlugin').text = "libcloudio.so" + + cs_config.write('/etc/columnstore/Columnstore.xml.loadbrm') + os.replace('/etc/columnstore/Columnstore.xml.loadbrm', '/etc/columnstore/Columnstore.xml') # atomic replacement + + # Single-node on S3 + if storage.lower() == 's3' and not bucket.lower() == 'some_bucket' and pmCount == 1: + try: + print("Running smcat") + brm_saves_current = subprocess.check_output(['smcat', brm]) + except subprocess.CalledProcessError as e: + # will happen when brm file does not exist + print('{} does not exist.'.format(brm), file=sys.stderr) + else: + brm = '{}_current'.format(dbrmroot) + # Multi-node + if pmCount > 1: + try: + import requests + requests.packages.urllib3.disable_warnings() + except ImportError as e: + print('requests Python module does not exist. \ +Please install CMAPI first.', file=sys.stderr) + sys.exit(1) + try: + primary_address = config_root.find('./DBRM_Controller/IPAddr').text + api_key = get_key() + if len(api_key) == 0: + print('Failed to find API key in {}.'.format(API_CONFIG_PATH), \ +file=sys.stderr) + sys.exit(1) + headers = {'x-api-key': api_key} + api_version = get_version() + api_port = get_port() + elems = ['em', 'journal', 'vbbm', 'vss'] + for e in elems: + print("Pulling {} from the primary node.".format(e)) + url = "https://{}:{}/cmapi/{}/node/meta/{}".format(primary_address, \ +api_port, api_version, e) + r = requests.get(url, verify=False, headers=headers, timeout=30) + if (r.status_code != 200): + raise RuntimeError("Error requesting {} from the primary \ +node.".format(e)) + + # To avoid SM storing BRM files + if storage.lower() == 's3' and bucket.lower() != 'some_bucket': + dbrmroot = BYPASS_SM_PATH + + if not os.path.exists(dbrmroot): + os.makedirs(dbrmroot) + + current_name = '{}_{}'.format(dbrmroot, e) + + print ("Saving {} to {}".format(e, current_name)) + path = Path(current_name) + path.write_bytes(r.content) + except Exception as e: + print(str(e)) + print('Failed to load BRM data from the primary \ +node {}.'.format(primary_address), file=sys.stderr) + sys.exit(1) + + brm_saves_current = b"BRM_saves\n" + else: + # load local dbrm + try: + brm_saves_current = subprocess.check_output(['cat', brm]) + except subprocess.CalledProcessError as e: + # will happen when brm file does not exist + print('{} does not exist.'.format(brm), file=sys.stderr) + + if brm_saves_current: + cmd = '{} {}{}'.format(loadbrm, dbrmroot, \ +brm_saves_current.decode("utf-8").replace("BRM_saves", "")) + print(f"{datetime.datetime.now()} : Running {cmd}") + try: + retcode = subprocess.call(cmd, shell=True) + if retcode < 0: + print('{} exits with {}.'.format(cmd, retcode)) + sys.exit(1) + except OSError as e: + sys.exit(1) diff --git a/cmapi/engine_files/mcs-savebrm.py b/cmapi/engine_files/mcs-savebrm.py new file mode 100755 index 000000000..dfdbcb652 --- /dev/null +++ b/cmapi/engine_files/mcs-savebrm.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +import subprocess +import sys +import xml.etree.ElementTree as ET +import configparser + +XML_CONFIG_PATH = '/etc/columnstore/Columnstore.xml' +SM_CONFIG_PATH = '/etc/columnstore/storagemanager.cnf' +REST_REQUEST_TO = 2 + + +def get_version(): + return '0.4.0' + + +def get_port(): + return '8640' + + +if __name__ == '__main__': + master_addr = '' + pm_count = 0 + try: + cs_config = ET.parse(XML_CONFIG_PATH) + config_root = cs_config.getroot() + master_addr = config_root.find('./DBRM_Controller/IPAddr').text + pm_count = int(config_root.find('./SystemModuleConfig/ModuleCount3').text) + except (FileNotFoundError, AttributeError, ValueError) as e: + print("Exception had been raised. Continue anyway") + print(str(e)) + + storage = 'LocalStorage' + sm_config = configparser.ConfigParser() + files_read = len(sm_config.read(SM_CONFIG_PATH)) + if files_read == 1: + storage = sm_config.get('ObjectStorage', 'service') + + default_addr = '127.0.0.1' + savebrm = 'save_brm' + is_primary = False + + # For multi-node with local storage or default installations + if (storage.lower() != 's3' and master_addr != default_addr) or \ +master_addr == default_addr: + is_primary = True + print('Multi-node with local-storage detected.') + else: + has_requests = False + try: + import requests + requests.packages.urllib3.disable_warnings() + has_requests = True + except ImportError as e: + print('requests Python module does not exist. \ + Please install CMAPI first.') + if has_requests is True: + try: + print('Requesting for the primary node status.') + api_version = get_version() + api_port = get_port() + url = "https://{}:{}/cmapi/{}/node/primary".format(default_addr, \ + api_port, api_version) + resp = requests.get(url, + verify=False, + timeout=REST_REQUEST_TO) + if (resp.status_code != 200): + print("Error sending GET /node/primary.") + else: + is_primary = resp.json()['is_primary'] == 'True' + except: + print('Failed to request.') + print(str(e)) + + if is_primary is True: + try: + retcode = subprocess.call(savebrm, shell=True) + if retcode < 0: + print('{} exits with {}.'.format(savebrm, retcode)) + sys.exit(0) + except OSError as e: + print(str(e)) + sys.exit(0) + + sys.exit(0) diff --git a/cmapi/engine_files/mcs-workernode.service b/cmapi/engine_files/mcs-workernode.service new file mode 100644 index 000000000..959b19870 --- /dev/null +++ b/cmapi/engine_files/mcs-workernode.service @@ -0,0 +1,13 @@ +[Unit] +Description=mcs-workernode +After=mcs-loadbrm.service + +[Service] +Type=simple +ExecStart=/usr/bin/workernode DBRM_Worker1 fg +Restart=on-failure +ExecStop=/usr/bin/env bash -c "kill -15 $MAINPID" +ExecStopPost=-/usr/bin/mcs-savebrm.py +ExecStopPost=/usr/bin/env bash -c "clearShm > /dev/null 2>&1" +TimeoutStopSec=120 +EnvironmentFile=-/etc/columnstore/systemd.env diff --git a/cmapi/failover/__init__.py b/cmapi/failover/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/failover/agent_comm.py b/cmapi/failover/agent_comm.py new file mode 100644 index 000000000..d9e169138 --- /dev/null +++ b/cmapi/failover/agent_comm.py @@ -0,0 +1,266 @@ +# this class handles the comm with the agent; whatever it will be + +import datetime +import logging +import threading +import time + + +logger = logging.getLogger('agent_comm') + + +# First an agent base class +class AgentBase: + + def activateNodes(self, nodes): + print("AgentBase: Got activateNodes({})".format(nodes)) + + def deactivateNodes(self, nodes): + print("AgentBase: Got deactivateNodes({})".format(nodes)) + + def movePrimaryNode(self, placeholder): + print("AgentBase: Got movePrimaryNode()") + + def enterStandbyMode(self): + print("AgentBase: Got enterStandbyMode()") + + def getNodeHealth(self): + print("AgentBase: Got getNodeHealth()") + return 0 + + def raiseAlarm(self, msg): + print("AgentBase: Got raiseAlarm({})".format(msg)) + + def startTransaction(self, extra_nodes = [], remove_nodes = []): + print(f"AgentBase: Got startTransaction, extra_nodes={extra_nodes}, remove_nodes={remove_nodes}") + return 0 + + def commitTransaction(self, txnid, nodes): + print("AgentBase: Got commitTransaction") + + def rollbackTransaction(self, txnid, nodes): + print("AgentBase: Got abortTransaction") + + + +class OpAndArgs: + name = None # a callable in AgentBase + args = None # a tuple containing the args for the callable + + def __init__(self, name, *args): + self.name = name + self.args = args + + def __str__(self): + return f"{str(self.name.__qualname__)}{str(self.args)}" + + def __hash__(self): + return hash((self.name.__qualname__, str(self.args))) + + def __eq__(self, other): + return self.name == other.name and self.args == other.args + + def __ne__(self, other): + return not self.__eq__(other) + + def run(self): + self.name(*self.args) + + +# The AgentComm class +# Doesn't do anything but pass along events to the Agent yet +# TODO: implement an event queue and a thread to pluck events and issue them +# to the agent. Done? +# TODO: de-dup events as they come in from the node monitor, +# add to the event queue\ +# TODO: rewrite using builtin Queue class +class AgentComm: + + def __init__(self, agent = None): + if agent is None: + self._agent = AgentBase() + else: + self._agent = agent + + # deduper contains queue contents, events in progress, and finished + # events up to 10s after they finished + self._deduper = {} + self._die = False + self._queue = [] + self._mutex = threading.Lock() + self._thread = None + + def __del__(self): + self.die() + + def start(self): + self._die = False + self._thread = threading.Thread(target=self._runner, name='AgentComm') + self._thread.start() + + # TODO: rename to stop + def die(self): + self._die = True + self._thread.join() + + # returns (len-of-event-queue, len-of-deduper) + def getQueueSize(self): + self._mutex.acquire() + ret = (len(self._queue), len(self._deduper)) + self._mutex.release() + return ret + + def activateNodes(self, nodes): + self._addEvent(self._agent.activateNodes, (nodes)) + + def deactivateNodes(self, nodes): + self._addEvent(self._agent.deactivateNodes, (nodes)) + + def movePrimaryNode(self): + self._addEvent(self._agent.movePrimaryNode, ()) + + def enterStandbyMode(self): + # The other events are moot if this node has to enter standby mode + self._mutex.acquire() + op = OpAndArgs(self._agent.enterStandbyMode, ()) + self._queue = [ op ] + self._deduper = { op : datetime.datetime.now() } + self._mutex.release() + + def getNodeHealth(self): + return self._agent.getNodeHealth() + + def raiseAlarm(self, msg): + self._agent.raiseAlarm(msg) + + def _addEvent(self, name, args): + """Interface to the event queue.""" + op = OpAndArgs(name, args) + + self._mutex.acquire() + if op not in self._deduper: + self._deduper[op] = None + self._queue.append(op) + self._mutex.release() + + def _getEvents(self): + """ + This gets all queued events at once and prunes events older than + 10 seconds from the deduper. + """ + self._mutex.acquire() + ret = self._queue + self._queue = [] + + # prune events that finished more than 10 secs ago from the deduper + tenSecsAgo = datetime.datetime.now() - datetime.timedelta(seconds = 10) + for (op, finishTime) in list(self._deduper.items()): + if finishTime is not None and finishTime < tenSecsAgo: + del self._deduper[op] + + self._mutex.release() + return ret + + def _requeueEvents(self, events): + self._mutex.acquire() + # events has commands issued before what is currently in _queue + events.extend(self._queue) + self._queue = events + self._mutex.release() + + def _markEventsFinished(self, events): + self._mutex.acquire() + now = datetime.datetime.now() + for event in events: + self._deduper[event] = now + self._mutex.release() + + def _runner(self): + while not self._die: + try: + self.__runner() + except Exception: + logger.error( + 'AgentComm.runner(): got an unrecognised exception.', + exc_info=True + ) + if not self._die: + time.sleep(1) + logger.info('AgentComm.runner() exiting normally...') + + def __runner(self): + while not self._die: + events = self._getEvents() + logger.trace(f'Get events from queue "{events}".') + if len(events) == 0: + time.sleep(5) + continue + + nextPollTime = datetime.datetime.now() + datetime.timedelta(seconds = 5) + + nodes_added = set() + nodes_removed = set() + + # scan the list of events, put together the extra_nodes and remove_nodes parameters to + # startTransaction(). Note, we could consolidate the activate / deactivate calls here, + # but that's a minor optimization not worth doing yet. + needs_transaction = False + for event in events: # TODO: combine with loop below. + #print(f"got event: {event}") + + # determine whether we need a transaction at all. + # List the fcns that require a txn here. + if not needs_transaction and event.name in ( + self._agent.activateNodes, + self._agent.deactivateNodes, + self._agent.movePrimaryNode): + needs_transaction = True + + if event.name == self._agent.activateNodes: + nodes = event.args[0] + for node in nodes: + nodes_added.add(node) + elif event.name == self._agent.deactivateNodes: + nodes = event.args[0] + for node in nodes: + nodes_removed.add(node) + + if needs_transaction: + logger.debug( + 'Failover starts transaction to run upcoming event.' + ) + (txn_id, nodes) = self._agent.startTransaction( + extra_nodes=list(nodes_added), + remove_nodes=list(nodes_removed) + ) + + # The problem with this is that it's all-or-nothing + # It would be preferable to commit what has been done up to the point of failure + # and discard the event that failed. + # If the problem is with the event itself, then it may keep happening and block all + # progress. + try: + for event in events: + #print(f"Running {event}") + event.run() + except Exception as e: + logger.error( + 'AgentComm.runner(): got an unrecognised exception.', + exc_info=True + ) + if needs_transaction: + logger.warning( + f'Aborting transaction {txn_id}', + exc_info=True + ) + self._agent.rollbackTransaction(txn_id, nodes=nodes) + # on failure, requeue the events in this batch to pick them up + # again on the next iteration + self._requeueEvents(events) + else: + if needs_transaction: + self._agent.commitTransaction(txn_id, nodes = nodes) + self._markEventsFinished(events) + finishTime = datetime.datetime.now() + if nextPollTime > finishTime: + time.sleep((nextPollTime - finishTime).seconds) diff --git a/cmapi/failover/config.py b/cmapi/failover/config.py new file mode 100644 index 000000000..0e87a4137 --- /dev/null +++ b/cmapi/failover/config.py @@ -0,0 +1,177 @@ +import configparser +import logging +import threading +from os.path import getmtime + +from cmapi_server.constants import DEFAULT_MCS_CONF_PATH, DEFAULT_SM_CONF_PATH +from mcs_node_control.models.node_config import NodeConfig + + +class Config: + config_file = '' + + # params read from the config file + _desired_nodes = [] + _active_nodes = [] + _inactive_nodes = [] + _primary_node = '' + _my_name = None # derived from config file + + config_lock = threading.Lock() + last_mtime = 0 + die = False + logger = None + + def __init__(self, config_file=DEFAULT_MCS_CONF_PATH): + self.config_file = config_file + self.logger = logging.getLogger() + + def getDesiredNodes(self): + self.config_lock.acquire() + self.check_reload() + ret = self._desired_nodes + self.config_lock.release() + return ret + + def getActiveNodes(self): + self.config_lock.acquire() + self.check_reload() + ret = self._active_nodes + self.config_lock.release() + return ret + + def getInactiveNodes(self): + self.config_lock.acquire() + self.check_reload() + ret = self._inactive_nodes + self.config_lock.release() + return ret + + def getAllNodes(self): + """Returns a 3-element tuple describing the status of all nodes. + + index 0 = all nodes in the cluster + index 1 = all active nodes + index 2 = all inactive nodes + """ + self.config_lock.acquire() + self.check_reload() + ret = (self._desired_nodes, self._active_nodes, self._inactive_nodes) + self.config_lock.release() + return ret + + def getPrimaryNode(self): + self.config_lock.acquire() + self.check_reload() + ret = self._primary_node + self.config_lock.release() + return ret + + def is_shared_storage(self, sm_config_file=DEFAULT_SM_CONF_PATH): + """Check if SM is S3 or not. + + :param sm_config_file: path to SM config, + defaults to DEFAULT_SM_CONF_PATH + :type sm_config_file: str, optional + :return: True if SM is S3 otherwise False + :rtype: bool + + TODO: remove in next releases, useless? + """ + sm_config = configparser.ConfigParser() + sm_config.read(sm_config_file) + # only LocalStorage or S3 can be returned for now + storage = sm_config.get( + 'ObjectStorage', 'service', fallback='LocalStorage' + ) + return storage.lower() == 's3' + + def check_reload(self): + """Check config reload. + + Returns True if reload happened, False otherwise. + """ + if self.last_mtime != getmtime(self.config_file): + self.load_config() + return True + return False + + def who_am_I(self): + self.config_lock.acquire() + self.check_reload() + ret = self._my_name + self.config_lock.release() + return ret + + def load_config(self): + try: + node_config = NodeConfig() + root = node_config.get_current_config_root(self.config_file) + last_mtime = getmtime(self.config_file) + except Exception: + self.logger.warning( + f'Failed to parse config file {self.config_file}.', + exc_info=True + ) + return False + + node_tmp = root.findall('./DesiredNodes/Node') + if len(node_tmp) == 0: + self.logger.warning( + f'The config file {self.config_file} is missing entries ' + 'in the DesiredNodes section' + ) + return False + + desired_nodes = [node.text for node in node_tmp] + active_nodes = [ + node.text for node in root.findall('./ActiveNodes/Node') + ] + inactive_nodes = [ + node.text for node in root.findall('./InactiveNodes/Node') + ] + + node_tmp = root.find('./PrimaryNode') + if node_tmp is None or len(node_tmp.text) == 0: + self.logger.warning( + f'The config file {self.config_file} is missing a valid ' + 'PrimaryNode entry' + ) + return False + primary_node = node_tmp.text + + # find my name in this cluster + names = set(node_config.get_network_addresses_and_names()) + all_nodes = set(desired_nodes) + intersection = all_nodes & names + if len(intersection) > 1: + my_name = intersection.pop() + self.logger.warning( + 'This node has multiple names in the list of desired nodes, ' + 'was it added more than once? Some things may not work in ' + f'this configuration. Using {my_name} as the name for this ' + 'node.' + ) + elif len(intersection) == 0: + self.logger.warning( + 'This node has no entry in the list of desired nodes.' + ) + my_name = None + elif len(intersection) == 1: + my_name = intersection.pop() + # handles the initial 0-node special case + if my_name == '127.0.0.1': + my_name = None + + self.logger.info(f'Loaded the config file, my name is {my_name}') + + desired_nodes.sort() + active_nodes.sort() + inactive_nodes.sort() + self._desired_nodes = desired_nodes + self._active_nodes = active_nodes + self._inactive_nodes = inactive_nodes + self._primary_node = primary_node + self.last_mtime = last_mtime + self._my_name = my_name + return True diff --git a/cmapi/failover/heartbeat_history.py b/cmapi/failover/heartbeat_history.py new file mode 100644 index 000000000..3155a58c9 --- /dev/null +++ b/cmapi/failover/heartbeat_history.py @@ -0,0 +1,95 @@ +from array import array +from threading import Lock + +# for tracking the history of heartbeat responses + +class InvalidNode: + pass + +class HBHistory: + # consts to denote state of the responses + NoResponse = 1 + GoodResponse = 2 + LateResponse = -1 + NewNode = 0 + + # By default, keep a 600 heartbeat history for each node (10 mins @ 1hb/s) + # and consider a response late if it arrives 3+ ticks late. 3 is an arbitrary small value. + def __init__(self, tickWindow=600, lateWindow=3): + # a list of a heartbeats for each node. index = str, value = array of int, + # history flushes each time threaad restarted + self.nodeHistory = {} + # current tick resets to zero each time thread restarted + self.currentTick = 0 + self.lateWindow = lateWindow + self.mutex = Lock() + self.tickWindow = tickWindow + + def _initNode(self, node, defaultValue = GoodResponse): + self.nodeHistory[node] = array( + 'b', [ defaultValue for _ in range(self.tickWindow) ] + ) + + def removeNode(self, node): + self.mutex.acquire() + if node in self.nodeHistory: + del self.nodeHistory[node] + self.mutex.release() + + def keepOnlyTheseNodes(self, nodes): + self.mutex.acquire() + nodesToKeep = set(nodes) + historicalNodes = set(self.nodeHistory.keys()) + for node in historicalNodes: + if node not in nodesToKeep: + del self.nodeHistory[node] + self.mutex.release() + + def setCurrentTick(self, tick): + self.mutex.acquire() + + self.currentTick = tick + for pongs in self.nodeHistory.values(): + pongs[tick % self.tickWindow] = self.NoResponse + + self.mutex.release() + + def gotHeartbeat(self, node, tickID): + if tickID <= self.currentTick - self.lateWindow: + status = self.LateResponse + else: + status = self.GoodResponse + + self.mutex.acquire() + if node not in self.nodeHistory: + self._initNode(node) + self.nodeHistory[node][tickID % self.tickWindow] = status + self.mutex.release() + + # defaultValue is used to init a fake history for a node this code is learning about + # 'now'. If a node is inserted into the active list, we do not want to remove + # it right away b/c it hasn't responded to any pings yet. Likewise, + # if a node is inserted into the inactive list, we do not want to activate it + # right away b/c it has responded to all pings sent so far (0). TBD if we want + # to add logic to handle an 'init' value in the history. + def getNodeHistory(self, node, tickInterval, defaultValue = GoodResponse): + self.mutex.acquire() + if node not in self.nodeHistory: + self._initNode(node, defaultValue = defaultValue) + + # We don't want to return values in the range where we are likely to be + # gathering responses. + # The return value is the range of heartbeat responses from node from + # tickInterval + lateWindow ticks ago to lateWindow ticks ago + + lastIndex = (self.currentTick - self.lateWindow) % self.tickWindow + firstIndex = lastIndex - tickInterval + history = self.nodeHistory[node] + if firstIndex < 0: + ret = history[firstIndex:] + ret.extend(history[:lastIndex]) + else: + ret = history[firstIndex:lastIndex] + + self.mutex.release() + return ret diff --git a/cmapi/failover/heartbeater.py b/cmapi/failover/heartbeater.py new file mode 100644 index 000000000..015b0c2f1 --- /dev/null +++ b/cmapi/failover/heartbeater.py @@ -0,0 +1,121 @@ +import logging +import threading +import time +from socket import socket, SOCK_DGRAM +from struct import pack, unpack_from + + +class HeartBeater: + port = 9051 + dieMsg = b'die!00' + areYouThereMsg = b'AYTM' + yesIAmMsg = b'YIAM' + + def __init__(self, config, history): + self.config = config + self.die = False + self.history = history + self.sequenceNum = 0 + self.responseThread = None + self.sock = None + self.sockMutex = threading.Lock() + self.logger = logging.getLogger('heartbeater') + + def start(self): + self.initSockets() + self.die = False + self.responseThread = threading.Thread( + target=self.listenAndRespond, name='HeartBeater' + ) + self.responseThread.start() + + def stop(self): + self.die = True + # break out of the recv loop + sock = socket(type=SOCK_DGRAM) + sock.sendto(self.dieMsg, ('localhost', self.port)) + time.sleep(1) + self.sock.close() + self.responseThread.join() + + def initSockets(self): + self.sock = socket(type=SOCK_DGRAM) + self.sock.bind(('0.0.0.0', self.port)) + + def listenAndRespond(self): + self.logger.info('Starting the heartbeat listener.') + while not self.die: + try: + self._listenAndRespond() + except Exception: + self.logger.warning( + 'Caught an exception while listening and responding.', + exc_info=True + ) + time.sleep(1) + self.logger.info('Heartbeat listener exiting normally...') + + def _listenAndRespond(self): + (data, remote) = self.sock.recvfrom(300) + if len(data) < 6: + return + (msg_type, seq) = unpack_from('4sH', data, 0) + if msg_type == self.areYouThereMsg: + self.logger.trace(f'Got "are you there?" from {remote[0]}') + name = self.config.who_am_I() + if name is None: + self.logger.warning( + 'Heartbeater: got an "are you there?" msg from ' + f'{remote[0]}, but this node is not in the list of ' + 'desired nodes for the cluster. ' + 'This node needs a config update.' + ) + return + bname = name.encode('ascii') + if len(bname) > 255: + bname = bname[:255] + msg = pack(f'4sH{len(bname)}s', self.yesIAmMsg, seq, bname) + self.send(msg, remote[0]) + self.logger.trace(f'Send "yes I Am" to {remote[0]}') + elif msg_type == self.yesIAmMsg: + if len(data) > 6: + name = data[6:].decode('ascii') + self.logger.trace(f'Got "yes I am" from {name}') + self.history.gotHeartbeat(name, seq) + + def send(self, msg, destaddr): + self.sockMutex.acquire() + try: + self.sock.sendto(msg, (destaddr, self.port)) + except Exception: + self.logger.warning( + f'Heartbeater.send(): caught error sending msg to {destaddr}', + exc_info=True + ) + finally: + self.sockMutex.release() + + def sendHeartbeats(self): + nodes = self.config.getDesiredNodes() + my_name = self.config.who_am_I() + msg = pack('4sH', self.areYouThereMsg, self.sequenceNum) + self.sockMutex.acquire() + for node in nodes: + if node == my_name: + continue + try: + self.logger.trace(f'Send "are you there" node {node}') + self.sock.sendto(msg, (node, self.port)) + except Exception as e: + pass + # Suppressing these logs. + # In docker the whole dns entry gets removed when a container + # goes away. + # Ends up spamming the logs until the node is removed from + # the cluster via the rest endpoint, or the node comes back up. + # self.logger.warning("Heartbeater.sendHeartbeats(): + # caught an exception sending heartbeat to {}: {}". + # format(node, e)) + self.sockMutex.release() + self.sequenceNum = (self.sequenceNum + 1) % 65535 + self.history.setCurrentTick(self.sequenceNum) diff --git a/cmapi/failover/node_monitor.py b/cmapi/failover/node_monitor.py new file mode 100644 index 000000000..cbcf758de --- /dev/null +++ b/cmapi/failover/node_monitor.py @@ -0,0 +1,230 @@ +import logging +import time +import threading + +from .heartbeater import HeartBeater +from .config import Config +from .heartbeat_history import HBHistory +from .agent_comm import AgentComm + + +class NodeMonitor: + + def __init__( + self, agent=None, config=None, samplingInterval=30, + flakyNodeThreshold=0.5 + ): + self._agentComm = AgentComm(agent) + self._die = False + self._inStandby = False + self._testMode = False # TODO: remove + self._hbHistory = HBHistory() + self._logger = logging.getLogger('node_monitor') + self._runner = None + if config is not None: + self._config = config + else: + self._config = Config() + self._hb = HeartBeater(self._config, self._hbHistory) + self.samplingInterval = samplingInterval + # not used yet, KI-V-SS for V1 [old comment from Patrick] + self.flakyNodeThreshold = flakyNodeThreshold + self.myName = self._config.who_am_I() + #self._logger.info("Using {} as my name".format(self.myName)) + + def __del__(self): + self.stop() + + def start(self): + self._agentComm.start() + self._hb.start() + self._die = False + self._runner = threading.Thread( + target=self.monitor, name='NodeMonitor' + ) + self._runner.start() + + def stop(self): + self._die = True + self._agentComm.die() + if not self._testMode: + self._hb.stop() + self._runner.join() + + def _removeRemovedNodes(self, desiredNodes): + self._hbHistory.keepOnlyTheseNodes(desiredNodes) + + def _pickNewActor(self, nodes): + if not nodes: + return + if self.myName == nodes[0]: + self._isActorOfCohort = True + else: + self._isActorOfCohort = False + + def _chooseNewPrimaryNode(self): + self._agentComm.movePrimaryNode() + + def monitor(self): + while not self._die: + try: + self._logger.info('Starting the monitor logic') + self._monitor() + except Exception: + self._logger.error( + f'monitor() caught an exception.', + exc_info=True + ) + if not self._die: + time.sleep(1) + self._logger.info("node monitor logic exiting normally...") + + def _monitor(self): + """ + This works like the main loop of a game. + 1) check current state + 2) identify the differences + 3) update based on the differences + """ + + (desiredNodes, activeNodes, inactiveNodes) = self._config.getAllNodes() + self._pickNewActor(activeNodes) + + logged_idleness_msg = False + logged_active_msg = False + inStandbyMode = False + while not self._die: + # these things would normally go at the end of the loop; doing it here + # to reduce line count & chance of missing something as we add more code + oldActiveNodes = activeNodes + wasActorOfCohort = self._isActorOfCohort + self._logger.trace( + f'Previous actor of cohort state is {wasActorOfCohort}' + ) + time.sleep(1) + + # get config updates + (desiredNodes, activeNodes, inactiveNodes) = self._config.getAllNodes() + self.myName = self._config.who_am_I() + self.primaryNode = self._config.getPrimaryNode() + + # remove nodes from history that have been removed from the cluster + self._removeRemovedNodes(desiredNodes) + + # if there are less than 3 nodes in the cluster, do nothing + if len(desiredNodes) < 3: + if not logged_idleness_msg: + self._logger.info( + 'Failover support is inactive; ' + 'requires at least 3 nodes and a shared storage system' + ) + logged_idleness_msg = True + logged_active_msg = False + elif not logged_active_msg: + self._logger.info( + 'Failover support is active, ' + f'monitoring nodes {desiredNodes}' + ) + logged_active_msg = True + logged_idleness_msg = False + + # nothing to do in this case + if len(desiredNodes) == 1: + continue + + # has this node been reactivated? + if self.myName in activeNodes: + #TODO: remove useless flag or use it in future releases + self._inStandby = False + # has it been deactivated? + else: + self._logger.trace('Node not in active nodes, do nothing.') + self._inStandby = True + continue # wait to be activated + + # send heartbeats + self._hb.sendHeartbeats() + + # decide if action is necessary based on config changes + + # get the list of nodes no longer responding to heartbeats + # V1: only remove a node that hasn't responded to any pings in the sampling period + deactivateSet = set() + for node in activeNodes: + if node == self.myName: + continue + history = self._hbHistory.getNodeHistory(node, self.samplingInterval, HBHistory.GoodResponse) + self._logger.trace(f'Get history "{history}" for node {node}') + noResponses = [ x for x in history if x == HBHistory.NoResponse ] + if len(noResponses) == self.samplingInterval: + deactivateSet.add(node) + + # get the list of nodes that have started responding + # reactivate live nodes that have begun responding to heartbeats + # V1: only reactivate a node if we have good responses for the whole sampling period + activateSet = set() + for node in inactiveNodes: + history = self._hbHistory.getNodeHistory(node, self.samplingInterval, HBHistory.NoResponse) + goodResponses = [ x for x in history if x == HBHistory.GoodResponse ] + if len(goodResponses) == self.samplingInterval: + activateSet.add(node) + + # effectiveActiveNodeList can be described as activeNodes after pending config changes + # have been applied. Another way to view it is that it reflects current reality, whereas + # the config file reflects a fixed point in the recent past. + effectiveActiveNodeList = sorted((set(activeNodes) - deactivateSet) | activateSet) + + # if there was a change to the list of active nodes + # decide if this node is the effective actor in the cohort. + if effectiveActiveNodeList != activeNodes: + self._pickNewActor(effectiveActiveNodeList) + self._logger.trace( + f'Effective list changed, actor state is {self._isActorOfCohort}' + ) + elif oldActiveNodes != activeNodes: + self._pickNewActor(activeNodes) + self._logger.trace( + f'Active list changed, actor state is {self._isActorOfCohort}' + ) + + + # if we are in a cohort that has <= 50% of the desired nodes, enter standby + if len(activeNodes)/len(desiredNodes) <= 0.5 and len(effectiveActiveNodeList)/len(desiredNodes) <= 0.5: + if not inStandbyMode: + msg = "Only {} out of {} nodes are active. At least {} are required. Entering standby mode to protect the system."\ + .format(len(activeNodes), len(desiredNodes), int(len(desiredNodes)/2) + 1) + self._agentComm.raiseAlarm(msg) + self._logger.critical(msg) + self._agentComm.enterStandbyMode() + inStandbyMode = True + continue + elif inStandbyMode and len(effectiveActiveNodeList)/len(desiredNodes) > 0.5: + self._logger.info("Exiting standby mode, waiting for config update") + inStandbyMode = False + + # (wasActorOfCohort and not isActorOfCohort) indicates that a new Actor has come online. + # To hand over the crown, perform one last act as Actor to add it back to the cluster + # and synchronize its config file. + + # if not the actor, nothing else for this node to do + if not self._isActorOfCohort and not wasActorOfCohort: + continue + + # as of here, this node is the actor of its quorum + + if len(deactivateSet) > 0: + self._agentComm.deactivateNodes(list(deactivateSet)) + + if len(activateSet) > 0: + self._agentComm.activateNodes(activateSet) + + # if the primary node is in this list to be deactivated, or its already on the inactive list + # choose a new primary node. The deadNode list is a sanity check for cases like the cluster + # starting with the primary node already in inactive-nodes. + deadNodeList = list(deactivateSet) + inactiveNodes + if self.primaryNode in deadNodeList: + self._chooseNewPrimaryNode() + + # methods for testing + def turnOffHBResponder(self): + self.stop() diff --git a/cmapi/failover/test/config-test.xml b/cmapi/failover/test/config-test.xml new file mode 100644 index 000000000..3d728b198 --- /dev/null +++ b/cmapi/failover/test/config-test.xml @@ -0,0 +1,18 @@ + + + node1 + node2 + node3 + node4 + + + node1 + node2 + node3 + + + node4 + + node2 + 1 + diff --git a/cmapi/failover/test/test_agent_comm.py b/cmapi/failover/test/test_agent_comm.py new file mode 100644 index 000000000..b4d21f114 --- /dev/null +++ b/cmapi/failover/test/test_agent_comm.py @@ -0,0 +1,129 @@ +import unittest +import time +import socket +import datetime +import cherrypy +import os.path +from contextlib import contextmanager +from ..agent_comm import AgentComm +from cmapi_server.failover_agent import FailoverAgent +from mcs_node_control.models.node_config import NodeConfig +from cmapi_server.controllers.dispatcher import dispatcher, jsonify_error +from cmapi_server.test.unittest_global import create_self_signed_certificate, cert_filename +from cmapi_server import helpers, node_manipulation + +config_filename = './cmapi_server/cmapi_server.conf' + +@contextmanager +def start_server(): + if not os.path.exists(cert_filename): + create_self_signed_certificate() + + app = cherrypy.tree.mount(root = None, config = config_filename) + app.config.update({ + '/': { + 'request.dispatch': dispatcher, + 'error_page.default': jsonify_error, + }, + 'config': { + 'path': config_filename, + }, + }) + cherrypy.config.update(config_filename) + + cherrypy.engine.start() + cherrypy.engine.wait(cherrypy.engine.states.STARTED) + yield + cherrypy.engine.exit() + cherrypy.engine.block() + + +class TestAgentComm(unittest.TestCase): + + def test_with_agent_base(self): + agent = AgentComm() + # Add events except for enterStandbyMode + agent.activateNodes(["mysql.com"]) + agent.activateNodes(["mysql.com"]) # an intentional dup + agent.designatePrimaryNode("mysql.com") + agent.deactivateNodes(["mysql.com"]) + agent.deactivateNodes(["mysql.com"]) + agent.designatePrimaryNode(socket.gethostname()) + + health = agent.getNodeHealth() + agent.raiseAlarm("Hello world!") + print("Waiting up to 20s for queued events to be processed and removed") + stop_time = datetime.datetime.now() + datetime.timedelta(seconds = 20) + success = False + while datetime.datetime.now() < stop_time and not success: + sizes = agent.getQueueSize() + if sizes != (0, 0): + time.sleep(1) + else: + print("Event queue & deduper are now empty") + success = True + + print("Waiting for the agent comm thread to die.") + agent.die() + self.assertTrue(success) + + + # This is the beginnings of an integration test, will need perms to modify the real config file + def test_with_failover_agent(self): + + print("\n\n") # make a little whitespace between tests + + # check for existence of and permissions to write to the real config file + try: + f = open("/etc/columnstore/Columnstore.xml", "a") + f.close() + except PermissionError: + print(f"Skipping {__name__}, got a permissions error opening /etc/columnstore/Columnstore.xml for writing") + return + + success = False + with start_server(): + try: + agent = FailoverAgent() + agentcomm = AgentComm(agent) + + # make sure the AC thread has a chance to start before we start issuing cmds. + # If it grabs jobs in the middle of this block, we'll try to send the config file + # to mysql.com. :D + time.sleep(1) + + # do the same as above. + agentcomm.activateNodes(["mysql.com"]) + agentcomm.activateNodes(["mysql.com"]) # an intentional dup + agentcomm.designatePrimaryNode("mysql.com") + agentcomm.deactivateNodes(["mysql.com"]) + agentcomm.deactivateNodes(["mysql.com"]) + agentcomm.designatePrimaryNode(socket.gethostname()) + + health = agent.getNodeHealth() + agent.raiseAlarm("Hello world!") + print("Waiting up to 30s for queued events to be processed and removed") + stop_time = datetime.datetime.now() + datetime.timedelta(seconds = 30) + + while datetime.datetime.now() < stop_time and not success: + sizes = agentcomm.getQueueSize() + if sizes != (0, 0): + time.sleep(1) + else: + print("Event queue & deduper are now empty") + success = True + if not success: + raise Exception("The event queue or de-duper did not empty within 30s") + agentcomm.die() + except Exception as e: + agentcomm.die() + cherrypy.engine.exit() + cherrypy.engine.block() + raise + + # clean up the config file, remove mysql.com + txnid = helpers.start_transaction() + node_manipulation.remove_node("mysql.com") + helpers.update_revision_and_manager() + helpers.broadcast_new_config() + helpers.commit_transaction(txnid) diff --git a/cmapi/failover/test/tester.py b/cmapi/failover/test/tester.py new file mode 100644 index 000000000..73e4ed03a --- /dev/null +++ b/cmapi/failover/test/tester.py @@ -0,0 +1,83 @@ +from .. import config +import time +from socket import * +import struct +import sys + +_config = config.Config("failover/test/config-test.xml") +print("got desired_nodes = {}".format(_config.getDesiredNodes())) +print("got active_nodes = {}".format(_config.getActiveNodes())) +print("got inacive_nodes = {}".format(_config.getInactiveNodes())) +print("got all nodes = {}".format(_config.getAllNodes())) +print("got primarynode = {}".format(_config.getPrimaryNode())) +print() + +from ..heartbeater import HeartBeater +from ..heartbeat_history import HBHistory + +hbh = HBHistory() +hb = HeartBeater(_config, hbh) +hb.start() +sock = socket(type = SOCK_DGRAM) +sock.bind(('localhost', 12345)) + +# Updated heartbeater to send the reply to its own port, rather than to +# the port of the sending socket. Need to update this. +#msg = struct.pack("4sH", hb.areYouThereMsg, 1234) +#sock.sendto(msg, ('localhost', hb.port)) +#print("sent the are-you-there msg") +#(data, remote) = sock.recvfrom(6) +#(data, seq) = struct.unpack("4sH", data) +#if data == hb.yesIAmMsg: +# print("got the yes-i-am msg, seq = {}".format(seq)) +#else: +# print("got something other than the yes-i-am-msg") + +hb.stop() + +#from heartbeat_history import HBHistory + +#hbh = HBHistory() +hbh.setCurrentTick(0) +hbh.gotHeartbeat("node1", 0) +hbh.setCurrentTick(1) +hbh.gotHeartbeat("node2", 1) +hbh.setCurrentTick(2) +hbh.setCurrentTick(10) +hbh.gotHeartbeat("node1", 9) +hbh.gotHeartbeat("node1", 2) +pongs = hbh.getNodeHistory("node1", 20) +print("Got pongs: {}".format(pongs)) + +print(''' + This is currently a 'manual' test, meaning the user should watch for the expected output + In this case, because NM's identity checker will return 'node1', and that does not match + node[2-4], those nodes will appear to NodeMonitor to be offline. Our starting condition + is that nodes 1-3 are active, and node4 is inactive. After 15s, nodes 2 & 3 + should be deactivated, a new primary node will be chosen, and and our AgentBase will start + printing these events. +''') +def testNodeMonitor1(nm): + nm.start() + print("Waiting for 20 secs, watch for output from AgentBase") + time.sleep(20) + nm.stop() + time.sleep(1) + print("NodeMonitor was stopped, did it produce the right output?") + +from ..node_monitor import NodeMonitor +nm = NodeMonitor(config = _config, samplingInterval = 10) +# check whether node[1-4] are in the /etc/hosts file as localhost +addr1 = gethostbyname("node1") +addr2 = gethostbyname("node2") +addr3 = gethostbyname("node3") +addr4 = gethostbyname("node4") +if addr1 == '127.0.0.1' and addr2 == '127.0.0.1' and addr3 == '127.0.0.1' and addr4 == '127.0.0.1': + testNodeMonitor1(nm) +else: + print("Skipping testNodeMonitor1(). node[1-4] needs to be defined as 127.0.0.1 in /etc/hosts") + + + +print("tester is finished") + diff --git a/cmapi/mcs.template b/cmapi/mcs.template new file mode 100755 index 000000000..fbd489126 --- /dev/null +++ b/cmapi/mcs.template @@ -0,0 +1 @@ +PYTHONPATH="${CMAPI_DIR}:${CMAPI_DIR}/deps" ${CMAPI_DIR}/python/bin/python3 -m mcs_cluster_tool $@ diff --git a/cmapi/mcs_cluster_tool/__init__.py b/cmapi/mcs_cluster_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/mcs_cluster_tool/__main__.py b/cmapi/mcs_cluster_tool/__main__.py new file mode 100644 index 000000000..773123854 --- /dev/null +++ b/cmapi/mcs_cluster_tool/__main__.py @@ -0,0 +1,30 @@ +import logging +import sys + +import typer + +from cmapi_server.logging_management import dict_config, add_logging_level +from mcs_cluster_tool import cluster_app +from mcs_cluster_tool.constants import MCS_CLI_LOG_CONF_PATH + + +# don't show --install-completion and --show-completion options in help message +app = typer.Typer( + add_completion=False, + help=( + 'The MCS Command Line Interface is a unified tool to manage your ' + 'MCS services' + ), +) +app.add_typer(cluster_app.app, name="cluster") + + +if __name__ == "__main__": + add_logging_level('TRACE', 5) #TODO: remove when stadalone mode added. + dict_config(MCS_CLI_LOG_CONF_PATH) + logger = logging.getLogger('mcs_cli') + # add separator between cli commands logging + logger.debug(f'{"-":-^80}') + cl_args_line = ' '.join(sys.argv[1:]) + logger.debug(f'Called "mcs {cl_args_line}"') + app(prog_name='mcs') diff --git a/cmapi/mcs_cluster_tool/cluster_app.py b/cmapi/mcs_cluster_tool/cluster_app.py new file mode 100644 index 000000000..27963adf8 --- /dev/null +++ b/cmapi/mcs_cluster_tool/cluster_app.py @@ -0,0 +1,140 @@ +"""Cluster typer application. + +Formally this module contains all subcommands for "mcs cluster" cli command. +""" +import logging +from typing import List, Optional + +import pyotp +import typer + +from cmapi_server.constants import SECRET_KEY +from cmapi_server.handlers.cluster import ClusterHandler +from mcs_cluster_tool.decorators import handle_output + + +logger = logging.getLogger('mcs_cli') +app = typer.Typer( + help='MariaDB Columnstore cluster management command line tool.' +) +node_app = typer.Typer(help='Cluster nodes management.') +app.add_typer(node_app, name='node') +set_app = typer.Typer(help='Set cluster parameters.') +app.add_typer(set_app, name='set') + + +@app.command() +@handle_output +def status(): + """Get status information.""" + return ClusterHandler.status(logger=logger) + + +@app.command() +@handle_output +def stop(): + """Stop the Columnstore cluster.""" + return ClusterHandler.shutdown(logger=logger) + + +@app.command() +@handle_output +def start(): + """Start the Columnstore cluster.""" + return ClusterHandler.start(logger=logger) + + +@app.command() +@handle_output +def restart(): + """Restart the Columnstore cluster.""" + stop_result = ClusterHandler.shutdown(logger=logger) + if 'error' in stop_result: + return stop_result + result = ClusterHandler.start(logger=logger) + result['stop_timestamp'] = stop_result['timestamp'] + return result + + +@node_app.command() +@handle_output +def add( + nodes: Optional[List[str]] = typer.Option( + ..., + '--node', # command line argument name + help=( + 'node IP, name or FQDN. ' + 'Can be used multiple times to add several nodes at a time.' + ) + ) +): + """Add nodes to the Columnstore cluster.""" + result = [] + for node in nodes: + result.append(ClusterHandler.add_node(node, logger=logger)) + return result + + +@node_app.command() +@handle_output +def remove(nodes: Optional[List[str]] = typer.Option( + ..., + '--node', # command line argument name + help=( + 'node IP, name or FQDN. ' + 'Can be used multiple times to remove several nodes at a time.' + ) + ) +): + """Remove nodes from the Columnstore cluster.""" + result = [] + for node in nodes: + result.append(ClusterHandler.remove_node(node, logger=logger)) + return result + + +@set_app.command() +@handle_output +def mode(cluster_mode: str = typer.Option( + ..., + '--mode', + help=( + 'cluster mode to set. ' + '"readonly" or "readwrite" are the only acceptable values.' + ) + ) +): + """Set Columnstore cluster mode.""" + if cluster_mode not in ('readonly', 'readwrite'): + raise typer.BadParameter( + '"readonly" or "readwrite" are the only acceptable modes now.' + ) + return ClusterHandler.set_mode(cluster_mode, logger=logger) + + +@set_app.command() +@handle_output +def api_key(key: str = typer.Option(..., help='API key to set.')): + """Set API key for communication with cluster nodes via API. + + WARNING: this command will affect API key value on all cluster nodes. + """ + if not key: + raise typer.BadParameter('Empty API key not allowed.') + + totp = pyotp.TOTP(SECRET_KEY) + + return ClusterHandler.set_api_key(key, totp.now(), logger=logger) + + +@set_app.command() +@handle_output +def log_level(level: str = typer.Option(..., help='Logging level to set.')): + """Set logging level on all cluster nodes for develop purposes. + + WARNING: this could dramatically affect the number of log lines. + """ + if not level: + raise typer.BadParameter('Empty log level not allowed.') + + return ClusterHandler.set_log_level(level, logger=logger) diff --git a/cmapi/mcs_cluster_tool/constants.py b/cmapi/mcs_cluster_tool/constants.py new file mode 100644 index 000000000..796259ff5 --- /dev/null +++ b/cmapi/mcs_cluster_tool/constants.py @@ -0,0 +1,4 @@ +import os + +MCS_CLI_ROOT_PATH = os.path.dirname(__file__) +MCS_CLI_LOG_CONF_PATH = os.path.join(MCS_CLI_ROOT_PATH, 'mcs_cli_log.conf') diff --git a/cmapi/mcs_cluster_tool/decorators.py b/cmapi/mcs_cluster_tool/decorators.py new file mode 100644 index 000000000..e5ab0bb40 --- /dev/null +++ b/cmapi/mcs_cluster_tool/decorators.py @@ -0,0 +1,35 @@ +"""Module contains decorators for typer cli commands.""" +import json +import logging +from functools import wraps + +import typer + +from cmapi_server.exceptions import CMAPIBasicError + + +def handle_output(func): + """Decorator for handling output errors and add result to log file.""" + @wraps(func) + def wrapper(*args, **kwargs): + logger = logging.getLogger('mcs_cli') + return_code = 1 + try: + result = func(*args, **kwargs) + typer.echo(json.dumps(result, indent=2)) + logger.debug(f'Command returned: {result}') + return_code = 0 + except CMAPIBasicError as err: + typer.echo(err.message, err=True) + logger.error('Error while command execution', exc_info=True) + except typer.BadParameter as err: + logger.error('Bad command line parameter.') + raise err + except Exception: + logger.error( + 'Undefined error while command execution', + exc_info=True + ) + typer.echo('Unknown error, check the log file.', err=True) + raise typer.Exit(return_code) + return wrapper diff --git a/cmapi/mcs_cluster_tool/mcs_cli_log.conf b/cmapi/mcs_cluster_tool/mcs_cli_log.conf new file mode 100644 index 000000000..bf8b38a10 --- /dev/null +++ b/cmapi/mcs_cluster_tool/mcs_cli_log.conf @@ -0,0 +1,31 @@ +{ + "version": 1, + "disable_existing_loggers": true, + "formatters": { + "default": { + "format": "%(asctime)s [%(levelname)s] (%(name)s) %(message)s", + "datefmt": "%d/%b/%Y %H:%M:%S" + } + }, + "handlers": { + "file": { + "class" : "logging.handlers.RotatingFileHandler", + "formatter": "default", + "filename": "/var/log/mariadb/columnstore/mcs_cli.log", + "mode": "a", + "maxBytes": 1048576, + "backupCount": 10 + } + }, + "loggers": { + "": { + "level": "DEBUG", + "handlers": ["file"] + }, + "mcs_cli": { + "level": "DEBUG", + "handlers": ["file"], + "propagate": false + } + } +} diff --git a/cmapi/mcs_node_control/__init__.py b/cmapi/mcs_node_control/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/mcs_node_control/custom_dispatchers/container.sh b/cmapi/mcs_node_control/custom_dispatchers/container.sh new file mode 100755 index 000000000..43f57563e --- /dev/null +++ b/cmapi/mcs_node_control/custom_dispatchers/container.sh @@ -0,0 +1,269 @@ +#!/bin/bash + +# TODO: remove in next releases + +programname=$0 + +function usage { + echo "usage: $programname op [service_name] [is_primary]" + echo " op - operation name [start|stop]" + echo " service_name - [mcs-controllernode|mcs-workernode etc]" + echo " is_primary - [0|1]" + exit 1 +} + +operation=$1 +service_name=$2 +is_primary=$3 + +if [[ -z "$operation" || -z "$service_name" || $is_primary -ne 0 && $is_primary -ne 1 ]]; then + usage +fi + +LOG_FILE=/var/log/mariadb/columnstore/container-sh.log + +start_up_to_workernode() { + # Set Variables + IFLAG=/etc/columnstore/container-initialized + LOG_PREFIX=/var/log/mariadb/columnstore + MCS_INSTALL_PATH=/var/lib/columnstore + MCS_INSTALL_BIN=/usr/bin + PROGS='StorageManager mcs-loadbrm.py workernode' + JEMALLOC_PATH=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') + if [ -z "$JEMALLOC_PATH" && -f $MCS_INSTALL_PATH/libjemalloc.so.2 ]; then + JEMALLOC_PATH=$MCS_INSTALL_PATH/libjemalloc.so.2 + fi + export LD_PRELOAD=$JEMALLOC_PATH + + # Intialize Container If Necessary + if [ ! -e $IFLAG ]; then + $MCS_INSTALL_BIN/columnstore-init &>> $LOG_PREFIX/columnstore-init.log + fi + + # Verify All Programs Are Available + for i in $PROGS ; do + if [ ! -x $MCS_INSTALL_BIN/$i ] ; then + echo "$i doesn't exist." + exit 1 + fi + done + + # Start System + echo `date`: start_up_to_workernode\(\)... >> $LOG_FILE + + touch $LOG_PREFIX/storagemanager.log && chmod 666 $LOG_PREFIX/storagemanager.log + $MCS_INSTALL_BIN/StorageManager &>> $LOG_PREFIX/storagemanager.log & + echo `date`: StorageManager PID = $! >> $LOG_FILE + + sleep 1 + + echo `date`: loading BRM >> $LOG_FILE + touch $LOG_PREFIX/mcs-loadbrm.log && chmod 666 $LOG_PREFIX/mcs-loadbrm.log + # Argument "no" here means don't use systemd to start SM + $MCS_INSTALL_BIN/mcs-loadbrm.py no >> $LOG_PREFIX/mcs-loadbrm.log 2>&1 + + touch $LOG_PREFIX/workernode.log && chmod 666 $LOG_PREFIX/workernode.log + $MCS_INSTALL_BIN/workernode DBRM_Worker1 &>> $LOG_PREFIX/workernode.log & + echo `date`: workernode PID = $! >> $LOG_FILE + + exit 0 +} + +start_those_left_at_master() { + # Set Variables + LOG_PREFIX=/var/log/mariadb/columnstore + MCS_INSTALL_PATH=/var/lib/columnstore + MCS_INSTALL_BIN=/usr/bin + # TODO: remove fast fix + # skip check binary for ExeMgr + PROGS='controllernode PrimProc WriteEngineServer DMLProc DDLProc' + JEMALLOC_PATH=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') + if [ -z "$JEMALLOC_PATH" && -f $MCS_INSTALL_PATH/libjemalloc.so.2 ]; then + JEMALLOC_PATH=$MCS_INSTALL_PATH/libjemalloc.so.2 + fi + export LD_PRELOAD=$JEMALLOC_PATH + + # Verify All Programs Are Available (except ExeMgr) + for i in $PROGS ; do + if [ ! -x $MCS_INSTALL_BIN/$i ] ; then + echo "$i doesn't exist." + exit 1 + fi + done + + echo `date`: start_those_left_at_master\(\) >> $LOG_FILE + + if [[ $is_primary -eq 1 ]]; then + touch $LOG_PREFIX/controllernode.log && chmod 666 $LOG_PREFIX/controllernode.log + $MCS_INSTALL_BIN/controllernode fg &>> $LOG_PREFIX/controllernode.log & + echo `date`: controllernode PID = $! >> $LOG_FILE + fi + + touch $LOG_PREFIX/primproc.log && chmod 666 $LOG_PREFIX/primproc.log + $MCS_INSTALL_BIN/PrimProc &>> $LOG_PREFIX/primproc.log & + echo `date`: PrimProc PID = $! >> $LOG_FILE + + sleep 1 + + if [ -e $MCS_INSTALL_BIN/ExeMgr ] ; then + touch $LOG_PREFIX/exemgr.log && chmod 666 $LOG_PREFIX/exemgr.log + $MCS_INSTALL_BIN/ExeMgr &>> $LOG_PREFIX/exemgr.log & + echo `date`: ExeMgr PID = $! >> $LOG_FILE + fi + + touch $LOG_PREFIX/writeengineserver.log && chmod 666 $LOG_PREFIX/writeengineserver.log + $MCS_INSTALL_BIN/WriteEngineServer &>> $LOG_PREFIX/writeengineserver.log & + echo `date`: WriteEngineServer PID = $! >> $LOG_FILE + + sleep 3 + + touch $LOG_PREFIX/dmlproc.log && chmod 666 $LOG_PREFIX/dmlproc.log + $MCS_INSTALL_BIN/DMLProc &>> $LOG_PREFIX/dmlproc.log & + echo `date`: DMLProc PID = $! >> $LOG_FILE + + touch $LOG_PREFIX/ddlproc.log && chmod 666 $LOG_PREFIX/ddlproc.log + $MCS_INSTALL_BIN/DDLProc &>> $LOG_PREFIX/ddlproc.log & + echo `date`: DDLProc PID = $! >> $LOG_FILE + + exit 0 +} + + + +start() { + # Set Variables + IFLAG=/etc/columnstore/container-initialized + LOG_PREFIX=/var/log/mariadb/columnstore + MCS_INSTALL_PATH=/var/lib/columnstore + MCS_INSTALL_BIN=/usr/bin + # TODO: remove fast fix + # skip check binary for ExeMgr + PROGS='StorageManager load_brm workernode controllernode PrimProc WriteEngineServer DMLProc DDLProc' + JEMALLOC_PATH=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') + if [ -z "$JEMALLOC_PATH" && -f $MCS_INSTALL_PATH/libjemalloc.so.2 ]; then + JEMALLOC_PATH=$MCS_INSTALL_PATH/libjemalloc.so.2 + fi + export LD_PRELOAD=$JEMALLOC_PATH + + # Intialize Container If Necessary + if [ ! -e $IFLAG ]; then + $MCS_INSTALL_BIN/columnstore-init &>> $LOG_PREFIX/columnstore-init.log + fi + + # Verify All Programs Are Available (except ExeMgr) + for i in $PROGS ; do + if [ ! -x $MCS_INSTALL_BIN/$i ] ; then + echo "$i doesn't exist." + exit 1 + fi + done + + # Start System + echo `date`: start\(\)... >> $LOG_FILE + + touch $LOG_PREFIX/storagemanager.log && chmod 666 $LOG_PREFIX/storagemanager.log + $MCS_INSTALL_BIN/StorageManager &>> $LOG_PREFIX/storagemanager.log & + echo `date`: StorageManager PID = $! >> $LOG_FILE + sleep 1 + + echo `date`: loading BRM >> $LOG_FILE + touch $LOG_PREFIX/mcs-loadbrm.log && chmod 666 $LOG_PREFIX/mcs-loadbrm.log + # Argument "no" here means don't use systemd to start SM + $MCS_INSTALL_BIN/mcs-loadbrm.py no >> $LOG_PREFIX/mcs-loadbrm.log 2>&1 + + touch $LOG_PREFIX/workernode.log && chmod 666 $LOG_PREFIX/workernode.log + $MCS_INSTALL_BIN/workernode DBRM_Worker2 &>> $LOG_PREFIX/workernode.log & + echo `date`: workernode PID = $! >> $LOG_FILE + + sleep 2 + + if [[ $is_primary -eq 1 ]]; then + touch $LOG_PREFIX/controllernode.log && chmod 666 $LOG_PREFIX/controllernode.log + $MCS_INSTALL_BIN/controllernode fg &>> $LOG_PREFIX/controllernode.log & + echo `date`: controllernode PID = $! >> $LOG_FILE + fi + + touch $LOG_PREFIX/primproc.log && chmod 666 $LOG_PREFIX/primproc.log + $MCS_INSTALL_BIN/PrimProc &>> $LOG_PREFIX/primproc.log & + echo `date`: PrimProc PID = $! >> $LOG_FILE + + sleep 1 + + if [ -e $MCS_INSTALL_BIN/ExeMgr ] ; then + touch $LOG_PREFIX/exemgr.log && chmod 666 $LOG_PREFIX/exemgr.log + $MCS_INSTALL_BIN/ExeMgr &>> $LOG_PREFIX/exemgr.log & + echo `date`: ExeMgr PID = $! >> $LOG_FILE + fi + + touch $LOG_PREFIX/writeengineserver.log && chmod 666 $LOG_PREFIX/writeengineserver.log + $MCS_INSTALL_BIN/WriteEngineServer &>> $LOG_PREFIX/writeengineserver.log & + echo `date`: WriteEngineServer PID = $! >> $LOG_FILE + + sleep 3 + + if [[ $is_primary -eq 1 ]]; then + touch $LOG_PREFIX/dmlproc.log && chmod 666 $LOG_PREFIX/dmlproc.log + $MCS_INSTALL_BIN/DMLProc &>> $LOG_PREFIX/dmlproc.log & + echo `date`: DMLProc PID = $! >> $LOG_FILE + touch $LOG_PREFIX/ddlproc.log && chmod 666 $LOG_PREFIX/ddlproc.log + $MCS_INSTALL_BIN/DDLProc &>> $LOG_PREFIX/ddlproc.log & + echo `date`: DDLProc PID = $! >> $LOG_FILE + fi + + exit 0 +} + +stop() { + # TODO: remove fast fix + # skip check binary for ExeMgr + PROGS='DMLProc DDLProc WriteEngineServer PrimProc workernode controllernode StorageManager' + MCS_INSTALL_BIN=/usr/bin + LOG_PREFIX=/var/log/mariadb/columnstore + + # Stop System + echo `date`: Stopping... >> $LOG_FILE + + if [[ ! -z "$(pidof $PROGS)" ]]; then + # Save BRM only on the primary node now. + if [[ ! -z "$(pidof controllernode)" ]]; then + $MCS_INSTALL_BIN/mcs-savebrm.py &>> $LOG_PREFIX/savebrm.log 2>&1 + fi + + echo `date`: Sending SIGTERM >> $LOG_FILE + kill $(pidof $PROGS) > /dev/null + sleep 3 + # Make sure StorageManager had a chance to shutdown clean + counter=1 + while [ -n "$(pidof StorageManager)" -a $counter -le 60 ] + do + sleep 1 + ((counter++)) + done + echo `date`: Sending SIGKILL >> $LOG_FILE + kill -9 $(pidof $PROGS) > /dev/null + fi + + echo `date`: Clearing SHM >> $LOG_FILE + $MCS_INSTALL_BIN/clearShm + + exit 0 +} + +case "$operation" in + 'start') + # We start everything when controllernode starts at primary node and with workernode at non-primary + if [[ $is_primary -eq 1 && "mcs-workernode" == "$service_name" ]]; then + start_up_to_workernode $is_primary + elif [[ $is_primary -eq 1 && "mcs-controllernode" == "$service_name" ]]; then + start_those_left_at_master $is_primary + elif [[ $is_primary -eq 0 && "mcs-workernode" == "$service_name" ]]; then + start $is_primary + fi + ;; + + 'stop') + if [[ $is_primary -eq 1 && "mcs-controllernode" == "$service_name" || $is_primary -eq 0 && "mcs-workernode" == "$service_name" ]]; then + stop + fi + ;; +esac diff --git a/cmapi/mcs_node_control/models/__init__.py b/cmapi/mcs_node_control/models/__init__.py new file mode 100644 index 000000000..d4549722e --- /dev/null +++ b/cmapi/mcs_node_control/models/__init__.py @@ -0,0 +1,2 @@ + +from mcs_node_control.models.node_status import NodeStatus diff --git a/cmapi/mcs_node_control/models/dbrm.py b/cmapi/mcs_node_control/models/dbrm.py new file mode 100644 index 000000000..f0d64cc2e --- /dev/null +++ b/cmapi/mcs_node_control/models/dbrm.py @@ -0,0 +1,220 @@ +import logging +import socket + +from cmapi_server.constants import DEFAULT_MCS_CONF_PATH +from mcs_node_control.models.dbrm_socket import ( + DBRM_COMMAND_BYTES, DEFAULT_HOST, DEFAULT_PORT, DBRMSocketHandler +) +from mcs_node_control.models.node_config import NodeConfig +from mcs_node_control.models.process import Process + + +# TODO: why we need bitwise shift here? May be constant values? +SYSTEM_STATE_FLAGS = { + "SS_READY": 1 << 0, # 1 + "SS_SUSPENDED": 1 << 1, # 2 + "SS_SUSPEND_PENDING": 1 << 2, # 4 + "SS_SHUTDOWN_PENDING": 1 << 3, # 8 + "SS_ROLLBACK": 1 << 4, # 16 + "SS_FORCE": 1 << 5, # 32 + "SS_QUERY_READY": 1 << 6, # 64 +} + + +module_logger = logging.getLogger() + + +class DBRM: + """Class DBRM commands""" + def __init__( + self, root=None, config_filename: str = DEFAULT_MCS_CONF_PATH + ): + self.dbrm_socket = DBRMSocketHandler() + self.root = root + self.config_filename = config_filename + + def connect(self): + node_config = NodeConfig() + root = self.root or node_config.get_current_config_root( + self.config_filename + ) + master_conn_info = node_config.get_dbrm_conn_info(root) + if master_conn_info is None: + module_logger.warning( + 'DBRB.connect: No DBRM info in the Columnstore.xml.' + ) + dbrm_host = master_conn_info['IPAddr'] or DEFAULT_HOST + dbrm_port = int(master_conn_info['Port']) or DEFAULT_PORT + self.dbrm_socket.connect(dbrm_host, dbrm_port) + + def close(self): + self.dbrm_socket.close() + + def __enter__(self): + self.connect() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + if exc_type: + return False + return True + + def _send_command(self, command_name, command_value=None): + if command_name not in DBRM_COMMAND_BYTES: + module_logger.warning( + f'DBRM._send_command: Wrong command requested {command_name}' + ) + return None + + module_logger.info( + f'DBRM._send_command: Command {command_name} ' + f'was requested with value {command_value}' + ) + + self.dbrm_socket.send(command_name, command_value) + response_value_bytes = self.dbrm_socket.receive() + + if command_name == 'readonly': + reply = int.from_bytes(response_value_bytes, 'little') + else: + # get first byte, it's an error message + err = int.from_bytes(response_value_bytes[:1], 'little') + + if err != 0: + module_logger.warning( + f'DBRM._send_command: Command {command_name} ' + 'returned error on server' + ) + raise RuntimeError( + f'Controller Node replied error with code {err} ' + f'for command {command_name}' + ) + + if len(response_value_bytes) < 2: + return None + + reply = int.from_bytes(response_value_bytes[1:], 'little') + return reply + + def get_system_state(self): + state = self._send_command('get_system_state') + return [ + flag_name for flag_name, flag_value in SYSTEM_STATE_FLAGS.items() + # TODO: looks like weird logic? Not readable. + if flag_value & state + ] + + def _edit_system_state(self, states: list, command: str): + state = 0 + # TODO: why we need this? States type is list. + # May be str without loop inside is more appropriate here. + if isinstance(states, str): + states = (states,) + + for state_name in states: + if state_name not in SYSTEM_STATE_FLAGS: + module_logger.warning( + f'DBRM.{command}: Wrong system state requested: ' + f'{state_name}' + ) + continue + # TODO: For that case it's same with simple addition? + # So why we need bitwise OR? + state |= SYSTEM_STATE_FLAGS[state_name] + + self._send_command(command, state) + + def set_system_state(self, states: list): + self._edit_system_state(states, 'set_system_state') + + def clear_system_state(self, states: list): + self._edit_system_state(states, 'clear_system_state') + + @staticmethod + def get_dbrm_status(): + """Reads DBRM status + + DBRM Block Resolution Manager operates in two modes: + - master + - slave + + This method returns the mode of this DBRM node + looking for controllernode process running. + + :return: mode of this DBRM node + :rtype: string + """ + if Process.check_process_alive('controllernode'): + return 'master' + return 'slave' + + def _get_cluster_mode(self): + """Get DBRM cluster mode for internal usage. + + Returns real DBRM cluster mode from socket response. + """ + # state can be 1(readonly) or 0(readwrite) or exception raised + state = self._send_command('readonly') + if state == 1: + return 'readonly' + elif state == 0: + return 'readwrite' + + def get_cluster_mode(self): + """Get DBRM cluster mode for external usage. + + There are some kind of weird logic. + It's requested from management. + TODO: Here we can cause a logic error. + E.g. set non master node to "readwrite" and + we got a "readonly" in return value. + + :return: DBRM cluster mode + :rtype: str + """ + real_mode = self._get_cluster_mode() + if self.get_dbrm_status() == 'master': + return real_mode + else: + return 'readonly' + + def set_cluster_mode(self, mode): + """Set cluster mode requested + + Connects to the DBRM master's socket and + send a command to set cluster mode. + + :rtype: str :error or cluster mode set + """ + + if mode == 'readonly': + command = 'set_readonly' + elif mode == 'readwrite': + command = 'set_readwrite' + else: + return '' + + _ = self._send_command(command) + + return self.get_cluster_mode() + + +def set_cluster_mode( + mode: str, root=None, config_filename: str = DEFAULT_MCS_CONF_PATH +): + """Set cluster mode requested + + Connects to the DBRM master's socket and send a command to + set cluster mode. + + :rtype: str :error or cluster mode set + """ + try: + with DBRM(root, config_filename) as dbrm: + return dbrm.set_cluster_mode(mode) + except (ConnectionRefusedError, RuntimeError, socket.error): + module_logger.warning( + 'Cannot establish DBRM connection.', exc_info=True + ) + return 'readonly' diff --git a/cmapi/mcs_node_control/models/dbrm_socket.py b/cmapi/mcs_node_control/models/dbrm_socket.py new file mode 100644 index 000000000..7e81392a2 --- /dev/null +++ b/cmapi/mcs_node_control/models/dbrm_socket.py @@ -0,0 +1,248 @@ +import logging +import socket + + +MAGIC_BYTES = 0x14fbc137.to_bytes(4, 'little') +# value is tuple(command_bytes, command_value_length) +DBRM_COMMAND_BYTES = { + 'readonly': ((20).to_bytes(1, 'little'), 0), + 'set_readonly': ((14).to_bytes(1, 'little'), 0), + 'set_readwrite': ((15).to_bytes(1, 'little'), 0), + 'set_system_state': ((55).to_bytes(1, 'little'), 4), + 'get_system_state': ((54).to_bytes(1, 'little'), 4), + 'clear_system_state': ((57).to_bytes(1, 'little'), 4), +} +DEFAULT_HOST = 'localhost' +DEFAULT_PORT = 8616 +SOCK_TIMEOUT = 5 + + +class DBRMSocketHandler(): + """Class for stream socket operations. + + Include all logic for detecting bytestream protocol version, reading and + parsing magic inside, getting command bytes and command value length + by command name. + + """ + long_strings = None + + def __init__( + self, family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0, + fileno=None + ) -> None: + self._socket = None + self._family = family + self._type = type + self._proto = proto + self._fileno = fileno + self._host = None + self._port = None + self._recreate_socket() + + @property + def _connect_called(self): + """Is connect method called previously. + + This is the instance state to determine if "connect" method called + previously. This is not quaranteed that connection still alive. + :return: connected state + :rtype: bool + """ + if self._host and self._port: + return True + return False + + def _recreate_socket(self) -> None: + """Create new internal _socket object. + + Create\recreate new _socket object and connects to it if was already + connected. + """ + if self._socket is not None: + self._socket.close() + self._socket = socket.socket( + family=self._family, type=self._type, + proto=self._proto, fileno=self._fileno + ) + if self._connect_called: + self.connect(self._host, self._port) + + def _detect_protocol(self) -> None: + """Detect dbrm socket bytestream version. + + This method normally will be called only in first instance + at first "send" method call. + After that header will be formed and parsed depending on + "long_strings" class variable value. + + Sends "readonly" message with "old" protocol version (before MCS 6.2.1) + If timeout error raised, sends message with "new" protocol version + (after MCS 6.2.1) with extra 4 bytes in header. + If both attemts are failed raise RuntimeError and return the + "long_strings" variable to initial state - None. + + :raises RuntimeError: [description] + """ + success = False + # check at first old protocol because 5.x.x version got an issue if + # we try to send new format packages. + for long_strings in (False, True): + DBRMSocketHandler.long_strings = long_strings + self.send('readonly') + try: + _ = self.receive() + success = True + break + except (socket.timeout, TimeoutError): + # wrong packet sended could cause errors on the mcs engine side + self._recreate_socket() + continue + if not success: + # something went wrong so return to unknown protocol state + DBRMSocketHandler.long_strings = None + raise RuntimeError( + 'Can\'t detect DBRM bytestream protocol version.' + ) + else: + dbrm_protocol_version = ( + 'new' if DBRMSocketHandler.long_strings else 'old' + ) + logging.info( + f'Detected "{dbrm_protocol_version}" DBRM bytestream protocol' + ) + + def _make_msg(self, command_name: str, command_value: int) -> bytes: + """Make bytes msg by command name and value. + + :param command_name: name of a command + :type command_name: str + :param command_value: command value + :type command_value: int or None + :return: msg to send throught socket + :rtype: bytes + """ + command_bytes, command_value_length = DBRM_COMMAND_BYTES[command_name] + data_length = ( + command_value_length + len(command_bytes) + ).to_bytes(4, 'little') + # bytestream protocol before MCS 6.2.1 version + package_header = MAGIC_BYTES + data_length + if DBRMSocketHandler.long_strings: + # bytestream protocol after MCS 6.2.1 version + long_strings_count = (0).to_bytes(4, 'little') + package_header += long_strings_count + + msg_bytes = package_header + command_bytes + if command_value is not None: + msg_bytes += command_value.to_bytes( + command_value_length, 'little' + ) + return msg_bytes + + def _receive_magic(self): + """Reads the stream up to the uncompressed magic. + + The magic is a constant delimeter that occurs at the begging + of the stream. + """ + data: bytes + recv_data: bytes = b'' + while recv_data != MAGIC_BYTES: + data = self._socket.recv(1) + # TODO: advanced error handling + if data == b'': + raise RuntimeError( + 'Socket connection broken while receiving magic' + ) + recv_data += data + if not MAGIC_BYTES.startswith(recv_data): + recv_data = data + continue + + def _receive(self, length: int): + """Receive raw data from socket by length. + + :param length: length in bytes to receive + :type length: int + :raises RuntimeError: if socket connection is broken while receiving + :return: received bytes + :rtype: bytes + """ + chunks = [] + bytes_recd = 0 + while bytes_recd < length: + chunk = self._socket.recv(min(length - bytes_recd, 2048)) + if chunk == b'': + raise RuntimeError( + 'Socket connection broken while receiving data.' + ) + chunks.append(chunk) + bytes_recd += len(chunk) + return b''.join(chunks) + + def _send(self, msg: bytes): + """Send msg in bytes through the socket. + + :param msg: string in bytes to send + :type msg: bytes + :raises RuntimeError: if connection is broken while sending + """ + totalsent = 0 + while totalsent < len(msg): + sent = self._socket.send(msg[totalsent:]) + if sent == 0: + raise RuntimeError( + 'DBRM socket connection broken while sending.' + ) + totalsent = totalsent + sent + + def connect(self, host: str = DEFAULT_HOST, port: int = DEFAULT_PORT): + """Connect to socket. + + By default it connects with DBRM master. + """ + self._host = host + self._port = port + self._socket.settimeout(SOCK_TIMEOUT) + self._socket.connect((host, port)) + + def close(self): + """Closing the socket. + + Set _host and _port instance variables to None to change state to + not connected. Then close the _socket. + """ + self._host = None + self._port = None + self._socket.close() + + def send(self, command_name: str, command_value: int = None): + """Top level send by command name and value. + + param command_name: name of a command + :type command_name: str + :param command_value: command value, defaults to None + :type command_value: int, optional + """ + if DBRMSocketHandler.long_strings is None: + self._detect_protocol() + msg_bytes = self._make_msg(command_name, command_value) + self._send(msg_bytes) + + def receive(self): + """Top level method to receive data from socket. + + Automatically reads the magic and data length from data header. + + :return: received bytes without header + :rtype: bytes + """ + self._receive_magic() + data_length = int.from_bytes(self._receive(4), 'little') + if DBRMSocketHandler.long_strings: + # receive long strings count to meet new bytestream protocol + # requirements (after MCS 6.2.1 release) + long_strings_count_bytes = self._receive(4) + data_bytes = self._receive(data_length) + return data_bytes diff --git a/cmapi/mcs_node_control/models/misc.py b/cmapi/mcs_node_control/models/misc.py new file mode 100644 index 000000000..ca83fdc77 --- /dev/null +++ b/cmapi/mcs_node_control/models/misc.py @@ -0,0 +1,114 @@ +from __future__ import annotations +import logging +from pathlib import Path + +from lxml import etree + +from cmapi_server.constants import ( + DEFAULT_MCS_CONF_PATH, MCS_DATA_PATH, MCS_MODULE_FILE_PATH, +) + + +module_logger = logging.getLogger() + + +def read_module_id(): + """Retrieves module ID from MCS_MODULE_FILE_PATH. + + :rtype: int : seconds + """ + module_file = Path(MCS_MODULE_FILE_PATH) + return int(module_file.read_text()[2:]) + + +# TODO: Useless for now, newer called in code +# Nodeconfig.apply_config doing this. +def set_module_id(module_id: int = 1): + """Sets current module ID from MCS_MODULE_FILE_PATH. + + :rtype: int : seconds + """ + module_file = Path(MCS_MODULE_FILE_PATH) + return module_file.write_text(f'pm{module_id}\n') + + +def get_dbroots_list(path: str = MCS_DATA_PATH): + """searches for services + + The method returns numeric ids of dbroots available. + + :rtype: generator of ints + """ + func_name = 'get_dbroots_list' + path = Path(path) + for child in path.glob('data[1-9]*'): + dir_list = str(child).split('/') # presume Linux only + dbroot_id = int(''.join(list(filter(str.isdigit, dir_list[-1])))) + module_logger.debug(f'{func_name} The node has dbroot {dbroot_id}') + yield dbroot_id + + +def get_workernodes() -> dict[dict[str, int]]: + """Get workernodes list. + + Returns a list of network address of all workernodes. + This is an equivalent of all nodes. + + :return: workernodes dict + :rtype: dict[dict[str, int]] + """ + # TODO: fix in MCOL-5147, get xml path from class that will handle xml + root = current_config_root() + workernodes = {} + # searches for all tags starts with DBRM_Worker, eg DBRM_Worker1 + workernodes_elements = root.xpath( + "//*[starts-with(local-name(), 'DBRM_Worker')]" + ) + for workernode_el in workernodes_elements: + workernode_ip = workernode_el.find('./IPAddr').text + if workernode_ip == '0.0.0.0': + # skip elements with specific ip + continue + try: + workernode_port = int(workernode_el.find('./Port').text) + except (AttributeError, ValueError): + # AttributeError for not found Port tag, so got None.text + # ValueError for non numeric values in tag text + module_logger.error( + 'No Port tag found or wrong Port value for tag ' + f'"{workernode_el.tag}".' + ) + workernode_port = 8700 + workernodes[workernode_el.tag] = { + 'IPAddr': workernode_ip, 'Port': workernode_port + } + return workernodes + + +def get_dbrm_master(config_filename: str = DEFAULT_MCS_CONF_PATH) -> dict: + """Get DBRM master ip and port. + + :param config_filename: path to xml conf, defaults to DEFAULT_MCS_CONF_PATH + :type config_filename: str, optional + :return: ipaddress and port of DBRM master + :rtype: dict + """ + # TODO: fix in MCOL-5147, get xml path from class that will handle xml + # Use NodeConfig class as a template? + root = current_config_root(config_filename) + return { + 'IPAddr': root.find("./DBRM_Controller/IPAddr").text, + 'Port': root.find("./DBRM_Controller/Port").text + } + + +def current_config_root(config_filename: str = DEFAULT_MCS_CONF_PATH): + """Retrievs current configuration + + Read the config and returns Element + + :rtype: lxml.Element + """ + parser = etree.XMLParser(load_dtd=True) + tree = etree.parse(config_filename, parser=parser) + return tree.getroot() diff --git a/cmapi/mcs_node_control/models/network_ifaces.py b/cmapi/mcs_node_control/models/network_ifaces.py new file mode 100644 index 000000000..534df54d5 --- /dev/null +++ b/cmapi/mcs_node_control/models/network_ifaces.py @@ -0,0 +1,114 @@ +# Based on https://gist.github.com/provegard/1536682, which was +# Based on getifaddrs.py from pydlnadms [http://code.google.com/p/pydlnadms/]. +# Only tested on Linux! +# WARNING: Not working on Mac OS (tested on 10.12 Sierra) +# TODO: move to psutil lib + + +from socket import AF_INET, AF_INET6, inet_ntop +from ctypes import ( + Structure, Union, POINTER, + pointer, get_errno, cast, + c_ushort, c_byte, c_void_p, c_char_p, c_uint, c_int, c_uint16, c_uint32 +) +import ctypes.util +import ctypes + + +class struct_sockaddr(Structure): + _fields_ = [ + ('sa_family', c_ushort), + ('sa_data', c_byte * 14),] + + +class struct_sockaddr_in(Structure): + _fields_ = [ + ('sin_family', c_ushort), + ('sin_port', c_uint16), + ('sin_addr', c_byte * 4)] + + +class struct_sockaddr_in6(Structure): + _fields_ = [ + ('sin6_family', c_ushort), + ('sin6_port', c_uint16), + ('sin6_flowinfo', c_uint32), + ('sin6_addr', c_byte * 16), + ('sin6_scope_id', c_uint32)] + + +class union_ifa_ifu(Union): + _fields_ = [ + ('ifu_broadaddr', POINTER(struct_sockaddr)), + ('ifu_dstaddr', POINTER(struct_sockaddr)),] + + +class struct_ifaddrs(Structure): + pass +struct_ifaddrs._fields_ = [ + ('ifa_next', POINTER(struct_ifaddrs)), + ('ifa_name', c_char_p), + ('ifa_flags', c_uint), + ('ifa_addr', POINTER(struct_sockaddr)), + ('ifa_netmask', POINTER(struct_sockaddr)), + ('ifa_ifu', union_ifa_ifu), + ('ifa_data', c_void_p),] + +libc = ctypes.CDLL(ctypes.util.find_library('c')) + + +def ifap_iter(ifap): + ifa = ifap.contents + while True: + yield ifa + if not ifa.ifa_next: + break + ifa = ifa.ifa_next.contents + + +def getfamaddr(sa): + family = sa.sa_family + addr = None + if family == AF_INET: + sa = cast(pointer(sa), POINTER(struct_sockaddr_in)).contents + addr = inet_ntop(family, sa.sin_addr) + elif family == AF_INET6: + sa = cast(pointer(sa), POINTER(struct_sockaddr_in6)).contents + addr = inet_ntop(family, sa.sin6_addr) + return family, addr + + +class NetworkInterface(object): + def __init__(self, name): + self.name = name + self.index = libc.if_nametoindex(name) + self.addresses = {} + + def __str__(self): + return "%s [index=%d, IPv4=%s, IPv6=%s]" % ( + self.name, self.index, + self.addresses.get(AF_INET), + self.addresses.get(AF_INET6)) + + +def get_network_interfaces(): + ifap = POINTER(struct_ifaddrs)() + result = libc.getifaddrs(pointer(ifap)) + if result != 0: + raise OSError(get_errno()) + del result + try: + retval = {} + for ifa in ifap_iter(ifap): + name = ifa.ifa_name.decode("UTF-8") + i = retval.get(name) + if not i: + i = retval[name] = NetworkInterface(name) + family, addr = getfamaddr(ifa.ifa_addr.contents) + if addr: + if family not in i.addresses: + i.addresses[family] = list() + i.addresses[family].append(addr) + return retval.values() + finally: + libc.freeifaddrs(ifap) diff --git a/cmapi/mcs_node_control/models/node_config.py b/cmapi/mcs_node_control/models/node_config.py new file mode 100644 index 000000000..78a16d78b --- /dev/null +++ b/cmapi/mcs_node_control/models/node_config.py @@ -0,0 +1,574 @@ +import configparser +import grp +import logging +import pwd +import re +import socket +from os import mkdir, replace, chown +from pathlib import Path +from shutil import copyfile +from xml.dom import minidom # to pick up pretty printing functionality + +from lxml import etree + +from cmapi_server.constants import ( + DEFAULT_MCS_CONF_PATH, DEFAULT_SM_CONF_PATH, + MCS_MODULE_FILE_PATH, +) +# from cmapi_server.managers.process import MCSProcessManager +from mcs_node_control.models.misc import ( + read_module_id, get_dbroots_list +) +from mcs_node_control.models.network_ifaces import get_network_interfaces + + +module_logger = logging.getLogger() + + +class NodeConfig: + """Class to operate with the configuration file. + + The class instance applies new config or retrives current. + + config_filename and output_filename allow tests to override + the input & output of this fcn + The output in this case may be a config file upgraded to version 1. + """ + def get_current_config_root( + self, config_filename: str = DEFAULT_MCS_CONF_PATH, upgrade=True + ): + """Retrievs current configuration. + + Read the config and returns Element. + TODO: pretty the same function in misc.py - review + + :rtype: lxml.Element + """ + parser = etree.XMLParser(load_dtd=True) + tree = etree.parse(config_filename, parser=parser) + self.upgrade_config(tree=tree, upgrade=upgrade) + return tree.getroot() + + def get_root_from_string(self, config_string: str): + root = etree.fromstring(config_string) + self.upgrade_config(root=root) + return root + + def upgrade_from_v0(self, root): + revision = etree.SubElement(root, 'ConfigRevision') + revision.text = '1' + cluster_manager = etree.SubElement(root, 'ClusterManager') + cluster_manager.text = str(self.get_module_net_address(root=root)) + cluster_name = etree.SubElement(root, 'ClusterName') + cluster_name.text = 'MyCluster' + + # Need to get the addresses/host names of all nodes. + # Should all be listed as DBRM_worker nodes + addrs = set() + num = 1 + max_node = 1 + while True: + node = root.find(f'./DBRM_Worker{num}/IPAddr') + if node is None: + break + if node.text != '0.0.0.0': + addrs.add(node.text) + if max_node < num: + max_node = num + num += 1 + + # NextNodeId can be derived from the max DBRM_worker entry with non-0 + # ip address + next_node_id = etree.SubElement(root, 'NextNodeId') + next_node_id.text = str(max_node + 1) + + # NextDBRootId is the max current dbroot in use + 1 + num = 1 + max_dbroot = 1 + while num < 100: + node = root.find(f'./SystemConfig/DBRoot{num}') + if node is not None: + max_dbroot = num + num += 1 + next_dbroot_id = etree.SubElement(root, 'NextDBRootId') + next_dbroot_id.text = str(max_dbroot + 1) + + # The current primary node is listed under DBRMControllerNode. + # Might as well start with that. + primary_node_addr = root.find('./DBRM_Controller/IPAddr').text + + # Put them all in the DesiredNodes and ActiveNodes sections + desired_nodes = etree.SubElement(root, 'DesiredNodes') + active_nodes = etree.SubElement(root, 'ActiveNodes') + for addr in addrs: + node = etree.SubElement(desired_nodes, 'Node') + node.text = addr + node = etree.SubElement(active_nodes, 'Node') + node.text = addr + + # Add an empty InactiveNodes section and set the primary node addr + inactive_nodes = etree.SubElement(root, 'InactiveNodes') + primary_node = etree.SubElement(root, 'PrimaryNode') + primary_node.text = primary_node_addr + + # Add Maintenance tag and set to False + maintenance = etree.SubElement(root, 'Maintenance') + maintenance.text = str(False).lower() + + + def upgrade_config(self, tree=None, root=None, upgrade=True): + """ + Add the parts that might be missing after an upgrade from an earlier + version. + + .. note:: one or the other optional parameter should be specified (?) + """ + if root is None and tree is not None: + root = tree.getroot() + + rev_node = root.find('./ConfigRevision') + + if rev_node is None and upgrade: + self.upgrade_from_v0(root) + # as we add revisions, add add'l checks on rev_node.text here + + def write_config(self, tree, filename=DEFAULT_MCS_CONF_PATH): + tmp_filename = filename + ".cmapi.tmp" + with open(tmp_filename, "w") as f: + f.write(self.to_string(tree)) + replace(tmp_filename, filename) # atomic replacement + + def to_string(self, tree): + # TODO: try to use lxml to do this to avoid the add'l dependency + xmlstr = minidom.parseString(etree.tostring(tree)).toprettyxml( + indent=" " + ) + # fix annoying issue of extra newlines added by toprettyxml() + xmlstr = '\n'.join([ + line.rstrip() for line in xmlstr.split('\n') if line.strip() != "" + ]) + return xmlstr + + def get_dbrm_conn_info(self, root=None): + """Retrievs current DBRM master IP and port + + Read the config and returns a dict with the connection information. + + :rtype: dict + """ + if root is None: + return None + addr = '' + port = 0 + for el in root: + if el.tag == 'DBRM_Controller': + for subel in el: + if subel.tag == 'IPAddr': + addr = subel.text + elif subel.tag == 'Port': + port = subel.text + return {'IPAddr': addr, 'Port': port} + + return None + + def apply_config( + self, config_filename: str = DEFAULT_MCS_CONF_PATH, + xml_string: str = None, sm_config_filename: str = None, + sm_config_string: str = None + ): + """Applies the configuration WIP. + + Instance iterates over the xml nodes. + + : param config_filename: string 4 testing + : param xml_string: string + + :rtype: bool + """ + if xml_string is None: + return + + current_root = self.get_current_config_root(config_filename) + parser = etree.XMLParser(load_dtd=True) + new_root = etree.fromstring(xml_string, parser=parser) + + try: + # We don't change module ids for non-single nodes. + # if self.is_single_node(root=current_root): + # set_module_id(self.get_new_module_id(new_root)) + + # make sure all of the dbroot directories exist on this node + for dbroot in self.get_all_dbroots(new_root): + try: + node = new_root.find(f'./SystemConfig/DBRoot{dbroot}') + mkdir(node.text, mode=0o755) + + # if we are using the systemd dispatcher we need to change + # ownership on any created dirs to mysql:mysql + # TODO: remove conditional once container dispatcher will + # use non-root by default + # TODO: what happened if we change ownership in container? + # check the container installations works as expected + # from cmapi_server.managers.process import MCSProcessManager + # if MCSProcessManager.dispatcher_name == 'systemd': + uid = pwd.getpwnam('mysql').pw_uid + gid = grp.getgrnam('mysql').gr_gid + chown(node.text, uid, gid) + except FileExistsError: + pass + # Save current config + config_file = Path(config_filename) + config_dir = config_file.resolve().parent + copyfile( + config_file, f'{config_dir}/{config_file.name}.cmapi.save' + ) + + # Save new config + self.write_config(tree=new_root, filename=config_filename) + + # Save current and new storagemanager config + if sm_config_string and sm_config_filename: + sm_config_file = Path(sm_config_filename) + sm_config_dir = sm_config_file.resolve().parent + copyfile( + sm_config_file, + f'{sm_config_dir}/{sm_config_file.name}.cmapi.save' + ) + with open(sm_config_filename, 'w') as sm_config_file: + sm_config_file.write(sm_config_string) + # TODO: review + # figure out what to put in the 'module' file to make + # the old oam library happy + module_file = None + try: + pm_num = self.get_current_pm_num(new_root) + with open(MCS_MODULE_FILE_PATH, 'w') as module_file: + module_file.write(f'pm{pm_num}\n') + module_logger.info( + f'Wrote "pm{pm_num}" to {MCS_MODULE_FILE_PATH}' + ) + except Exception: + module_logger.error( + 'Failed to get or set this node\'s pm number.\n' + 'You may observe add\'l errors as a result.\n', + exc_info=True + ) + except: + # Raise an appropriate exception + module_logger.error( + f'{self.apply_config.__name__} throws an exception.' + 'The original config must be restored by ' + 'explicit ROLLBACK command or timeout.', + exc_info=True + ) + raise + + def in_active_nodes(self, root): + my_names = set(self.get_network_addresses_and_names()) + active_nodes = [ + node.text for node in root.findall("./ActiveNodes/Node") + ] + for node in active_nodes: + if node in my_names: + return True + return False + + def get_current_pm_num(self, root): + # Find this node in the Module* tags, return the module number + + my_names = set(self.get_network_addresses_and_names()) + smc_node = root.find("./SystemModuleConfig") + pm_count = int(smc_node.find("./ModuleCount3").text) + for pm_num in range(1, pm_count + 1): + ip_addr = smc_node.find(f"./ModuleIPAddr{pm_num}-1-3").text + name = smc_node.find(f"./ModuleHostName{pm_num}-1-3").text + if ip_addr in my_names: + module_logger.info(f" -- Matching against ModuleIPAddr{pm_num}-1-3, which says {ip_addr}") + return pm_num + if name in my_names: + module_logger.info(f" -- Matching against ModuleHostName{pm_num}-1-3, which says {name}") + return pm_num + raise Exception("Did not find my IP addresses or names in the SystemModuleConfig section") + + + def rollback_config(self, config_filename: str = DEFAULT_MCS_CONF_PATH): + """Rollback the configuration. + + Copyback the copy of the configuration file. + + : param config_filename: Columnstore config file path + :rtype: dict + """ + # TODO: Rollback doesn't restart needed processes? + config_file = Path(config_filename) + config_dir = config_file.resolve().parent + backup_path = f"{config_dir}/{config_file.name}.cmapi.save" + config_file_copy = Path(backup_path) + if config_file_copy.exists(): + replace(backup_path, config_file) # atomic replacement + + + def get_current_config(self, config_filename: str = DEFAULT_MCS_CONF_PATH): + """Retrievs current configuration. + + Read the config and convert it into bytes string. + + :rtype: string + + ..TODO: fix using self.get_current_config_root() + """ + parser = etree.XMLParser(load_dtd=True) + tree = etree.parse(config_filename, parser=parser) + self.upgrade_config(tree=tree) + # TODO: Unicode? UTF-8 may be? + return etree.tostring( + tree.getroot(), pretty_print=True, encoding='unicode' + ) + + + def get_current_sm_config( + self, config_filename: str = DEFAULT_SM_CONF_PATH + ) -> str: + """Retrievs current SM configuration + + Read the config and convert it into a string. + + :rtype: str + """ + func_name = 'get_current_sm_config' + sm_config_path = Path(config_filename) + try: + return sm_config_path.read_text(encoding='utf-8') + except FileNotFoundError: + module_logger.error(f"{func_name} SM config {config_filename} not found.") + return '' + + + def s3_enabled(self, config_filename: str = DEFAULT_SM_CONF_PATH) -> bool: + """Checks if SM is enabled + + Reads SM config and checks if storage set to S3. + It also checks for additional settings in the XML that must be set too. + + :rtype: bool + """ + func_name = 's3_enabled' + sm_config = configparser.ConfigParser() + if len(sm_config.read(config_filename)) > 0: + storage = sm_config.get('ObjectStorage', 'service') + if storage is None: + storage = 'LocalStorage' + + if storage.lower() == 's3': + config_root = self.get_current_config_root() + if not config_root.find('./Installation/DBRootStorageType').text.lower() == "storagemanager": + module_logger.error(f"{func_name} DBRootStorageType.lower() != storagemanager") + if not config_root.find('./StorageManager/Enabled').text.lower() == "y": + module_logger.error(f"{func_name} StorageManager/Enabled.lower() != y") + if not config_root.find('./SystemConfig/DataFilePlugin').text == "libcloudio.so": + module_logger.error(f"{func_name} SystemConfig/DataFilePlugin != libcloudio.so") + + return True + else: + module_logger.error(f"{func_name} SM config {config_filename} not found.") + + return False + + def get_network_addresses(self): + """Retrievs the list of the network addresses + + Generator that yields network interface addresses. + + :rtype: str + """ + for ni in get_network_interfaces(): + for fam in [socket.AF_INET, socket.AF_INET6]: + addrs = ni.addresses.get(fam) + if addrs is not None: + for addr in addrs: + yield(addr) + + def get_network_addresses_and_names(self): + """Retrievs the list of the network addresses, hostnames, and aliases + + Generator that yields network interface addresses, hostnames, and aliases + + :rtype: str + """ + for ni in get_network_interfaces(): + for fam in [socket.AF_INET, socket.AF_INET6]: + addrs = ni.addresses.get(fam) + if addrs is not None: + for addr in addrs: + yield(addr) + try: + (host, aliases, _) = socket.gethostbyaddr(addr) + except: + continue + yield host + for alias in aliases: + yield alias + + def is_primary_node(self, root=None): + """Checks if this node is the primary node. + + Reads the config and compares DBRM_Controller IP or + hostname with the this node's IP and hostname. + + :rtype: bool + """ + if root is None: + root = self.get_current_config_root() + + primary_address = self.get_dbrm_conn_info(root)['IPAddr'] + return primary_address in self.get_network_addresses_and_names() + + def is_single_node(self, + root=None): + """Checks if this node is the single node. + + Reads the config and compares DBRMMaster IP with the predefined localhost addresses. + + :rtype: bool + """ + if root is None: + root = self.get_current_config_root() + + master_address = self.get_dbrm_conn_info(root)['IPAddr'] + if master_address in ['127.0.0.1', 'localhost', '::1']: + return True + return False + + def get_new_module_id(self, new_root=None): + """Retrieves new module id. + + Reads new XML config and searches IP belongs to this host in SystemModuleConfig.ModuleIPAddrX-1-3. X is the new module id. + + :rtype: int + """ + func_name = 'get_new_module_id' + current_module_id = read_module_id() + + if new_root is None: + module_logger.error(f'{func_name} Empty new XML tree root.') + return current_module_id + + net_address = self.get_module_net_address(new_root, current_module_id) + # Use getaddrinfo in case of IPv6 + if net_address is None: + module_logger.error(f'{func_name} Columnstore.xml has unknown value in SystemModuleConfig.\ +ModuleIPAddr{current_module_id}-1-3.') + raise RuntimeError('net_address is None.') + if socket.gethostbyname(net_address) in self.get_network_addresses(): + return current_module_id + + # Use getaddrinfo in case of IPv6 + # This fires for a added node when node id changes from 1 to something + for module_entry in self.get_modules_addresses(new_root): + if module_entry['addr'] is not None: + net_addr = socket.gethostbyname(module_entry['addr']) + if net_addr in self.get_network_addresses(): + module_logger.debug(f'{func_name} New module id \ +{module_entry["id"]}') + return int(module_entry['id']) + + module_logger.error(f'{func_name} Cannot find new module id for \ +the node.') + raise RuntimeError('Fail to find module id.') + + def get_module_net_address(self, root=None, module_id: int = None): + """Retrieves the module network address. + + Reads new XML config and returns IP or + hostname from SystemModuleConfig.ModuleIPAddrX-1-3. + + :rtype: string + """ + func_name = 'get_module_net_address' + if module_id is None: + module_id = read_module_id() + + if root is None: + module_logger.error(f'{func_name} Empty XML root.') + return + + for el in root: + if el.tag == 'SystemModuleConfig': + for subel in el: + if subel.tag == f'ModuleIPAddr{module_id}-1-3': + module_logger.debug( + f'{func_name} Module {module_id} ' + f'network address {subel.text}' + ) + return subel.text + + module_logger.error(f'{func_name} Module {module_id} was not found.') + return + + def get_modules_addresses(self, root=None): + """Retrieves the modules network addresses. + + Reads new XML config and returns IP or hostname from + SystemModuleConfig.ModuleIPAddrX-1-3 with X being a node id. + + :rtype: dict + """ + func_name = 'get_module_addresses' + if root is None: + module_logger.error(f'{func_name} Empty XML root.') + return None + + regex_string = 'ModuleIPAddr[0-9]+-1-3' + for el in root: + if el.tag == 'SystemModuleConfig': + for subel in el: + module_ip_m = re.match(regex_string, subel.tag) + if module_ip_m is not None: + id_m = re.search('[0-9]+', module_ip_m.group(0)) + module_id = id_m.group(0) + module_logger.debug( + f'{func_name} Module {module_id} ' + f'network address {subel.text}' + ) + yield {'addr': subel.text, 'id': module_id} + + + module_logger.error(f'{func_name} Module {module_id} was not found.') + return None + + def dbroots_to_create(self, root=None, module_id:int=None): + """Generates dbroot ids if there are new dbroots to be created/renamed + + Reads new XML config and generates dbroot ids if on-disk dbroots differs from the config's set. + + :rtype: generator of strings + """ + func_name = 'dbroots_to_create' + if module_id is None: + module_id = read_module_id() + + if root is None: + module_logger.error(f'{func_name} Empty XML root.') + return + + current_dbroot_list = get_dbroots_list() + + regex_string = f'ModuleDBRootID{module_id}-[0-9]+-3' + for el in root: + if el.tag == 'SystemModuleConfig': + for subel in el: + if re.match(regex_string, subel.tag) is not None and \ +int(subel.text) not in current_dbroot_list: + module_logger.debug(f'{func_name} Module {module_id} \ +has dbroot {subel.text}') + yield int(subel.text) + return + + def get_all_dbroots(self, root): + dbroots = [] + smc_node = root.find("./SystemModuleConfig") + mod_count = int(smc_node.find("./ModuleCount3").text) + for i in range(1, mod_count+1): + for j in range(1, int(smc_node.find(f"./ModuleDBRootCount{i}-3").text) + 1): + dbroots.append(smc_node.find(f"./ModuleDBRootID{i}-{j}-3").text) + return dbroots diff --git a/cmapi/mcs_node_control/models/node_status.py b/cmapi/mcs_node_control/models/node_status.py new file mode 100644 index 000000000..c4b526055 --- /dev/null +++ b/cmapi/mcs_node_control/models/node_status.py @@ -0,0 +1,91 @@ +import logging +import socket + +from cmapi_server.constants import MCS_DATA_PATH, MCS_MODULE_FILE_PATH +from mcs_node_control.models.dbrm import DBRM +from mcs_node_control.models.misc import get_dbroots_list, read_module_id +from mcs_node_control.models.process import get_host_uptime + + +PROC_NAMES = ['ExeMgr', 'PrimProc', 'WriteEngine', 'controllernode', + 'workernode', 'cmagent', 'DMLProc', 'DDLProc'] + + +module_logger = logging.getLogger() + + +class NodeStatus: + """Class to tell the status of the node. + + Inspects runtime of the cluster and OS and returns its observations. + """ + def get_cluster_mode(self): + """Reads cluster mode. + + Cluster can be in readwrite or readonly modes. It can be also ready or + not ready but it is not important at this point. We pesume if there is + no connection with DBRM master then the cluster is readonly. + + TODO: + - Is it ok to have those method here in NodeStatus? + Move to DBRM. + - pass 'root' and config_filename arguments + (likewise dbrm.set_cluster_mode) + + :rtype: string + """ + try: + with DBRM() as dbrm: + return dbrm.get_cluster_mode() + except (ConnectionRefusedError, RuntimeError, socket.error): + module_logger.error( + 'Cannot establish or use DBRM connection.', + exc_info=True + ) + return 'readonly' + + + def get_dbrm_status(self): + """reads DBRM status + + DBRM Block Resolution Manager operates in two modes: + master and slave. This m() returns the mode of this node + looking for controllernode process running. + + :rtype: string + """ + return DBRM.get_dbrm_status() + + def get_dbroots(self, path:str = MCS_DATA_PATH): + """searches for services + + The method returns numeric ids of dbroots available. + + :rtype: generator of ints + """ + for id in get_dbroots_list(path): + yield id + + + def get_host_uptime(self): + """Retrieves uptime in seconds. + + :rtype: int : seconds + """ + return get_host_uptime() + + + def get_module_id(self): + """Retrieves module ID from MCS_MODULE_FILE_PATH. + + :rtype: int : seconds + """ + func_name = 'get_module_id' + try: + module_id = read_module_id() + except FileNotFoundError: + module_id = 0 + module_logger.error( + f'{func_name} {MCS_MODULE_FILE_PATH} file is absent.' + ) + return module_id diff --git a/cmapi/mcs_node_control/models/process.py b/cmapi/mcs_node_control/models/process.py new file mode 100644 index 000000000..d3a715075 --- /dev/null +++ b/cmapi/mcs_node_control/models/process.py @@ -0,0 +1,110 @@ +import os +import time + +import psutil + +PROCFS_PATH = '/proc/' # Linux only + + +def open_binary(fname, **kwargs): + return open(fname, "rb", **kwargs) + + +def get_host_uptime(): + """ + Return the system boot time expressed in seconds since the epoch. + + :rtype: int : diff b/w current epoch and boot epoch + """ + path = f'{PROCFS_PATH}stat' + boot_time = 0 + with open_binary(path) as f: + for line in f: + if line.startswith(b'btime'): + boot_time = float(line.strip().split()[1]) + return int(time.time() - int(boot_time)) + return 0 + + +class Process(): + """An interface to retrieve data from proc.""" + def get_proc_iterator(self): + for pid in self.pids(): + yield pid + + + def pids(self): + """Returns a list of PIDs currently running on the system.""" + return [int(x) for x in os.listdir(PROCFS_PATH) if x.isdigit()] + + + def name(self, pid: int): + """Method to retrive name associated with the pid.""" + return self.parse_stat_file(pid)['name'] + + + def parse_stat_file(self, pid: int): + """Parse /proc/{pid}/stat file and return a dict with various + process info. + + Using "man proc" as a reference: where "man proc" refers to + position N always substract 3 (e.g ppid position 4 in + 'man proc' == position 1 in here). + """ + ret = {} + try: + with open_binary(f"{PROCFS_PATH}{pid}/stat") as f: + data = f.read() + # Process name is between parentheses. It can contain spaces and + # other parentheses. This is taken into account by looking for + # the first occurrence of "(" and the last occurence of ")". + rpar = data.rfind(b')') + name = data[data.find(b'(') + 1:rpar] + fields = data[rpar + 2:].split() + + ret['name'] = name + ret['status'] = fields[0] + ret['ppid'] = fields[1] + ret['ttynr'] = fields[4] + ret['utime'] = fields[11] + ret['stime'] = fields[12] + ret['children_utime'] = fields[13] + ret['children_stime'] = fields[14] + ret['create_time'] = fields[19] + ret['cpu_num'] = fields[36] + ret['blkio_ticks'] = fields[39] # aka 'delayacct_blkio_ticks' + except (PermissionError, ProcessLookupError, FileNotFoundError): + ret['name'] = '' + ret['status'] = '' + ret['ppid'] = '' + ret['ttynr'] = '' + ret['utime'] = '' + ret['stime'] = '' + ret['children_utime'] = '' + ret['children_stime'] = '' + ret['create_time'] = '' + ret['cpu_num'] = '' + ret['blkio_ticks'] = '' # aka 'delayacct_blkio_ticks' + return ret + + @staticmethod + def check_process_alive(proc_name: str) -> bool: + """Check process running. + + :param proc_name: process name + :type proc_name: str + :return: True if process running, otherwise False + :rtype: bool + """ + # Iterate over the all the running process + for proc in psutil.process_iter(): + try: + # Check if process name equals to the given name string. + if proc_name.lower() == proc.name().lower(): + return True + except ( + psutil.NoSuchProcess, psutil.AccessDenied, + psutil.ZombieProcess + ): + pass + return False diff --git a/cmapi/mcs_node_control/test/Columnstore_new.xml b/cmapi/mcs_node_control/test/Columnstore_new.xml new file mode 100644 index 000000000..43de5205f --- /dev/null +++ b/cmapi/mcs_node_control/test/Columnstore_new.xml @@ -0,0 +1,587 @@ + + + MaxScale IP + 42 + + 192.168.0.101 + 8601 + um1 + + + 0.0.0.0 + 8602 + + + 192.168.0.102 + 8603 + + + 192.168.0.102 + 8606 + + + 192.168.0.102 + 8604 + + + 192.168.0.103 + 8605 + + + + + 127.0.0.1 + 8800 + + + 0.0.0.0 + 8800 + + + 192.168.0.101 + 8800 + + + 192.168.0.102 + 8800 + + + 0.0.0.0 + 8622 + + + 192.168.0.101 + 8622 + + + 192.168.0.102 + 8622 + + + 192.168.0.102 + 8630 + + + 192.168.0.101 + 8612 + + + 192.168.0.101 + 8614 + + + 10000 + + + 4 + 2 + 128 + 10K + 0 + 512 + 512 + + 1 + 0 + y + + + + y + + + + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + 192.168.0.102 + 8620 + + + 192.168.0.103 + 8620 + + + 192.168.0.105 + 8620 + + + 192.168.0.106 + 8620 + + + en_US.utf8 + columnstore-1 + pm1 + um1 + + 4 + /usr/local/mariadb/columnstore/data1 + $INSTALLDIR/data1/systemFiles/dbrm/BRM_saves + $INSTALLDIR/data1/systemFiles/dbrm/tablelocks + 20 + 100000 + 10 + $INSTALLDIR + 95 + OFF + + /rdwrscratch + + /tmp/columnstore_tmp_files + /usr/local/mariadb/columnstore/data2 + /usr/local/mariadb/columnstore/data3 + /usr/local/mariadb/columnstore/data4 + + + dm + Director Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + um + User Module + 2 + 192.168.0.101 + nvm002314 + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + pm + Performance Module + 4 + 192.168.0.102 + nvm002315 + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + 1 + 1 + ENABLED + 192.168.0.104 + 192.168.0.104 + 1 + 2 + 1 + 3 + 1 + 4 + ENABLED + nvm002316 + 192.168.0.103 + ENABLED + nvm002980 + 192.168.0.105 + ENABLED + nvm002981 + 192.168.0.106 + unassigned + 0.0.0.0 + unassigned + 0.0.0.0 + unassigned + 0.0.0.0 + unassigned + 0.0.0.0 + unassigned + 0.0.0.0 + unassigned + 0.0.0.0 + + + 0 + unassigned + 0.0.0.0 + ENABLED + + + 1000 + $INSTALLDIR/data1/systemFiles/dbrm/SMTxnID + + + + 1GB + + + + $INSTALLDIR/data1/systemFiles/dbrm/oidbitmap + + 3000 + + + $INSTALLDIR/data/bulk + $INSTALLDIR/data1/systemFiles/bulkRollback + 98 + 1 + n + + + 6 + 192.168.0.102 + 8616 + + + + 192.168.0.102 + 8700 + pm1 + + + 192.168.0.101 + 8700 + um1 + + + 192.168.0.104 + 8700 + um2 + + + 192.168.0.103 + 8700 + pm2 + + + 192.168.0.105 + 8700 + pm3 + + + 192.168.0.106 + 8700 + pm4 + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + + + + + 1 + 0 + 0 + 65536 + 2K + 200 + 0 + 65 + + + 1 + n + y + external + internal + y + /etc/profile.d/columnstoreAlias.sh + + + + + 8 + 0x0 + + + 128 + 128K + 64M + 25% + 100 + N + Y + Snappy + + + 16K + 32 + 1 + + + 40 + 100 + + + + + + N + + + 127.0.0.1 + 3306 + root + + + + + + + Y + + + N + + + Y + Snappy + + + 127.0.0.1 + 0 + + + 192.168.0.104 + 8800 + + + 192.168.0.104 + 8622 + + + 192.168.0.104 + 8601 + um2 + + + 192.168.0.103 + 8800 + + + 192.168.0.103 + 8622 + + + 192.168.0.103 + 8630 + + + 192.168.0.105 + 8800 + + + 192.168.0.105 + 8622 + + + 192.168.0.105 + 8630 + + + 192.168.0.106 + 8800 + + + 192.168.0.106 + 8622 + + + 192.168.0.106 + 8630 + + + + + + + + + 192.168.0.102 + + diff --git a/cmapi/mcs_node_control/test/Columnstore_old.xml b/cmapi/mcs_node_control/test/Columnstore_old.xml new file mode 100644 index 000000000..60cf4c04e --- /dev/null +++ b/cmapi/mcs_node_control/test/Columnstore_old.xml @@ -0,0 +1,531 @@ + + + + 127.0.0.1 + 8601 + unassigned + + + 0.0.0.0 + 8602 + + + 127.0.0.1 + 8603 + + + 127.0.0.1 + 8606 + + + 127.0.0.1 + 8604 + + + 0.0.0.0 + 8605 + + + + + 127.0.0.1 + 8800 + + + 0.0.0.0 + 8800 + + + 0.0.0.0 + 8800 + + + 127.0.0.1 + 8800 + + + 0.0.0.0 + 8622 + + + 0.0.0.0 + 8622 + + + 127.0.0.1 + 8622 + + + 127.0.0.1 + 8630 + + + 127.0.0.1 + 8612 + + + 127.0.0.1 + 8614 + + + 10000 + + + 1 + 2 + 128 + 10K + + 0 + 512 + 512 + + + + 1 + 0 + n + + + + + y + + + + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + 127.0.0.1 + 8620 + + + C + columnstore-1 + pm1 + pm1 + + 1 + /var/lib/columnstore/data1 + /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves + /var/lib/columnstore/data1/systemFiles/dbrm/tablelocks + 20 + + 100000 + + 10 + + 95 + + OFF + + /rdwrscratch + + /tmp/columnstore_tmp_files + + + dm + Director Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + um + User Module + 0 + 0.0.0.0 + unassigned + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + unassigned + unassigned + pm + Performance Module + 1 + 127.0.0.1 + localhost + ENABLED + 0 + 0 + 0 + 0 + 90 + 80 + 70 + 90 + 0 + 0 + 90 + 80 + 70 + / + 1 + 1 + + + 0 + unassigned + 0.0.0.0 + ENABLED + + + 1000 + /var/lib/columnstore/data1/systemFiles/dbrm/SMTxnID + + + + 1GB + + + + /var/lib/columnstore/data1/systemFiles/dbrm/oidbitmap + + 3000 + + + /var/log/mariadb/columnstore/data/bulk + /var/lib/columnstore/data1/systemFiles/bulkRollback + 98 + 1 + n + + + 1 + 127.0.0.1 + 8616 + + + + 127.0.0.1 + 8700 + pm1 + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + 0.0.0.0 + 8700 + unassigned + + + + + + + + 1 + + 0 + 0 + 65536 + 2K + 200 + 0 + 50 + + + 2 + n + n + internal + internal + /etc/profile.d/columnstoreAlias.sh + + + + + 4 + + 0x0 + + + 128 + 128K + + 1G + 25% + 100 + N + Y + Snappy + + + 16K + 16 + 1 + + + + + + 100 + + + + + + + + + N + + + 127.0.0.1 + 3306 + root + + + + + + + N + + + N + + + Y + Snappy + + + 127.0.0.1 + 0 + + + 30 + N + + + + + 1 + 127.0.0.1 + MyCluster + 2 + 2 + + 127.0.0.1 + + + 127.0.0.1 + + + 127.0.0.1 + diff --git a/cmapi/mcs_node_control/test/__init__.py b/cmapi/mcs_node_control/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cmapi/mcs_node_control/test/settings.py b/cmapi/mcs_node_control/test/settings.py new file mode 100644 index 000000000..b45197792 --- /dev/null +++ b/cmapi/mcs_node_control/test/settings.py @@ -0,0 +1,7 @@ +import os + +from cmapi_server.constants import CMAPI_DEFAULT_CONF_PATH + + +CONFIG_PATH_NEW = './mcs_node_control/test/Columnstore_new.xml' +CONFIG_PATH_OLD = './mcs_node_control/test/Columnstore_old.xml' diff --git a/cmapi/mcs_node_control/test/test_dbrm_socket.py b/cmapi/mcs_node_control/test/test_dbrm_socket.py new file mode 100644 index 000000000..7fb936629 --- /dev/null +++ b/cmapi/mcs_node_control/test/test_dbrm_socket.py @@ -0,0 +1,29 @@ +import io +import logging +import unittest + +from mcs_node_control.models.dbrm_socket import MAGIC_BYTES, DBRMSocketHandler + + +logging.basicConfig(level='DEBUG') + + +class TestDBRMSocketHandler(unittest.TestCase): + + def test_myreceive_to_magic(self): + response_data = b'\x01\x00\x00\x00\x00' + valid_magic = b'%s%s' % (MAGIC_BYTES, response_data) + first_unknow = b'A%s%s' % (MAGIC_BYTES, response_data) + partial_first_magic = b'%s%s%s' % ( + MAGIC_BYTES[:3], MAGIC_BYTES, response_data + ) + sock_responses = [valid_magic, first_unknow, partial_first_magic] + for sock_response in sock_responses: + with self.subTest(sock_response=sock_response): + data_stream = io.BytesIO(sock_response) + data_stream.recv = data_stream.read + dbrm_socket = DBRMSocketHandler() + # pylint: disable=protected-access + dbrm_socket._socket = data_stream + dbrm_socket._receive_magic() + self.assertEqual(data_stream.read(), response_data) diff --git a/cmapi/mcs_node_control/test/test_misc.py b/cmapi/mcs_node_control/test/test_misc.py new file mode 100644 index 000000000..619d8ee40 --- /dev/null +++ b/cmapi/mcs_node_control/test/test_misc.py @@ -0,0 +1,13 @@ +import unittest + + +class MiscTest(unittest.TestCase): + def test_read_module_id(self): + pass + + def test_set_module_id(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/cmapi/mcs_node_control/test/test_node_config.py b/cmapi/mcs_node_control/test/test_node_config.py new file mode 100644 index 000000000..386061fac --- /dev/null +++ b/cmapi/mcs_node_control/test/test_node_config.py @@ -0,0 +1,288 @@ +import logging +import os +import subprocess +import unittest +import xml.etree.ElementTree as ET +from pathlib import Path +from shutil import copyfile +from tempfile import TemporaryDirectory +from unittest import TestCase, mock + +from lxml import etree + +from cmapi_server.constants import CMAPI_DEFAULT_CONF_PATH +from mcs_node_control.models.dbrm import ( + DBRM, set_cluster_mode +) +from mcs_node_control.models.node_config import NodeConfig +from mcs_node_control.models.misc import read_module_id +from mcs_node_control.models.node_status import NodeStatus +from mcs_node_control.test.settings import CONFIG_PATH_NEW, CONFIG_PATH_OLD + + +MCS_NODE_MODELS = 'mcs_node_control.models' +NODE_CONFIG_MODULE = f'{MCS_NODE_MODELS}.node_config' + + +logging.basicConfig(level='DEBUG') + + +# These tests needs working DBRM worker. +class NodeConfigTest(TestCase): + + @mock.patch(f'{NODE_CONFIG_MODULE}.mkdir') + @mock.patch(f'{NODE_CONFIG_MODULE}.chown') + @mock.patch(f'{NODE_CONFIG_MODULE}.read_module_id', return_value=1) + @mock.patch( + f'{NODE_CONFIG_MODULE}.NodeConfig.in_active_nodes', + return_value=False + ) + def test_apply_config(self, *_args): + """Test apply configuration file.""" + with TemporaryDirectory() as tmp_dirname: + config_filepath = os.path.join(tmp_dirname, 'Columnstore.xml') + + copyfile(CONFIG_PATH_OLD, config_filepath) + # change config + parser = etree.XMLParser(load_dtd=True) + # new_tree = etree.parse('/etc/columnstore/Columnstore.xml', parser=parser) + new_tree = etree.parse(CONFIG_PATH_NEW, parser=parser) + + node_config = NodeConfig() + xml_string = node_config.to_string(new_tree) + + node_config.apply_config(config_filepath, xml_string) + + # compare configurations + config_file = Path(config_filepath) + xml_string_written = config_file.read_text() + self.assertEqual(xml_string_written, xml_string) + # copy must exists + config_file_copy = Path(f"{config_filepath}.cmapi.save") + self.assertTrue(config_file_copy.exists()) + + @mock.patch(f'{NODE_CONFIG_MODULE}.mkdir') + @mock.patch(f'{NODE_CONFIG_MODULE}.chown') + @mock.patch(f'{NODE_CONFIG_MODULE}.read_module_id', return_value=1) + @mock.patch( + f'{NODE_CONFIG_MODULE}.NodeConfig.in_active_nodes', + return_value=False + ) + def test_rollback_config(self, *_args): + """"Test rollback applied configuration file.""" + with TemporaryDirectory() as tmp_dirname: + config_filepath = os.path.join(tmp_dirname, 'Columnstore.xml') + copyfile(CONFIG_PATH_OLD, config_filepath) + + old_config_file = Path(CONFIG_PATH_OLD) + old_xml_string = old_config_file.read_text() + new_config_file = Path(CONFIG_PATH_NEW) + new_xml_string = new_config_file.read_text() + + node_config = NodeConfig() + node_config.apply_config(config_filepath, new_xml_string) + node_config.rollback_config(config_filepath) + + config_file = Path(config_filepath) + xml_string_restored = config_file.read_text() + self.assertEqual(xml_string_restored, old_xml_string) + + def test_get_current_config(self): + """Test get current config from file.""" + config_file = Path(CONFIG_PATH_OLD) + node_config = NodeConfig() + self.assertEqual( + node_config.get_current_config(CONFIG_PATH_OLD), + config_file.read_text() + ) + + def test_set_cluster_mode(self): + """Test set cluster mode. + + TODO: + - move from here. There are no set_cluster_mode in NodeConfig + - split to unit and integrational tests + - make unittests for raising exception + """ + + for mode in ['readonly', 'readwrite']: + with self.subTest(mode=mode): + fake_mode = mode + set_cluster_mode(mode) + with DBRM() as dbrm: + if dbrm.get_dbrm_status() != 'master': + fake_mode = 'readonly' + self.assertEqual(dbrm.get_cluster_mode(), fake_mode) + self.assertEqual(dbrm._get_cluster_mode(), mode) + + def test_get_dbrm_conn_info(self): + node_config = NodeConfig() + root = node_config.get_current_config_root(CONFIG_PATH_OLD) + master_conn_info = node_config.get_dbrm_conn_info(root) + + tree = ET.parse(CONFIG_PATH_OLD) + master_ip = tree.find('./DBRM_Controller/IPAddr').text + master_port = tree.find('./DBRM_Controller/Port').text + + self.assertEqual(master_conn_info['IPAddr'], master_ip) + self.assertEqual(master_conn_info['Port'], master_port) + + def test_is_primary_node(self): + try: + current_master = None + node_config = NodeConfig() + root = node_config.get_current_config_root() + current_master = node_config.get_dbrm_conn_info(root)['IPAddr'] + list_ips = "ip -4 -o addr | awk '!/^[0-9]*: ?lo|link\/ether/ {print $4}'" + result = subprocess.run(list_ips, + shell=True, + stdout=subprocess.PIPE) + local_addresses = result.stdout.decode('ASCII').split('\n') + local_addresses = [addr.split('/')[0] for addr in local_addresses if len(addr)] + os.system(f"mcsSetConfig DBRM_Controller IPAddr {local_addresses[0]}") + self.assertTrue(node_config.is_primary_node()) + os.system(f"mcsSetConfig DBRM_Controller IPAddr 8.8.8.8") + self.assertFalse(node_config.is_primary_node()) + os.system(f"mcsSetConfig DBRM_Controller IPAddr {current_master}") + except AssertionError as e: + if current_master is not None: + os.system(f"mcsSetConfig DBRM_Controller IPAddr \ +{current_master}") + raise e + + def test_get_network_interfaces(self): + node_config = NodeConfig() + addresses = list(node_config.get_network_addresses()) + exemplar_addresses = [] + list_ips = "ip -4 -o addr | awk '!/^[0-9]*: ?lo|link\/ether/ {print $4}'" + result = subprocess.run(list_ips, + shell=True, + stdout=subprocess.PIPE) + exemplar_addresses += result.stdout.decode('ASCII').split('\n') + list_ips = "ip -6 -o addr | awk '!/^[0-9]*: ?lo|link\/ether/ {print $4}'" + result = subprocess.run(list_ips, + shell=True, + stdout=subprocess.PIPE) + exemplar_addresses += result.stdout.decode('ASCII').split('\n') + golden_addresses = [addr.split('/')[0] for addr in exemplar_addresses if len(addr) > 0] + for addr in golden_addresses: + self.assertTrue(addr in addresses) + + def test_is_single_node(self): + try: + current_master = None + node_config = NodeConfig() + root = node_config.get_current_config_root() + current_master = node_config.get_dbrm_conn_info(root)['IPAddr'] + os.system(f"mcsSetConfig DBRM_Controller IPAddr 127.0.0.1") + self.assertTrue(node_config.is_single_node()) + os.system(f"mcsSetConfig DBRM_Controller IPAddr 8.8.8.8") + self.assertFalse(node_config.is_single_node()) + os.system(f"mcsSetConfig DBRM_Controller IPAddr {current_master}") + except AssertionError as e: + if current_master is not None: + os.system(f"mcsSetConfig DBRM_Controller IPAddr \ +{current_master}") + raise e + + @mock.patch(f'{NODE_CONFIG_MODULE}.read_module_id', return_value=1) + def test_get_module_net_address(self, *args): + with TemporaryDirectory() as tmp_dirname: + config_filepath = os.path.join(tmp_dirname, 'Columnstore.xml') + copyfile(CONFIG_PATH_OLD, config_filepath) + + module_address = None + node_config = NodeConfig() + current_module_id = read_module_id() + module_address_sh = ( + f'mcsGetConfig -c {config_filepath} ' + f'SystemModuleConfig ModuleIPAddr{current_module_id}-1-3' + ) + result = subprocess.run( + module_address_sh, shell=True, stdout=subprocess.PIPE + ) + module_address = result.stdout.decode('ASCII').split('\n')[0] + dummy_address = '8.8.8.8' + os.system( + f'mcsSetConfig -c {config_filepath} ' + f'SystemModuleConfig ModuleIPAddr{current_module_id}-1-3 ' + f'{dummy_address}' + ) + root = node_config.get_current_config_root(config_filepath) + self.assertEqual( + dummy_address, node_config.get_module_net_address(root) + ) + self.assertNotEqual( + module_address, node_config.get_module_net_address(root) + ) + os.system( + f'mcsSetConfig -c {config_filepath} SystemModuleConfig ' + f'ModuleIPAddr{current_module_id}-1-3 {module_address}' + ) + root = node_config.get_current_config_root(config_filepath) + self.assertEqual( + module_address, node_config.get_module_net_address(root) + ) + + def test_get_new_module_id(self): + try: + current_module_id = None + current_module_address = None + node_config = NodeConfig() + current_module_id = read_module_id() + root = node_config.get_current_config_root() + current_module_address = node_config.get_module_net_address(root) + os.system(f"mcsSetConfig SystemModuleConfig \ + ModuleIPAddr{current_module_id}-1-3 8.8.8.8") + os.system(f"mcsSetConfig SystemModuleConfig \ + ModuleIPAddr{current_module_id+42}-1-3 {current_module_address}") + root = node_config.get_current_config_root() + self.assertEqual(current_module_id+42, + node_config.get_new_module_id(root)) + self.assertNotEqual(current_module_id, + node_config.get_new_module_id(root)) + os.system(f"mcsSetConfig SystemModuleConfig \ + ModuleIPAddr{current_module_id}-1-3 {current_module_address}") + os.system(f"mcsSetConfig -x SystemModuleConfig \ + ModuleIPAddr{current_module_id+42}-1-3 {current_module_address}") + root = node_config.get_current_config_root() + self.assertEqual(current_module_id, + node_config.get_new_module_id(root)) + except AssertionError as e: + if current_module_id is not None and current_module_address is not None: + os.system(f"mcsSetConfig SystemModuleConfig \ + ModuleIPAddr{current_module_id}-1-3 {current_module_address}") + os.system(f"mcsSetConfig -x SystemModuleConfig \ + ModuleIPAddr{current_module_id+42}-1-3 {current_module_address}") + + def test_dbroots_to_create(self): + try: + node_config = NodeConfig() + current_module_id = read_module_id() + dummy_dbroots = [42, 43] + dbroot_seq_id = 2 + for d in dummy_dbroots: + os.system(f"mcsSetConfig SystemModuleConfig \ + ModuleDBRootID{current_module_id}-{dbroot_seq_id}-3 {d}") + dbroot_seq_id += 1 + root = node_config.get_current_config_root() + dbroots_to_create = list(node_config.dbroots_to_create(root=root, module_id=current_module_id)) + for d in dbroots_to_create: + self.assertTrue(d in dummy_dbroots) + except AssertionError as e: + dbroot_seq_id = 2 + for d in dummy_dbroots: + os.system(f"mcsSetConfig -x SystemModuleConfig \ + ModuleDBRootID{current_module_id}-{dbroot_seq_id}-3 {d}") + dbroot_seq_id += 1 + raise e + + dbroot_seq_id = 2 + for d in dummy_dbroots: + os.system(f"mcsSetConfig -x SystemModuleConfig \ +ModuleDBRootID{current_module_id}-{dbroot_seq_id}-3 {d}") + dbroot_seq_id += 1 + + +if __name__ == '__main__': + unittest.main() diff --git a/cmapi/mcs_node_control/test/test_node_status.py b/cmapi/mcs_node_control/test/test_node_status.py new file mode 100644 index 000000000..236e3dd0e --- /dev/null +++ b/cmapi/mcs_node_control/test/test_node_status.py @@ -0,0 +1,50 @@ +import logging +import os +import unittest +from pathlib import Path +from shutil import rmtree + +from cmapi_server.constants import MCS_MODULE_FILE_PATH +from mcs_node_control.models.node_status import NodeStatus + + +logging.basicConfig(level='DEBUG') + + +class NodeStatusTest(unittest.TestCase): + def test_dbrm_cluster_mode(self): + node_status = NodeStatus() + # use subprocess.run to capture stdout + os.system('/usr/bin/dbrmctl readwrite') + self.assertEqual(node_status.get_cluster_mode(), 'readwrite') + os.system('/usr/bin/dbrmctl readonly') + self.assertEqual(node_status.get_cluster_mode(), 'readonly') + # kill controllernode and test it + + def test_dbrm_status(self): + node_status = NodeStatus() + self.assertEqual(node_status.get_dbrm_status(), 'master') + + def test_dbroots(self): + try: + node_status = NodeStatus() + dbroot_ids = [1, 2, 3] + path = '/tmp/dbroots/' + for e in dbroot_ids: + p = Path(path + 'data' + str(e)) + p.mkdir(parents = True, exist_ok = True) + for e in node_status.get_dbroots(path=path): + self.assertEqual(e in dbroot_ids, True) + except AssertionError as e: + rmtree(path) + raise e + + def test_module_id(self): + node_status = NodeStatus() + module_file = Path(MCS_MODULE_FILE_PATH) + examplar_id = int(module_file.read_text()[2:]) + self.assertEqual(examplar_id, node_status.get_module_id()) + + +if __name__ == '__main__': + unittest.main() diff --git a/cmapi/postinst.template b/cmapi/postinst.template new file mode 100755 index 000000000..c1a823fb3 --- /dev/null +++ b/cmapi/postinst.template @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# only for postinstall in CentOS +if [ -f ${CMAPI_CONF_FILEPATH}.rpmsave ]; then + echo "warning: found previously saved configuration file ${CMAPI_CONF_FILEPATH}.rpmsave" + mv ${CMAPI_CONF_FILEPATH} ${CMAPI_CONF_FILEPATH}.rpmnew + echo "warning: newly installed configuration file ${CMAPI_CONF_FILEPATH} saved as ${CMAPI_CONF_FILEPATH}.rpmnew" + mv ${CMAPI_CONF_FILEPATH}.rpmsave ${CMAPI_CONF_FILEPATH} + echo "warning: previously saved configuration file ${CMAPI_CONF_FILEPATH}.rpmsave applied as current config file ${CMAPI_CONF_FILEPATH}" +fi + +systemctl enable ${SYSTEMD_UNIT_NAME} + +systemctl start ${SYSTEMD_UNIT_NAME} + +systemctl mask ${SYSTEMD_ENGINE_UNIT_NAME} diff --git a/cmapi/prerm.template b/cmapi/prerm.template new file mode 100755 index 000000000..54204cb45 --- /dev/null +++ b/cmapi/prerm.template @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +systemctl stop ${SYSTEMD_UNIT_NAME} + +find ${CMAPI_DIR} -type d -name __pycache__ -print0 | xargs --null --no-run-if-empty rm -rf + +systemctl disable ${SYSTEMD_UNIT_NAME} + +systemctl unmask ${SYSTEMD_ENGINE_UNIT_NAME} diff --git a/cmapi/requirements.txt b/cmapi/requirements.txt new file mode 100644 index 000000000..db0e2e6df --- /dev/null +++ b/cmapi/requirements.txt @@ -0,0 +1,73 @@ +awscli==1.25.56 +CherryPy==18.6.1 +cryptography==36.0.1 +furl==2.1.3 +gsutil==5.12 +lxml==4.7.1 +psutil==5.9.1 +pyotp==2.6.0 +requests==2.27.1 +typer==0.4.1 + +# indirect dependencies +aiohttp==3.8.1 +aiosignal==1.2.0 +argcomplete==2.0.0 +async-timeout==4.0.2 +asynctest==0.13.0 +attrs==22.1.0 +boto==2.49.0 +boto3==1.24.55 +botocore==1.27.55 +cachetools==5.2.0 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.12 +cheroot==8.6.0 +click==8.1.3 +colorama==0.4.4 +crcmod==1.7 +docutils==0.16 +fasteners==0.17.3 +frozenlist==1.3.1 +gcs-oauth2-boto-plugin==3.0 +google-apitools==0.5.32 +google-auth==2.10.0 +google-reauth==0.1.1 +httplib2==0.20.4 +idna==3.3 +importlib-resources==5.4.0 +importlib-metadata==4.12.0 +jaraco.classes==3.2.1 +jaraco.collections==3.5.1 +jaraco.context==4.1.1 +jaraco.functools==3.5.0 +jaraco.text==3.7.0 +jmespath==1.0.1 +monotonic==1.6 +more-itertools==8.12.0 +multidict==6.0.2 +oauth2client==4.1.3 +orderedmultidict==1.0.1 +portend==3.1.0 +pyasn1-modules==0.2.8 +pyasn1==0.4.8 +pycparser==2.21 +pyOpenSSL==22.0.0 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2021.3 +pyu2f==0.1.5 +PyYAML==5.4.1 +repoze.lru==0.7 +retry-decorator==1.1.1 +Routes==2.5.1 +rsa==4.7.2 +s3transfer==0.6.0 +six==1.16.0 +tempora==5.0.1 +typing-extensions==4.3.0 +urllib3==1.26.8 +yarl==1.8.1 +zc.lockfile==2.0 +zipp==3.7.0 diff --git a/cmapi/run_tests.py b/cmapi/run_tests.py new file mode 100644 index 000000000..4f4b326fc --- /dev/null +++ b/cmapi/run_tests.py @@ -0,0 +1,41 @@ +import datetime +import logging +import sys +import unittest +from cmapi_server.logging_management import add_logging_level + + +class DatedTextTestResult(unittest.TextTestResult): + def startTest(self, test: unittest.case.TestCase): + self.stream.write('\n') + self.stream.write( + datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S]: ") + ) + return super().startTest(test) + + +def run_tests_from_package(p_name: str): + logging.info(f'Starting tests from package {p_name}') + loader = unittest.TestLoader() + testsuite = loader.discover( + pattern='test_*.py', start_dir=p_name, top_level_dir='./' + ) + runner = unittest.runner.TextTestRunner( + verbosity=3, failfast=True, resultclass=DatedTextTestResult + ) + result = runner.run(testsuite) + failed = False + if not result.wasSuccessful(): + failed = True + sys.exit(failed) + logging.info(f'Finished tests from package {p_name}') + + +if __name__ == "__main__": + add_logging_level('TRACE', 5) + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s [%(levelname)s] (%(name)s) %(message)s' + ) + run_tests_from_package('cmapi_server') + run_tests_from_package('mcs_node_control') diff --git a/cmapi/service.sh b/cmapi/service.sh new file mode 100755 index 000000000..a6dc93f24 --- /dev/null +++ b/cmapi/service.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +CPACK_PACKAGE_DESCRIPTION_SUMMARY="Mariadb Columnstore Cluster Manager API" +SVC_NAME="mariadb-columnstore-cmapi" + +SVC_CMD=$1 +arg_2=${2} + +UNIT_PATH=/usr/lib/systemd/system/${SVC_NAME}.service +TEMPLATE_PATH=./service.template +TEMP_PATH=./service.temp +SYSTEMD_ENV_FILE=/etc/columnstore/systemd.env + +CMAPI_DIR=$(pwd) +CMAPI_USER=root +CONFIG_FOLDER=/etc/columnstore +CONFIG_FILENAME=cmapi_server.conf + +user_id=$(id -u) + +# systemctl must run as sudo +if [ $user_id -ne 0 ]; then + echo "Must run as sudo" + exit 1 +fi + +function failed() +{ + local error=${1:-Undefined error} + echo "Failed: $error" >&2 + exit 1 +} + +if [ ! -f "${TEMPLATE_PATH}" ]; then + failed "Must run from package folder or install is corrupt" +fi + +# check if we run as root +if [[ $(id -u) != "0" ]]; then + echo "Failed: This script requires to run with sudo." >&2 + exit 1 +fi + +function install() +{ + echo "Creating service in ${UNIT_PATH}" + if [ -f "${UNIT_PATH}" ]; then + failed "error: exists ${UNIT_PATH}" + fi + + if [ -f "${TEMP_PATH}" ]; then + rm "${TEMP_PATH}" || failed "failed to delete ${TEMP_PATH}" + fi + + # can optionally use username supplied + #run_as_user=${arg_2:-$SUDO_USER} + #echo "Run as user: ${run_as_user}" + + #run_as_uid=$(id -u ${run_as_user}) || failed "User does not exist" + #echo "Run as uid: ${run_as_uid}" + + #run_as_gid=$(id -g ${run_as_user}) || failed "Group not available" + #echo "gid: ${run_as_gid}" + + sed "s/\${CPACK_PACKAGE_DESCRIPTION_SUMMARY}/${CPACK_PACKAGE_DESCRIPTION_SUMMARY}/g; s/\${CMAPI_USER}/${CMAPI_USER}/g; s/\${CMAPI_DIR}/$(echo ${CMAPI_DIR} | sed -e 's/[\/&]/\\&/g')/g;" "${TEMPLATE_PATH}" > "${TEMP_PATH}" || failed "failed to create replacement temp file" + mv "${TEMP_PATH}" "${UNIT_PATH}" || failed "failed to copy unit file" + + if [ ! -d "${CONFIG_FOLDER}" ]; then + mkdir $CONFIG_FOLDER || failed "failed to create configuration folder" + fi + + if [ ! -f "${CONFIG_FOLDER}/${CONFIG_FILENAME}" ]; then + cp cmapi_server/cmapi_server.conf.default "${CONFIG_FOLDER}/${CONFIG_FILENAME}" || failed "failed to copy config file" + fi + + # Unit file should not be executable and world writable + chmod 664 ${UNIT_PATH} || failed "failed to set permissions on ${UNIT_PATH}" + + # Since we started with sudo, files will be owned by root. Change this to specific user + #chown -R ${run_as_uid}:${run_as_gid} $CMAPI_DIR || failed "failed to set owner for $CMAPI_DIR" + + systemctl enable ${SVC_NAME} || failed "failed to enable ${SVC_NAME}" + + # chown ${run_as_uid}:${run_as_gid} ${CONFIG_FOLDER}/${CONFIG_FILENAME} || failed "failed to set permission for ${CONFIG_FOLDER}/${CONFIG_FILENAME}" + echo PYTHONPATH=${CMAPI_DIR}/deps > ${SYSTEMD_ENV_FILE} + + systemctl daemon-reload || failed "failed to reload daemons" +} + +function start() +{ + systemctl start ${SVC_NAME} || failed "failed to start ${SVC_NAME}" + status +} + +function stop() +{ + systemctl stop ${SVC_NAME} || failed "failed to stop ${SVC_NAME}" + status +} + +function uninstall() +{ + stop + systemctl disable ${SVC_NAME} || failed "failed to disable ${SVC_NAME}" + rm "${UNIT_PATH}" || failed "failed to delete ${UNIT_PATH}" + rm "${SYSTEMD_ENV_FILE}" || failed "failed to delete ${SYSTEMD_ENV_FILE}" + systemctl daemon-reload || failed "failed to reload daemons" +} + +function status() +{ + if [ -f "${UNIT_PATH}" ]; then + echo + echo "${UNIT_PATH}" + else + echo + echo "not installed" + echo + return + fi + + systemctl --no-pager status ${SVC_NAME} +} + +function usage() +{ + echo + echo Usage: + echo "./install.sh [install, start, stop, status, uninstall]" + echo "Commands:" + #echo " install [user]: Install as Root or specified user" + echo " install: Install" + echo " start: Manually start" + echo " stop: Manually stop" + echo " status: Display intallation status" + echo " uninstall: Uninstall" + echo +} + +case $SVC_CMD in + "install") install;; + "status") status;; + "uninstall") uninstall;; + "start") start;; + "stop") stop;; + "status") status;; + *) usage;; +esac + +exit 0 diff --git a/cmapi/service.template b/cmapi/service.template new file mode 100644 index 000000000..45fa9b4a1 --- /dev/null +++ b/cmapi/service.template @@ -0,0 +1,12 @@ +[Unit] +Description=${CPACK_PACKAGE_DESCRIPTION_SUMMARY} + +[Service] +Environment=PYTHONPATH=${CMAPI_DIR}/deps +ExecStart=${CMAPI_DIR}/python/bin/python3 -m cmapi_server +ExecStartPost=/bin/sh -c ${CMAPI_DIR}/check_ready.sh +User=${CMAPI_USER} +WorkingDirectory=${CMAPI_DIR} + +[Install] +WantedBy=multi-user.target diff --git a/cmapi/systemd.env.template b/cmapi/systemd.env.template new file mode 100644 index 000000000..23c839458 --- /dev/null +++ b/cmapi/systemd.env.template @@ -0,0 +1 @@ +PYTHONPATH=${CMAPI_DIR}/deps diff --git a/core_dumps/core_dump_format.sh b/core_dumps/core_dump_format.sh index e8126852a..4048c5d81 100755 --- a/core_dumps/core_dump_format.sh +++ b/core_dumps/core_dump_format.sh @@ -29,3 +29,5 @@ echo " Step: ${STEP_NAME}
Binary name: ${BINARY}
" >> "${FILENAME save_ansi_to_html "Backtrace" save_ansi_to_html "Arguments" save_ansi_to_html "Locals" + +gzip -5 "${COREDUMP}" diff --git a/core_dumps/logs.sh b/core_dumps/logs.sh index cea6fa92d..e5a56a567 100755 --- a/core_dumps/logs.sh +++ b/core_dumps/logs.sh @@ -14,9 +14,11 @@ dump_log () journalctl -u "$name".service > "$DIR_NAME"/"${name}_${STEP_NAME}".log } +dump_log "mariadb" dump_log "mcs-ddlproc" dump_log "mcs-dmlproc" dump_log "mcs-loadbrm" dump_log "mcs-primproc" dump_log "mcs-workernode@1" dump_log "mcs-writeengineserver" +dump_log "mcs-controllernode" diff --git a/setup-repo.sh b/setup-repo.sh new file mode 100755 index 000000000..21397a627 --- /dev/null +++ b/setup-repo.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env sh + +set -e + +. /etc/os-release + +ARCH=$(expr "$(uname -m)" : "arm64\|aarch64" > /dev/null && echo "arm64" || echo "amd64") + +case "$ID" in +ubuntu|debian) + apt update -y + apt install -y ca-certificates + echo "deb [trusted=yes] ${PACKAGES_URL}/${ARCH}/ ${OS}/" > /etc/apt/sources.list.d/repo.list + cat /etc/apt/sources.list.d/repo.list + apt update -y + ;; +rocky|centos) + cat << EOF > /etc/yum.repos.d/repo.repo +[repo] +name=repo +baseurl=${PACKAGES_URL}/${ARCH}/${OS} +enabled=1 +gpgcheck=0 +module_hotfixes=1 +EOF + ;; +*) + echo "$ID is unknown!" + exit 1 + ;; +esac