From 2c0367ea2bb8ef3f5a7a0935741e1051ed8eea23 Mon Sep 17 00:00:00 2001 From: mariadb-AlanMologorsky Date: Tue, 22 Apr 2025 20:36:07 +0300 Subject: [PATCH] feat(mcs): MCOL-5300 review/finetune log collection tools. * chore(mcs, scripts): extra/columnstore_review.sh with scripts/columnstore_review.sh with 1.4.13 version * feat(mcs): add review command to the Tools section. It's the wrapper for columnstore_review.sh * feat(mcs): add review command implementation to tools.py file + constansts.py * chore(mcs): add separator argument to cook_sh_arg function * docs(mcs): updated README.md and mcs.1 man file --- cmapi/CMakeLists.txt | 2 +- cmapi/mcs_cluster_tool/README.md | 115 +++-- cmapi/mcs_cluster_tool/__main__.py | 6 + cmapi/mcs_cluster_tool/constants.py | 3 + cmapi/mcs_cluster_tool/helpers.py | 8 +- cmapi/mcs_cluster_tool/mcs.1 | 145 ++++-- cmapi/mcs_cluster_tool/tools_commands.py | 232 ++++++++- .../scripts}/columnstore_review.sh | 445 ++++++++++++++++-- 8 files changed, 851 insertions(+), 105 deletions(-) rename {extra => cmapi/scripts}/columnstore_review.sh (85%) diff --git a/cmapi/CMakeLists.txt b/cmapi/CMakeLists.txt index 6b06e8746..aa89742e4 100644 --- a/cmapi/CMakeLists.txt +++ b/cmapi/CMakeLists.txt @@ -86,7 +86,7 @@ INSTALL(FILES mcs_aws INSTALL(FILES mcs_gsutil PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ DESTINATION ${BIN_DIR}) -INSTALL(FILES scripts/mcs_backup_manager.sh scripts/cs_package_manager.sh +INSTALL(FILES scripts/mcs_backup_manager.sh scripts/cs_package_manager.sh scripts/columnstore_review.sh PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ DESTINATION ${BIN_DIR}) INSTALL(FILES mcs_cluster_tool/mcs.1 DESTINATION ${MAN_DIR}) diff --git a/cmapi/mcs_cluster_tool/README.md b/cmapi/mcs_cluster_tool/README.md index 73446aa9d..53a9ac8db 100644 --- a/cmapi/mcs_cluster_tool/README.md +++ b/cmapi/mcs_cluster_tool/README.md @@ -10,6 +10,7 @@ $ mcs [OPTIONS] COMMAND [ARGS]... **Options**: +* `-v, --verbose`: Enable verbose logging to console * `--help`: Show this message and exit. **Commands**: @@ -19,7 +20,9 @@ $ mcs [OPTIONS] COMMAND [ARGS]... * `restore`: Restore Columnstore (and/or MariaDB) data. * `dbrm_restore`: Restore Columnstore DBRM data. * `cskeys`: Generates a random AES encryption key and init vector and writes them to disk. -* `cspasswd`: Encrypt a Columnstore plaintext password using the encryption key in the key file. +* `cspasswd`: Encrypt a Columnstore plaintext password. +* `bootstrap-single-node`: Bootstrap a single node (localhost)... +* `review`: Provides useful functions to review and troubleshoot the MCS cluster. * `help-all`: Show help for all commands in man page style. * `status`: Get status information. * `stop`: Stop the Columnstore cluster. @@ -45,8 +48,8 @@ $ mcs backup [OPTIONS] * `-bl, --backup-location TEXT`: What directory to store the backups on this machine or the target machine. Consider write permissions of the scp user and the user running this script. Mariadb-backup will use this location as a tmp dir for S3 and remote backups temporarily. -Example: /mnt/backups/ [default: /tmp/backups/] -* `-bd, --backup-destination TEXT`: Are the backups going to be stored on the same machine this script is running on or another server - if Remote you need to setup scp=Options: "Local" or "Remote" [default: Local] +Example: /mnt/backups/ +* `-bd, --backup-destination TEXT`: Are the backups going to be stored on the same machine this script is running on or another server - if Remote you need to setup scp=Options: "Local" or "Remote" * `-scp TEXT`: Used only if --backup-destination="Remote". The user/credentials that will be used to scp the backup files Example: "centos@10.14.51.62" @@ -56,25 +59,25 @@ Example: "s3://my-cs-backups" * `-url, --endpoint-url TEXT`: Used by on premise S3 vendors. Example: "http://127.0.0.1:8000" * `-s, --storage TEXT`: What storage topogoly is being used by Columnstore - found in /etc/columnstore/storagemanager.cnf. -Options: "LocalStorage" or "S3" [default: LocalStorage] +Options: "LocalStorage" or "S3" * `-i, --incremental TEXT`: Adds columnstore deltas to an existing full backup. Backup folder to apply increment could be a value or "auto_most_recent" - the incremental backup applies to last full backup. -* `-P, --parallel INTEGER`: Determines if columnstore data directories will have multiple rsync running at the same time for different subfolders to parallelize writes. Ignored if "-c/--compress" argument not set. [default: 4] -* `-ha, --highavilability / -no-ha, --no-highavilability`: Hint wether shared storage is attached @ below on all nodes to see all data +* `-P, --parallel INTEGER`: Enables parallel rsync for faster backups, setting the number of simultaneous rsync processes. With -c/--compress, sets the number of compression threads. +* `-ha, --highavilability`: Hint wether shared storage is attached @ below on all nodes to see all data HA LocalStorage ( /var/lib/columnstore/dataX/ ) -HA S3 ( /var/lib/columnstore/storagemanager/ ) [default: no-ha] +HA S3 ( /var/lib/columnstore/storagemanager/ ) * `-f, --config-file TEXT`: Path to backup configuration file to load variables from - relative or full path accepted. -* `-sbrm, --skip-save-brm / -no-sbrm, --no-skip-save-brm`: Skip saving brm prior to running a backup - ideal for dirty backups. [default: no-sbrm] -* `-spoll, --skip-polls / -no-spoll, --no-skip-polls`: Skip sql checks confirming no write/cpimports running. [default: no-spoll] -* `-slock, --skip-locks / -no-slock, --no-skip-locks`: Skip issuing write locks - ideal for dirty backups. [default: no-slock] -* `-smdb, --skip-mariadb-backup / -no-smdb, --no-skip-mariadb-backup`: Skip running a mariadb-backup for innodb data - ideal for incremental dirty backups. [default: no-smdb] -* `-sb, --skip-bucket-data / -no-sb, --no-skip-bucket-data`: Skip taking a copy of the columnstore data in the bucket. [default: no-sb] -* `-nb, --name-backup TEXT`: Define the name of the backup - default: $(date +%m-%d-%Y) [default: 03-20-2025] +* `-sbrm, --skip-save-brm`: Skip saving brm prior to running a backup - ideal for dirty backups. +* `-spoll, --skip-polls`: Skip sql checks confirming no write/cpimports running. +* `-slock, --skip-locks`: Skip issuing write locks - ideal for dirty backups. +* `-smdb, --skip-mariadb-backup`: Skip running a mariadb-backup for innodb data - ideal for incremental dirty backups. +* `-sb, --skip-bucket-data`: Skip taking a copy of the columnstore data in the bucket. +* `-nb, --name-backup TEXT`: Define the name of the backup - default: $(date +%m-%d-%Y) * `-c, --compress TEXT`: Compress backup in X format - Options: [ pigz ]. -* `-q, --quiet / -no-q, --no-quiet`: Silence verbose copy command outputs. [default: no-q] -* `-nv-ssl, --no-verify-ssl / -v-ssl, --verify-ssl`: Skips verifying ssl certs, useful for onpremise s3 storage. [default: v-ssl] -* `-pi, --poll-interval INTEGER`: Number of seconds between poll checks for active writes & cpimports. [default: 5] -* `-pmw, --poll-max-wait INTEGER`: Max number of minutes for polling checks for writes to wait before exiting as a failed backup attempt. [default: 60] -* `-r, --retention-days INTEGER`: Retain backups created within the last X days, default 0 == keep all backups. [default: 0] +* `-q, --quiet`: Silence verbose copy command outputs. +* `-nv-ssl, --no-verify-ssl`: Skips verifying ssl certs, useful for onpremise s3 storage. +* `-pi, --poll-interval INTEGER`: Number of seconds between poll checks for active writes & cpimports. +* `-pmw, --poll-max-wait INTEGER`: Max number of minutes for polling checks for writes to wait before exiting as a failed backup attempt. +* `-r, --retention-days INTEGER`: Retain backups created within the last X days, default 0 == keep all backups. * `-aro, --apply-retention-only`: Only apply retention policy to existing backups, does not run a backup. * `-li, --list`: List backups. * `--help`: Show this message and exit. @@ -96,8 +99,8 @@ $ mcs dbrm_backup [OPTIONS] * `-bl, --backup-location TEXT`: Path of where to save the dbrm backups on disk. [default: /tmp/dbrm_backups] * `-m, --mode TEXT`: "loop" or "once" ; Determines if this script runs in a forever loop sleeping -i minutes or just once. [default: once] * `-nb, --name-backup TEXT`: Define the prefix of the backup - default: dbrm_backup+date +%Y%m%d_%H%M%S [default: dbrm_backup] -* `-ssm, --skip-storage-manager / -no-ssm, --no-skip-storage-manager`: Skip backing up storagemanager directory. [default: no-ssm] -* `-q, --quiet / -no-q, --no-quiet`: Silence verbose copy command outputs. [default: no-q] +* `-ssm, --skip-storage-manager`: Skip backing up storagemanager directory. +* `-q, --quiet`: Silence verbose copy command outputs. * `-li, --list`: List backups. * `--help`: Show this message and exit. @@ -133,14 +136,14 @@ Options: "LocalStorage" or "S3" [default: LocalStorage] * `-nk, --new-key TEXT`: Defines the aws key to connect to the new_bucket. * `-ns, --new-secret TEXT`: Defines the aws secret of the aws key to connect to the new_bucket. * `-P, --parallel INTEGER`: Determines number of decompression and mdbstream threads. Ignored if "-c/--compress" argument not set. [default: 4] -* `-ha, --highavilability / -no-ha, --no-highavilability`: Flag for high available systems (meaning shared storage exists supporting the topology so that each node sees all data) [default: no-ha] -* `-cont, --continue / -no-cont, --no-continue`: This acknowledges data in your --new_bucket is ok to delete when restoring S3. When set to true skips the enforcement that new_bucket should be empty prior to starting a restore. [default: no-cont] +* `-ha, --highavilability`: Flag for high available systems (meaning shared storage exists supporting the topology so that each node sees all data) +* `-cont, --continue`: This acknowledges data in your --new_bucket is ok to delete when restoring S3. When set to true skips the enforcement that new_bucket should be empty prior to starting a restore. * `-f, --config-file TEXT`: Path to backup configuration file to load variables from - relative or full path accepted. -* `-smdb, --skip-mariadb-backup / -no-smdb, --no-skip-mariadb-backup`: Skip restoring mariadb server via mariadb-backup - ideal for only restoring columnstore. [default: no-smdb] -* `-sb, --skip-bucket-data / -no-sb, --no-skip-bucket-data`: Skip restoring columnstore data in the bucket - ideal if looking to only restore mariadb server. [default: no-sb] +* `-smdb, --skip-mariadb-backup`: Skip restoring mariadb server via mariadb-backup - ideal for only restoring columnstore. +* `-sb, --skip-bucket-data`: Skip restoring columnstore data in the bucket - ideal if looking to only restore mariadb server. * `-c, --compress TEXT`: Hint that the backup is compressed in X format. Options: [ pigz ]. -* `-q, --quiet / -no-q, --no-quiet`: Silence verbose copy command outputs. [default: no-q] -* `-nv-ssl, --no-verify-ssl / -v-ssl, --verify-ssl`: Skips verifying ssl certs, useful for onpremise s3 storage. [default: v-ssl] +* `-q, --quiet`: Silence verbose copy command outputs. +* `-nv-ssl, --no-verify-ssl`: Skips verifying ssl certs, useful for onpremise s3 storage. * `-li, --list`: List backups. * `--help`: Show this message and exit. @@ -159,8 +162,8 @@ $ mcs dbrm_restore [OPTIONS] * `-bl, --backup-location TEXT`: Path of where dbrm backups exist on disk. [default: /tmp/dbrm_backups] * `-l, --load TEXT`: Name of the directory to restore from -bl * `-ns, --no-start`: Do not attempt columnstore startup post dbrm_restore. -* `-sdbk, --skip-dbrm-backup / -no-sdbk, --no-skip-dbrm-backup`: Skip backing up dbrms before restoring. [default: sdbk] -* `-ssm, --skip-storage-manager / -no-ssm, --no-skip-storage-manager`: Skip backing up storagemanager directory. [default: ssm] +* `-sdbk, --skip-dbrm-backup`: Skip backing up dbrms before restoring. +* `-ssm, --skip-storage-manager`: Skip backing up storagemanager directory. * `-li, --list`: List backups. * `--help`: Show this message and exit. @@ -208,6 +211,62 @@ $ mcs cspasswd [OPTIONS] * `--decrypt`: Decrypt an encrypted password instead. * `--help`: Show this message and exit. +## `mcs bootstrap-single-node` + +Bootstrap a single node (localhost) Columnstore instance. + +**Usage**: + +```console +$ mcs bootstrap-single-node [OPTIONS] +``` + +**Options**: + +* `--api-key TEXT`: API key to set. +* `--help`: Show this message and exit. + +## `mcs review` + +This script performs various maintenance and diagnostic tasks for +MariaDB ColumnStore, including log archiving, extent map backups, +schema and table testing, directory and ownership checks, extent map +validation, S3 storage comparison, process management, table +synchronization, port availability checks, stack dumps, cleanup of +rollback fragments, and graceful process termination. + +If database is up, this script will connect as root@localhost via socket. + +**Usage**: + +```console +$ mcs review [OPTIONS] +``` + +**Options**: + +* `--version`: Only show the header with version information. +* `--logs`: Create a compressed archive of logs for MariaDB Support Ticket +* `--path`: Define the path for where to save files/tarballs and outputs of this script. +* `--backupdbrm`: Takes a compressed backup of extent map files in dbrm directory. +* `--testschema`: Creates a test schema, tables, imports, queries, drops schema. +* `--testschemakeep`: creates a test schema, tables, imports, queries, does not drop. +* `--ldlischema`: Using ldli, creates test schema, tables, imports, queries, drops schema. +* `--ldlischemakeep`: Using ldli, creates test schema, tables, imports, queries, does not drop. +* `--emptydirs`: Searches /var/lib/columnstore for empty directories. +* `--notmysqldirs`: Searches /var/lib/columnstore for directories not owned by mysql. +* `--emcheck`: Checks the extent map for orphaned and missing files. +* `--s3check`: Checks the extent map against S3 storage. +* `--pscs`: Adds the pscs command. pscs lists running columnstore processes. +* `--schemasync`: Fix out-of-sync columnstore tables (CAL0009). +* `--tmpdir`: Ensure owner of temporary dir after reboot (MCOL-4866 & MCOL-5242). +* `--checkports`: Checks if ports needed by Columnstore are opened. +* `--eustack`: Dumps the stack of Columnstore processes. +* `--clearrollback`: Clear any rollback fragments from dbrm files. +* `--killcolumnstore`: Stop columnstore processes gracefully, then kill remaining processes. +* `--color TEXT`: print headers in color. Options: prefix color with l for light. +* `--help`: Show this message and exit. + ## `mcs help-all` Show help for all commands in man page style. diff --git a/cmapi/mcs_cluster_tool/__main__.py b/cmapi/mcs_cluster_tool/__main__.py index 4c15284b6..a2260d0c6 100644 --- a/cmapi/mcs_cluster_tool/__main__.py +++ b/cmapi/mcs_cluster_tool/__main__.py @@ -50,6 +50,12 @@ app.command( app.command( 'bootstrap-single-node', rich_help_panel='Tools commands', )(tools_commands.bootstrap_single_node) +app.command( + 'review', rich_help_panel='Tools commands', + short_help=( + 'Provides useful functions to review and troubleshoot the MCS cluster.' + ) +)(tools_commands.review) @app.command( diff --git a/cmapi/mcs_cluster_tool/constants.py b/cmapi/mcs_cluster_tool/constants.py index ec988175c..ef7e61b4b 100644 --- a/cmapi/mcs_cluster_tool/constants.py +++ b/cmapi/mcs_cluster_tool/constants.py @@ -7,3 +7,6 @@ MCS_CLI_ROOT_PATH = os.path.dirname(__file__) MCS_CLI_LOG_CONF_PATH = os.path.join(MCS_CLI_ROOT_PATH, 'mcs_cli_log.conf') MCS_BACKUP_MANAGER_SH = os.path.join(MCS_INSTALL_BIN, 'mcs_backup_manager.sh') +MCS_COLUMNSTORE_REVIEW_SH = os.path.join( + MCS_INSTALL_BIN, 'columnstore_review.sh' +) \ No newline at end of file diff --git a/cmapi/mcs_cluster_tool/helpers.py b/cmapi/mcs_cluster_tool/helpers.py index 442cdb19c..a39678238 100644 --- a/cmapi/mcs_cluster_tool/helpers.py +++ b/cmapi/mcs_cluster_tool/helpers.py @@ -2,13 +2,17 @@ from typing import Optional, Union -def cook_sh_arg(arg_name: str, value: Union[str, int, bool]) -> Optional[str]: +def cook_sh_arg( + arg_name: str, value: Union[str, int, bool], separator: str = ' ' +) -> Optional[str]: """Convert argument and and value from function locals to bash argument. :param arg_name: function argument name :type arg_name: str :param value: function argument value :type value: Union[str, int, bool] + :param separator: separator between argument and value + :type separator: str :return: bash argument string or None :rtype: Optional[str] """ @@ -31,4 +35,4 @@ def cook_sh_arg(arg_name: str, value: Union[str, int, bool]) -> Optional[str]: return None # if True value presented just pass only arg name without value value = '' - return f'-{arg_name} {value}' if value else f'-{arg_name}' + return f'-{arg_name}{separator}{value}' if value else f'-{arg_name}' diff --git a/cmapi/mcs_cluster_tool/mcs.1 b/cmapi/mcs_cluster_tool/mcs.1 index 724161f7c..172102335 100644 --- a/cmapi/mcs_cluster_tool/mcs.1 +++ b/cmapi/mcs_cluster_tool/mcs.1 @@ -13,6 +13,8 @@ $ mcs [OPTIONS] COMMAND [ARGS]... \fBOptions\fP: .RS .IP \(bu 2 +\fB\fC\-v, \-\-verbose\fR: Enable verbose logging to console +.IP \(bu 2 \fB\fC\-\-help\fR: Show this message and exit. .RE .PP @@ -29,7 +31,11 @@ $ mcs [OPTIONS] COMMAND [ARGS]... .IP \(bu 2 \fB\fCcskeys\fR: Generates a random AES encryption key and init vector and writes them to disk. .IP \(bu 2 -\fB\fCcspasswd\fR: Encrypt a Columnstore plaintext password using the encryption key in the key file. +\fB\fCcspasswd\fR: Encrypt a Columnstore plaintext password. +.IP \(bu 2 +\fB\fCbootstrap\-single\-node\fR: Bootstrap a single node (localhost)... +.IP \(bu 2 +\fB\fCreview\fR: Provides useful functions to review and troubleshoot the MCS cluster. .IP \(bu 2 \fB\fChelp\-all\fR: Show help for all commands in man page style. .IP \(bu 2 @@ -67,9 +73,9 @@ $ mcs backup [OPTIONS] \fB\fC\-bl, \-\-backup\-location TEXT\fR: What directory to store the backups on this machine or the target machine. Consider write permissions of the scp user and the user running this script. Mariadb\-backup will use this location as a tmp dir for S3 and remote backups temporarily. -Example: /mnt/backups/ [default: /tmp/backups/] +Example: /mnt/backups/ .IP \(bu 2 -\fB\fC\-bd, \-\-backup\-destination TEXT\fR: Are the backups going to be stored on the same machine this script is running on or another server \- if Remote you need to setup scp=Options: \[dq]Local\[dq] or \[dq]Remote\[dq] [default: Local] +\fB\fC\-bd, \-\-backup\-destination TEXT\fR: Are the backups going to be stored on the same machine this script is running on or another server \- if Remote you need to setup scp=Options: \[dq]Local\[dq] or \[dq]Remote\[dq] .IP \(bu 2 \fB\fC\-scp TEXT\fR: Used only if \-\-backup\-destination=\[dq]Remote\[dq]\&. The user/credentials that will be used to scp the backup files @@ -83,41 +89,41 @@ Example: \[dq]s3://my\-cs\-backups\[dq] Example: \[dq]\[la]http://127.0.0.1:8000\[ra]\[dq] .IP \(bu 2 \fB\fC\-s, \-\-storage TEXT\fR: What storage topogoly is being used by Columnstore \- found in /etc/columnstore/storagemanager.cnf. -Options: \[dq]LocalStorage\[dq] or \[dq]S3\[dq] [default: LocalStorage] +Options: \[dq]LocalStorage\[dq] or \[dq]S3\[dq] .IP \(bu 2 \fB\fC\-i, \-\-incremental TEXT\fR: Adds columnstore deltas to an existing full backup. Backup folder to apply increment could be a value or \[dq]auto\fImost\fPrecent\[dq] \- the incremental backup applies to last full backup. .IP \(bu 2 -\fB\fC\-P, \-\-parallel INTEGER\fR: Determines if columnstore data directories will have multiple rsync running at the same time for different subfolders to parallelize writes. Ignored if \[dq]\-c/\-\-compress\[dq] argument not set. [default: 4] +\fB\fC\-P, \-\-parallel INTEGER\fR: Enables parallel rsync for faster backups, setting the number of simultaneous rsync processes. With \-c/\-\-compress, sets the number of compression threads. .IP \(bu 2 -\fB\fC\-ha, \-\-highavilability / \-no\-ha, \-\-no\-highavilability\fR: Hint wether shared storage is attached @ below on all nodes to see all data +\fB\fC\-ha, \-\-highavilability\fR: Hint wether shared storage is attached @ below on all nodes to see all data HA LocalStorage ( /var/lib/columnstore/dataX/ ) -HA S3 ( /var/lib/columnstore/storagemanager/ ) [default: no\-ha] +HA S3 ( /var/lib/columnstore/storagemanager/ ) .IP \(bu 2 \fB\fC\-f, \-\-config\-file TEXT\fR: Path to backup configuration file to load variables from \- relative or full path accepted. .IP \(bu 2 -\fB\fC\-sbrm, \-\-skip\-save\-brm / \-no\-sbrm, \-\-no\-skip\-save\-brm\fR: Skip saving brm prior to running a backup \- ideal for dirty backups. [default: no\-sbrm] +\fB\fC\-sbrm, \-\-skip\-save\-brm\fR: Skip saving brm prior to running a backup \- ideal for dirty backups. .IP \(bu 2 -\fB\fC\-spoll, \-\-skip\-polls / \-no\-spoll, \-\-no\-skip\-polls\fR: Skip sql checks confirming no write/cpimports running. [default: no\-spoll] +\fB\fC\-spoll, \-\-skip\-polls\fR: Skip sql checks confirming no write/cpimports running. .IP \(bu 2 -\fB\fC\-slock, \-\-skip\-locks / \-no\-slock, \-\-no\-skip\-locks\fR: Skip issuing write locks \- ideal for dirty backups. [default: no\-slock] +\fB\fC\-slock, \-\-skip\-locks\fR: Skip issuing write locks \- ideal for dirty backups. .IP \(bu 2 -\fB\fC\-smdb, \-\-skip\-mariadb\-backup / \-no\-smdb, \-\-no\-skip\-mariadb\-backup\fR: Skip running a mariadb\-backup for innodb data \- ideal for incremental dirty backups. [default: no\-smdb] +\fB\fC\-smdb, \-\-skip\-mariadb\-backup\fR: Skip running a mariadb\-backup for innodb data \- ideal for incremental dirty backups. .IP \(bu 2 -\fB\fC\-sb, \-\-skip\-bucket\-data / \-no\-sb, \-\-no\-skip\-bucket\-data\fR: Skip taking a copy of the columnstore data in the bucket. [default: no\-sb] +\fB\fC\-sb, \-\-skip\-bucket\-data\fR: Skip taking a copy of the columnstore data in the bucket. .IP \(bu 2 -\fB\fC\-nb, \-\-name\-backup TEXT\fR: Define the name of the backup \- default: $(date +%m\-%d\-%Y) [default: 03\-20\-2025] +\fB\fC\-nb, \-\-name\-backup TEXT\fR: Define the name of the backup \- default: $(date +%m\-%d\-%Y) .IP \(bu 2 \fB\fC\-c, \-\-compress TEXT\fR: Compress backup in X format \- Options: [ pigz ]. .IP \(bu 2 -\fB\fC\-q, \-\-quiet / \-no\-q, \-\-no\-quiet\fR: Silence verbose copy command outputs. [default: no\-q] +\fB\fC\-q, \-\-quiet\fR: Silence verbose copy command outputs. .IP \(bu 2 -\fB\fC\-nv\-ssl, \-\-no\-verify\-ssl / \-v\-ssl, \-\-verify\-ssl\fR: Skips verifying ssl certs, useful for onpremise s3 storage. [default: v\-ssl] +\fB\fC\-nv\-ssl, \-\-no\-verify\-ssl\fR: Skips verifying ssl certs, useful for onpremise s3 storage. .IP \(bu 2 -\fB\fC\-pi, \-\-poll\-interval INTEGER\fR: Number of seconds between poll checks for active writes & cpimports. [default: 5] +\fB\fC\-pi, \-\-poll\-interval INTEGER\fR: Number of seconds between poll checks for active writes & cpimports. .IP \(bu 2 -\fB\fC\-pmw, \-\-poll\-max\-wait INTEGER\fR: Max number of minutes for polling checks for writes to wait before exiting as a failed backup attempt. [default: 60] +\fB\fC\-pmw, \-\-poll\-max\-wait INTEGER\fR: Max number of minutes for polling checks for writes to wait before exiting as a failed backup attempt. .IP \(bu 2 -\fB\fC\-r, \-\-retention\-days INTEGER\fR: Retain backups created within the last X days, default 0 == keep all backups. [default: 0] +\fB\fC\-r, \-\-retention\-days INTEGER\fR: Retain backups created within the last X days, default 0 == keep all backups. .IP \(bu 2 \fB\fC\-aro, \-\-apply\-retention\-only\fR: Only apply retention policy to existing backups, does not run a backup. .IP \(bu 2 @@ -150,9 +156,9 @@ $ mcs dbrm_backup [OPTIONS] .IP \(bu 2 \fB\fC\-nb, \-\-name\-backup TEXT\fR: Define the prefix of the backup \- default: dbrm\fIbackup+date +%Y%m%d\fP%H%M%S [default: dbrm_backup] .IP \(bu 2 -\fB\fC\-ssm, \-\-skip\-storage\-manager / \-no\-ssm, \-\-no\-skip\-storage\-manager\fR: Skip backing up storagemanager directory. [default: no\-ssm] +\fB\fC\-ssm, \-\-skip\-storage\-manager\fR: Skip backing up storagemanager directory. .IP \(bu 2 -\fB\fC\-q, \-\-quiet / \-no\-q, \-\-no\-quiet\fR: Silence verbose copy command outputs. [default: no\-q] +\fB\fC\-q, \-\-quiet\fR: Silence verbose copy command outputs. .IP \(bu 2 \fB\fC\-li, \-\-list\fR: List backups. .IP \(bu 2 @@ -207,21 +213,21 @@ Options: \[dq]LocalStorage\[dq] or \[dq]S3\[dq] [default: LocalStorage] .IP \(bu 2 \fB\fC\-P, \-\-parallel INTEGER\fR: Determines number of decompression and mdbstream threads. Ignored if \[dq]\-c/\-\-compress\[dq] argument not set. [default: 4] .IP \(bu 2 -\fB\fC\-ha, \-\-highavilability / \-no\-ha, \-\-no\-highavilability\fR: Flag for high available systems (meaning shared storage exists supporting the topology so that each node sees all data) [default: no\-ha] +\fB\fC\-ha, \-\-highavilability\fR: Flag for high available systems (meaning shared storage exists supporting the topology so that each node sees all data) .IP \(bu 2 -\fB\fC\-cont, \-\-continue / \-no\-cont, \-\-no\-continue\fR: This acknowledges data in your \-\-new\fIbucket is ok to delete when restoring S3. When set to true skips the enforcement that new\fPbucket should be empty prior to starting a restore. [default: no\-cont] +\fB\fC\-cont, \-\-continue\fR: This acknowledges data in your \-\-new\fIbucket is ok to delete when restoring S3. When set to true skips the enforcement that new\fPbucket should be empty prior to starting a restore. .IP \(bu 2 \fB\fC\-f, \-\-config\-file TEXT\fR: Path to backup configuration file to load variables from \- relative or full path accepted. .IP \(bu 2 -\fB\fC\-smdb, \-\-skip\-mariadb\-backup / \-no\-smdb, \-\-no\-skip\-mariadb\-backup\fR: Skip restoring mariadb server via mariadb\-backup \- ideal for only restoring columnstore. [default: no\-smdb] +\fB\fC\-smdb, \-\-skip\-mariadb\-backup\fR: Skip restoring mariadb server via mariadb\-backup \- ideal for only restoring columnstore. .IP \(bu 2 -\fB\fC\-sb, \-\-skip\-bucket\-data / \-no\-sb, \-\-no\-skip\-bucket\-data\fR: Skip restoring columnstore data in the bucket \- ideal if looking to only restore mariadb server. [default: no\-sb] +\fB\fC\-sb, \-\-skip\-bucket\-data\fR: Skip restoring columnstore data in the bucket \- ideal if looking to only restore mariadb server. .IP \(bu 2 \fB\fC\-c, \-\-compress TEXT\fR: Hint that the backup is compressed in X format. Options: [ pigz ]. .IP \(bu 2 -\fB\fC\-q, \-\-quiet / \-no\-q, \-\-no\-quiet\fR: Silence verbose copy command outputs. [default: no\-q] +\fB\fC\-q, \-\-quiet\fR: Silence verbose copy command outputs. .IP \(bu 2 -\fB\fC\-nv\-ssl, \-\-no\-verify\-ssl / \-v\-ssl, \-\-verify\-ssl\fR: Skips verifying ssl certs, useful for onpremise s3 storage. [default: v\-ssl] +\fB\fC\-nv\-ssl, \-\-no\-verify\-ssl\fR: Skips verifying ssl certs, useful for onpremise s3 storage. .IP \(bu 2 \fB\fC\-li, \-\-list\fR: List backups. .IP \(bu 2 @@ -248,9 +254,9 @@ $ mcs dbrm_restore [OPTIONS] .IP \(bu 2 \fB\fC\-ns, \-\-no\-start\fR: Do not attempt columnstore startup post dbrm_restore. .IP \(bu 2 -\fB\fC\-sdbk, \-\-skip\-dbrm\-backup / \-no\-sdbk, \-\-no\-skip\-dbrm\-backup\fR: Skip backing up dbrms before restoring. [default: sdbk] +\fB\fC\-sdbk, \-\-skip\-dbrm\-backup\fR: Skip backing up dbrms before restoring. .IP \(bu 2 -\fB\fC\-ssm, \-\-skip\-storage\-manager / \-no\-ssm, \-\-no\-skip\-storage\-manager\fR: Skip backing up storagemanager directory. [default: ssm] +\fB\fC\-ssm, \-\-skip\-storage\-manager\fR: Skip backing up storagemanager directory. .IP \(bu 2 \fB\fC\-li, \-\-list\fR: List backups. .IP \(bu 2 @@ -311,6 +317,89 @@ $ mcs cspasswd [OPTIONS] .IP \(bu 2 \fB\fC\-\-help\fR: Show this message and exit. .RE +.SH \fB\fCmcs bootstrap\-single\-node\fR +.PP +Bootstrap a single node (localhost) Columnstore instance. +.PP +\fBUsage\fP: +.PP +.RS +.nf +$ mcs bootstrap\-single\-node [OPTIONS] +.fi +.RE +.PP +\fBOptions\fP: +.RS +.IP \(bu 2 +\fB\fC\-\-api\-key TEXT\fR: API key to set. +.IP \(bu 2 +\fB\fC\-\-help\fR: Show this message and exit. +.RE +.SH \fB\fCmcs review\fR +.PP +This script performs various maintenance and diagnostic tasks for +MariaDB ColumnStore, including log archiving, extent map backups, +schema and table testing, directory and ownership checks, extent map +validation, S3 storage comparison, process management, table +synchronization, port availability checks, stack dumps, cleanup of +rollback fragments, and graceful process termination. +.PP +If database is up, this script will connect as root@localhost via socket. +.PP +\fBUsage\fP: +.PP +.RS +.nf +$ mcs review [OPTIONS] +.fi +.RE +.PP +\fBOptions\fP: +.RS +.IP \(bu 2 +\fB\fC\-\-version\fR: Only show the header with version information. +.IP \(bu 2 +\fB\fC\-\-logs\fR: Create a compressed archive of logs for MariaDB Support Ticket +.IP \(bu 2 +\fB\fC\-\-path\fR: Define the path for where to save files/tarballs and outputs of this script. +.IP \(bu 2 +\fB\fC\-\-backupdbrm\fR: Takes a compressed backup of extent map files in dbrm directory. +.IP \(bu 2 +\fB\fC\-\-testschema\fR: Creates a test schema, tables, imports, queries, drops schema. +.IP \(bu 2 +\fB\fC\-\-testschemakeep\fR: creates a test schema, tables, imports, queries, does not drop. +.IP \(bu 2 +\fB\fC\-\-ldlischema\fR: Using ldli, creates test schema, tables, imports, queries, drops schema. +.IP \(bu 2 +\fB\fC\-\-ldlischemakeep\fR: Using ldli, creates test schema, tables, imports, queries, does not drop. +.IP \(bu 2 +\fB\fC\-\-emptydirs\fR: Searches /var/lib/columnstore for empty directories. +.IP \(bu 2 +\fB\fC\-\-notmysqldirs\fR: Searches /var/lib/columnstore for directories not owned by mysql. +.IP \(bu 2 +\fB\fC\-\-emcheck\fR: Checks the extent map for orphaned and missing files. +.IP \(bu 2 +\fB\fC\-\-s3check\fR: Checks the extent map against S3 storage. +.IP \(bu 2 +\fB\fC\-\-pscs\fR: Adds the pscs command. pscs lists running columnstore processes. +.IP \(bu 2 +\fB\fC\-\-schemasync\fR: Fix out\-of\-sync columnstore tables (CAL0009). +.IP \(bu 2 +\fB\fC\-\-tmpdir\fR: Ensure owner of temporary dir after reboot (MCOL\-4866 & MCOL\-5242). +.IP \(bu 2 +\fB\fC\-\-checkports\fR: Checks if ports needed by Columnstore are opened. +.IP \(bu 2 +\fB\fC\-\-eustack\fR: Dumps the stack of Columnstore processes. +.IP \(bu 2 +\fB\fC\-\-clearrollback\fR: Clear any rollback fragments from dbrm files. +.IP \(bu 2 +\fB\fC\-\-killcolumnstore\fR: Stop columnstore processes gracefully, then kill remaining processes. +.IP \(bu 2 +\fB\fC\-\-color TEXT\fR: print headers in color. Options: prefix color with l for light. +.IP \(bu 2 +\fB\fC\-\-help\fR: Show this message and exit. +.RE .SH \fB\fCmcs help\-all\fR .PP Show help for all commands in man page style. diff --git a/cmapi/mcs_cluster_tool/tools_commands.py b/cmapi/mcs_cluster_tool/tools_commands.py index 3d3ddc0dc..742f13d13 100644 --- a/cmapi/mcs_cluster_tool/tools_commands.py +++ b/cmapi/mcs_cluster_tool/tools_commands.py @@ -1,7 +1,9 @@ import logging import os import secrets -from datetime import datetime, timedelta +import sys +from datetime import datetime +from typing import Optional import typer from typing_extensions import Annotated @@ -14,7 +16,11 @@ from cmapi_server.controllers.api_clients import ClusterControllerClient from cmapi_server.exceptions import CEJError from cmapi_server.handlers.cej import CEJPasswordHandler from cmapi_server.managers.transaction import TransactionManager +from cmapi_server.process_dispatchers.base import BaseDispatcher +from mcs_cluster_tool.constants import MCS_COLUMNSTORE_REVIEW_SH from mcs_cluster_tool.decorators import handle_output +from mcs_cluster_tool.helpers import cook_sh_arg + logger = logging.getLogger('mcs_cli') @@ -150,3 +156,227 @@ def bootstrap_single_node( 'add_node_resp': add_node_resp, } return result + + +@handle_output +def review( + _version: Annotated[ + Optional[bool], + typer.Option( + '--version', + help='Only show the header with version information.', + show_default=False + ) + ] = None, + _logs: Annotated[ + Optional[bool], + typer.Option( + '--logs', + help=( + 'Create a compressed archive of logs for MariaDB Support ' + 'Ticket' + ), + show_default=False + ) + ] = None, + _path: Annotated[ + Optional[str], + typer.Option( + '--path', + help=( + 'Define the path for where to save files/tarballs and outputs ' + 'of this script.' + ), + show_default=False + ) + ] = None, + _backupdbrm: Annotated[ + Optional[bool], + typer.Option( + '--backupdbrm', + help=( + 'Takes a compressed backup of extent map files in dbrm ' + 'directory.' + ), + show_default=False + ) + ] = None, + _testschema: Annotated[ + Optional[bool], + typer.Option( + '--testschema', + help=( + 'Creates a test schema, tables, imports, queries, drops ' + 'schema.' + ), + show_default=False + ) + ] = None, + _testschemakeep: Annotated[ + Optional[bool], + typer.Option( + '--testschemakeep', + help=( + 'Creates a test schema, tables, imports, queries, does not ' + 'drop.' + ), + show_default=False + ) + ] = None, + _ldlischema: Annotated[ + Optional[bool], + typer.Option( + '--ldlischema', + help=( + 'Using ldli, creates test schema, tables, imports, queries, ' + 'drops schema.' + ), + show_default=False + ) + ] = None, + _ldlischemakeep: Annotated[ + Optional[bool], + typer.Option( + '--ldlischemakeep', + help=( + 'Using ldli, creates test schema, tables, imports, queries, ' + 'does not drop.' + ), + show_default=False + ) + ] = None, + _emptydirs: Annotated[ + Optional[bool], + typer.Option( + '--emptydirs', + help='Searches /var/lib/columnstore for empty directories.', + show_default=False + ) + ] = None, + _notmysqldirs: Annotated[ + Optional[bool], + typer.Option( + '--notmysqldirs', + help=( + 'Searches /var/lib/columnstore for directories not owned by ' + 'mysql.' + ), + show_default=False + ) + ] = None, + _emcheck: Annotated[ + Optional[bool], + typer.Option( + '--emcheck', + help='Checks the extent map for orphaned and missing files.', + show_default=False + ) + ] = None, + _s3check: Annotated[ + Optional[bool], + typer.Option( + '--s3check', + help='Checks the extent map against S3 storage.', + show_default=False + ) + ] = None, + _pscs: Annotated[ + Optional[bool], + typer.Option( + '--pscs', + help=( + 'Adds the pscs command. pscs lists running columnstore ' + 'processes.' + ), + show_default=False + ) + ] = None, + _schemasync: Annotated[ + Optional[bool], + typer.Option( + '--schemasync', + help='Fix out-of-sync columnstore tables (CAL0009).', + show_default=False + ) + ] = None, + _tmpdir: Annotated[ + Optional[bool], + typer.Option( + '--tmpdir', + help=( + 'Ensure owner of temporary dir after reboot (MCOL-4866 & ' + 'MCOL-5242).' + ), + show_default=False + ) + ] = None, + _checkports: Annotated[ + Optional[bool], + typer.Option( + '--checkports', + help='Checks if ports needed by Columnstore are opened.', + show_default=False + ) + ] = None, + _eustack: Annotated[ + Optional[bool], + typer.Option( + '--eustack', + help='Dumps the stack of Columnstore processes.', + show_default=False + ) + ] = None, + _clearrollback: Annotated[ + Optional[bool], + typer.Option( + '--clearrollback', + help='Clear any rollback fragments from dbrm files.', + show_default=False + ) + ] = None, + _killcolumnstore: Annotated[ + Optional[bool], + typer.Option( + '--killcolumnstore', + help=( + 'Stop columnstore processes gracefully, then kill remaining ' + 'processes.' + ), + show_default=False + ) + ] = None, + _color: Annotated[ + Optional[str], + typer.Option( + '--color', + help=( + 'print headers in color. Options: [none,red,blue,green,yellow,' + 'magenta,cyan, none] prefix color with l for light.' + ), + show_default=False + ) + ] = None, +): + """ + This script performs various maintenance and diagnostic tasks for + MariaDB ColumnStore, including log archiving, extent map backups, + schema and table testing, directory and ownership checks, extent map + validation, S3 storage comparison, process management, table + synchronization, port availability checks, stack dumps, cleanup of + rollback fragments, and graceful process termination. + + If database is up, this script will connect as root@localhost via socket. + """ + + arguments = [] + for arg_name, value in locals().items(): + sh_arg = cook_sh_arg(arg_name, value, separator='=') + if sh_arg is None: + continue + # columnstore_review.sh accepts only --arg=value format + arguments.append(sh_arg) + cmd = f'{MCS_COLUMNSTORE_REVIEW_SH} {" ".join(arguments)}' + success, _ = BaseDispatcher.exec_command(cmd, stdout=sys.stdout) + if not success: + raise typer.Exit(code=1) + raise typer.Exit(code=0) \ No newline at end of file diff --git a/extra/columnstore_review.sh b/cmapi/scripts/columnstore_review.sh similarity index 85% rename from extra/columnstore_review.sh rename to cmapi/scripts/columnstore_review.sh index 87ce49cd9..22fafc824 100644 --- a/extra/columnstore_review.sh +++ b/cmapi/scripts/columnstore_review.sh @@ -1,15 +1,28 @@ #!/bin/bash # columnstore_review.sh # script by Edward Stoever for MariaDB support -VERSION=1.4.3 +# Contributors: Allen Herrera +# Patrizio Tamorri +VERSION=1.4.13 function prepare_for_run() { unset ERR - OUTDIR=/tmp/columnstore_review + if [ -n "$USER_PROVIDED_OUTPUT_PATH" ] && [ ! -d "$USER_PROVIDED_OUTPUT_PATH" ]; then + printf "The directory $USER_PROVIDED_OUTPUT_PATH does not exist.\n\n" + exit 1 + fi + + if [ -n "$USER_PROVIDED_OUTPUT_PATH" ]; then + OUTDIR=$USER_PROVIDED_OUTPUT_PATH/columnstore_review + TARDIR=$USER_PROVIDED_OUTPUT_PATH + else + OUTDIR=/tmp/columnstore_review + TARDIR=/tmp + fi mkdir -p $OUTDIR WARNFILE=$OUTDIR/cs_warnings.out if [ $EM_CHECK ]; then - EMOUTDIR=/tmp/columnstore_review/em; mkdir -p $EMOUTDIR + EMOUTDIR=$OUTDIR/em; mkdir -p $EMOUTDIR OUTPUTFILE=$EMOUTDIR/$(hostname)_cs_em_check.txt else OUTPUTFILE=$OUTDIR/$(hostname)_cs_review.txt @@ -44,6 +57,7 @@ function exists_mariadbd_running() { } function exists_columnstore_running() { + if [[ "$(ps -ef | grep -E "(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode)" | grep -v "grep"|wc -l)" == "0" ]]; then echo 'There are no Mariadb-Columnstore processes running.' >> $WARNFILE; else @@ -942,7 +956,15 @@ function dump_log () { } function collect_logs() { - LOGSOUTDIR=/tmp/columnstore_review/logs_$(date +"%m-%d-%H-%M-%S")/$(hostname) + + if [ -n "$USER_PROVIDED_OUTPUT_PATH" ]; then + TARPATH="$USER_PROVIDED_OUTPUT_PATH" + LOGSOUTDIR="$USER_PROVIDED_OUTPUT_PATH/columnstore_review/logs_$(date +"%m-%d-%H-%M-%S")/$(hostname)" + else + TARPATH=/tmp + LOGSOUTDIR=/tmp/columnstore_review/logs_$(date +"%m-%d-%H-%M-%S")/$(hostname) + fi + mkdir -p $LOGSOUTDIR || ech0 'Cannot create temporary directory for logs.'; mkdir -p $LOGSOUTDIR/system mkdir -p $LOGSOUTDIR/mariadb @@ -968,25 +990,46 @@ function collect_logs() { dump_log "mcs-loadbrm" $LOGSOUTDIR/columnstore/ dump_log "mcs-primproc" $LOGSOUTDIR/columnstore/ dump_log "mcs-workernode@1" $LOGSOUTDIR/columnstore/ + dump_log "mcs-workernode@2" $LOGSOUTDIR/columnstore/ dump_log "mcs-writeengineserver" $LOGSOUTDIR/columnstore/ dump_log "mcs-controllernode" $LOGSOUTDIR/columnstore/ set_data1dir ls -lrt $DATA1DIR/systemFiles/dbrm > $LOGSOUTDIR/columnstore/ls_lrt_dbrm.txt + if [ ! -z "$STORAGE_TYPE" ] && [ "$STORAGE_TYPE" == "S3" ]; then + dump_log "mcs-storagemanager" $LOGSOUTDIR/columnstore/ + smls /data1/systemFiles/dbrm/ > $LOGSOUTDIR/columnstore/s3_dbrms.txt ; + smcat /data1/systemFiles/dbrm/BRM_saves_current 2>/dev/null > $LOGSOUTDIR/columnstore/s3_BRM_saves_current ; + fi - # find /var/log \( -name "messages" -o -name "messages.1" \) -type f -exec cp {} $LOGSOUTDIR/system \; - cp /var/log/messages* $LOGSOUTDIR/system - find /var/log/syslog -name syslog -type f -exec tail -10000 {} > $LOGSOUTDIR/system/syslog \; - find /var/log/daemon.log -name daemon.log -type f -exec tail -10000 {} > $LOGSOUTDIR/system/daemon.log \; + + # System Logs + if [ -f "/proc/sys/kernel/threads-max" ]; then cp /proc/sys/kernel/threads-max $LOGSOUTDIR/system/kernal-threads-max; fi; + if [ -f "/proc/sys/kernel/pid_max" ]; then cp /proc/sys/kernel/pid_max $LOGSOUTDIR/system/kernal-pid_max; fi; + if [ -f "/proc/sys/vm/max_map_count" ]; then cp /proc/sys/vm/max_map_count $LOGSOUTDIR/system/kernal-max_map_count; fi; + # if [ -f "/var/log/messages" ]; then cp /var/log/messages* $LOGSOUTDIR/system; fi; # TOO MUCH COLLECTED... + find /var/log -name "messages*" -mtime -5 -type f -exec cp {} $LOGSOUTDIR/system \; 2>/dev/null + + if [ -f "/var/log/syslog" ]; then find /var/log/syslog -name syslog -type f -exec tail -10000 {} > $LOGSOUTDIR/system/syslog \;; fi; + if [ -f "/var/log/daemon.log" ]; then find /var/log/daemon.log -name daemon.log -type f -exec tail -10000 {} > $LOGSOUTDIR/system/daemon.log \;; fi; + if command -v ulimit >/dev/null 2>&1; then + ulimit -a > $LOGSOUTDIR/system/kernal-ulimits.txt + fi + cd /var/log/mariadb - find /usr/lib -name "mcs*service" -exec cp {} $LOGSOUTDIR/systemd \; find /usr/lib -name "mariadb*service" -exec cp {} $LOGSOUTDIR/systemd \; ls -1 columnstore/*.log 2>/dev/null | cpio -pd $LOGSOUTDIR/ 2>/dev/null ls -1 columnstore/*z 2>/dev/null | cpio -pd $LOGSOUTDIR/ 2>/dev/null find columnstore/archive columnstore/install columnstore/trace -mtime -30 | cpio -pd $LOGSOUTDIR/ 2>/dev/null - find columnstore/cpimport -mtime -1 | cpio -pd $LOGSOUTDIR/ 2>/dev/null + # find columnstore/cpimport -mtime -1 | cpio -pd $LOGSOUTDIR/ 2>/dev/null # COLLECTS TOO MUCH + find columnstore/cpimport -name "*.err" -size +0 -mtime -2 | cpio -pd $LOGSOUTDIR/ 2>/dev/null + + #collect ports Status + unset SUPPRESS_CLOSED_PORTS + check_ports > $LOGSOUTDIR/columnstore/$(hostname)_ports_check.txt 2>/dev/null + if [ $CAN_CONNECT ]; then mariadb -ABNe "show global variables" > $LOGSOUTDIR/mariadb/$(hostname)_global_variables.txt 2>/dev/null @@ -995,18 +1038,18 @@ function collect_logs() { my_print_defaults --mysqld > $LOGSOUTDIR/mariadb/$(hostname)_my_print_defaults.txt 2>/dev/null if [ -f $OUTPUTFILE ]; then cp $OUTPUTFILE $LOGSOUTDIR/; fi cd $LOGSOUTDIR/.. - tar -czf /tmp/$COMPRESSFILE ./* + tar -czf $TARPATH/$COMPRESSFILE ./* cd - 1>/dev/null print_color "### COLLECTED LOGS FOR SUPPORT TICKET ###\n" ech0 "Attach the following tar file to your support ticket." if [ $THISISCLUSTER ]; then ech0 "Please collect logs with this script from each node in your cluster." fi - FILE_SIZE=$(stat -c %s /tmp/$COMPRESSFILE) + FILE_SIZE=$(stat -c %s $TARPATH/$COMPRESSFILE) if (( $FILE_SIZE > 52428800 )); then - print0 "The file /tmp/$COMPRESSFILE is larger than 50MB.\nPlease use MariaDB Large file upload at https://mariadb.com/upload/\nInform us about the upload in the support ticket." + print0 "The file $TARPATH/$COMPRESSFILE is larger than 50MB.\nPlease use MariaDB Large file upload at https://mariadb.com/upload/\nInform us about the upload in the support ticket.\n" fi - print0 "\nCreated: /tmp/$COMPRESSFILE\n" + print0 "\nCreated: $TARPATH/$COMPRESSFILE\n" ech0 } @@ -1192,6 +1235,12 @@ fi } function backup_dbrm() { +if [ -n "$USER_PROVIDED_OUTPUT_PATH" ]; then + TARPATH="$USER_PROVIDED_OUTPUT_PATH" +else + TARPATH=/tmp +fi + STORAGE_TYPE=$(grep service /etc/columnstore/storagemanager.cnf | grep -v "^\#" | grep "\=" | awk -F= '{print $2}' | xargs) if [ "$(echo $STORAGE_TYPE | awk '{print tolower($0)}')" == "s3" ]; then print0 "This is node uses S3 storage for Columnstore. Exiting.\n\n"; return; fi @@ -1217,10 +1266,10 @@ fi fi set_data1dir cd $DATA1DIR/systemFiles - tar -czf /tmp/$COMPRESSFILE ./dbrm + tar -czf $TARPATH/$COMPRESSFILE ./dbrm cd - 1>/dev/null print_color "### DBRM EXTENT MAP BACKUP ###\n" - ech0 "Files in dbrm directory backed up to compressed archive /tmp/$COMPRESSFILE." + ech0 "Files in dbrm directory backed up to compressed archive $TARPATH/$COMPRESSFILE" ech0 "Files in /tmp can be deleted on reboot. It is recommended to move the archive to a safe location." ech0 } @@ -2047,6 +2096,250 @@ function ensure_owner_privs_of_tmp_dir() { } +# CHECK PORTS FUNCTIONS BY Patrizio Tamorri +function check_ports(){ + # Check if nmap is installed + if ! command -v nmap &> /dev/null; then + printf "nmap is not installed.\n\n" + return + fi + + # Define the ports to check + ports="8600,8601,8602,8603,8604,8605,8606,8607,8608,8609,8610,8611,8612,8613,8614,8615,8616,8617,8618,8619,8620,8630,8700,8800,3306,8999" + + # Get the local node from the file + my_node=$(cat /var/lib/columnstore/local/module) + + # Get the hostname and local IP address of the machine + hostname=$(hostname) + local_ip=$(hostname -I | awk '{print $1}') + + # Extract IPs from Columnstore.xml, handling special characters like \r, \n, and \t + ips=$(grep -A 1 "_WriteEngineServer" /etc/columnstore/Columnstore.xml \ + | sed "/${my_node}_WriteEngineServer/,+0d" \ + | grep "" \ + | tr -d '\r\n\t' \ + | sed -e 's///g' -e 's/<\/IPAddr>//g' -e 's/^[ \t]*//' -e 's/[ \t]*$//' \ + | sort -u) + + # Extract IPAddr:Port pairs from the XML file, removing \r, \n, and \t + local_ports=$(grep -E "|" /etc/columnstore/Columnstore.xml \ + | tr -d '\r\n\t' \ + | sed -e 's/<\/\?IPAddr>//g' -e 's/<\/\?Port>//g' \ + | awk 'NR%2{printf "%s:", $0; next;} 1') + + pass=true + + # Function to check if a port is available to use + check_port_nmap_available_to_use() { + ip=$1 + port=$2 + + # Use nmap to check the port status + result=$(nmap -T4 -p $port $ip | grep "$port" | awk '{print $2}') + + if [ "$result" = "open" ]; then + echo "$ip:$port - Port is open: SUCCESS" + elif [ "$result" = "closed" ]; then + if [ ! ${SUPPRESS_CLOSED_PORTS} ]; then echo "$ip:$port - Port is closed and not firewalled"; fi + elif [ "$result" = "filtered" ]; then + echo "$ip:$port - Port is filtered (firewalled or blocked): ERROR" + pass=false + else + echo "$ip:$port - Unknown port status: ERROR" + pass=false + fi + } + + # Function to check if a port must be open + check_port_nmap_must_be_opened() { + ip=$1 + port=$2 + + # Use nmap to check the port status + result=$(nmap -T4 -p $port $ip | grep "$port" | awk '{print $2}') + + if [ "$result" = "open" ]; then + echo "$ip:$port - Port is open: SUCCESS" + elif [ "$result" = "closed" ]; then + echo "$ip:$port - Port is closed and not firewalled: ERROR" + pass=false + elif [ "$result" = "filtered" ]; then + echo "$ip:$port - Port is filtered (firewalled or blocked): ERROR" + pass=false + else + echo "$ip:$port - Unknown port status: ERROR" + pass=false + fi + } + + # Loop through each IP and check the ports + for ipadd in $ips; do + echo "Checking ports on $ipadd..." + + # Replace ipadd with 127.0.0.1 if it matches the local IP or hostname + if [[ "$ipadd" == "$local_ip" || "$ipadd" == "$hostname" ]]; then + ipadd="127.0.0.1" + fi + + for port in ${ports//,/ }; do + ip_port="$ipadd:$port" + if [[ " ${local_ports[@]} " =~ " $ip_port " ]]; then + check_port_nmap_must_be_opened $ipadd $port + else + check_port_nmap_available_to_use $ipadd $port + fi + done + done + + # Final status report + if [ "$pass" = true ]; then + printf "All nodes passed the port test.\n\n" + else + printf "One or more nodes failed the port test. Please investigate.\n\n" + fi + + +} + +function clear_rollback() { + unset ERR + CLEAR_ROLLBACK_MESSAGE="It is recommended that you clear rollback files only when instructed to do so by Mariadb Support.\nType c to clear rollback files.\nType any other key to exit.\n" + STORAGE_TYPE=$(grep service /etc/columnstore/storagemanager.cnf | grep -v "^\#" | grep "\=" | awk -F= '{print $2}' | awk '{print tolower($0)}' | xargs) + DATA1DIR=$(mcsGetConfig SystemConfig DBRoot1 2>/dev/null) || DATA1DIR=/var/lib/columnstore/data1 + BRMSAV=$(cat $DATA1DIR/systemFiles/dbrm/BRM_saves_current | xargs) +if [[ ! "$(ps -ef | grep -E "(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode)" | grep -v "grep"|wc -l)" == "0" ]]; then + TEMP_COLOR=lred; print_color "Columnstore processes are running.\nYou may clear rollback fragments only when Columnstore processes are stopped."; unset TEMP_COLOR + print0 "\nExiting.\n\n"; exit 0 +fi + +if [ "$STORAGE_TYPE" == "localstorage" ]; then + COUNTFILES=$(find $DATA1DIR/systemFiles \( -name "${BRMSAV}_vss" -o -name "${BRMSAV}_vbbm" \) -size +0 | wc -l) + if [ "$COUNTFILES" == "0" ]; then + TEMP_COLOR=lred; print_color "Rollback files are empty."; unset TEMP_COLOR + print0 "\nExiting.\n\n"; exit 0 + fi + + print0 "$CLEAR_ROLLBACK_MESSAGE" + + read -s -n 1 RESPONSE + if [ "$RESPONSE" == "c" ]; then + ech0; ech0 + BRM_SAVES_BACKUP_FILE=$(hostname)_$(date +"%Y-%m-%d-%H-%M-%S")_BRM_saves.tar + cd $DATA1DIR/systemFiles/dbrm/ + print0 "BRM_saves_current: ${BRMSAV}\n\nBacking up these files:\n" + find . \( -name "${BRMSAV}_vss" -o -name "${BRMSAV}_vbbm" \) -exec tar -rvf /tmp/$BRM_SAVES_BACKUP_FILE {} \; + ech0 + find . \( -name "${BRMSAV}_vss" -o -name "${BRMSAV}_vbbm" \) -size +0 -exec truncate -s0 {} \; || ERR=true + COUNTFILES=$(find $DATA1DIR/systemFiles \( -name "${BRMSAV}_vss" -o -name "${BRMSAV}_vbbm" \) -size +0 | wc -l) + if [ $ERR ] || [ "$COUNTFILES" != "0" ]; then + ech0 "Something went wrong. Check the size of files ${BRMSAV}_vss and ${BRMSAV}_vbbm. Each file should be zero bytes in size." + ls -lrt $DATA1DIR/systemFiles/dbrm + else + TEMP_COLOR=lcyan; print_color "BRM_saves files backed up to /tmp/$BRM_SAVES_BACKUP_FILE.\nFiles cleared successfully.\n\n"; unset TEMP_COLOR + fi + else + print0 "\nNothing done.\n\n" + fi + +fi + +if [ "$STORAGE_TYPE" == "s3" ]; then + DBRM_TMP_DIR=/tmp/dbrm-before-clearing-$(date +"%Y-%m-%d-%H-%M-%S") || ERR=true + print0 "$CLEAR_ROLLBACK_MESSAGE" + read -s -n 1 RESPONSE + if [ "$RESPONSE" == "c" ]; then + ## REF: https://mariadbcorp.atlassian.net/wiki/spaces/Support/pages/1600094249/Stuck+load_brm+failed+rollback+of+a+transaction + cd /var/lib/columnstore/storagemanager/metadata/data1/systemFiles/dbrm/ || ERR=true + mkdir -p $DBRM_TMP_DIR || ERR=true + find . | cpio -pd $DBRM_TMP_DIR || ERR=true + # Clear vss and vbbm files + rm -f BRM_saves_vss.meta BRM_saves_vbbm.meta || ERR=true + touch BRM_saves_vss.meta || ERR=true; chown mysql:mysql BRM_saves_vss.meta || ERR=true + touch BRM_saves_vbbm.meta || ERR=true; chown mysql:mysql BRM_saves_vbbm.meta || ERR=true + rm -rf /var/lib/columnstore/storagemanager/cache/data1/* || ERR=true + mkdir /var/lib/columnstore/storagemanager/cache/data1/downloading || ERR=true + chown mysql:mysql -R /var/lib/columnstore/storagemanager/cache || ERR=true + + if [ $ERR ]; then + ech0 "Something went wrong." + else + TEMP_COLOR=lcyan; print_color "BRM_saves files backed up to $DBRM_TMP_DIR.\nFiles cleared successfully.\n\n"; unset TEMP_COLOR + fi + + else + print0 "\nNothing done.\n\n" + fi +fi +} + +function kill_columnstore(){ +COUNT_ANY_STRAGGLERS=$(ps -ef | grep -E '(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode|load_brm)' | grep -v "grep" | wc -l) +PM1=$(mcsGetConfig pm1_WriteEngineServer IPAddr) +PM2=$(mcsGetConfig pm2_WriteEngineServer IPAddr) +if [ ! "$PM1" == "127.0.0.1" ] && [ ! -z $PM2 ]; then + THISISCLUSTER=true +fi + +if [ "$COUNT_ANY_STRAGGLERS" == "0" ]; then + TEMP_COLOR=lred; print_color "Columnstore processes are not running.\n"; unset TEMP_COLOR + clearShm + TEMP_COLOR=lcyan; print_color "Columnstore shared memory cleared.\n"; unset TEMP_COLOR + print0 "\nExiting.\n\n"; exit 0 +fi + +if [ $THISISCLUSTER ] && [ "$COUNT_ANY_STRAGGLERS" != "0" ]; then + TEMP_COLOR=lred; print_color "WARNING: This is a columnstore cluster and it is best to use cmapi commands to stop columnstore processes.\n"; unset TEMP_COLOR +fi + + +TEMP_COLOR=lcyan; print_color "Press c to stop all columnstore processes on this node.\n"; unset TEMP_COLOR + +read -s -n 1 RESPONSE + if [ "$RESPONSE" == "c" ]; then + if [ "$COUNT_ANY_STRAGGLERS" != "0" ]; then + ech0 "Attempting to gracefully stop mcs-ddlproc." + systemctl stop mcs-ddlproc; + ech0 "Attempting to gracefully stop mcs-dmlproc." + systemctl stop mcs-dmlproc; + systemctl stop mcs-exemgr 2>/dev/null; # if cs 6.4 and prior + ech0 "Attempting to gracefully stop mcs-controllernode." + systemctl stop mcs-controllernode; + ech0 "Attempting to gracefully stop mcs-storagemanager." + systemctl stop mcs-storagemanager; + ech0 "Attempting to gracefully stop mcs-primproc." + systemctl stop mcs-primproc; + ech0 "Attempting to gracefully stop mcs-writeengineserver." + systemctl stop mcs-writeengineserver; + ech0 "Attempting to gracefully stop mcs-workernode@1." + systemctl stop mcs-workernode@1; + ech0 "Attempting to gracefully stop mcs-workernode@2." + systemctl stop mcs-workernode@2; + fi + + COUNT_ANY_STRAGGLERS=$(ps -ef | grep -E '(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode|load_brm)' | grep -v "grep" | wc -l) + if [ "$COUNT_ANY_STRAGGLERS" != "0" ]; then + ech0 "Remaining processes:" + ps -ef | grep -E '(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode|load_brm)' | grep -v "grep" + ech0 "Killing them..." + ps -ef | grep -E '(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode|load_brm)' | grep -v "grep" | awk '{print $2}' | xargs kill -9 + fi + + COUNT_ANY_STRAGGLERS=$(ps -ef | grep -E '(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode|load_brm)' | grep -v "grep" | wc -l) + + if [ "$COUNT_ANY_STRAGGLERS" == "0" ]; then + ech0 "No columnstore processes running." + clearShm + TEMP_COLOR=lcyan; print_color "Columnstore shared memory cleared.\n"; unset TEMP_COLOR + else + ech0 "After two attempts to kill all Columnstore processes, this is still running:" + ps -ef | grep -E '(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode|load_brm)' | grep -v "grep" + fi +else + print0 "\nNothing done.\n\n" +fi +} + function display_outputfile_message() { echo "The output of this script is saved in the file $OUTPUTFILE"; echo; } @@ -2059,36 +2352,31 @@ function display_help_message() { If database is up, this script will connect as root@localhost via socket. Switches: - --help # display this message - --version # only show the header with version information - --logs # create a compressed archive of logs for MariaDB Support Ticket - --backupdbrm # takes a compressed backup of extent map files in dbrm directory - --testschema # creates a test schema, tables, imports, queries, drops schema - --testschemakeep # creates a test schema, tables, imports, queries, does not drop - --ldlischema # using ldli, creates test schema, tables, imports, queries, drops schema - --ldlischemakeep # using ldli, creates test schema, tables, imports, queries, does not drop - --emptydirs # searches $COLUMNSTOREDIR for empty directories - --notmysqldirs # searches $COLUMNSTOREDIR for directories not owned by mysql - --emcheck # Checks the extent map for orphaned and missing files - --s3check # Checks the extent map against S3 storage - --pscs # Adds the pscs command. pscs lists running columnstore processes - --schemasync # Fix out-of-sync columnstore tables (CAL0009) - --tmpdir # Ensure owner of temporary dir after reboot (MCOL-4866 & MCOL-5242) + --help # display this message + --version # only show the header with version information + --logs # create a compressed archive of logs for MariaDB Support Ticket + --path # define the path for where to save files/tarballs and outputs of this script + --backupdbrm # takes a compressed backup of extent map files in dbrm directory + --testschema # creates a test schema, tables, imports, queries, drops schema + --testschemakeep # creates a test schema, tables, imports, queries, does not drop + --ldlischema # using ldli, creates test schema, tables, imports, queries, drops schema + --ldlischemakeep # using ldli, creates test schema, tables, imports, queries, does not drop + --emptydirs # searches $COLUMNSTOREDIR for empty directories + --notmysqldirs # searches $COLUMNSTOREDIR for directories not owned by mysql + --emcheck # Checks the extent map for orphaned and missing files + --s3check # Checks the extent map against S3 storage + --pscs # Adds the pscs command. pscs lists running columnstore processes + --schemasync # Fix out-of-sync columnstore tables (CAL0009) + --tmpdir # Ensure owner of temporary dir after reboot (MCOL-4866 & MCOL-5242) + --checkports # Checks if ports needed by Columnstore are opened + --eustack # Dumps the stack of Columnstore processes + --clearrollback # Clear any rollback fragments from dbrm files + --killcolumnstore # Stop columnstore processes gracefully, then kill remaining processes Color output switches: - --color=none # print headers without color - --color=red # print headers in red - --color=blue # print headers in blue - --color=green # print headers in green - --color=yellow # print headers in yellow - --color=magenta # print headers in magenta - --color=cyan # print headers in cyan (default color) - --color=lred # print headers in light red - --color=lblue # print headers in light blue - --color=lgreen # print headers in light green - --color=lyellow # print headers in light yellow - --color=lmagenta # print headers in light magenta - --color=lcyan # print headers in light cyan\n" + --color=none # print headers without color + --color=red # print headers in color + # Options: [none,red,blue,green,yellow,magenta,cyan] prefix color with "l" for light\n" ech0 } @@ -2143,6 +2431,53 @@ fi printf "$1" >> $OUTPUTFILE } +function get_eu_stack() { + if ! command -v eu-stack &> /dev/null; then + printf "\n[!] eu-stack not found. Please install eu-stack\n\n" + ech0 "example: " + ech0 " yum install elfutils -y" + ech0 " apt-get install elfutils" + ech0 + exit 1; + fi + + # Confirm CS online + if [[ "$(ps -ef | grep -E "(PrimProc|ExeMgr|DMLProc|DDLProc|WriteEngineServer|StorageManager|controllernode|workernode)" | grep -v "grep"|wc -l)" == "0" ]]; then + printf "Columnstore processes are not running. EU Stack will not be collected.\n\n" + exit 1; + fi + + eu=$(which eu-stack) + EU_FOLDER="$(hostname)_$(date +"%Y-%m-%d-%H-%M-%S")_eu_stack" + if [ ! -d "$OUTDIR/$EU_FOLDER" ]; then mkdir -p "$OUTDIR/$EU_FOLDER"; fi + + $eu -p $(pidof PrimProc) > "$OUTDIR/$EU_FOLDER/eu-PrimProc.txt" ; + $eu -p $(pidof DMLProc) > "$OUTDIR/$EU_FOLDER/eu-DMLProc.txt" ; + $eu -p $(pidof DDLProc) > "$OUTDIR/$EU_FOLDER/eu-DDLProc.txt" ; + $eu -p $(pidof mariadbd) > "$OUTDIR/$EU_FOLDER/eu-mariadbd.txt" ; + $eu -p $(pidof WriteEngineServer) > "$OUTDIR/$EU_FOLDER/eu-WriteEngineServer.txt" ; + $eu -p $(pidof controllernode) > "$OUTDIR/$EU_FOLDER/eu-controllernode.txt" ; + $eu -p $(pidof workernode) > "$OUTDIR/$EU_FOLDER/eu-workernode.txt" ; + cd $OUTDIR + tar -czf "$OUTDIR/$EU_FOLDER.tar.gz" $EU_FOLDER/* + + if [ -f "$OUTDIR/$EU_FOLDER.tar.gz" ]; then + print_color "### EU STACK COMPLETE ###\n" + else + print0 "EU Stack files not found.\n" + exit 1; + fi + + # cleanup + mv "$OUTDIR/$EU_FOLDER.tar.gz" $TARDIR + if [ -f "$TARDIR/$EU_FOLDER.tar.gz" ]; then + print0 "Created: $TARDIR/$EU_FOLDER.tar.gz \n\n" + else + print0 "EU Stack files not found.\n" + exit 1; + fi +} + COLOR=default for params in "$@"; do unset VALID; @@ -2163,6 +2498,7 @@ for params in "$@"; do if [ "$params" == '--help' ]; then HELP=true; VALID=true; fi if [ "$params" == '--version' ]; then if [ ! $SKIP_REPORT ]; then DISPLAY_VERSION=true; fi; VALID=true; fi if [ "$params" == '--logs' ]; then if [ ! $SKIP_REPORT ]; then COLLECT_LOGS=true; fi; VALID=true; fi + if [[ "$params" == "--path"* ]]; then USER_PROVIDED_OUTPUT_PATH=$(echo "$params" | awk -F= '{print $2}'); VALID=true; fi if [ "$params" == '--backupdbrm' ]; then BACKUP_DBRM=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi if [ "$params" == '--testschema' ]; then TEST_SCHEMA=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi if [ "$params" == '--testschemakeep' ]; then TEST_SCHEMA_KEEP=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi @@ -2175,13 +2511,17 @@ for params in "$@"; do if [ "$params" == '--pscs' ]; then PSCS_ALIAS=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi if [ "$params" == '--schemasync' ]; then SCHEMA_SYNC=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi if [ "$params" == '--tmpdir' ]; then FIX_TMP_DIR=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi + if [ "$params" == '--clearrollback' ]; then CLEARROLLBACK=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi + if [ "$params" == '--checkports' ]; then SKIP_REPORT=true; CHECKPORTS=true;VALID=true; fi + if [ "$params" == '--killcolumnstore' ]; then KILLCS=true; SKIP_REPORT=true; unset COLLECT_LOGS; VALID=true; fi + if [ "$params" == '--eustack' ]; then SKIP_REPORT=true; COLLECT_EU_STACK=true;VALID=true; fi if [ ! $VALID ]; then INVALID_INPUT=$params; fi done prepare_for_run exists_client_able_to_connect_with_socket if [ $DISPLAY_VERSION ]; then exit 0; fi -if [ $INVALID_INPUT ]; then TEMP_COLOR=lred; print_color "Invalid parameter: ";ech0 $INVALID_INPUT; ech0; unset TEMP_COLOR; fi +if [ $INVALID_INPUT ]; then TEMP_COLOR=lred; print_color "Invalid parameter: ";ech0 $INVALID_INPUT; ech0; unset TEMP_COLOR; exit 1; fi if [ $HELP ]||[ $INVALID_INPUT ]; then display_help_message exit 0 @@ -2265,6 +2605,7 @@ report_cs_table_locks report_columnstore_query_count report_calpontsys_exists report_columnstore_tables +SUPPRESS_CLOSED_PORTS=true; check_ports TEMP_COLOR=lblue; print_color "===================== LOGS =====================\n"; unset TEMP_COLOR report_host_datetime report_last_10_error_log_error @@ -2327,4 +2668,18 @@ if [ $FIX_TMP_DIR ]; then ensure_owner_privs_of_tmp_dir fi +if [ $CLEARROLLBACK ]; then + clear_rollback +fi +if [ $CHECKPORTS ]; then + check_ports +fi + +if [ $KILLCS ]; then + kill_columnstore +fi + +if [ $COLLECT_EU_STACK ]; then + get_eu_stack +fi \ No newline at end of file