diff --git a/DEVELOPING.md b/DEVELOPING.md index 81f9beb5d..7a9b18d9f 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -1,3 +1,11 @@ +## DBRM unresponsive timeout + +The master DBRM node previously used a fixed 300s (5 minutes) timeout before forcing read-only mode when a worker didn't respond. This is now configurable via Columnstore.xml only: + +- SystemConfig/DBRMUnresponsiveTimeout (seconds, default 300) + +The value controls how long the master waits for workers to reconfigure/respond after a network error is detected before switching to read-only. See `versioning/BRM/masterdbrmnode.cpp` for details. + This file documents helpful knowledge, commands and setup instructions for better developing flow. ## Logging @@ -18,7 +26,7 @@ std::cout << row.toString() << std::endl; ## Restarting services after a crash -Sometimes, e.g. during debugging, single processes can crash. +Sometimes, e.g. during debugging, single processes can crash. To restart a MCS specific unit process run: @@ -48,7 +56,7 @@ For interaction with storage engines, MariaDB has a template that is basically a Especially during debugging you might end up killing a process, which leads to error messages like: -`ERROR 1815 (HY000): Internal error: MCS-2033: Error occurred when calling system catalog.` +`ERROR 1815 (HY000): Internal error: MCS-2033: Error occurred when calling system catalog.` This error message occurs when the `PrimProc` process is killed, but all other processes continue running and cannot access the system catalog which is served by `PrimProc`. @@ -58,7 +66,7 @@ You can verified that this happened by having a look at all running processes fo ps -axwffu | grep mysql ``` -And restart any service via +And restart any service via ```bash systemctl restart mcs- @@ -85,6 +93,6 @@ Using the provided Vagrantfile the setup of develop VM is as easy as: 1. `MARIA_DB_SERVER_REPOSITORY` and `MCS_REPOSITORY` . These options expect the HTTPS GitHub URL of the referenced repositories. If a build with a fork of the official repos is wanted, this is where the fork URLs should be provided. (For any questions regarding a general build, please refer to the `BUILD.md`). 2. `PROVIDER` . Vagrant allows to configure the underlying VM software used (the so called provider). The current version of the Vagrantfile uses VMWare as a VM provider. VMware provides free licenses for personal use, students and open-source development, otherwise it is a paid service. If you don’t have a license or want to use another provider either way, you can either use the out of the box provided VirtualBox provider or install another provider. Read more about Vagrant VM providers [here](https://developer.hashicorp.com/vagrant/docs/providers). Read more about how to install VMWare as a provider [here](https://developer.hashicorp.com/vagrant/docs/providers/vmware/installation). 3. `BOX` . Vagrant uses boxes to package Vagrant environments. The box needs to match your system and architecture. The easiest way to obtain a a box is to select one from the publicly available, pre-defined boxes at [VagrantCloud](https://app.vagrantup.com/boxes/search). - 4. `MEMSIZE/NUMVCPUS`: Adapt the number of cores and the amount of RAM you want to give your VM. -2. Run `vagrant up` to create and/or start the virtual machine as specified in the `Vagrantfile`. + 4. `MEMSIZE/NUMVCPUS`: Adapt the number of cores and the amount of RAM you want to give your VM. +2. Run `vagrant up` to create and/or start the virtual machine as specified in the `Vagrantfile`. 3. Run `vagrant ssh` to obtain a terminal directly in your VM - or to develop on the virtual machine in your preferred IDE, obtain the ssh config data of the machine with `vagrant ssh-config` and use it to connect. (For even easier connection add the ssh connection data to your `~/.ssh/config` .) \ No newline at end of file diff --git a/cmapi/cmapi_server/SingleNode.xml b/cmapi/cmapi_server/SingleNode.xml index 67c8637bd..cd8c2ce24 100644 --- a/cmapi/cmapi_server/SingleNode.xml +++ b/cmapi/cmapi_server/SingleNode.xml @@ -58,6 +58,7 @@ /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves /var/lib/columnstore/data1/systemFiles/dbrm/tablelocks 15 + 300 100000 10 95 diff --git a/cmapi/cmapi_server/test/CS-config-test.xml b/cmapi/cmapi_server/test/CS-config-test.xml index ac4995629..5f851bb18 100644 --- a/cmapi/cmapi_server/test/CS-config-test.xml +++ b/cmapi/cmapi_server/test/CS-config-test.xml @@ -237,6 +237,7 @@ /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves /var/lib/columnstore/data1/systemFiles/dbrm/tablelocks 15 + 300 100000 10 95 diff --git a/cmapi/cmapi_server/test/Columnstore_apply_config.xml b/cmapi/cmapi_server/test/Columnstore_apply_config.xml index 580a829ae..4eb618d24 100644 --- a/cmapi/cmapi_server/test/Columnstore_apply_config.xml +++ b/cmapi/cmapi_server/test/Columnstore_apply_config.xml @@ -239,6 +239,7 @@ /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves /var/lib/columnstore/data1/systemFiles/dbrm/tablelocks 20 + 300 100000 10 95 diff --git a/versioning/BRM/brmtypes.h b/versioning/BRM/brmtypes.h index 336db709f..b17f96028 100644 --- a/versioning/BRM/brmtypes.h +++ b/versioning/BRM/brmtypes.h @@ -429,7 +429,7 @@ EXPORT void log(const std::string& msg, logging::LOG_TYPE = logging::LOG_TYPE_CR EXPORT void log_errno(const std::string& msg, logging::LOG_TYPE = logging::LOG_TYPE_CRITICAL); EXPORT void errString(int rc, std::string& errMsg); -const struct timespec FIVE_MIN_TIMEOUT = {300, 0}; +// Note: Unresponsive timeouts are now configurable in the master and not defined here. /* Function identifiers used for master-slave communication. diff --git a/versioning/BRM/masterdbrmnode.cpp b/versioning/BRM/masterdbrmnode.cpp index 4f2a32bbd..a48b6277a 100644 --- a/versioning/BRM/masterdbrmnode.cpp +++ b/versioning/BRM/masterdbrmnode.cpp @@ -107,6 +107,16 @@ MasterDBRMNode::MasterDBRMNode() MSG_TIMEOUT.tv_sec = secondsToWait; else MSG_TIMEOUT.tv_sec = 20; + + // Configurable unresponsive timeout (default 300 seconds) + haltTimeout = {300, 0}; + std::string unrespStr = config->getConfig("SystemConfig", "DBRMUnresponsiveTimeout"); + int unrespSecs = config->fromText(unrespStr); + if (unrespSecs > 0) + { + haltTimeout.tv_sec = unrespSecs; + haltTimeout.tv_nsec = 0; + } } MasterDBRMNode::~MasterDBRMNode() @@ -534,16 +544,16 @@ void MasterDBRMNode::msgProcessor() retrycmd: uint32_t haltloops = 0; - while (halting && ++haltloops < static_cast(FIVE_MIN_TIMEOUT.tv_sec)) + while (halting && ++haltloops < static_cast(haltTimeout.tv_sec)) sleep(1); slaveLock.lock(); - if (haltloops == FIVE_MIN_TIMEOUT.tv_sec) + if (haltloops == static_cast(haltTimeout.tv_sec)) { ostringstream os; os << "A node is unresponsive for cmd = " << (uint32_t)cmd << ", no reconfigure in at least " - << FIVE_MIN_TIMEOUT.tv_sec << " seconds. Setting read-only mode."; + << haltTimeout.tv_sec << " seconds. Setting read-only mode."; log(os.str()); readOnly = true; halting = false; @@ -832,7 +842,9 @@ int MasterDBRMNode::gatherResponses(uint8_t cmd, uint32_t cmdMsgLength, vector 0) ? (haltTimeout.tv_sec / newtimeout.tv_sec) : 0; + if (ntRetries == 0) + ntRetries = 1; uint32_t retries = 0; while (++retries < ntRetries && tmp->length() == 0 && !halting) diff --git a/versioning/BRM/masterdbrmnode.h b/versioning/BRM/masterdbrmnode.h index a29e60c56..407f0b3dc 100644 --- a/versioning/BRM/masterdbrmnode.h +++ b/versioning/BRM/masterdbrmnode.h @@ -258,6 +258,9 @@ class MasterDBRMNode volatile bool die, halting; bool reloadCmd; mutable bool readOnly; + // Maximum time to wait for worker responses/reconfigure before forcing read-only + // Loaded from Columnstore.xml: SystemConfig/DBRMUnresponsiveTimeout (default: 300 seconds) + struct timespec haltTimeout; mutable bool waitToFinishJobs{false}; struct timespec MSG_TIMEOUT; };