From a5a30922023f0492d34632e781ec7cc949b76e94 Mon Sep 17 00:00:00 2001 From: Roman Nozdrin Date: Wed, 30 Mar 2022 08:57:05 +0000 Subject: [PATCH] MCOL-4912 This patch introduces Extent Map index to improve EM scaleability EM scaleability project has two parts: phase1 and phase2. This is phase1 that brings EM index to speed up(from O(n) down to the speed of boost::unordered_map) EM lookups looking for tuple to turn it into LBID, e.g. most bulk insertion meta info operations. The basis is boost::shared_managed_object where EMIndex is stored. Whilst it is not debug-friendly it allows to put a nested structs into shmem. EMIndex has 3 tiers. Top down description: vector of dbroots, map of oids to partition vectors, partition vectors that have EM indices. Separate EM methods now queries index before they do EM run. EMIndex has a separate shmem file with the fixed id MCS-shm-00060001. --- dbcon/mysql/ha_mcs_client_udfs.cpp | 64 +- tests/CMakeLists.txt | 5 + tests/brm-em-standalone.cpp | 1914 +++++++++++++++++++++++++ tools/clearShm/main.cpp | 36 +- utils/rwlock/rwlock.h | 7 + versioning/BRM/brmshmimpl.cpp | 172 ++- versioning/BRM/brmshmimpl.h | 82 +- versioning/BRM/dbrm.cpp | 23 +- versioning/BRM/dbrm.h | 9 +- versioning/BRM/extentmap.cpp | 1246 +++++++++++----- versioning/BRM/extentmap.h | 220 ++- versioning/BRM/lock_grabber.cpp | 61 +- versioning/BRM/lock_state.cpp | 47 +- versioning/BRM/mastersegmenttable.cpp | 1 + versioning/BRM/mastersegmenttable.h | 4 +- versioning/BRM/shmkeys.cpp | 3 +- versioning/BRM/shmkeys.h | 1 + versioning/BRM/slavedbrmnode.cpp | 5 + versioning/BRM/slavedbrmnode.h | 1 + versioning/BRM/slavenode.cpp | 2 + writeengine/bulk/cpimport.cpp | 22 +- writeengine/xml/we_xmlgenproc.cpp | 21 +- writeengine/xml/we_xmlgenproc.h | 3 + writeengine/xml/we_xmljob.cpp | 11 +- writeengine/xml/we_xmljob.h | 2 +- 25 files changed, 3498 insertions(+), 464 deletions(-) create mode 100644 tests/brm-em-standalone.cpp diff --git a/dbcon/mysql/ha_mcs_client_udfs.cpp b/dbcon/mysql/ha_mcs_client_udfs.cpp index 2ea1a68a2..580c2627f 100644 --- a/dbcon/mysql/ha_mcs_client_udfs.cpp +++ b/dbcon/mysql/ha_mcs_client_udfs.cpp @@ -1,5 +1,5 @@ /* Copyright (C) 2014 InfiniDB, Inc. - Copyright (C) 2016 MariaDB Corporation + Copyright (C) 2016-2022 MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -52,6 +52,8 @@ extern "C" const char* SetParmsPrelude = "Updated "; const char* SetParmsError = "Invalid parameter: "; const char* InvalidParmSize = "Invalid parameter size: Input value cannot be larger than "; + const char* MsgEMIndexSizeInitErrMsg = "mcs_emindex_size() takes no arguments"; + const char* MsgEMIndexFreeInitErrMsg = "mcs_emindex_free() takes no arguments"; const size_t Plen = strlen(SetParmsPrelude); const size_t Elen = strlen(SetParmsError); @@ -864,4 +866,64 @@ extern "C" { } +#ifdef _MSC_VER + __declspec(dllexport) +#endif + long long mcs_emindex_size(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* error) + { + DBRM dbrm; + return dbrm.EMIndexShmemSize(); + } + +#ifdef _MSC_VER + __declspec(dllexport) +#endif + my_bool mcs_emindex_size_init(UDF_INIT* initid, UDF_ARGS* args, char* message) + { + if (args->arg_count != 0) + { + strcpy(message, MsgEMIndexSizeInitErrMsg); + return 1; + } + + return 0; + } + +#ifdef _MSC_VER + __declspec(dllexport) +#endif + void mcs_emindex_size_deinit(UDF_INIT* initid) + { + } + +#ifdef _MSC_VER + __declspec(dllexport) +#endif + long long mcs_emindex_free(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* error) + { + DBRM dbrm; + return dbrm.EMIndexShmemFree(); + } + +#ifdef _MSC_VER + __declspec(dllexport) +#endif + my_bool mcs_emindex_free_init(UDF_INIT* initid, UDF_ARGS* args, char* message) + { + if (args->arg_count != 0) + { + strcpy(message, MsgEMIndexFreeInitErrMsg); + return 1; + } + + return 0; + } + +#ifdef _MSC_VER + __declspec(dllexport) +#endif + void mcs_emindex_free_deinit(UDF_INIT* initid) + { + } + } // extern "C" diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 36c9161cd..2b45e83c3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -49,6 +49,11 @@ if (WITH_UNITTESTS) target_link_libraries(comparators_tests ${ENGINE_LDFLAGS} ${MARIADB_CLIENT_LIBS} ${ENGINE_WRITE_LIBS} ${CPPUNIT_LIBRARIES} cppunit) add_test(NAME columnstore:comparators_tests, COMMAND comparators_tests) + # standalone EM routines test + # add_executable(brm_em_standalone brm-em-standalone.cpp) + # target_link_libraries(brm_em_standalone ${ENGINE_LDFLAGS} ${MARIADB_CLIENT_LIBS} ${ENGINE_WRITE_LIBS} ${CPPUNIT_LIBRARIES} cppunit) + # install(TARGETS brm_em_standalone DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine) + endif() # Saving this as the example of the microbench diff --git a/tests/brm-em-standalone.cpp b/tests/brm-em-standalone.cpp new file mode 100644 index 000000000..92b410107 --- /dev/null +++ b/tests/brm-em-standalone.cpp @@ -0,0 +1,1914 @@ +/* Copyright (C) 2014 InfiniDB, Inc. + Copyright (C) 2016-2021 MariaDB Corporation + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1301, USA. */ + +/***************************************************************************** + * $Id: tdriver-dbrm2.cpp 1823 2013-01-21 14:13:09Z rdempsey $ + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "brm.h" +#include "extentmap.h" +#include "IDBPolicy.h" + +//#define BRM_VERBOSE 1 + +using namespace BRM; +using namespace std; + +int threadStop; +int oid = 1; +uint64_t opCount = 0; +LBID_t lbidCounter = 0; +VER_t nextTxnID = 1; +u_int64_t vbOffset = 0; +pthread_mutex_t pthreadMutex; +const std::vector colWidthsAvailable = {1, 2, 4, 8, 16}; +const DBRootT dbroot = 1; +const uint32_t KibiBlocks = 1024; + +struct Range +{ + LBID_t start, end, nextBlock; + VER_t txnID; + Range() + { + start = end = nextBlock = 0; + txnID = 0; + } +}; + +struct EMEntries +{ + LBID_t LBIDstart; + uint32_t size; + OID_t OID; + uint32_t FBO; + uint32_t HWM; + uint32_t secondHWM; + int32_t txnID; + DBRootT dbroot; + PartitionNumberT partNum; + SegmentT segNum; + struct EMEntries* next; + EMEntries() : HWM(0), secondHWM(0), txnID(0), next(nullptr) + { } + EMEntries(const uint32_t aSize, const OID_t aOid, const uint32_t aFbo, + const LBID_t aLBIDStart, EMEntries* aNext) + : LBIDstart(aLBIDStart), size(aSize), OID(aOid), FBO(aFbo), HWM(0), + secondHWM(0), txnID(0), next(aNext) + { } + EMEntries(const uint32_t aSize, const OID_t aOid, const uint32_t aFbo, + const LBID_t aLBIDStart, EMEntries* aNext, const DBRootT aDbroot, + const PartitionNumberT aPartNum, const SegmentT aSegNum) + : LBIDstart(aLBIDStart), size(aSize), OID(aOid), FBO(aFbo), HWM(0), + secondHWM(0), txnID(0), dbroot(aDbroot), partNum(aPartNum), + segNum(aSegNum), next(aNext) + { } +}; +/* +static void* BRMRunner_2(void* arg) +{ + + vector copyList, copiedList, committedList; + vector::iterator rit; + vector writtenList; + vector::iterator lit; + + pthread_mutex_t listMutex; + int op; + uint32_t randstate; + DBRM* brm; + struct timeval tv; + VER_t txnID; + + pthread_mutex_init(&listMutex, NULL); + gettimeofday(&tv, NULL); + randstate = static_cast(tv.tv_usec); + brm = new DBRM(); + + while (!threadStop) + { + op = rand_r(&randstate) % 9; + + switch (op) + { + case 0: // beginVBCopy + { + int blockCount, size, err; + Range newEntry; + VBRange_v vbRanges; + VBRange_v::iterator vit; + LBIDRange_v ranges; + LBIDRange range; + + size = rand_r(&randstate) % 10000; + + pthread_mutex_lock(&pthreadMutex); + newEntry.start = lbidCounter; + lbidCounter += size; + txnID = nextTxnID++; + pthread_mutex_unlock(&pthreadMutex); + + newEntry.nextBlock = newEntry.start; + newEntry.end = newEntry.start + size; + range.start = newEntry.start; + range.size = size; + + err = brm->beginVBCopy(txnID, dbroot, ranges, vbRanges); + CPPUNIT_ASSERT(err == 0); + + for (blockCount = 0, vit = vbRanges.begin(); vit != vbRanges.end(); vit++) + blockCount += (*vit).size; + + CPPUNIT_ASSERT(blockCount == size); + + pthread_mutex_lock(&listMutex); + copyList.push_back(newEntry); + pthread_mutex_unlock(&listMutex); + + err = brm->beginVBCopy(txnID, dbroot, ranges, vbRanges); + CPPUNIT_ASSERT(err == -1); + break; + } + + case 1: // writeVBEntry + { + int randIndex; + Range* entry; + + pthread_mutex_lock(&listMutex); + + if (copyList.size() == 0) + break; + + randIndex = rand_r(&randstate) % copyList.size(); + entry = &(copyList[randIndex]); + entry->nextBlock++; + txnID = entry->txnID; + break; + } + + default: + cerr << "not finished yet" << endl; + } + } + + return NULL; +} +*/ + +/* +static void* BRMRunner_1(void* arg) +{ + + // keep track of LBID ranges allocated here and + // randomly allocate, lookup, delete, get/set HWM, and + // destroy the EM object. + +#ifdef BRM_VERBOSE + int threadNum = reinterpret_cast(arg); +#endif + int op, listSize = 0, i; + uint32_t randstate; + struct EMEntries* head = NULL, *tmp; + struct timeval tv; + DBRM* brm; + ExtentMap em; + vector lbids; + LBID_t lbid; + uint32_t colWidth; + PartitionNumberT partNum; + SegmentT segmentNum; + execplan::CalpontSystemCatalog::ColDataType colDataType; + +#ifdef BRM_VERBOSE + cerr << "thread number " << threadNum << " started." << endl; +#endif + + gettimeofday(&tv, NULL); + randstate = static_cast(tv.tv_usec); + brm = new DBRM(); + + + while (!threadStop) + { + auto randNumber = rand_r(&randstate); + op = randNumber % 10; + colWidth = colWidthsAvailable[randNumber % colWidthsAvailable.size()]; + partNum = randNumber % std::numeric_limits::max(); + segmentNum = randNumber % std::numeric_limits::max(); + colDataType = (execplan::CalpontSystemCatalog::ColDataType) (randNumber % (int)execplan::CalpontSystemCatalog::ColDataType::TIMESTAMP); +#ifdef BRM_VERBOSE + cerr << "next op is " << op << endl; +#endif + + switch (op) + { + case 0: //allocate space for a new file + { + struct EMEntries* newEm; + size_t size = rand_r(&randstate) % 102399 + 1; + int entries, OID, allocdSize, err; + uint32_t startBlockOffset; + + + pthread_mutex_lock(&pthreadMutex); + OID = oid++; + opCount++; + pthread_mutex_unlock(&pthreadMutex); + lbids.clear(); + for (size_t i = 0; i < size; ++i) + { + err = brm->createColumnExtent_DBroot(OID, colWidth, dbroot, + partNum, segmentNum, colDataType, lbid, allocdSize, startBlockOffset); + CPPUNIT_ASSERT(err == 0); + lbids.push_back(lbid); + } + + entries = size / brm->getExtentSize(); + + if ((size % brm->getExtentSize()) != 0) + entries++; + + if ((uint32_t)entries != lbids.size()) + cerr << "entries = " << entries << " lbids.size = " << lbids.size() << endl; + + CPPUNIT_ASSERT((uint32_t)entries == lbids.size()); + + for (i = 0 ; i < entries; i++) + { + + newEm = new EMEntries(brm->getExtentSize(), OID, brm->getExtentSize(), lbids[i], + head, dbroot, partNum, segmentNum); + head = newEm; + listSize++; + } + +#ifdef BRM_VERBOSE + cerr << "created new space for OID " << newEm->OID << endl; +#endif + em.checkConsistency(); + break; + } + + case 1: //allocate space for an existing file + { + if (listSize == 0) + break; + + struct EMEntries* newEm, *tmp; + int size = rand_r(&randstate) % 102399 + 1; + int fileRand = rand_r(&randstate) % listSize; + int i, lastExtent, blockEnd, oid; + int tmpHWM, entries, allocdSize, err; + uint32_t startBlockOffset; + vector lbids; + LBID_t lbid; + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + for (lastExtent = 0, tmp = head; tmp != NULL; tmp = tmp->next) + { + if (tmp->OID != oid) + continue; + + tmpHWM = tmp->HWM; + blockEnd = tmp->FBO + tmp->size; + + if (lastExtent < blockEnd) + lastExtent = blockEnd; + } + + err = brm->createColumnExtentExactFile(oid, colWidth, dbroot, + partNum, segmentNum, colDataType, lbid, allocdSize, startBlockOffset); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + + entries = size / brm->getExtentSize(); + + if ((size % brm->getExtentSize()) != 0) + entries++; + + CPPUNIT_ASSERT((uint32_t)entries == lbids.size()); + + for (i = 0; i < entries; i++) + { + + newEm = new EMEntries((i != entries) ? brm->getExtentSize() : size % brm->getExtentSize(), + oid, lastExtent + (i * brm->getExtentSize()), + lbids[i], head, dbroot, partNum, segmentNum); + newEm->HWM = tmpHWM; + head = newEm; + listSize++; + } + +#ifdef BRM_VERBOSE + cerr << "created another extent for OID " << newEm->OID << endl; +#endif + em.checkConsistency(); + break; + } + + case 2: //delete an OID + { + if (listSize == 0) + break; + + struct EMEntries* tmp, *prev; + int fileRand = rand_r(&randstate) % listSize; + int i, oid, err; + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + err = brm->deleteOID(oid); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + + for (tmp = head; tmp != NULL;) + { + if (tmp->OID == oid) + { + if (tmp == head) + { + head = head->next; + delete tmp; + tmp = head; + } + else + { + prev->next = tmp->next; + delete tmp; + tmp = prev->next; + } + + listSize--; + } + else + { + prev = tmp; + tmp = tmp->next; + } + } + +#ifdef BRM_VERBOSE + cerr << "deleted OID " << oid << endl; +#endif + em.checkConsistency(); + break; + } + + case 3: //lookup by LBID + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset, oid; + struct EMEntries* tmp; + LBID_t target; + uint32_t fbo; + DBRootT localDbroot; + PartitionNumberT localPartNum; + SegmentT localSegmentNum; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + + target = tmp->LBIDstart + offset; + err = brm->lookupLocal(target, 0, false, oid, localDbroot, localPartNum, + localSegmentNum, fbo); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); +#ifdef BRM_VERBOSE + cerr << "looked up LBID " << target << " got oid " << oid << " fbo " << fbo << endl; + cerr << " oid should be " << tmp->OID << " fbo should be " << offset + tmp->FBO << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(oid == tmp->OID); + CPPUNIT_ASSERT(fbo == offset + tmp->FBO); + em.checkConsistency(); + break; + } + + case 4: //lookup by OID, FBO + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, oid, err, offset; + struct EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + oid = tmp->OID; + + err = brm->lookupLocal(oid, partNum, segmentNum, offset + tmp->FBO, lbid); + + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); +#ifdef BRM_VERBOSE + cerr << "looked up OID " << oid << " fbo " << offset + tmp->FBO << + " got lbid " << lbid << endl; + cerr << " lbid should be " << tmp->LBIDstart + offset << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(lbid == static_cast(tmp->LBIDstart + offset)); + em.checkConsistency(); + break; + } + + case 5: //getHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, status; + struct EMEntries* tmp; + uint32_t hwm; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + err = brm->getLocalHWM(tmp->OID, partNum, segmentNum, hwm, status); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); +#ifdef BRM_VERBOSE + cerr << "stored HWM for OID " << tmp->OID << " is " << tmp->HWM + << " BRM says it's " << hwm << endl; +#endif + CPPUNIT_ASSERT(hwm == tmp->HWM); + em.checkConsistency(); + break; + } + + case 6: //setHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, hwm, oid, err; + struct EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + hwm = rand_r(&randstate) % (tmp->FBO + brm->getExtentSize()); + err = brm->setLocalHWM(oid, tmp->partNum, tmp->segNum, hwm); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + + for (tmp = head; tmp != NULL; tmp = tmp->next) + if (tmp->OID == oid) + tmp->HWM = hwm; + +#ifdef BRM_VERBOSE + cerr << "setHWM of OID " << oid << " to " << hwm << endl; +#endif + em.checkConsistency(); + break; + } + + case 7: // renew this EM object + { + delete brm; + brm = new DBRM(); +#ifdef BRM_VERBOSE + cerr << "got a new BRM instance" << endl; +#endif + em.checkConsistency(); + break; + } +#if 0 + case 8: //getBulkInsertVars + { + if (listSize == 0) + break; + + HWM_t hwm; + VER_t txnID; + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + lbid = tmp->LBIDstart + offset; + err = brm->getBulkInsertVars(lbid, hwm, txnID); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(hwm == tmp->secondHWM); + CPPUNIT_ASSERT(txnID == tmp->txnID); + break; + } + + case 9: //setBulkInsertVars + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + tmp->secondHWM = rand_r(&randstate) % MAXINT; + tmp->txnID = rand_r(&randstate) % MAXINT; + err = brm->setBulkInsertVars(tmp->LBIDstart + offset, + tmp->secondHWM, tmp->txnID); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + break; + } +#endif + default: + break; + } + } + + delete brm; + + while (head != NULL) + { + tmp = head->next; + delete head; + head = tmp; + } + +#ifdef BRM_VERBOSE + cerr << "thread " << threadNum << " exiting" << endl; +#endif + return NULL; +} +*/ +DBRM brm_si; +/* +static void* BRMRunner_si(void* arg) +{ + + // keep track of LBID ranges allocated here and + // randomly allocate, lookup, delete, get/set HWM, and + // destroy the EM object. + +#ifdef BRM_VERBOSE + int threadNum = reinterpret_cast(arg); +#endif + int op, listSize = 0, i; + uint32_t randstate; + struct EMEntries* head = NULL, *tmp; + struct timeval tv; + ExtentMap em; + vector lbids; + +#ifdef BRM_VERBOSE + cerr << "thread number " << threadNum << " started." << endl; +#endif + + gettimeofday(&tv, NULL); + randstate = static_cast(tv.tv_usec); + + while (!threadStop) + { + op = rand_r(&randstate) % 10; +#ifdef BRM_VERBOSE + cerr << "next op is " << op << endl; +#endif + + switch (op) + { + case 0: //allocate space for a new file + { + struct EMEntries* newEm; + int size = rand_r(&randstate) % 102399 + 1; + int entries, OID, allocdSize, err; + + pthread_mutex_lock(&pthreadMutex); + OID = oid++; + opCount++; + pthread_mutex_unlock(&pthreadMutex); + + err = brm_si.createExtent(size, OID, lbids, allocdSize); + CPPUNIT_ASSERT(err == 0); + + entries = size / brm_si.getExtentSize(); + + if ((size % brm_si.getExtentSize()) != 0) + entries++; + + CPPUNIT_ASSERT((uint32_t)entries == lbids.size()); + + for (i = 0 ; i < entries; i++) + { + + newEm = new EMEntries(); + newEm->size = brm_si.getExtentSize(); + newEm->OID = OID; + newEm->FBO = i * brm_si.getExtentSize(); + newEm->LBIDstart = lbids[i]; + + newEm->next = head; + head = newEm; + listSize++; + } + +#ifdef BRM_VERBOSE + cerr << "created new space for OID " << newEm->OID << endl; +#endif + em.checkConsistency(); + break; + } + + case 1: //allocate space for an existing file + { + if (listSize == 0) + break; + + struct EMEntries* newEm, *tmp; + int size = rand_r(&randstate) % 102399 + 1; + int fileRand = rand_r(&randstate) % listSize; + int i, lastExtent, blockEnd, oid; + int tmpHWM, entries, allocdSize, err; + vector lbids; + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + for (lastExtent = 0, tmp = head; tmp != NULL; tmp = tmp->next) + { + if (tmp->OID != oid) + continue; + + tmpHWM = tmp->HWM; + blockEnd = tmp->FBO + tmp->size; + + if (lastExtent < blockEnd) + lastExtent = blockEnd; + } + + err = brm_si.createExtent(size, oid, lbids, allocdSize); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + + entries = size / brm_si.getExtentSize(); + + if ((size % brm_si.getExtentSize()) != 0) + entries++; + + CPPUNIT_ASSERT((uint32_t)entries == lbids.size()); + + for (i = 0; i < entries; i++) + { + + newEm = new EMEntries(); + + if (i != entries) + newEm->size = brm_si.getExtentSize(); + else + newEm->size = size % brm_si.getExtentSize(); + + newEm->OID = oid; + newEm->FBO = lastExtent + (i * brm_si.getExtentSize()); + newEm->LBIDstart = lbids[i]; + newEm->HWM = tmpHWM; + + newEm->next = head; + head = newEm; + listSize++; + } + +#ifdef BRM_VERBOSE + cerr << "created another extent for OID " << newEm->OID << endl; +#endif + em.checkConsistency(); + break; + } + + case 2: //delete an OID + { + if (listSize == 0) + break; + + struct EMEntries* tmp, *prev; + int fileRand = rand_r(&randstate) % listSize; + int i, oid, err; + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + err = brm_si.deleteOID(oid); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + + for (tmp = head; tmp != NULL;) + { + if (tmp->OID == oid) + { + if (tmp == head) + { + head = head->next; + delete tmp; + tmp = head; + } + else + { + prev->next = tmp->next; + delete tmp; + tmp = prev->next; + } + + listSize--; + } + else + { + prev = tmp; + tmp = tmp->next; + } + } + +#ifdef BRM_VERBOSE + cerr << "deleted OID " << oid << endl; +#endif + em.checkConsistency(); + break; + } + + case 3: //lookup by LBID + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset, oid; + struct EMEntries* tmp; + LBID_t target; + uint32_t fbo; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + + target = tmp->LBIDstart + offset; + err = brm_si.lookup(target, 0, false, oid, fbo); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); +#ifdef BRM_VERBOSE + cerr << "looked up LBID " << target << " got oid " << oid << " fbo " << fbo << endl; + cerr << " oid should be " << tmp->OID << " fbo should be " << offset + tmp->FBO << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(oid == tmp->OID); + CPPUNIT_ASSERT(fbo == offset + tmp->FBO); + em.checkConsistency(); + break; + } + + case 4: //lookup by OID, FBO + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, oid, err, offset; + struct EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + oid = tmp->OID; + + err = brm_si.lookup(oid, offset + tmp->FBO, lbid); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); +#ifdef BRM_VERBOSE + cerr << "looked up OID " << oid << " fbo " << offset + tmp->FBO << + " got lbid " << lbid << endl; + cerr << " lbid should be " << tmp->LBIDstart + offset << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(lbid == static_cast(tmp->LBIDstart + offset)); + em.checkConsistency(); + break; + } + + case 5: //getHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err; + struct EMEntries* tmp; + uint32_t hwm; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + err = brm_si.getHWM(tmp->OID, hwm); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); +#ifdef BRM_VERBOSE + cerr << "stored HWM for OID " << tmp->OID << " is " << tmp->HWM + << " BRM says it's " << hwm << endl; +#endif + CPPUNIT_ASSERT(hwm == tmp->HWM); + em.checkConsistency(); + break; + } + + case 6: //setHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, hwm, oid, err; + struct EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + hwm = rand_r(&randstate) % (tmp->FBO + brm_si.getExtentSize()); + err = brm_si.setHWM(oid, hwm); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + + for (tmp = head; tmp != NULL; tmp = tmp->next) + if (tmp->OID == oid) + tmp->HWM = hwm; + +#ifdef BRM_VERBOSE + cerr << "setHWM of OID " << oid << " to " << hwm << endl; +#endif + em.checkConsistency(); + break; + } + + case 7: //getBulkInsertVars + { + if (listSize == 0) + break; + + HWM_t hwm; + VER_t txnID; + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + lbid = tmp->LBIDstart + offset; + err = brm_si.getBulkInsertVars(lbid, hwm, txnID); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(hwm == tmp->secondHWM); + CPPUNIT_ASSERT(txnID == tmp->txnID); + break; + } + + case 8: //setBulkInsertVars + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + tmp->secondHWM = rand_r(&randstate) % MAXINT; + tmp->txnID = rand_r(&randstate) % MAXINT; + err = brm_si.setBulkInsertVars(tmp->LBIDstart + offset, + tmp->secondHWM, tmp->txnID); + pthread_mutex_lock(&pthreadMutex); + opCount++; + pthread_mutex_unlock(&pthreadMutex); + CPPUNIT_ASSERT(err == 0); + break; + } + + default: + break; + } + } + + while (head != NULL) + { + tmp = head->next; + delete head; + head = tmp; + } + +#ifdef BRM_VERBOSE + cerr << "thread " << threadNum << " exiting" << endl; +#endif + return NULL; +} +*/ + +static void* EMRunner(void* arg) +{ + // keep track of LBID ranges allocated here and + // randomly allocate, lookup, delete, get/set HWM, and + // destroy the EM object. + +#ifdef BRM_VERBOSE + uint64_t threadNum = (uint64_t)arg; +#endif + int op, listSize = 0; + uint32_t randstate; + struct EMEntries* head = NULL, *tmp; + struct timeval tv; + ExtentMap* em; + LBID_t lbid; + uint32_t colWidth; + PartitionNumberT partNum; + SegmentT segmentNum; + execplan::CalpontSystemCatalog::ColDataType colDataType; + +#ifdef BRM_VERBOSE + cerr << "thread number " << threadNum << " started." << endl; +#endif + + gettimeofday(&tv, NULL); + randstate = static_cast(tv.tv_usec); + //pthread_mutex_lock(&pthreadMutex); + em = new ExtentMap(); + //pthread_mutex_unlock(&pthreadMutex); + + while (!threadStop) + { + auto randNumber = rand_r(&randstate); + op = randNumber % 10; + + colWidth = colWidthsAvailable[randNumber % colWidthsAvailable.size()]; + partNum = randNumber % std::numeric_limits::max(); + segmentNum = randNumber % std::numeric_limits::max(); + colDataType = (execplan::CalpontSystemCatalog::ColDataType) (randNumber % (int)execplan::CalpontSystemCatalog::ColDataType::TIMESTAMP); +#ifdef BRM_VERBOSE + cerr << "next op is " << op << endl; +#endif + + switch (op) + { + case 0: //allocate space for a new file + { + vector emEntriesVec; + struct EMEntries* newEm; + size_t numberOfExtents = randNumber % 4 + 1; + int OID; + uint32_t startBlockOffset; + + pthread_mutex_lock(&pthreadMutex); + OID = oid++; + pthread_mutex_unlock(&pthreadMutex); + + em->getExtents(OID, emEntriesVec, false, false, true); + size_t extentsNumberBefore = emEntriesVec.size(); + int allocdsize; + for (size_t i = 0; i < numberOfExtents; ++i) + { + em->createColumnExtent_DBroot(OID, colWidth, dbroot, colDataType, + partNum, segmentNum, lbid, allocdsize, startBlockOffset); + em->confirmChanges(); + + newEm = new EMEntries(allocdsize, OID, startBlockOffset, lbid, + head, dbroot, partNum, segmentNum); + head = newEm; + listSize++; + } + + emEntriesVec.clear(); + em->getExtents(OID, emEntriesVec, false, false, true); + size_t extentsNumberAfter = emEntriesVec.size(); + + CPPUNIT_ASSERT(extentsNumberBefore + numberOfExtents == extentsNumberAfter); + +#ifdef BRM_VERBOSE + cerr << "created new space for OID " << newEm->OID << endl; +#endif + //em->checkConsistency(); + break; + } +/* + case 1: //allocate space for an existing file + { + if (listSize == 0) + break; + + struct EMEntries* newEm, *tmp; + size_t size = rand_r(&randstate) % 10; + int fileRand = rand_r(&randstate) % listSize; + int i, lastExtent, blockEnd, oid; + int tmpHWM, entries, allocdSize; + uint32_t startBlockOffset; + lbids.clear(); + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + for (lastExtent = 0, tmp = head; tmp != NULL; tmp = tmp->next) + { + if (tmp->OID != oid) + continue; + + tmpHWM = tmp->HWM; + blockEnd = tmp->FBO + tmp->size; + + if (lastExtent < blockEnd) + lastExtent = blockEnd; + } + + for (size_t i = 0; i < size; ++i) + { + em->createColumnExtent_DBroot(oid, colWidth, dbroot, colDataType, + partNum, segmentNum, lbid, allocdSize, startBlockOffset); + em->confirmChanges(); + lbids.push_back(lbid); + } + + //em->createExtent(size, oid, lbids, allocdSize); + //em->confirmChanges(); + + entries = size / em->getExtentSize(); + + if ((size % em->getExtentSize()) != 0) + entries++; + + CPPUNIT_ASSERT((uint32_t)entries == lbids.size()); + + for (i = 0; i < entries; i++) + { + newEm = new EMEntries((i != entries) ? em->getExtentSize() : size % em->getExtentSize(), + oid, lastExtent + (i * em->getExtentSize()), + lbids[i], head, dbroot, partNum, segmentNum); + newEm->HWM = tmpHWM; + head = newEm; + listSize++; + } + +#ifdef BRM_VERBOSE + cerr << "created another extent for OID " << newEm->OID << endl; +#endif + em->checkConsistency(); + break; + } +*/ + + case 2: //delete an OID + { + if (listSize == 0) + break; + + struct EMEntries* tmp, *prev; + int fileRand = rand_r(&randstate) % listSize; + int i, oid; + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + em->deleteOID(oid); + em->confirmChanges(); + + vector emEntriesVec; + em->getExtents(oid, emEntriesVec, false, false, true); + CPPUNIT_ASSERT(emEntriesVec.empty()); + + for (tmp = head; tmp != NULL;) + { + if (tmp->OID == oid) + { + if (tmp == head) + { + head = head->next; + delete tmp; + tmp = head; + } + else + { + prev->next = tmp->next; + delete tmp; + tmp = prev->next; + } + + listSize--; + } + else + { + prev = tmp; + tmp = tmp->next; + } + } + +#ifdef BRM_VERBOSE + cerr << "deleted OID " << oid << endl; +#endif + //em->checkConsistency(); + break; + } + + case 3: //lookup by LBID + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset, oid; + struct EMEntries* tmp; + LBID_t target; + uint32_t fbo; + DBRootT localDbroot; + PartitionNumberT localPartNum; + SegmentT localSegmentNum; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % (tmp->size - 1); + + target = tmp->LBIDstart + offset; + err = em->lookupLocal(target, oid, localDbroot, localPartNum, localSegmentNum, fbo); +#ifdef BRM_VERBOSE + cerr << "looked up LBID " << target << " got oid " << oid << " fbo " << fbo << endl; + cerr << " oid should be " << tmp->OID << " fbo should be " << offset + tmp->FBO << endl; + cerr << "op 3 fbo " << fbo << " offset + tmp->FBO " << offset + tmp->FBO << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(oid == tmp->OID); + CPPUNIT_ASSERT(fbo == offset + tmp->FBO); + //em->checkConsistency(); + break; + } + + case 4: //lookup by OID, FBO + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, oid, err, offset; + struct EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % (tmp->size - 1); + oid = tmp->OID; + + err = em->lookupLocal(oid, tmp->partNum, tmp->segNum, offset + tmp->FBO, lbid); +#ifdef BRM_VERBOSE + cerr << "looked up OID " << oid << " fbo " << offset + tmp->FBO << + " got lbid " << lbid << endl; + cerr << " lbid should be " << tmp->LBIDstart + offset << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(lbid == tmp->LBIDstart + offset); + //em->checkConsistency(); + break; + } + + case 5: //getHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, status; + struct EMEntries* tmp; + uint32_t hwm; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + hwm = em->getLocalHWM(tmp->OID, tmp->partNum, tmp->segNum, status); +#ifdef BRM_VERBOSE_I + cerr << "stored HWM for OID " << tmp->OID << " is " << tmp->HWM + << " BRM says it's " << hwm << endl; +#endif + CPPUNIT_ASSERT(hwm == tmp->HWM); + //em->checkConsistency(); + break; + } + + case 6: //setHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, hwm, oid; + struct EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + hwm = rand_r(&randstate) % (tmp->size - 1); + bool firstNode = true; + em->setLocalHWM(oid, tmp->partNum, tmp->segNum, hwm, firstNode); + + em->confirmChanges(); + + tmp->HWM = hwm; + +#ifdef BRM_VERBOSE + cerr << "setHWM of OID " << oid << " to " << hwm << endl; +#endif + //em->checkConsistency(); + break; + } + +/* + case 7: // renew this EM object + { + delete em; + em = new ExtentMap(); +#ifdef BRM_VERBOSE + cerr << "got a new EM instance" << endl; +#endif + em->checkConsistency(); + break; + } + case 8: //getBulkInsertVars + { + if (listSize == 0) + break; + + HWM_t hwm; + VER_t txnID; + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + lbid = tmp->LBIDstart + offset; + err = em->getBulkInsertVars(lbid, hwm, txnID); + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(hwm == tmp->secondHWM); + CPPUNIT_ASSERT(txnID == tmp->txnID); + break; + } + + case 9: //setBulkInsertVars + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + tmp->secondHWM = rand_r(&randstate) % MAXINT; + tmp->txnID = rand_r(&randstate) % MAXINT; + err = em->setBulkInsertVars(tmp->LBIDstart + offset, + tmp->secondHWM, tmp->txnID); + em->confirmChanges(); + CPPUNIT_ASSERT(err == 0); + break; + } +*/ + default: + break; + } + } + + delete em; + + while (head != NULL) + { + tmp = head->next; + delete head; + head = tmp; + } + +#ifdef BRM_VERBOSE + cerr << "thread " << threadNum << " exiting" << endl; +#endif + return NULL; +} + +/* +ExtentMap em_si; +static void* EMRunner_si(void* arg) +{ + + // keep track of LBID ranges allocated here and + // randomly allocate, lookup, delete, get/set HWM, and + // destroy the EM object. + +#ifdef BRM_VERBOSE + int threadNum = reinterpret_cast(arg); +#endif + int op, listSize = 0, i; + uint32_t randstate; + struct EMEntries* head = NULL, *tmp; + struct timeval tv; + vector lbids; + +#ifdef BRM_VERBOSE + cerr << "thread number " << threadNum << " started." << endl; +#endif + + gettimeofday(&tv, NULL); + randstate = static_cast(tv.tv_usec); + + while (!threadStop) + { + op = rand_r(&randstate) % 9; +#ifdef BRM_VERBOSE + cerr << "next op is " << op << endl; +#endif + + switch (op) + { + case 0: //allocate space for a new file + { + struct EMEntries* newEm; + int size = rand_r(&randstate) % 102399 + 1; + int entries, OID, allocdSize; + + pthread_mutex_lock(&pthreadMutex); + OID = oid++; + pthread_mutex_unlock(&pthreadMutex); + + em_si.createExtent(size, OID, lbids, allocdSize); + em_si.confirmChanges(); + + entries = size / em_si.getExtentSize(); + + if ((size % em_si.getExtentSize()) != 0) + entries++; + + CPPUNIT_ASSERT((uint32_t)entries == lbids.size()); + + for (i = 0 ; i < entries; i++) + { + + newEm = new EMEntries(); + newEm->size = em_si.getExtentSize(); + newEm->OID = OID; + newEm->FBO = i * em_si.getExtentSize(); + newEm->LBIDstart = lbids[i]; + + newEm->next = head; + head = newEm; + listSize++; + } + +#ifdef BRM_VERBOSE + cerr << "created new space for OID " << newEm->OID << endl; +#endif + em_si.checkConsistency(); + break; + } + + case 1: //allocate space for an existing file + { + if (listSize == 0) + break; + + struct EMEntries* newEm, *tmp; + int size = rand_r(&randstate) % 102399 + 1; + int fileRand = rand_r(&randstate) % listSize; + int i, lastExtent, blockEnd, oid; + int tmpHWM, entries, allocdSize; + vector lbids; + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + for (lastExtent = 0, tmp = head; tmp != NULL; tmp = tmp->next) + { + if (tmp->OID != oid) + continue; + + tmpHWM = tmp->HWM; + blockEnd = tmp->FBO + tmp->size; + + if (lastExtent < blockEnd) + lastExtent = blockEnd; + } + + em_si.createExtent(size, oid, lbids, allocdSize); + em_si.confirmChanges(); + + entries = size / em_si.getExtentSize(); + + if ((size % em_si.getExtentSize()) != 0) + entries++; + + CPPUNIT_ASSERT((uint32_t)entries == lbids.size()); + + for (i = 0; i < entries; i++) + { + + newEm = new EMEntries(); + + if (i != entries) + newEm->size = em_si.getExtentSize(); + else + newEm->size = size % em_si.getExtentSize(); + + newEm->OID = oid; + newEm->FBO = lastExtent + (i * em_si.getExtentSize()); + newEm->LBIDstart = lbids[i]; + newEm->HWM = tmpHWM; + + newEm->next = head; + head = newEm; + listSize++; + } + +#ifdef BRM_VERBOSE + cerr << "created another extent for OID " << newEm->OID << endl; +#endif + em_si.checkConsistency(); + break; + } + + case 2: //delete an OID + { + if (listSize == 0) + break; + + struct EMEntries* tmp, *prev; + int fileRand = rand_r(&randstate) % listSize; + int i, oid; + + for (i = 0, tmp = head; i < fileRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + + em_si.deleteOID(oid); + em_si.confirmChanges(); + + for (tmp = head; tmp != NULL;) + { + if (tmp->OID == oid) + { + if (tmp == head) + { + head = head->next; + delete tmp; + tmp = head; + } + else + { + prev->next = tmp->next; + delete tmp; + tmp = prev->next; + } + + listSize--; + } + else + { + prev = tmp; + tmp = tmp->next; + } + } + +#ifdef BRM_VERBOSE + cerr << "deleted OID " << oid << endl; +#endif + em_si.checkConsistency(); + break; + } + + case 3: //lookup by LBID + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset, oid; + struct EMEntries* tmp; + LBID_t target; + uint32_t fbo; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + + target = tmp->LBIDstart + offset; + err = em_si.lookup(target, oid, fbo); +#ifdef BRM_VERBOSE + cerr << "looked up LBID " << target << " got oid " << oid << " fbo " << fbo << endl; + cerr << " oid should be " << tmp->OID << " fbo should be " << offset + tmp->FBO << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(oid == tmp->OID); + CPPUNIT_ASSERT(fbo == offset + tmp->FBO); + em_si.checkConsistency(); + break; + } + + case 4: //lookup by OID, FBO + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, oid, err, offset; + struct EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + oid = tmp->OID; + + err = em_si.lookup(oid, offset + tmp->FBO, lbid); +#ifdef BRM_VERBOSE + cerr << "looked up OID " << oid << " fbo " << offset + tmp->FBO << + " got lbid " << lbid << endl; + cerr << " lbid should be " << tmp->LBIDstart + offset << endl; +#endif + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(lbid == tmp->LBIDstart + offset); + em_si.checkConsistency(); + break; + } + + case 5: //getHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i; + struct EMEntries* tmp; + uint32_t hwm; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + hwm = em_si.getHWM(tmp->OID); +#ifdef BRM_VERBOSE + cerr << "stored HWM for OID " << tmp->OID << " is " << tmp->HWM + << " BRM says it's " << hwm << endl; +#endif + CPPUNIT_ASSERT(hwm == tmp->HWM); + em_si.checkConsistency(); + break; + } + + case 6: //setHWM + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, hwm, oid; + struct EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + oid = tmp->OID; + hwm = rand_r(&randstate) % (tmp->FBO + em_si.getExtentSize()); + em_si.setHWM(oid, hwm); + em_si.confirmChanges(); + + for (tmp = head; tmp != NULL; tmp = tmp->next) + if (tmp->OID == oid) + tmp->HWM = hwm; + +#ifdef BRM_VERBOSE + cerr << "setHWM of OID " << oid << " to " << hwm << endl; +#endif + em_si.checkConsistency(); + break; + } + + case 7: //getBulkInsertVars + { + if (listSize == 0) + break; + + HWM_t hwm; + VER_t txnID; + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + LBID_t lbid; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + lbid = tmp->LBIDstart + offset; + err = em_si.getBulkInsertVars(lbid, hwm, txnID); + CPPUNIT_ASSERT(err == 0); + CPPUNIT_ASSERT(hwm == tmp->secondHWM); + CPPUNIT_ASSERT(txnID == tmp->txnID); + break; + } + + case 8: //setBulkInsertVars + { + if (listSize == 0) + break; + + int entryRand = rand_r(&randstate) % listSize; + int i, err, offset; + EMEntries* tmp; + + for (i = 0, tmp = head; i < entryRand; i++) + tmp = tmp->next; + + offset = rand_r(&randstate) % tmp->size; + tmp->secondHWM = rand_r(&randstate) % MAXINT; + tmp->txnID = rand_r(&randstate) % MAXINT; + err = em_si.setBulkInsertVars(tmp->LBIDstart + offset, + tmp->secondHWM, tmp->txnID); + em_si.confirmChanges(); + CPPUNIT_ASSERT(err == 0); + break; + } + + default: + break; + } + } + + while (head != NULL) + { + tmp = head->next; + delete head; + head = tmp; + } + +#ifdef BRM_VERBOSE + cerr << "thread " << threadNum << " exiting" << endl; +#endif + return NULL; +} +*/ + + +class LongBRMTests : public CppUnit::TestFixture +{ + + CPPUNIT_TEST_SUITE(LongBRMTests); + + CPPUNIT_TEST(longEMTest_1); +// CPPUNIT_TEST(longEMTest_2); +// CPPUNIT_TEST(longBRMTest_1); +// CPPUNIT_TEST(longBRMTest_2); + + CPPUNIT_TEST_SUITE_END(); + +private: +public: + void longEMTest_1() + { + const int threadCount = 10; + int i; + pthread_t threads[threadCount]; + + cerr << endl << "Multithreaded, multiple instance ExtentMap test. " + "This runs for 5 minutes." << endl; + + threadStop = 0; + pthread_mutex_init(&pthreadMutex, nullptr); + + for (i = 0; i < threadCount; i++) + { + if (pthread_create(&threads[i], NULL, EMRunner, + reinterpret_cast(i + 1)) < 0) + throw logic_error("Error creating threads for the ExtentMap test"); + + usleep(1000); + } + + sleep(300); + threadStop = 1; + + for (i = 0; i < threadCount; i++) + { + cerr << "Waiting for thread #" << i << endl; + pthread_join(threads[i], nullptr); + } + } + +/* + void longEMTest_2() + { + const int threadCount = 10; + int i; + pthread_t threads[threadCount]; + + cerr << endl << "Multithreaded, single instance ExtentMap test. " + "This runs for 5 minutes." << endl; + + threadStop = 0; + pthread_mutex_init(&pthreadMutex, NULL); + + for (i = 0; i < threadCount; i++) + { + if (pthread_create(&threads[i], NULL, EMRunner_si, + reinterpret_cast(i + 1)) < 0) + throw logic_error("Error creating threads for the ExtentMap test"); + + usleep(1000); + } + + sleep(60); + threadStop = 1; + + for (i = 0; i < threadCount; i++) + { + cerr << "Waiting for thread #" << i << endl; + pthread_join(threads[i], NULL); + } + } + void longBRMTest_1() + { + const int threadCount = 10; + int i; + pthread_t threads[threadCount]; + + cerr << endl << "Multithreaded, multiple instance DBRM test. " + "This runs for 5 minutes." << endl; + + threadStop = 0; + pthread_mutex_init(&pthreadMutex, NULL); + opCount = 0; + + for (i = 0; i < threadCount; i++) + { + if (pthread_create(&threads[i], NULL, BRMRunner_1, + reinterpret_cast(i + 1)) < 0) + throw logic_error("Error creating threads for the DBRM test"); + + usleep(1000); + } + + sleep(300); + threadStop = 1; + + for (i = 0; i < threadCount; i++) + { + cerr << "Waiting for thread #" << i << endl; + pthread_join(threads[i], NULL); + } + + cerr << "opCount = " << opCount << endl; + } + void longBRMTest_2() + { + const int threadCount = 10; + int i; + pthread_t threads[threadCount]; + + cerr << endl << "Multithreaded, single instance DBRM test. " + "This runs for 5 minutes." << endl; + + threadStop = 0; + pthread_mutex_init(&pthreadMutex, NULL); + opCount = 0; + + for (i = 0; i < threadCount; i++) + { + if (pthread_create(&threads[i], NULL, BRMRunner_si, + reinterpret_cast(i + 1)) < 0) + throw logic_error("Error creating threads for the DBRM test"); + + usleep(1000); + } + + sleep(300); + threadStop = 1; + + for (i = 0; i < threadCount; i++) + { + cerr << "Waiting for thread #" << i << endl; + pthread_join(threads[i], NULL); + } + + cerr << "opCount = " << opCount << endl; + } +*/ + +}; +CPPUNIT_TEST_SUITE_REGISTRATION( LongBRMTests ); + +#include +#include + +int main( int argc, char** argv) +{ + CppUnit::TextUi::TestRunner runner; + CppUnit::TestFactoryRegistry& registry = CppUnit::TestFactoryRegistry::getRegistry(); + runner.addTest( registry.makeTest() ); + idbdatafile::IDBPolicy::configIDBPolicy(); + bool wasSuccessful = runner.run( "", false ); + return (wasSuccessful ? 0 : 1); +} diff --git a/tools/clearShm/main.cpp b/tools/clearShm/main.cpp index 31e07b9a8..e37e27db9 100644 --- a/tools/clearShm/main.cpp +++ b/tools/clearShm/main.cpp @@ -60,7 +60,7 @@ void shmDoit(key_t shm_key, const string& label) bi::offset_t memSize = 0; memObj.get_size(memSize); std::lock_guard lk(coutMutex); - cout << label << ": shm_key: " << shm_key << "; key_name: " << key_name << "; size: " << memSize + cout << label << ": shm|sem_key: " << shm_key << "; key_name: " << key_name << "; size: " << memSize << endl; } catch (...) @@ -74,6 +74,11 @@ void shmDoit(key_t shm_key, const string& label) } } +void semDoit(key_t sem_key, const string& label) +{ + shmDoit(sem_key, label); +} + void shmDoitRange(key_t shm_key, const string& label) { if (shm_key == 0) @@ -87,32 +92,6 @@ void shmDoitRange(key_t shm_key, const string& label) } } -void semDoit(key_t sem_key, const string& label) -{ - string key_name = ShmKeys::keyToName(sem_key); - - if (vFlg) - { - try - { - bi::shared_memory_object memObj(bi::open_only, key_name.c_str(), bi::read_only); - bi::offset_t memSize = 0; - memObj.get_size(memSize); - std::lock_guard lk(coutMutex); - cout << label << ": sem_key: " << sem_key << "; key_name: " << key_name << "; size: " << memSize - << endl; - } - catch (...) - { - } - } - - if (!nFlg) - { - bi::shared_memory_object::remove(key_name.c_str()); - } -} - void usage() { cout << "usage: clearShm [-cvnh]" << endl; @@ -206,6 +185,8 @@ int main(int argc, char** argv) tg.add_thread(tp); tp = new boost::thread(ThdFunc(BrmKeys.KEYRANGE_VSS_BASE, "VSS ")); tg.add_thread(tp); + tp = new boost::thread(ThdFunc(BrmKeys.KEYRANGE_EXTENTMAP_INDEX_BASE, "EXTMAP_INDX")); + tg.add_thread(tp); tg.join_all(); shmDoit(BrmKeys.MST_SYSVKEY, "MST "); @@ -226,6 +207,7 @@ int main(int argc, char** argv) semDoit(BrmKeys.KEYRANGE_EMFREELIST_BASE, "EXTMAP_FREE"); semDoit(BrmKeys.KEYRANGE_VBBM_BASE, "VBBM "); semDoit(BrmKeys.KEYRANGE_VSS_BASE, "VSS "); + semDoit(BrmKeys.KEYRANGE_EXTENTMAP_INDEX_BASE, "EXTMAP_INDX"); semDoit(BrmKeys.MST_SYSVKEY, "MST "); if (!cFlg) diff --git a/utils/rwlock/rwlock.h b/utils/rwlock/rwlock.h index e40305701..77ecc7631 100644 --- a/utils/rwlock/rwlock.h +++ b/utils/rwlock/rwlock.h @@ -1,4 +1,5 @@ /* Copyright (C) 2014 InfiniDB, Inc. + Copyright (C) 2016-2022 MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -27,6 +28,8 @@ #ifndef RWLOCK_H_ #define RWLOCK_H_ +#include + #include #include @@ -43,6 +46,10 @@ namespace rwlock { +const std::array RWLockNames = { + "all", "VSS", "ExtentMap", "FreeList", "VBBM", "CopyLocks", "ExtentMapIndex", +}; + /// the layout of the shmseg struct State { diff --git a/versioning/BRM/brmshmimpl.cpp b/versioning/BRM/brmshmimpl.cpp index 2c25caf8b..740282410 100644 --- a/versioning/BRM/brmshmimpl.cpp +++ b/versioning/BRM/brmshmimpl.cpp @@ -40,7 +40,15 @@ namespace bi = boost::interprocess; namespace BRM { -BRMShmImpl::BRMShmImpl(unsigned key, off_t size, bool readOnly) : fKey(key), fSize(size), fReadOnly(readOnly) +const constexpr uint32_t ShmCreateMaxRetries = 10; +const constexpr unsigned int NapTimer = 500000; + +BRMShmImplParent::BRMShmImplParent(unsigned key, off_t size, bool readOnly) + : fKey(key), fSize(size), fReadOnly(readOnly){}; + +BRMShmImplParent::~BRMShmImplParent(){}; + +BRMShmImpl::BRMShmImpl(unsigned key, off_t size, bool readOnly) : BRMShmImplParent(key, size, readOnly) { string keyName = ShmKeys::keyToName(fKey); @@ -238,6 +246,168 @@ void BRMShmImpl::destroy() bi::shared_memory_object::remove(oldName.c_str()); } +BRMManagedShmImpl::BRMManagedShmImpl(unsigned key, off_t size, bool readOnly) + : BRMShmImplParent(key, size, readOnly) +{ + string keyName = ShmKeys::keyToName(fKey); + off_t curSize = 0; + + for (uint32_t tries = 0; fSize == 0 && tries <= ShmCreateMaxRetries; ++tries) + { + try + { + auto* shmSegment = new boost::interprocess::managed_shared_memory(bi::open_only, keyName.c_str()); + curSize = shmSegment->get_size(); + + if (curSize == 0) + { + delete shmSegment; + throw bi::interprocess_exception("shared memory segment size is 0."); + } + else + { + fShmSegment = shmSegment; + fSize = curSize; + return; + } + } + catch (bi::interprocess_exception&) + { + if (tries == ShmCreateMaxRetries) + { + log("BRMManagedShmImpl::BRMManagedShmImpl(): re-creating shared memory segment\ + b/c of its size == 0. Re-throw."); + throw; + } + + cerr << "BRMManagedShmImpl::BRMManagedShmImpl(): re-creating shared memory segment\ + b/c of its size == 0" + << endl; + usleep(NapTimer); + } + } + + try + { + bi::permissions perms; + perms.set_unrestricted(); + fShmSegment = new bi::managed_shared_memory(bi::create_only, keyName.c_str(), fSize, + 0, // use a default address to map the segment + perms); + // fSize == 0 on any process startup but managed_shared_memory ctor throws + // so control flow doesn't get here. + idbassert(fSize > 0); + } + catch (bi::interprocess_exception& b) + { + if (b.get_error_code() != bi::already_exists_error) + { + ostringstream o; + o << "BRM caught an exception creating a shared memory segment: " << b.what(); + log(o.str()); + throw; + } + bi::managed_shared_memory* shmSegment = nullptr; + try + { + if (fReadOnly) + shmSegment = new bi::managed_shared_memory(bi::open_read_only, keyName.c_str()); + else + shmSegment = new bi::managed_shared_memory(bi::open_only, keyName.c_str()); + } + catch (exception& e) + { + ostringstream o; + o << "BRM caught an exception attaching to a shared memory segment (" << keyName << "): " << b.what(); + log(o.str()); + throw; + } + off_t curSize = shmSegment->get_size(); + + idbassert(curSize > 0); + idbassert(curSize >= fSize); + fShmSegment = shmSegment; + fSize = curSize; + } +} + +int BRMManagedShmImpl::grow(off_t newSize) +{ + auto keyName = ShmKeys::keyToName(fKey); + + if (newSize > fSize) + { + const auto incSize = newSize - fSize; + if (fShmSegment) + { + // Call destructor to unmap the segment. + delete fShmSegment; + // Grow the segment. + bi::managed_shared_memory::grow(keyName.c_str(), incSize); + // Open only with the assumption ::grow() can be called on read-write shmem. + fShmSegment = new bi::managed_shared_memory(bi::open_only, keyName.c_str()); + // Update size. + fSize = newSize; + } + } + + return 0; +} + +// Dummy method that has no references in the code. +int BRMManagedShmImpl::clear(unsigned newKey, off_t newSize) +{ + return 0; +} + +// This method calls for all related shmem pointers to be refreshed. +void BRMManagedShmImpl::setReadOnly() +{ + if (fReadOnly) + return; + const bool readOnly = true; + remap(readOnly); + fReadOnly = true; +} + +void BRMManagedShmImpl::swap(BRMManagedShmImpl& rhs) +{ + fShmSegment->swap(*rhs.fShmSegment); + std::swap(fKey, rhs.fKey); + std::swap(fSize, rhs.fSize); + std::swap(fReadOnly, rhs.fReadOnly); +} + +// The method was copied from non-managed shmem impl class +// and it has no refences in MCS 6.x code. +void BRMManagedShmImpl::destroy() +{ + string keyName = ShmKeys::keyToName(fKey); + try + { + bi::shared_memory_object::remove(keyName.c_str()); + } + catch (bi::interprocess_exception& b) + { + std::ostringstream o; + o << "BRMManagedShmImpl::destroy caught an exception removing a managed shared memory segment: " + << b.what(); + log(o.str()); + throw; + } +} + +void BRMManagedShmImpl::remap(const bool readOnly) +{ + delete fShmSegment; + fShmSegment = nullptr; + string keyName = ShmKeys::keyToName(fKey); + if (readOnly) + fShmSegment = new bi::managed_shared_memory(bi::open_read_only, keyName.c_str()); + else + fShmSegment = new bi::managed_shared_memory(bi::open_only, keyName.c_str()); +} + } // namespace BRM // vim:ts=4 sw=4: diff --git a/versioning/BRM/brmshmimpl.h b/versioning/BRM/brmshmimpl.h index d2177afec..fa1f0cfee 100644 --- a/versioning/BRM/brmshmimpl.h +++ b/versioning/BRM/brmshmimpl.h @@ -24,24 +24,24 @@ * class BRMShmImpl */ -#ifndef IDBSHMIMPL_H_ -#define IDBSHMIMPL_H_ +#pragma once #include //#define NDEBUG #include #include +#include #include +namespace bi = boost::interprocess; + namespace BRM { -class BRMShmImpl +class BRMShmImplParent { public: - BRMShmImpl(unsigned key, off_t size, bool readOnly = false); - ~BRMShmImpl() - { - } + BRMShmImplParent(unsigned key, off_t size, bool readOnly = false); + virtual ~BRMShmImplParent(); inline unsigned key() const { @@ -56,25 +56,63 @@ class BRMShmImpl return fReadOnly; } - void setReadOnly(); - int grow(unsigned newKey, off_t newSize); - int clear(unsigned newKey, off_t newSize); - - void swap(BRMShmImpl& rhs); - void destroy(); - - boost::interprocess::shared_memory_object fShmobj; - boost::interprocess::mapped_region fMapreg; - - private: - BRMShmImpl(const BRMShmImpl& rhs); - BRMShmImpl& operator=(const BRMShmImpl& rhs); + virtual void setReadOnly() = 0; + virtual int clear(unsigned newKey, off_t newSize) = 0; + virtual void destroy() = 0; + protected: unsigned fKey; off_t fSize; bool fReadOnly; }; -} // namespace BRM +class BRMShmImpl : public BRMShmImplParent +{ + public: + BRMShmImpl(unsigned key, off_t size, bool readOnly = false); + BRMShmImpl(const BRMShmImpl& rhs) = delete; + BRMShmImpl& operator=(const BRMShmImpl& rhs) = delete; + ~BRMShmImpl() + { + } -#endif + int clear(unsigned newKey, off_t newSize) override; + void destroy() override; + void setReadOnly() override; + + int grow(unsigned newKey, off_t newSize); + void swap(BRMShmImpl& rhs); + + bi::shared_memory_object fShmobj; + bi::mapped_region fMapreg; +}; + +class BRMManagedShmImpl : public BRMShmImplParent +{ + public: + BRMManagedShmImpl(unsigned key, off_t size, bool readOnly = false); + BRMManagedShmImpl(const BRMManagedShmImpl& rhs) = delete; + BRMManagedShmImpl& operator=(const BRMManagedShmImpl& rhs) = delete; + ~BRMManagedShmImpl() + { + delete fShmSegment; + } + + int clear(unsigned newKey, off_t newSize) override; + void destroy() override; + void setReadOnly() override; + + int grow(off_t newSize); + void remap(const bool readOnly = false); + void swap(BRMManagedShmImpl& rhs); + bi::managed_shared_memory* getManagedSegment() + { + assert(fShmSegment); + return fShmSegment; + } + + private: + bi::managed_shared_memory* fShmSegment; +}; + +} // namespace BRM diff --git a/versioning/BRM/dbrm.cpp b/versioning/BRM/dbrm.cpp index 1cc3fb983..2ab47d8ca 100644 --- a/versioning/BRM/dbrm.cpp +++ b/versioning/BRM/dbrm.cpp @@ -98,7 +98,7 @@ DBRM::DBRM(const DBRM& brm) throw logic_error("DBRM: Don't use the copy constructor."); } -DBRM::~DBRM() throw() +DBRM::~DBRM() { if (msgClient != NULL) MessageQueueClientPool::releaseInstance(msgClient); @@ -461,7 +461,7 @@ int DBRM::markExtentsInvalid(const vector& lbids, } template -int DBRM::getExtentMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) throw() +int DBRM::getExtentMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) { #ifdef BRM_INFO @@ -489,7 +489,7 @@ int DBRM::getExtentMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) th } } -int DBRM::getExtentCPMaxMin(const LBID_t lbid, CPMaxMin& cpMaxMin) throw() +int DBRM::getExtentCPMaxMin(const LBID_t lbid, CPMaxMin& cpMaxMin) { try { @@ -4555,10 +4555,19 @@ void DBRM::invalidateUncommittedExtentLBIDs(execplan::CalpontSystemCatalog::SCN setExtentsMaxMin(cpInfos); } -template int DBRM::getExtentMaxMin(const LBID_t lbid, int128_t& max, int128_t& min, - int32_t& seqNum) throw(); +size_t DBRM::EMIndexShmemSize() +{ + return em->EMIndexShmemSize(); +} -template int DBRM::getExtentMaxMin(const LBID_t lbid, int64_t& max, int64_t& min, - int32_t& seqNum) throw(); +size_t DBRM::EMIndexShmemFree() +{ + return em->EMIndexShmemFree(); +} + +template int DBRM::getExtentMaxMin(const LBID_t lbid, int128_t& max, int128_t& min, + int32_t& seqNum); + +template int DBRM::getExtentMaxMin(const LBID_t lbid, int64_t& max, int64_t& min, int32_t& seqNum); } // namespace BRM diff --git a/versioning/BRM/dbrm.h b/versioning/BRM/dbrm.h index e4ddcff27..abbe8bc8f 100644 --- a/versioning/BRM/dbrm.h +++ b/versioning/BRM/dbrm.h @@ -104,7 +104,7 @@ class DBRM // The param noBRMFcns suppresses init of the ExtentMap, VSS, VBBM, and CopyLocks. // It can speed up init if the caller only needs the other structures. EXPORT DBRM(bool noBRMFcns = false); - EXPORT ~DBRM() throw(); + EXPORT ~DBRM(); EXPORT static void refreshShm() { @@ -781,12 +781,12 @@ class DBRM const std::vector& colDataTypes) DBRM_THROW; template - EXPORT int getExtentMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) throw(); + EXPORT int getExtentMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum); EXPORT int setExtentMaxMin(const LBID_t lbid, const int64_t max, const int64_t min, const int32_t seqNum) DBRM_THROW; - EXPORT int getExtentCPMaxMin(const LBID_t lbid, CPMaxMin& cpMaxMin) throw(); + EXPORT int getExtentCPMaxMin(const LBID_t lbid, CPMaxMin& cpMaxMin); /** @brief Updates the max and min casual partitioning info for the passed extents. * @@ -985,6 +985,9 @@ class DBRM EXPORT void invalidateUncommittedExtentLBIDs(execplan::CalpontSystemCatalog::SCN txnid, bool allExtents, std::vector* plbidList = NULL); + size_t EMIndexShmemSize(); + size_t EMIndexShmemFree(); + private: DBRM(const DBRM& brm); DBRM& operator=(const DBRM& brm); diff --git a/versioning/BRM/extentmap.cpp b/versioning/BRM/extentmap.cpp index 6235f3017..3f64ad0e5 100644 --- a/versioning/BRM/extentmap.cpp +++ b/versioning/BRM/extentmap.cpp @@ -1,19 +1,20 @@ /* Copyright (C) 2014 InfiniDB, Inc. + Copyright (C) 2016-2022 MariaDB Corporation - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License - as published by the Free Software Foundation; version 2 of - the License. + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - MA 02110-1301, USA. */ + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1301, USA. */ /***************************************************************************** * $Id: extentmap.cpp 1936 2013-07-09 22:10:29Z dhall $ @@ -45,7 +46,6 @@ #include #include -namespace bi = boost::interprocess; #include "liboamcpp.h" #include "brmtypes.h" @@ -113,6 +113,7 @@ inline void incSeqNum(int32_t& seqNum) namespace BRM { +static const char* EmIndexObjectName = "i"; //------------------------------------------------------------------------------ // EMCasualPartition_struct methods //------------------------------------------------------------------------------ @@ -217,9 +218,10 @@ bool EMEntry::operator<(const EMEntry& e) const /*static*/ boost::mutex ExtentMapImpl::fInstanceMutex; boost::mutex ExtentMap::mutex; +boost::mutex ExtentMap::emIndexMutex; /*static*/ -ExtentMapImpl* ExtentMapImpl::fInstance = 0; +ExtentMapImpl* ExtentMapImpl::fInstance = nullptr; /*static*/ ExtentMapImpl* ExtentMapImpl::makeExtentMapImpl(unsigned key, off_t size, bool readOnly) @@ -279,19 +281,305 @@ FreeListImpl::FreeListImpl(unsigned key, off_t size, bool readOnly) : fFreeList( { } +/*static*/ +std::mutex ExtentMapIndexImpl::fInstanceMutex_; + +/*static*/ +ExtentMapIndexImpl* ExtentMapIndexImpl::fInstance_ = nullptr; + +/*static*/ +ExtentMapIndexImpl* ExtentMapIndexImpl::makeExtentMapIndexImpl(unsigned key, off_t size, bool readOnly) +{ + std::lock_guard lock(fInstanceMutex_); + + if (fInstance_) + { + if (size != fInstance_->getShmemSize()) + { + fInstance_->fBRMManagedShmMemImpl_.remap(); + } + + return fInstance_; + } + + fInstance_ = new ExtentMapIndexImpl(key, size, readOnly); + fInstance_->createExtentMapIndexIfNeeded(); + + return fInstance_; +} + +ExtentMapIndexImpl::ExtentMapIndexImpl(unsigned key, off_t size, bool readOnly) + : fBRMManagedShmMemImpl_(key, size, readOnly) +{ +} + +void ExtentMapIndexImpl::createExtentMapIndexIfNeeded() +{ + // pair + auto managedShmemSearchPair = + fBRMManagedShmMemImpl_.getManagedSegment()->find(EmIndexObjectName); + if (!managedShmemSearchPair.first || managedShmemSearchPair.second == 0) + { + ShmVoidAllocator alloc(fBRMManagedShmMemImpl_.getManagedSegment()->get_segment_manager()); + fBRMManagedShmMemImpl_.getManagedSegment()->construct(EmIndexObjectName)(alloc); + } +} + +ExtentMapIndex* ExtentMapIndexImpl::get() +{ + // pair + auto managedShmemSearchPair = + fBRMManagedShmMemImpl_.getManagedSegment()->find(EmIndexObjectName); + assert(managedShmemSearchPair.first && managedShmemSearchPair.second > 0); + return managedShmemSearchPair.first; +} + +bool ExtentMapIndexImpl::growIfNeeded(const size_t memoryNeeded) +{ + auto freeShmem = getShmemFree(); + // Worst case managed segment can't get continues buffer with len = memoryNeeded + if (freeShmem < memoryNeeded) + { + const size_t currentShmemSize = getShmemSize(); + constexpr static const size_t minAllowance = 16 * 1024 * 1024; + const size_t newShmemSize = std::max(minAllowance, memoryNeeded) + currentShmemSize; + grow(newShmemSize); + return true; + } + return false; +} + +InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert(const EMEntry& emEntry, const size_t emIdx) +{ + auto dbRoot = emEntry.dbRoot; + auto* extentMapIndexPtr = get(); + bool shmemHasGrown = false; + + while (dbRoot >= extentMapIndexPtr->size()) + { + const size_t memNeeded = (extentMapIndexPtr->capacity() + extraUnits_) * dbRootContainerUnitSize_; + shmemHasGrown = growIfNeeded(memNeeded); + // Need to refresh all refs and iterators b/c the local address range changed. + extentMapIndexPtr = get(); + assert(extentMapIndexPtr); + ShmVoidAllocator alloc(fBRMManagedShmMemImpl_.getManagedSegment()->get_segment_manager()); + OIDIndexContainerT oidIndices(alloc); + extentMapIndexPtr->push_back(std::move(oidIndices)); + } + auto& extentMapIndex = *extentMapIndexPtr; + return insert2ndLayerWrapper(extentMapIndex[dbRoot], emEntry, emIdx, shmemHasGrown); +} + +InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert2ndLayer(OIDIndexContainerT& oids, const EMEntry& emEntry, + const size_t emIdx, const bool aShmemHasGrown) +{ + OID_t oid = emEntry.fileID; + ShmVoidAllocator alloc(fBRMManagedShmMemImpl_.getManagedSegment()->get_segment_manager()); + + PartitionIndexContainerT partitionIndex(alloc); + auto iterAndResult = oids.insert({oid, std::move(partitionIndex)}); + + if (iterAndResult.second) + { + PartitionIndexContainerT& partitionsContainer = (*iterAndResult.first).second; + return insert3dLayerWrapper(partitionsContainer, emEntry, emIdx, aShmemHasGrown); + } + else + return {false, aShmemHasGrown}; +} + +InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert2ndLayerWrapper(OIDIndexContainerT& oids, + const EMEntry& emEntry, const size_t emIdx, + const bool aShmemHasGrown) +{ + OID_t oid = emEntry.fileID; + auto oidsIter = oids.find(oid); + bool shmemHasGrown = aShmemHasGrown; + if (oidsIter == oids.end()) + { + const size_t freeShmem = fBRMManagedShmMemImpl_.getManagedSegment()->get_free_memory(); + const size_t memNeeded = (oids.size() + extraUnits_) * oidContainerUnitSize_; + if (oids.load_factor() >= oids.max_load_factor() || freeShmem <= freeSpaceThreshold_) + { + // Need to refresh all refs and iterators b/c the local address range changed. + shmemHasGrown = growIfNeeded(memNeeded); + auto* extMapIndexPtr = get(); + assert(extMapIndexPtr); + auto& extMapIndex = *extMapIndexPtr; + shmemHasGrown = shmemHasGrown || aShmemHasGrown; + // The dbroot must be here b/c it was already found once in the upper insert(). + OIDIndexContainerT& refreshedOidsRef = extMapIndex[emEntry.dbRoot]; + return insert2ndLayer(refreshedOidsRef, emEntry, emIdx, shmemHasGrown); + } + return insert2ndLayer(oids, emEntry, emIdx, shmemHasGrown); + } + PartitionIndexContainerT& partitions = (*oidsIter).second; + return insert3dLayerWrapper(partitions, emEntry, emIdx, shmemHasGrown); +} + +InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert3dLayer(PartitionIndexContainerT& partitions, + const EMEntry& emEntry, const size_t emIdx, + const bool aShmemHasGrown) +{ + auto partitionNumber = emEntry.partitionNum; + ShmVoidAllocator alloc(fBRMManagedShmMemImpl_.getManagedSegment()->get_segment_manager()); + ExtentMapIndicesT emIndices(alloc); + emIndices.push_back(emIdx); + auto iterAndResult = partitions.insert({partitionNumber, std::move(emIndices)}); + return {iterAndResult.second, aShmemHasGrown}; +} + +InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert3dLayerWrapper(PartitionIndexContainerT& partitions, + const EMEntry& emEntry, const size_t emIdx, + const bool aShmemHasGrown) +{ + auto partitionNumber = emEntry.partitionNum; + auto partitionsIter = partitions.find(partitionNumber); + bool shmemHasGrown = aShmemHasGrown; + if (partitionsIter == partitions.end()) + { + const size_t freeShmem = fBRMManagedShmMemImpl_.getManagedSegment()->get_free_memory(); + const size_t memNeeded = + (partitions.size() + extraUnits_) * partitionContainerUnitSize_ + emIdentUnitSize_; + if (partitions.load_factor() >= partitions.max_load_factor() || freeShmem <= freeSpaceThreshold_) + { + // Need to refresh all refs and iterators b/c the local address range changed. + shmemHasGrown = growIfNeeded(memNeeded); + auto* extMapIndexPtr = get(); + assert(extMapIndexPtr); + auto& extMapIndex = *extMapIndexPtr; + shmemHasGrown = shmemHasGrown || aShmemHasGrown; + // The dbroot must be here b/c we found it once in insert(). + OIDIndexContainerT& refreshedOidsRef = extMapIndex[emEntry.dbRoot]; + auto oidsIter = refreshedOidsRef.find(emEntry.fileID); + PartitionIndexContainerT& refreshedPartitionsRef = (*oidsIter).second; + return insert3dLayer(refreshedPartitionsRef, emEntry, emIdx, shmemHasGrown); + } + return insert3dLayer(partitions, emEntry, emIdx, shmemHasGrown); + } + + ExtentMapIndicesT& emIndices = (*partitionsIter).second; + emIndices.push_back(emIdx); + return {true, shmemHasGrown}; +} + +ExtentMapIndexFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OID_t oid, + const PartitionNumberT partitionNumber) +{ + ExtentMapIndex& emIndex = *get(); + if (dbroot >= emIndex.size()) + return {}; + return search2ndLayer(emIndex[dbroot], oid, partitionNumber); +} + +ExtentMapIndexFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OID_t oid) +{ + ExtentMapIndex& emIndex = *get(); + if (dbroot >= emIndex.size()) + return {}; + return search2ndLayer(emIndex[dbroot], oid); +} + +ExtentMapIndexFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& oids, const OID_t oid, + const PartitionNumberT partitionNumber) +{ + auto oidsIter = oids.find(oid); + if (oidsIter == oids.end()) + return {}; + + PartitionIndexContainerT& partitions = (*oidsIter).second; + return search3dLayer(partitions, partitionNumber); +} + +ExtentMapIndexFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& oids, const OID_t oid) +{ + auto oidsIter = oids.find(oid); + if (oidsIter == oids.end()) + return {}; + + ExtentMapIndexFindResult result; + PartitionIndexContainerT& partitions = (*oidsIter).second; + for (auto& partKeyValue : partitions) + { + ExtentMapIndicesT& emIdentifiers = partKeyValue.second; + for (auto& emIdent : emIdentifiers) + result.push_back(emIdent); + } + + return result; +} + +ExtentMapIndexFindResult ExtentMapIndexImpl::search3dLayer(PartitionIndexContainerT& partitions, + const PartitionNumberT partitionNumber) +{ + auto partitionsIter = partitions.find(partitionNumber); + if (partitionsIter == partitions.end()) + return {}; + + ExtentMapIndexFindResult result; + ExtentMapIndicesT& emIndicesVec = (*partitionsIter).second; + for (auto& emIndex : emIndicesVec) + result.push_back(emIndex); + return result; +} + +void ExtentMapIndexImpl::deleteDbRoot(const DBRootT dbroot) +{ + auto& extMapIndex = *get(); + extMapIndex[dbroot].clear(); +} + +void ExtentMapIndexImpl::deleteOID(const DBRootT dbroot, const OID_t oid) +{ + auto& extMapIndex = *get(); + auto oidsIter = extMapIndex[dbroot].find(oid); + // Nothing to delete. Might be a sign of a problem. + if (oidsIter == extMapIndex[dbroot].end()) + return; + extMapIndex[dbroot].erase(oidsIter); +} + +void ExtentMapIndexImpl::deleteEMEntry(const EMEntry& emEntry, const ExtentMapIdxT emIdent) +{ + // find partition + auto& extMapIndex = *get(); + auto oidsIter = extMapIndex[emEntry.dbRoot].find(emEntry.fileID); + if (oidsIter == extMapIndex[emEntry.dbRoot].end()) + return; + PartitionIndexContainerT& partitions = (*oidsIter).second; + auto partitionsIter = partitions.find(emEntry.partitionNum); + if (partitionsIter == partitions.end()) + return; + ExtentMapIndicesT& emIdentifiers = (*partitionsIter).second; + // pop the identifier + if (emIdentifiers.size() > 1) + { + auto emIdentifiersTargetIter = std::find(emIdentifiers.begin(), emIdentifiers.end(), emIdent); + std::swap(*emIdentifiersTargetIter, emIdentifiers.back()); + emIdentifiers.pop_back(); + } + else // only 1 ident in this partition + { + partitions.erase(partitionsIter); + } +} + ExtentMap::ExtentMap() { - fExtentMap = NULL; - fFreeList = NULL; + fExtentMap = nullptr; + fFreeList = nullptr; + fPExtMapImpl = nullptr; fCurrentEMShmkey = -1; fCurrentFLShmkey = -1; - fEMShminfo = NULL; - fFLShminfo = NULL; + fEMShminfo = nullptr; + fFLShminfo = nullptr; + fEMIndexShminfo = nullptr; r_only = false; flLocked = false; emLocked = false; - fPExtMapImpl = 0; - fPFreeListImpl = 0; + emIndexLocked = false; + fPFreeListImpl = nullptr; + fPExtMapIndexImpl_ = nullptr; #ifdef BRM_INFO fDebug = ("Y" == config::Config::makeConfig()->getConfig("DBRM", "Debug")); @@ -416,6 +704,7 @@ int ExtentMap::markInvalid(const LBID_t lbid, const execplan::CalpontSystemCatal #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); return _markInvalid(lbid, colDataType); } @@ -448,6 +737,7 @@ int ExtentMap::markInvalid(const vector& lbids, #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); // XXXPAT: what's the proper return code when one and only one fails? for (i = 0; i < size; ++i) @@ -514,6 +804,7 @@ int ExtentMap::setMaxMin(const LBID_t lbid, const int64_t max, const int64_t min #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (i = 0; i < entries; i++) @@ -571,8 +862,10 @@ int ExtentMap::setMaxMin(const LBID_t lbid, const int64_t max, const int64_t min if (emLocked) releaseEMEntryTable(WRITE); + if (emIndexLocked) + releaseEMIndex(WRITE); + throw logic_error("ExtentMap::setMaxMin(): lbid isn't allocated"); - // return -1; } // @bug 1970. Added updateExtentsMaxMin function. @@ -625,7 +918,10 @@ void ExtentMap::setExtentsMaxMin(const CPMaxMinMap_t& cpMap, bool firstNode, boo #endif if (useLock) + { grabEMEntryTable(WRITE); + grabEMIndex(WRITE); + } entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -759,7 +1055,7 @@ void ExtentMap::setExtentsMaxMin(const CPMaxMinMap_t& cpMap, bool firstNode, boo // @bug 1970. Added mergeExtentsMaxMin to merge CP info for list of extents. // @note - The key passed in the map must the starting LBID in the extent. // Used by cpimport to update extentmap casual partition min/max. -// NULL or empty values should not be passed in as min/max values. +// nullptr or empty values should not be passed in as min/max values. // seqNum in the input struct is not currently used. // // Note that DML calls markInvalid() to flag an extent as CP_UPDATING and incre- @@ -823,7 +1119,10 @@ void ExtentMap::mergeExtentsMaxMin(CPMaxMinMergeMap_t& cpMap, bool useLock) #endif if (useLock) + { grabEMEntryTable(WRITE); + grabEMIndex(WRITE); + } int entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -1127,6 +1426,7 @@ int ExtentMap::getMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) #endif grabEMEntryTable(READ); + grabEMIndex(READ); entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (i = 0; i < entries; i++) @@ -1149,12 +1449,14 @@ int ExtentMap::getMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) } seqNum = fExtentMap[i].partition.cprange.sequenceNum; isValid = fExtentMap[i].partition.cprange.isValid; + releaseEMIndex(READ); releaseEMEntryTable(READ); return isValid; } } } + releaseEMIndex(READ); releaseEMEntryTable(READ); throw logic_error("ExtentMap::getMaxMin(): that lbid isn't allocated"); // return -1; @@ -1323,7 +1625,6 @@ void ExtentMap::loadVersion4or5(T* in, bool upgradeV4ToV5) void* fExtentMapPtr = static_cast(fExtentMap); memset(fExtentMapPtr, 0, fEMShminfo->allocdSize); fEMShminfo->currentSize = 0; - // init the free list memset(fFreeList, 0, fFLShminfo->allocdSize); fFreeList[0].size = (1 << 26); // 2^36 LBIDs @@ -1344,15 +1645,15 @@ void ExtentMap::loadVersion4or5(T* in, bool upgradeV4ToV5) } growEMShmseg(nrows); + growEMIndexShmseg(ExtentMapIndexImpl::estimateEMIndexSize(emNumElements)); } + size_t progress = 0, writeSize = emNumElements * sizeof(EMEntry); int err; char* writePos; - size_t progress, writeSize; if (!upgradeV4ToV5) { - progress = 0; writeSize = emNumElements * sizeof(EMEntry); writePos = (char*)fExtentMap; @@ -1414,6 +1715,14 @@ void ExtentMap::loadVersion4or5(T* in, bool upgradeV4ToV5) //@bug 1911 - verify status value is valid if (fExtentMap[i].status < EXTENTSTATUSMIN || fExtentMap[i].status > EXTENTSTATUSMAX) fExtentMap[i].status = EXTENTAVAILABLE; + + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[i], i); + + if (resShmemHasGrownPair.second) + fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); + + if (!resShmemHasGrownPair.first) + logAndSetEMIndexReadOnly("loadVersion4or5"); } fEMShminfo->currentSize = emNumElements * sizeof(EMEntry); @@ -1454,6 +1763,7 @@ void ExtentMap::load(const string& filename, bool fixFL) #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); try { @@ -1461,6 +1771,7 @@ void ExtentMap::load(const string& filename, bool fixFL) } catch (...) { + releaseEMIndex(WRITE); releaseEMEntryTable(WRITE); throw; } @@ -1473,6 +1784,7 @@ void ExtentMap::load(const string& filename, bool fixFL) { log_errno("ExtentMap::load(): open"); releaseFreeList(WRITE); + releaseEMIndex(WRITE); releaseEMEntryTable(WRITE); throw ios_base::failure("ExtentMap::load(): open failed. Check the error log."); } @@ -1485,13 +1797,14 @@ void ExtentMap::load(const string& filename, bool fixFL) catch (...) { releaseFreeList(WRITE); + releaseEMIndex(WRITE); releaseEMEntryTable(WRITE); throw; } releaseFreeList(WRITE); + releaseEMIndex(WRITE); releaseEMEntryTable(WRITE); - // checkConsistency(); } // This is a quick workaround, to be able to initialize initial system tables @@ -1535,11 +1848,13 @@ void ExtentMap::loadFromBinaryBlob(const char* blob) catch (...) { releaseFreeList(WRITE); + releaseEMIndex(WRITE); releaseEMEntryTable(WRITE); throw; } releaseFreeList(WRITE); + releaseEMIndex(WRITE); releaseEMEntryTable(WRITE); } @@ -1586,6 +1901,7 @@ void ExtentMap::save(const string& filename) int allocdSize, loadSize[3], i; grabEMEntryTable(READ); + grabEMIndex(READ); try { @@ -1593,6 +1909,7 @@ void ExtentMap::save(const string& filename) } catch (...) { + releaseEMIndex(READ); releaseEMEntryTable(READ); throw; } @@ -1601,6 +1918,7 @@ void ExtentMap::save(const string& filename) { log("ExtentMap::save(): got request to save an empty BRM"); releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); throw runtime_error("ExtentMap::save(): got request to save an empty BRM"); } @@ -1613,6 +1931,7 @@ void ExtentMap::save(const string& filename) { log_errno("ExtentMap::save(): open"); releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); throw ios_base::failure("ExtentMap::save(): open failed. Check the error log."); } @@ -1634,6 +1953,7 @@ void ExtentMap::save(const string& filename) catch (...) { releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); throw; } @@ -1659,6 +1979,7 @@ void ExtentMap::save(const string& filename) if (err < 0) { releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); throw ios_base::failure("ExtentMap::save(): write failed. Check the error log."); } @@ -1678,6 +1999,7 @@ void ExtentMap::save(const string& filename) if (err < 0) { releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); throw ios_base::failure("ExtentMap::save(): write failed. Check the error log."); } @@ -1685,9 +2007,6 @@ void ExtentMap::save(const string& filename) } } - // allocdSize = fFLShminfo->allocdSize / sizeof(InlineLBIDRange); - // const int inlineLbidRangeSize = sizeof(InlineLBIDRange); - progress = 0; writeSize = fFLShminfo->allocdSize; char* writePos = (char*)fFreeList; @@ -1697,6 +2016,7 @@ void ExtentMap::save(const string& filename) if (err < 0) { releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); throw ios_base::failure("ExtentMap::save(): write failed. Check the error log."); } @@ -1705,6 +2025,7 @@ void ExtentMap::save(const string& filename) } releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); } @@ -1714,7 +2035,9 @@ void ExtentMap::grabEMEntryTable(OPS op) boost::mutex::scoped_lock lk(mutex); if (op == READ) + { fEMShminfo = fMST.getTable_read(MasterSegmentTable::EMTable); + } else { fEMShminfo = fMST.getTable_write(MasterSegmentTable::EMTable); @@ -1723,9 +2046,9 @@ void ExtentMap::grabEMEntryTable(OPS op) if (!fPExtMapImpl || fPExtMapImpl->key() != (unsigned)fEMShminfo->tableShmkey) { - if (fExtentMap != NULL) + if (fExtentMap != nullptr) { - fExtentMap = NULL; + fExtentMap = nullptr; } if (fEMShminfo->allocdSize == 0) @@ -1736,17 +2059,22 @@ void ExtentMap::grabEMEntryTable(OPS op) emLocked = true; if (fEMShminfo->allocdSize == 0) + { growEMShmseg(); + } emLocked = false; // has to be done holding the write lock fMST.getTable_downgrade(MasterSegmentTable::EMTable); } else + { growEMShmseg(); + } } else { fPExtMapImpl = ExtentMapImpl::makeExtentMapImpl(fEMShminfo->tableShmkey, 0); + ASSERT(fPExtMapImpl); if (r_only) @@ -1754,7 +2082,7 @@ void ExtentMap::grabEMEntryTable(OPS op) fExtentMap = fPExtMapImpl->get(); - if (fExtentMap == NULL) + if (fExtentMap == nullptr) { log_errno("ExtentMap::grabEMEntryTable(): shmat"); throw runtime_error("ExtentMap::grabEMEntryTable(): shmat failed. Check the error log."); @@ -1762,7 +2090,9 @@ void ExtentMap::grabEMEntryTable(OPS op) } } else + { fExtentMap = fPExtMapImpl->get(); + } } /* always returns holding the FL lock */ @@ -1783,9 +2113,9 @@ void ExtentMap::grabFreeList(OPS op) if (!fPFreeListImpl || fPFreeListImpl->key() != (unsigned)fFLShminfo->tableShmkey) { - if (fFreeList != NULL) + if (fFreeList != nullptr) { - fFreeList = NULL; + fFreeList = nullptr; } if (fFLShminfo->allocdSize == 0) @@ -1815,7 +2145,7 @@ void ExtentMap::grabFreeList(OPS op) fFreeList = fPFreeListImpl->get(); - if (fFreeList == NULL) + if (fFreeList == nullptr) { log_errno("ExtentMap::grabFreeList(): shmat"); throw runtime_error("ExtentMap::grabFreeList(): shmat failed. Check the error log."); @@ -1834,10 +2164,66 @@ void ExtentMap::grabFreeList(OPS op) } } +void ExtentMap::grabEMIndex(OPS op) +{ + boost::mutex::scoped_lock lk(emIndexMutex); + + if (op == READ) + { + fEMIndexShminfo = fMST.getTable_read(MasterSegmentTable::EMIndex); + } + else + { + fEMIndexShminfo = fMST.getTable_write(MasterSegmentTable::EMIndex); + emIndexLocked = true; + } + + if (!fPExtMapIndexImpl_) + { + if (fEMIndexShminfo->allocdSize == 0) + { + if (op == READ) + { + fMST.getTable_upgrade(MasterSegmentTable::EMIndex); + emIndexLocked = true; + + // Checking race conditions + if (fEMIndexShminfo->allocdSize == 0) + growEMIndexShmseg(); + + emIndexLocked = false; + fMST.getTable_downgrade(MasterSegmentTable::EMIndex); + } + else + { + growEMIndexShmseg(); + } + } + else + { + // Sending down current Managed Shmem size. If EMIndexImpl instance size doesn't match + // fEMIndexShminfo->allocdSize makeExtentMapIndexImpl will remap managed shmem segment. + fPExtMapIndexImpl_ = + ExtentMapIndexImpl::makeExtentMapIndexImpl(getInitialEMIndexShmkey(), fEMIndexShminfo->allocdSize); + + if (r_only) + fPExtMapIndexImpl_->makeReadOnly(); + } + } + else if (fPExtMapIndexImpl_->getShmemImplSize() != (unsigned)fEMIndexShminfo->allocdSize) + { + fPExtMapIndexImpl_->refreshShm(); + fPExtMapIndexImpl_ = + ExtentMapIndexImpl::makeExtentMapIndexImpl(getInitialEMIndexShmkey(), fEMIndexShminfo->allocdSize); + } +} + void ExtentMap::releaseEMEntryTable(OPS op) { if (op == READ) + { fMST.releaseTable_read(MasterSegmentTable::EMTable); + } else { /* @@ -1863,32 +2249,50 @@ void ExtentMap::releaseFreeList(OPS op) } } +void ExtentMap::releaseEMIndex(OPS op) +{ + if (op == READ) + { + fMST.releaseTable_read(MasterSegmentTable::EMIndex); + } + else + { + emIndexLocked = false; + fMST.releaseTable_write(MasterSegmentTable::EMIndex); + } +} + key_t ExtentMap::chooseEMShmkey() { - int fixedKeys = 1; - key_t ret; - - if (fEMShminfo->tableShmkey + 1 == (key_t)(fShmKeys.KEYRANGE_EXTENTMAP_BASE + fShmKeys.KEYRANGE_SIZE - 1) || - (unsigned)fEMShminfo->tableShmkey < fShmKeys.KEYRANGE_EXTENTMAP_BASE) - ret = fShmKeys.KEYRANGE_EXTENTMAP_BASE + fixedKeys; - else - ret = fEMShminfo->tableShmkey + 1; - - return ret; + return chooseShmkey(fEMShminfo, fShmKeys.KEYRANGE_EXTENTMAP_BASE); } key_t ExtentMap::chooseFLShmkey() { - int fixedKeys = 1, ret; + return chooseShmkey(fFLShminfo, fShmKeys.KEYRANGE_EMFREELIST_BASE); +} - if (fFLShminfo->tableShmkey + 1 == - (key_t)(fShmKeys.KEYRANGE_EMFREELIST_BASE + fShmKeys.KEYRANGE_SIZE - 1) || - (unsigned)fFLShminfo->tableShmkey < fShmKeys.KEYRANGE_EMFREELIST_BASE) - ret = fShmKeys.KEYRANGE_EMFREELIST_BASE + fixedKeys; - else - ret = fFLShminfo->tableShmkey + 1; +// The key values is fixed b/c MCS doesn't need to increase a segment id number +key_t ExtentMap::chooseEMIndexShmkey() +{ + return chooseShmkey(fEMIndexShminfo, fShmKeys.KEYRANGE_EXTENTMAP_INDEX_BASE); +} - return ret; +key_t ExtentMap::getInitialEMIndexShmkey() const +{ + return fShmKeys.KEYRANGE_EXTENTMAP_INDEX_BASE + 1; +} + +key_t ExtentMap::chooseShmkey(const MSTEntry* masterTableEntry, const uint32_t keyRangeBase) const +{ + int fixedKeys = 1; + + if (masterTableEntry->tableShmkey + 1 == (key_t)(keyRangeBase + fShmKeys.KEYRANGE_SIZE - 1) || + (unsigned)masterTableEntry->tableShmkey < keyRangeBase) + { + return keyRangeBase + fixedKeys; + } + return masterTableEntry->tableShmkey + 1; } /* Must be called holding the EM write lock @@ -1927,6 +2331,31 @@ void ExtentMap::growEMShmseg(size_t nrows) fExtentMap = fPExtMapImpl->get(); } +void ExtentMap::growEMIndexShmseg(const size_t suggestedSize) +{ + static const constexpr int InitEMIndexSize_ = 16 * 1024 * 1024; + size_t allocSize = std::max(InitEMIndexSize_, fEMIndexShminfo->allocdSize); + key_t newshmkey = chooseEMIndexShmkey(); + key_t fixedManagedSegmentKey = getInitialEMIndexShmkey(); + + allocSize = std::max(allocSize, suggestedSize); + if (!fPExtMapIndexImpl_) + { + fPExtMapIndexImpl_ = + ExtentMapIndexImpl::makeExtentMapIndexImpl(fixedManagedSegmentKey, allocSize, r_only); + } + else + { + fPExtMapIndexImpl_->growIfNeeded(allocSize); + } + + if (r_only) + fPExtMapIndexImpl_->makeReadOnly(); + + fEMIndexShminfo->tableShmkey = newshmkey; + fEMIndexShminfo->allocdSize = allocSize; +} + /* Must be called holding the FL lock Returns with the new shmseg mapped */ void ExtentMap::growFLShmseg() @@ -1986,7 +2415,6 @@ int ExtentMap::lookup(LBID_t lbid, LBID_t& firstLbid, LBID_t& lastLbid) #ifdef BRM_DEBUG - // printEM(); if (lbid < 0) { log("ExtentMap::lookup(): lbid must be >= 0", logging::LOG_TYPE_DEBUG); @@ -1997,6 +2425,7 @@ int ExtentMap::lookup(LBID_t lbid, LBID_t& firstLbid, LBID_t& lastLbid) #endif grabEMEntryTable(READ); + grabEMIndex(READ); entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (i = 0; i < entries; i++) @@ -2009,12 +2438,13 @@ int ExtentMap::lookup(LBID_t lbid, LBID_t& firstLbid, LBID_t& lastLbid) { firstLbid = fExtentMap[i].range.start; lastLbid = lastBlock; + releaseEMIndex(READ); releaseEMEntryTable(READ); return 0; } } } - + releaseEMIndex(READ); releaseEMEntryTable(READ); return -1; } @@ -2063,6 +2493,7 @@ int ExtentMap::lookupLocal(LBID_t lbid, int& OID, uint16_t& dbRoot, uint32_t& pa } grabEMEntryTable(READ); + grabEMIndex(READ); entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -2083,12 +2514,13 @@ int ExtentMap::lookupLocal(LBID_t lbid, int& OID, uint16_t& dbRoot, uint32_t& pa offset = lbid - fExtentMap[i].range.start; fileBlockOffset = fExtentMap[i].blockOffset + offset; + releaseEMIndex(READ); releaseEMEntryTable(READ); return 0; } } } - + releaseEMIndex(READ); releaseEMEntryTable(READ); return -1; } @@ -2110,7 +2542,7 @@ int ExtentMap::lookupLocal(int OID, uint32_t partitionNum, uint16_t segmentNum, } #endif - int entries, i, offset; + int offset; if (OID < 0) { @@ -2119,25 +2551,31 @@ int ExtentMap::lookupLocal(int OID, uint32_t partitionNum, uint16_t segmentNum, } grabEMEntryTable(READ); + grabEMIndex(READ); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); + DBRootVec dbRootVec(getAllDbRoots()); - for (i = 0; i < entries; i++) + for (auto dbRoot : dbRootVec) { - // TODO: Blockoffset logic. - if (fExtentMap[i].range.size != 0 && fExtentMap[i].fileID == OID && - fExtentMap[i].partitionNum == partitionNum && fExtentMap[i].segmentNum == segmentNum && - fExtentMap[i].blockOffset <= fileBlockOffset && - fileBlockOffset <= - (fExtentMap[i].blockOffset + (static_cast(fExtentMap[i].range.size) * 1024) - 1)) + auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + for (auto i : emIdents) { - offset = fileBlockOffset - fExtentMap[i].blockOffset; - LBID = fExtentMap[i].range.start + offset; - releaseEMEntryTable(READ); - return 0; + // TODO: Blockoffset logic. + if (fExtentMap[i].range.size != 0 && fExtentMap[i].segmentNum == segmentNum && + fExtentMap[i].blockOffset <= fileBlockOffset && + fileBlockOffset <= + (fExtentMap[i].blockOffset + (static_cast(fExtentMap[i].range.size) * 1024) - 1)) + { + offset = fileBlockOffset - fExtentMap[i].blockOffset; + LBID = fExtentMap[i].range.start + offset; + releaseEMIndex(READ); + releaseEMEntryTable(READ); + return 0; + } } } + releaseEMIndex(READ); releaseEMEntryTable(READ); return -1; } @@ -2168,6 +2606,7 @@ int ExtentMap::lookupLocal_DBroot(int OID, uint16_t dbroot, uint32_t partitionNu } grabEMEntryTable(READ); + grabEMIndex(READ); entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -2182,11 +2621,13 @@ int ExtentMap::lookupLocal_DBroot(int OID, uint16_t dbroot, uint32_t partitionNu { offset = fileBlockOffset - fExtentMap[i].blockOffset; LBID = fExtentMap[i].range.start + offset; + releaseEMIndex(READ); releaseEMEntryTable(READ); return 0; } } + releaseEMIndex(READ); releaseEMEntryTable(READ); return -1; } @@ -2214,7 +2655,6 @@ int ExtentMap::lookupLocalStartLbid(int OID, uint32_t partitionNum, uint16_t seg } #endif - int entries, i; if (OID < 0) { @@ -2225,22 +2665,29 @@ int ExtentMap::lookupLocalStartLbid(int OID, uint32_t partitionNum, uint16_t seg } grabEMEntryTable(READ); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); + grabEMIndex(READ); - for (i = 0; i < entries; i++) + DBRootVec dbRootVec(getAllDbRoots()); + + for (auto dbRoot : dbRootVec) { - if (fExtentMap[i].range.size != 0 && fExtentMap[i].fileID == OID && - fExtentMap[i].partitionNum == partitionNum && fExtentMap[i].segmentNum == segmentNum && - fExtentMap[i].blockOffset <= fileBlockOffset && - fileBlockOffset <= - (fExtentMap[i].blockOffset + (static_cast(fExtentMap[i].range.size) * 1024) - 1)) + auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + for (auto i : emIdents) { - LBID = fExtentMap[i].range.start; - releaseEMEntryTable(READ); - return 0; + if (fExtentMap[i].range.size != 0 && fExtentMap[i].segmentNum == segmentNum && + fExtentMap[i].blockOffset <= fileBlockOffset && + fileBlockOffset <= + (fExtentMap[i].blockOffset + (static_cast(fExtentMap[i].range.size) * 1024) - 1)) + { + LBID = fExtentMap[i].range.start; + releaseEMIndex(READ); + releaseEMEntryTable(READ); + return 0; + } } } + releaseEMIndex(READ); releaseEMEntryTable(READ); return -1; @@ -2270,6 +2717,7 @@ void ExtentMap::createStripeColumnExtents(const vectorcurrentSize == fEMShminfo->allocdSize) + { growEMShmseg(); + } // size is the number of multiples of 1024 blocks. // ex: size=1 --> 1024 blocks @@ -2414,11 +2865,11 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co uint32_t& partitionNum, uint16_t& segmentNum, uint32_t& startBlockOffset) { - int emptyEMEntry = -1; - int lastExtentIndex = -1; - uint32_t highestOffset = 0; - uint32_t highestPartNum = 0; - uint16_t highestSegNum = 0; + EmptyEMEntry emptyEMEntry = -1; + LastExtentIndexT lastExtentIndex = -1; + HighestOffset highestOffset = 0; + PartitionNumberT highestPartNum = 0; + SegmentT highestSegNum = 0; const unsigned FILES_PER_COL_PART = getFilesPerColumnPartition(); const unsigned EXTENT_ROWS = getExtentRows(); const unsigned EXTENTS_PER_SEGFILE = getExtentsPerSegmentFile(); @@ -2426,9 +2877,9 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co // Variables that track list of segfiles in target (HWM) DBRoot & partition. // Map segment number to the highest fbo extent in each file - typedef tr1::unordered_map TargetDbRootSegsMap; - typedef TargetDbRootSegsMap::iterator TargetDbRootSegsMapIter; - typedef TargetDbRootSegsMap::const_iterator TargetDbRootSegsMapConstIter; + using TargetDbRootSegsMap = tr1::unordered_map; + using TargetDbRootSegsMapIter = TargetDbRootSegsMap::iterator; + using TargetDbRootSegsMapConstIter = TargetDbRootSegsMap::const_iterator; TargetDbRootSegsMap targetDbRootSegs; uint32_t highEmptySegNum = 0; // high seg num for user specified partition; @@ -2441,50 +2892,74 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co // 2. if DBRoot is empty, track highest seg num in user specified partition // 3. Find first unused extent map entry //-------------------------------------------------------------------------- - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - LBID_t startLBID = getLBIDsFromFreeList(size); + auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID); // Find the first empty Entry; and find last extent for this OID and dbRoot - for (int i = 0; i < emEntries; i++) + + for (auto i : emIdents) { if (fExtentMap[i].range.size != 0) { - if (fExtentMap[i].fileID == OID) + // 1. Find HWM extent in relevant DBRoot + if ((fExtentMap[i].partitionNum > highestPartNum) || + ((fExtentMap[i].partitionNum == highestPartNum) && (fExtentMap[i].blockOffset > highestOffset)) || + ((fExtentMap[i].partitionNum == highestPartNum) && (fExtentMap[i].blockOffset == highestOffset) && + (fExtentMap[i].segmentNum >= highestSegNum))) { - // 1. Find HWM extent in relevant DBRoot - if (fExtentMap[i].dbRoot == dbRoot) - { - if ((fExtentMap[i].partitionNum > highestPartNum) || - ((fExtentMap[i].partitionNum == highestPartNum) && - (fExtentMap[i].blockOffset > highestOffset)) || - ((fExtentMap[i].partitionNum == highestPartNum) && - (fExtentMap[i].blockOffset == highestOffset) && (fExtentMap[i].segmentNum >= highestSegNum))) - { - lastExtentIndex = i; - highestPartNum = fExtentMap[i].partitionNum; - highestSegNum = fExtentMap[i].segmentNum; - highestOffset = fExtentMap[i].blockOffset; - } - } - - // 2. for empty DBRoot track hi seg# in user specified part# - if ((lastExtentIndex == -1) && (fExtentMap[i].partitionNum == partitionNum)) - { - if ((fExtentMap[i].segmentNum > highEmptySegNum) || (!bHighEmptySegNumSet)) - { - highEmptySegNum = fExtentMap[i].segmentNum; - bHighEmptySegNumSet = true; - } - } - } // found extentmap entry for specified OID - } // found valid extentmap entry + lastExtentIndex = i; + highestPartNum = fExtentMap[i].partitionNum; + highestSegNum = fExtentMap[i].segmentNum; + highestOffset = fExtentMap[i].blockOffset; + } + } // found valid extentmap entry // 3. Find first available extent map entry that can be reused else if (emptyEMEntry < 0) emptyEMEntry = i; } // Loop through extent map entries + DBRootVec dbRootVec(getAllDbRoots()); + // 2. for empty DBRoot track hi seg# in user specified part# + if (lastExtentIndex == -1) + { + // loop over all extents that doesn't belong to the target dbroot + for (auto dbRootFromList : dbRootVec) + { + if (dbRootFromList == dbRoot) + continue; + + auto emIdentsLocal = fPExtMapIndexImpl_->find(dbRootFromList, OID, partitionNum); + for (auto i : emIdentsLocal) + { + if ((fExtentMap[i].range.size != 0) && + ((fExtentMap[i].segmentNum > highEmptySegNum) || (!bHighEmptySegNumSet))) + { + highEmptySegNum = fExtentMap[i].segmentNum; + bHighEmptySegNumSet = true; + } + + // Search for the first empty Entry + if (fExtentMap[i].range.size == 0) + { + emptyEMEntry = i; + break; + } + } + } + } + + size_t emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); + // Search for the first empty Entry + for (size_t i = 0; emptyEMEntry < 0 && i < emEntries; ++i) + { + if (fExtentMap[i].range.size == 0) + { + emptyEMEntry = i; + break; + } + } + if (emptyEMEntry == -1) { ostringstream oss; @@ -2516,64 +2991,73 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co int partHighSeg = -1; // hi seg num for last partition int partHighSegNext = -1; // hi seg num for next partition + // Target dbroot has extents for the OID if (lastExtentIndex >= 0) { - uint32_t targetDbRootPart = fExtentMap[lastExtentIndex].partitionNum; - uint32_t targetDbRootPartNext = targetDbRootPart + 1; + PartitionNumberT targetDbRootPart = fExtentMap[lastExtentIndex].partitionNum; + PartitionNumberT targetDbRootPartNext = targetDbRootPart + 1; partHighSeg = fExtentMap[lastExtentIndex].segmentNum; targetDbRootSegs.insert(TargetDbRootSegsMap::value_type(fExtentMap[lastExtentIndex].segmentNum, fExtentMap[lastExtentIndex].blockOffset)); - for (int i = 0; i < emEntries; i++) + for (auto dbRootFromList : dbRootVec) { - if (fExtentMap[i].range.size != 0) + if (dbRootFromList == dbRoot) { - if (fExtentMap[i].fileID == OID) + auto emIdents = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPart); + for (auto i : emIdents) { - // 4. Track hi seg for hwm+1 partition - if (fExtentMap[i].partitionNum == targetDbRootPartNext) - { - if (fExtentMap[i].segmentNum > partHighSegNext) - { - partHighSegNext = fExtentMap[i].segmentNum; - } - } - // 5. Track hi seg for hwm partition - else if (fExtentMap[i].partitionNum == targetDbRootPart) + if (fExtentMap[i].segmentNum > partHighSeg) { - if (fExtentMap[i].segmentNum > partHighSeg) + partHighSeg = fExtentMap[i].segmentNum; + } + + // 6. Save list of seg files in target DBRoot/Partition, + // along with the highest fbo for each seg file + if (fExtentMap[i].status == EXTENTOUTOFSERVICE) + bSegsOutOfService = true; + + TargetDbRootSegsMapIter iter = targetDbRootSegs.find(fExtentMap[i].segmentNum); + + if (iter == targetDbRootSegs.end()) + { + targetDbRootSegs.insert( + TargetDbRootSegsMap::value_type(fExtentMap[i].segmentNum, fExtentMap[i].blockOffset)); + } + else + { + if (fExtentMap[i].blockOffset > iter->second) { - partHighSeg = fExtentMap[i].segmentNum; - } - - // 6. Save list of seg files in target DBRoot/Partition, - // along with the highest fbo for each seg file - if (fExtentMap[i].dbRoot == dbRoot) - { - if (fExtentMap[i].status == EXTENTOUTOFSERVICE) - bSegsOutOfService = true; - - TargetDbRootSegsMapIter iter = targetDbRootSegs.find(fExtentMap[i].segmentNum); - - if (iter == targetDbRootSegs.end()) - { - targetDbRootSegs.insert( - TargetDbRootSegsMap::value_type(fExtentMap[i].segmentNum, fExtentMap[i].blockOffset)); - } - else - { - if (fExtentMap[i].blockOffset > iter->second) - { - iter->second = fExtentMap[i].blockOffset; - } - } + iter->second = fExtentMap[i].blockOffset; } } - } // found extentmap entry for specified OID - } // found valid extentmap entry - } // loop through extent map entries - } // (lastExtentIndex >= 0) + } // loop over em idents + } // current dbroot == target dbroot + else + { + // 4. Track hi seg for hwm+1 partition + auto emIdentsNext = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPartNext); + for (auto i : emIdentsNext) + { + if (fExtentMap[i].segmentNum > partHighSegNext) + { + partHighSegNext = fExtentMap[i].segmentNum; + } + } + + // 5. Track hi seg for hwm partition + auto emIdents = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPart); + for (auto i : emIdents) + { + if (fExtentMap[i].segmentNum > partHighSeg) + { + partHighSeg = fExtentMap[i].segmentNum; + } + } + } // current dbroot != target dbroot + } // loop over dbroots + } // (lastExtentIndex >= 0) //-------------------------------------------------------------------------- // Third Step: Select partition and segment number for new extent @@ -2837,6 +3321,13 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co makeUndoRecord(fEMShminfo, sizeof(MSTEntry)); fEMShminfo->currentSize += sizeof(struct EMEntry); + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[emptyEMEntry], emptyEMEntry); + + if (resShmemHasGrownPair.second) + fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); + + if (!resShmemHasGrownPair.first) + logAndSetEMIndexReadOnly("_createColumnExtent_DBroot"); return startLBID; } @@ -2893,10 +3384,13 @@ void ExtentMap::createColumnExtentExactFile(int OID, uint32_t colWidth, uint16_t // extentRows should be multiple of blocksize (8192). const unsigned EXTENT_SIZE = (getExtentRows() * colWidth) / BLOCK_SIZE; grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); if (fEMShminfo->currentSize == fEMShminfo->allocdSize) + { growEMShmseg(); + } // size is the number of multiples of 1024 blocks. // ex: size=1 --> 1024 blocks @@ -2910,6 +3404,59 @@ void ExtentMap::createColumnExtentExactFile(int OID, uint32_t colWidth, uint16_t allocdsize = EXTENT_SIZE; } +LastIndEmptyIndEmptyInd ExtentMap::_createExtentCommonSearch(const OID_t OID, const DBRootT dbRoot, + const PartitionNumberT partitionNum, + const SegmentT segmentNum) +{ + EmptyEMEntry emptyEMEntry = -1; + LastExtentIndexT lastExtentIndex = -1; + HighestOffset highestOffset = 0; + + size_t emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); + + auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + // DRRTUY we might need to use cache preload here. + // Search of the last extent idx and the highest offset + for (auto i : emIdents) + { + if (fExtentMap[i].range.size != 0) + { + if ((fExtentMap[i].segmentNum == segmentNum) && (fExtentMap[i].blockOffset >= highestOffset)) + { + lastExtentIndex = i; + highestOffset = fExtentMap[i].blockOffset; + } + } + // Search for the first empty Entry + else if (emptyEMEntry < 0) + emptyEMEntry = i; + } + + // Search for the first empty Entry + // DRRTUY We might need to support empty EM ids vector + for (size_t i = 0; emptyEMEntry < 0 && i < emEntries; ++i) + { + if (fExtentMap[i].range.size == 0) + { + emptyEMEntry = i; + break; + } + } + return {lastExtentIndex, emptyEMEntry}; +} + +void ExtentMap::logAndSetEMIndexReadOnly(const std::string& funcName) +{ + fPExtMapIndexImpl_->makeReadOnly(); + ostringstream os; + os << "ExtentMap::" << funcName << ": " + << "Can not update EM Index. EM Index shmem segment is set to" + << " readonly. Please restart Columnstore."; + log(os.str(), logging::LOG_TYPE_CRITICAL); + + throw logic_error(os.str()); +} + //------------------------------------------------------------------------------ // Creates an extent for the exact segment file specified by the requested // OID, DBRoot, partition, and segment. This is the internal implementation @@ -2933,32 +3480,9 @@ LBID_t ExtentMap::_createColumnExtentExactFile(uint32_t size, int OID, uint32_t execplan::CalpontSystemCatalog::ColDataType colDataType, uint32_t& startBlockOffset) { - int emptyEMEntry = -1; - int lastExtentIndex = -1; - uint32_t highestOffset = 0; - - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - LBID_t startLBID = getLBIDsFromFreeList(size); - - // Find the first empty Entry; and find the last extent for this - // combination of OID, partition, and segment. - for (int i = 0; i < emEntries; i++) - { - if (fExtentMap[i].range.size != 0) - { - if (fExtentMap[i].fileID == OID) - { - if ((fExtentMap[i].dbRoot == dbRoot) && (fExtentMap[i].partitionNum == partitionNum) && - (fExtentMap[i].segmentNum == segmentNum) && (fExtentMap[i].blockOffset >= highestOffset)) - { - lastExtentIndex = i; - highestOffset = fExtentMap[i].blockOffset; - } - } - } - else if (emptyEMEntry < 0) - emptyEMEntry = i; - } // Loop through extent map entries + auto lastIndEmptyIndEmptyInd = _createExtentCommonSearch(OID, dbRoot, partitionNum, segmentNum); + LastExtentIndexT lastExtentIndex = lastIndEmptyIndEmptyInd.first; + EmptyEMEntry emptyEMEntry = lastIndEmptyIndEmptyInd.second; if (emptyEMEntry == -1) { @@ -2973,6 +3497,7 @@ LBID_t ExtentMap::_createColumnExtentExactFile(uint32_t size, int OID, uint32_t makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry)); EMEntry* e = &fExtentMap[emptyEMEntry]; + LBID_t startLBID = getLBIDsFromFreeList(size); e->range.start = startLBID; e->range.size = size; e->fileID = OID; @@ -3044,6 +3569,13 @@ LBID_t ExtentMap::_createColumnExtentExactFile(uint32_t size, int OID, uint32_t makeUndoRecord(fEMShminfo, sizeof(MSTEntry)); fEMShminfo->currentSize += sizeof(struct EMEntry); + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[emptyEMEntry], emptyEMEntry); + + if (resShmemHasGrownPair.second) + fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); + + if (!resShmemHasGrownPair.first) + logAndSetEMIndexReadOnly("_createColumnExtentExactFile"); return startLBID; } @@ -3094,10 +3626,13 @@ void ExtentMap::createDictStoreExtent(int OID, uint16_t dbRoot, uint32_t partiti const unsigned EXTENT_SIZE = (getExtentRows() * DICT_COL_WIDTH) / BLOCK_SIZE; grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); if (fEMShminfo->currentSize == fEMShminfo->allocdSize) + { growEMShmseg(); + } // size is the number of multiples of 1024 blocks. // ex: size=1 --> 1024 blocks @@ -3127,30 +3662,9 @@ void ExtentMap::createDictStoreExtent(int OID, uint16_t dbRoot, uint32_t partiti LBID_t ExtentMap::_createDictStoreExtent(uint32_t size, int OID, uint16_t dbRoot, uint32_t partitionNum, uint16_t segmentNum) { - int emptyEMEntry = -1; - int lastExtentIndex = -1; - uint32_t highestOffset = 0; - - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - LBID_t startLBID = getLBIDsFromFreeList(size); - - // Find the first empty Entry; and find the last extent for this - // combination of OID, partition, and segment. - for (int i = 0; i < emEntries; i++) - { - if (fExtentMap[i].range.size != 0) - { - if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].partitionNum == partitionNum) && - (fExtentMap[i].segmentNum == segmentNum) && (fExtentMap[i].blockOffset >= highestOffset)) - { - lastExtentIndex = i; - highestOffset = fExtentMap[i].blockOffset; - } - } - else if (emptyEMEntry < 0) - emptyEMEntry = i; - } // Loop through extent map entries + auto lastIndEmptyIndEmptyInd = _createExtentCommonSearch(OID, dbRoot, partitionNum, segmentNum); + LastExtentIndexT lastExtentIndex = lastIndEmptyIndEmptyInd.first; + EmptyEMEntry emptyEMEntry = lastIndEmptyIndEmptyInd.second; if (emptyEMEntry == -1) { @@ -3165,6 +3679,7 @@ LBID_t ExtentMap::_createDictStoreExtent(uint32_t size, int OID, uint16_t dbRoot makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry)); EMEntry* e = &fExtentMap[emptyEMEntry]; + LBID_t startLBID = getLBIDsFromFreeList(size); e->range.start = startLBID; e->range.size = size; e->fileID = OID; @@ -3201,6 +3716,13 @@ LBID_t ExtentMap::_createDictStoreExtent(uint32_t size, int OID, uint16_t dbRoot makeUndoRecord(fEMShminfo, sizeof(MSTEntry)); fEMShminfo->currentSize += sizeof(struct EMEntry); + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[emptyEMEntry], emptyEMEntry); + + if (resShmemHasGrownPair.second) + fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); + + if (!resShmemHasGrownPair.first) + logAndSetEMIndexReadOnly("_createDictStoreExtent"); return startLBID; } @@ -3364,6 +3886,7 @@ void ExtentMap::rollbackColumnExtents_DBroot(int oid, bool bDeleteAll, uint16_t uint32_t fboLoPreviousStripe = 0; grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -3562,6 +4085,7 @@ void ExtentMap::rollbackDictStoreExtents_DBroot(int oid, uint16_t dbRoot, uint32 tr1::unordered_map >::const_iterator segToHwmMapIter; grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -3688,6 +4212,7 @@ void ExtentMap::deleteEmptyColExtents(const ExtentsInfoMap_t& extentsInfo) #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); uint32_t fboLo = 0; @@ -3823,6 +4348,7 @@ void ExtentMap::deleteEmptyDictStoreExtents(const ExtentsInfoMap_t& extentsInfo) #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); ExtentsInfoMap_t::const_iterator it; @@ -3953,8 +4479,15 @@ void ExtentMap::deleteOID(int OID) #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); + // Clean up the index and tell deleteExtent to skip the clean-up. + DBRootVec dbRootVec(getAllDbRoots()); + for (auto dbRoot : dbRootVec) + fPExtMapIndexImpl_->deleteOID(dbRoot, OID); + const bool clearEMIndex = false; + int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (int emIndex = 0; emIndex < emEntries; emIndex++) @@ -3962,8 +4495,7 @@ void ExtentMap::deleteOID(int OID) if (fExtentMap[emIndex].range.size > 0 && fExtentMap[emIndex].fileID == OID) { OIDExists = true; - - deleteExtent(emIndex); + deleteExtent(emIndex, clearEMIndex); } } @@ -3991,10 +4523,20 @@ void ExtentMap::deleteOIDs(const OidsMap_t& OIDs) #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); + OidsMap_t::const_iterator it; int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); + const bool clearEMIndex = false; + DBRootVec dbRootVec(getAllDbRoots()); + for (auto dbRoot : dbRootVec) + { + for (auto& oidOidPair : OIDs) + fPExtMapIndexImpl_->deleteOID(dbRoot, oidOidPair.first); + } + for (int emIndex = 0; emIndex < emEntries; emIndex++) { if (fExtentMap[emIndex].range.size > 0) @@ -4002,7 +4544,7 @@ void ExtentMap::deleteOIDs(const OidsMap_t& OIDs) it = OIDs.find(fExtentMap[emIndex].fileID); if (it != OIDs.end()) - deleteExtent(emIndex); + deleteExtent(emIndex, clearEMIndex); } } } @@ -4011,7 +4553,7 @@ void ExtentMap::deleteOIDs(const OidsMap_t& OIDs) // Delete the specified extent from the extentmap and return to the free list. // emIndex - the index (from the extent map) of the extent to be deleted //------------------------------------------------------------------------------ -void ExtentMap::deleteExtent(int emIndex) +void ExtentMap::deleteExtent(const int emIndex, const bool clearEMIndex) { int flIndex, freeFLIndex, flEntries, preceedingExtent, succeedingExtent; LBID_t flBlockEnd, emBlockEnd; @@ -4134,6 +4676,8 @@ void ExtentMap::deleteExtent(int emIndex) // invalidate the entry in the Extent Map makeUndoRecord(&fExtentMap[emIndex], sizeof(EMEntry)); fExtentMap[emIndex].range.size = 0; + if (clearEMIndex) + fPExtMapIndexImpl_->deleteEMEntry(fExtentMap[emIndex], emIndex); makeUndoRecord(&fEMShminfo, sizeof(MSTEntry)); fEMShminfo->currentSize -= sizeof(struct EMEntry); } @@ -4183,6 +4727,7 @@ HWM_t ExtentMap::getLastHWM_DBroot(int OID, uint16_t dbRoot, uint32_t& partition } grabEMEntryTable(READ); + grabEMIndex(READ); // Searching the array in reverse order should be faster since the last // extent is usually at the bottom. We still have to search the entire @@ -4217,6 +4762,7 @@ HWM_t ExtentMap::getLastHWM_DBroot(int OID, uint16_t dbRoot, uint32_t& partition bFound = true; } + releaseEMIndex(READ); releaseEMEntryTable(READ); return hwm; @@ -4252,14 +4798,14 @@ void ExtentMap::getDbRootHWMInfo(int OID, uint16_t pmNumber, EmDbRootHWMInfo_v& // Determine List of DBRoots for specified PM, and construct map of // EmDbRootHWMInfo objects. tr1::unordered_map emDbRootMap; - vector dbRootList; - getPmDbRoots(pmNumber, dbRootList); + vector dbRootVec; + getPmDbRoots(pmNumber, dbRootVec); - if (dbRootList.size() > 0) + if (dbRootVec.size() > 0) { - for (unsigned int iroot = 0; iroot < dbRootList.size(); iroot++) + for (unsigned int iroot = 0; iroot < dbRootVec.size(); iroot++) { - uint16_t rootID = dbRootList[iroot]; + uint16_t rootID = dbRootVec[iroot]; EmDbRootHWMInfo emDbRootInfo(rootID); emDbRootMap[rootID] = emDbRootInfo; } @@ -4275,47 +4821,52 @@ void ExtentMap::getDbRootHWMInfo(int OID, uint16_t pmNumber, EmDbRootHWMInfo_v& } grabEMEntryTable(READ); + grabEMIndex(READ); tr1::unordered_map::iterator emIter; // Searching the array in reverse order should be faster since the last // extent is usually at the bottom. We still have to search the entire // array (just in case), but the number of operations per loop iteration // will be less. - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (int i = emEntries - 1; i >= 0; i--) + for (auto dbRoot : dbRootVec) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == OID)) + auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID); + for (auto i : emIdents) { - // Include this extent in the search, only if the extent's - // DBRoot falls in the list of DBRoots for this PM. - emIter = emDbRootMap.find(fExtentMap[i].dbRoot); - - if (emIter == emDbRootMap.end()) - continue; - - EmDbRootHWMInfo& emDbRoot = emIter->second; - - if ((fExtentMap[i].status != EXTENTOUTOFSERVICE) && (fExtentMap[i].HWM != 0)) - emDbRoot.totalBlocks += (fExtentMap[i].HWM + 1); - - if ((fExtentMap[i].partitionNum > emDbRoot.partitionNum) || - ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) && - (fExtentMap[i].blockOffset > emDbRoot.fbo)) || - ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) && - (fExtentMap[i].blockOffset == emDbRoot.fbo) && (fExtentMap[i].segmentNum >= emDbRoot.segmentNum))) + if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == OID)) { - emDbRoot.fbo = fExtentMap[i].blockOffset; - emDbRoot.partitionNum = fExtentMap[i].partitionNum; - emDbRoot.segmentNum = fExtentMap[i].segmentNum; - emDbRoot.localHWM = fExtentMap[i].HWM; - emDbRoot.startLbid = fExtentMap[i].range.start; - emDbRoot.status = fExtentMap[i].status; - emDbRoot.hwmExtentIndex = i; + // Include this extent in the search, only if the extent's + // DBRoot falls in the list of DBRoots for this PM. + emIter = emDbRootMap.find(fExtentMap[i].dbRoot); + + if (emIter == emDbRootMap.end()) + continue; + + EmDbRootHWMInfo& emDbRoot = emIter->second; + + if ((fExtentMap[i].status != EXTENTOUTOFSERVICE) && (fExtentMap[i].HWM != 0)) + emDbRoot.totalBlocks += (fExtentMap[i].HWM + 1); + + if ((fExtentMap[i].partitionNum > emDbRoot.partitionNum) || + ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) && + (fExtentMap[i].blockOffset > emDbRoot.fbo)) || + ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) && + (fExtentMap[i].blockOffset == emDbRoot.fbo) && + (fExtentMap[i].segmentNum >= emDbRoot.segmentNum))) + { + emDbRoot.fbo = fExtentMap[i].blockOffset; + emDbRoot.partitionNum = fExtentMap[i].partitionNum; + emDbRoot.segmentNum = fExtentMap[i].segmentNum; + emDbRoot.localHWM = fExtentMap[i].HWM; + emDbRoot.startLbid = fExtentMap[i].range.start; + emDbRoot.status = fExtentMap[i].status; + emDbRoot.hwmExtentIndex = i; + } } } } + releaseEMIndex(READ); releaseEMEntryTable(READ); for (tr1::unordered_map::iterator iter = emDbRootMap.begin(); @@ -4395,6 +4946,7 @@ void ExtentMap::getExtentState(int OID, uint32_t partitionNum, uint16_t segmentN } grabEMEntryTable(READ); + grabEMIndex(READ); emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -4409,6 +4961,7 @@ void ExtentMap::getExtentState(int OID, uint32_t partitionNum, uint16_t segmentN } } + releaseEMIndex(READ); releaseEMEntryTable(READ); } @@ -4441,7 +4994,6 @@ HWM_t ExtentMap::getLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum #endif - int i, emEntries; HWM_t ret = 0; bool OIDPartSegExists = false; @@ -4454,26 +5006,31 @@ HWM_t ExtentMap::getLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum } grabEMEntryTable(READ); + grabEMIndex(READ); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (i = 0; i < emEntries; i++) + DBRootVec dbRootVec(getAllDbRoots()); + for (auto dbRoot : dbRootVec) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == OID) && - (fExtentMap[i].partitionNum == partitionNum) && (fExtentMap[i].segmentNum == segmentNum)) + auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + for (auto i : emIdents) { - OIDPartSegExists = true; - status = fExtentMap[i].status; - - if (fExtentMap[i].HWM != 0) + if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].segmentNum == segmentNum)) { - ret = fExtentMap[i].HWM; - releaseEMEntryTable(READ); - return ret; + OIDPartSegExists = true; + status = fExtentMap[i].status; + + if (fExtentMap[i].HWM != 0) + { + ret = fExtentMap[i].HWM; + releaseEMIndex(READ); + releaseEMEntryTable(READ); + return ret; + } } } } + releaseEMIndex(READ); releaseEMEntryTable(READ); if (OIDPartSegExists) @@ -4520,31 +5077,37 @@ void ExtentMap::setLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum, #endif - int lastExtentIndex = -1; + LastExtentIndexT lastExtentIndex = -1; int oldHWMExtentIndex = -1; - uint32_t highestOffset = 0; + HighestOffset highestOffset = 0; if (uselock) - grabEMEntryTable(WRITE); - - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (int i = 0; i < emEntries; i++) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == OID) && - (fExtentMap[i].partitionNum == partitionNum) && (fExtentMap[i].segmentNum == segmentNum)) - { - // Find current HWM extent - if (fExtentMap[i].blockOffset >= highestOffset) - { - highestOffset = fExtentMap[i].blockOffset; - lastExtentIndex = i; - } + grabEMEntryTable(WRITE); + grabEMIndex(WRITE); + } - // Find previous HWM extent - if (fExtentMap[i].HWM != 0) + DBRootVec dbRootVec(getAllDbRoots()); + + for (auto dbRoot : dbRootVec) + { + auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + for (auto i : emIdents) + { + if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].segmentNum == segmentNum)) { - oldHWMExtentIndex = i; + // Find current HWM extent + if (fExtentMap[i].blockOffset >= highestOffset) + { + highestOffset = fExtentMap[i].blockOffset; + lastExtentIndex = i; + } + + // Find previous HWM extent + if (fExtentMap[i].HWM != 0) + { + oldHWMExtentIndex = i; + } } } } @@ -4619,6 +5182,7 @@ void ExtentMap::setLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum, void ExtentMap::bulkSetHWM(const vector& v, bool firstNode) { grabEMEntryTable(WRITE); + grabEMIndex(WRITE); for (uint32_t i = 0; i < v.size(); i++) setLocalHWM(v[i].oid, v[i].partNum, v[i].segNum, v[i].hwm, firstNode, false); @@ -4653,6 +5217,7 @@ void ExtentMap::bulkUpdateDBRoot(const vector& args) sArgs.insert(args[i]); grabEMEntryTable(WRITE); + grabEMIndex(WRITE); emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -4692,10 +5257,10 @@ void ExtentMap::getExtents(int OID, vector& entries, bool sorted } grabEMEntryTable(READ); + grabEMIndex(READ); emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); // Pre-expand entries to stop lots of small allocs entries.reserve(emEntries); - if (incOutOfService) { for (i = 0; i < emEntries; i++) @@ -4710,6 +5275,7 @@ void ExtentMap::getExtents(int OID, vector& entries, bool sorted entries.push_back(fExtentMap[i]); } + releaseEMIndex(READ); releaseEMEntryTable(READ); if (sorted) @@ -4767,12 +5333,14 @@ void ExtentMap::getExtents_dbroot(int OID, vector& entries, cons } grabEMEntryTable(READ); + grabEMIndex(READ); emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (i = 0; i < emEntries; i++) if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].range.size != 0) && (fExtentMap[i].dbRoot == dbroot)) entries.push_back(fExtentMap[i]); + releaseEMIndex(READ); releaseEMEntryTable(READ); } @@ -4794,6 +5362,7 @@ void ExtentMap::getExtentCount_dbroot(int OID, uint16_t dbroot, bool incOutOfSer } grabEMEntryTable(READ); + grabEMIndex(READ); emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); numExtents = 0; @@ -4817,6 +5386,7 @@ void ExtentMap::getExtentCount_dbroot(int OID, uint16_t dbroot, bool incOutOfSer } } + releaseEMIndex(READ); releaseEMEntryTable(READ); } @@ -4842,6 +5412,7 @@ void ExtentMap::getSysCatDBRoot(OID_t oid, uint16_t& dbRoot) bool bFound = false; grabEMEntryTable(READ); + grabEMIndex(READ); int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (int i = 0; i < emEntries; i++) @@ -4854,6 +5425,7 @@ void ExtentMap::getSysCatDBRoot(OID_t oid, uint16_t& dbRoot) } } + releaseEMIndex(READ); releaseEMEntryTable(READ); if (!bFound) @@ -4904,6 +5476,7 @@ void ExtentMap::deletePartition(const set& oids, const set foundPartitions; int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -5005,6 +5578,7 @@ void ExtentMap::markPartitionForDeletion(const set& oids, const setallocdSize / sizeof(struct EMEntry); set foundPartitions; vector extents; @@ -5112,6 +5686,7 @@ void ExtentMap::markAllPartitionForDeletion(const set& oids) set::const_iterator it; grabEMEntryTable(WRITE); + grabEMIndex(WRITE); int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (int i = 0; i < emEntries; i++) @@ -5165,6 +5740,7 @@ void ExtentMap::restorePartition(const set& oids, const set::const_iterator it; grabEMEntryTable(WRITE); + grabEMIndex(WRITE); int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); vector extents; @@ -5257,6 +5833,7 @@ void ExtentMap::getOutOfServicePartitions(OID_t oid, set& part } grabEMEntryTable(READ); + grabEMIndex(READ); int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (int i = 0; i < emEntries; i++) @@ -5270,6 +5847,7 @@ void ExtentMap::getOutOfServicePartitions(OID_t oid, set& part } } + releaseEMIndex(READ); releaseEMEntryTable(READ); } @@ -5291,11 +5869,13 @@ void ExtentMap::deleteDBRoot(uint16_t dbroot) #endif grabEMEntryTable(WRITE); + grabEMIndex(WRITE); grabFreeList(WRITE); for (unsigned i = 0; i < fEMShminfo->allocdSize / sizeof(struct EMEntry); i++) if (fExtentMap[i].range.size != 0 && fExtentMap[i].dbRoot == dbroot) deleteExtent(i); + fPExtMapIndexImpl_->deleteDbRoot(dbroot); } //------------------------------------------------------------------------------ @@ -5318,6 +5898,7 @@ bool ExtentMap::isDBRootEmpty(uint16_t dbroot) bool bEmpty = true; int i, emEntries; grabEMEntryTable(READ); + grabEMIndex(READ); emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); if (fEMShminfo->currentSize == 0) @@ -5334,6 +5915,7 @@ bool ExtentMap::isDBRootEmpty(uint16_t dbroot) } } + releaseEMIndex(READ); releaseEMEntryTable(READ); return bEmpty; @@ -5393,6 +5975,7 @@ void ExtentMap::lookup(OID_t OID, LBIDRange_v& ranges) } grabEMEntryTable(READ); + grabEMIndex(READ); emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (i = 0; i < emEntries; i++) @@ -5404,6 +5987,7 @@ void ExtentMap::lookup(OID_t OID, LBIDRange_v& ranges) ranges.push_back(tmp); } + releaseEMIndex(READ); releaseEMEntryTable(READ); } @@ -5435,6 +6019,7 @@ int ExtentMap::checkConsistency() uint32_t usedEntries; grabEMEntryTable(READ); + grabEMIndex(READ); try { @@ -5442,6 +6027,7 @@ int ExtentMap::checkConsistency() } catch (...) { + releaseEMIndex(READ); releaseEMEntryTable(READ); throw; } @@ -5655,6 +6241,7 @@ int ExtentMap::checkConsistency() cout << "test 5a passed\n"; releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); return 0; } @@ -5693,6 +6280,9 @@ void ExtentMap::finishChanges() if (flLocked) releaseFreeList(WRITE); + if (emIndexLocked) + releaseEMIndex(WRITE); + if (emLocked) releaseEMEntryTable(WRITE); } @@ -5707,6 +6297,11 @@ const bool* ExtentMap::getEMLockStatus() return &emLocked; } +const bool* ExtentMap::getEMIndexLockStatus() +{ + return &emIndexLocked; +} + //------------------------------------------------------------------------------ // Reload Config cache if config file time stamp has changed //------------------------------------------------------------------------------ @@ -5854,19 +6449,36 @@ unsigned ExtentMap::getDbRootCount() // Get list of DBRoots that map to the specified PM. DBRoot list is cached // internally in fPmDbRootMap after getting from Columnstore.xml via OAM. //------------------------------------------------------------------------------ -void ExtentMap::getPmDbRoots(int pm, vector& dbRootList) +void ExtentMap::getPmDbRoots(int pm, vector& dbRootVec) { oam::OamCache* oamcache = oam::OamCache::makeOamCache(); oam::OamCache::PMDbrootsMap_t pmDbroots = oamcache->getPMToDbrootsMap(); - dbRootList.clear(); - dbRootList = (*pmDbroots)[pm]; + dbRootVec.clear(); + dbRootVec = (*pmDbroots)[pm]; +} + +DBRootVec ExtentMap::getAllDbRoots() +{ + DBRootVec dbRootResultVec; + oam::OamCache* oamcache = oam::OamCache::makeOamCache(); + // NB The routine uses int for dbroot id that contradicts with the type used here, namely uint16_t + oam::OamCache::PMDbrootsMap_t pmDbroots = oamcache->getPMToDbrootsMap(); + auto& pmDbrootsRef = *pmDbroots; + + for (auto& pmDBRootPair : pmDbrootsRef) + { + for (auto dbRootId : pmDBRootPair.second) + dbRootResultVec.push_back(dbRootId); + } + return dbRootResultVec; } vector ExtentMap::getFreeListEntries() { vector v; grabEMEntryTable(READ); + grabEMIndex(READ); grabFreeList(READ); int allocdSize = fFLShminfo->allocdSize / sizeof(InlineLBIDRange); @@ -5875,6 +6487,7 @@ vector ExtentMap::getFreeListEntries() v.push_back(fFreeList[i]); releaseFreeList(READ); + releaseEMIndex(READ); releaseEMEntryTable(READ); return v; } @@ -5882,6 +6495,7 @@ vector ExtentMap::getFreeListEntries() void ExtentMap::dumpTo(ostream& os) { grabEMEntryTable(READ); + grabEMIndex(READ); unsigned emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); for (unsigned i = 0; i < emEntries; i++) @@ -5897,54 +6511,25 @@ void ExtentMap::dumpTo(ostream& os) } } + releaseEMIndex(READ); releaseEMEntryTable(READ); } -/*int ExtentMap::physicalPartitionNum(const set& oids, - const set& partitionNums, - vector& partitionInfos) +size_t ExtentMap::EMIndexShmemSize() { -#ifdef BRM_INFO - if (fDebug) - { - TRACER_WRITENOW("physicalPartitionNum"); - ostringstream oss; - set::const_iterator partIt; - oss << "partitionNums: " - for (partIt=partitionNums.begin(); it!=partitionNums.end(); ++it) - oss << (*it) << " "; - oss << endl; - TRACER_WRITEDIRECT(oss.str()); - } -#endif - - set::const_iterator it; - grabEMEntryTable(READ); - - int emEntries = fEMShminfo->allocdSize/sizeof(struct EMEntry); - PartitionInfo partInfo; - vector extents; - set foundPartitions; - for (int i = 0; i < emEntries; i++) - { - if ((fExtentMap[i].range.size != 0 ) && - partitionNums.find(logicalPartitionNum(fExtentMap[i])) != partitionNums.end()) - { - it = oids.find( fExtentMap[i].fileID ); - if (it != oids.end()) - { - partInfo.oid = fExtentMap[i].fileID; - partInfo.lp.dbroot = fExtentMap[i].dbRoot; - partInfo.lp.pp = fExtentMap[i].partitionNum; - partInfo.lp.seg = fExtentMap[i].segmentNum; - partitionInfos.push_back(partInfo); - } - } - } - releaseEMEntryTable(READ); - return 0; + grabEMIndex(READ); + size_t EMIndexShmemSize = fPExtMapIndexImpl_->getShmemSize(); + releaseEMIndex(READ); + return EMIndexShmemSize; +} + +size_t ExtentMap::EMIndexShmemFree() +{ + grabEMIndex(READ); + size_t EMIndexShmemFree = fPExtMapIndexImpl_->getShmemFree(); + releaseEMIndex(READ); + return EMIndexShmemFree; } -*/ template int ExtentMap::getMaxMin(const LBID_t lbidRange, int128_t& max, int128_t& min, int32_t& seqNum); @@ -5952,5 +6537,4 @@ template int ExtentMap::getMaxMin(const LBID_t lbidRange, int128_t& ma template int ExtentMap::getMaxMin(const LBID_t lbidRange, int64_t& max, int64_t& min, int32_t& seqNum); -} // namespace BRM -// vim:ts=4 sw=4: +} // namespace BRM \ No newline at end of file diff --git a/versioning/BRM/extentmap.h b/versioning/BRM/extentmap.h index e70f61492..9ffbe07fe 100644 --- a/versioning/BRM/extentmap.h +++ b/versioning/BRM/extentmap.h @@ -1,4 +1,5 @@ /* Copyright (C) 2014 InfiniDB, Inc. + Copyright (C) 2016-2022 MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -30,15 +31,18 @@ #include #include #include -#ifdef _MSC_VER #include -#else #include -#endif +#include + //#define NDEBUG #include #include #include +#include +#include +#include +#include //boost::hash #include "shmkeys.h" #include "brmtypes.h" @@ -64,6 +68,8 @@ #define EXPORT #endif +namespace bi = boost::interprocess; + namespace oam { typedef std::vector DBRootConfigList; @@ -76,6 +82,15 @@ class IDBDataFile; namespace BRM { +using PartitionNumberT = uint32_t; +using DBRootT = uint16_t; +using SegmentT = uint16_t; +using LastExtentIndexT = int; +using EmptyEMEntry = int; +using HighestOffset = uint32_t; +using LastIndEmptyIndEmptyInd = std::pair; +using DBRootVec = std::vector; + // assumed column width when calculating dictionary store extent size #define DICT_COL_WIDTH 8 @@ -99,8 +114,6 @@ const char CP_INVALID = 0; const char CP_UPDATING = 1; const char CP_VALID = 2; -// The _v4 structs are defined below for upgrading extent map -// from v4 to v5; see ExtentMap::loadVersion4or5 for details. struct EMCasualPartition_struct_v4 { RangePartitionData_t hi_val; // This needs to be reinterpreted as unsigned for uint64_t column types. @@ -114,16 +127,15 @@ struct EMPartition_struct_v4 { EMCasualPartition_struct_v4 cprange; }; - struct EMEntry_v4 { InlineLBIDRange range; int fileID; uint32_t blockOffset; HWM_t HWM; - uint32_t partitionNum; // starts at 0 - uint16_t segmentNum; // starts at 0 - uint16_t dbRoot; // starts at 1 to match Columnstore.xml + PartitionNumberT partitionNum; // starts at 0 + uint16_t segmentNum; // starts at 0 + DBRootT dbRoot; // starts at 1 to match Columnstore.xml uint16_t colWid; int16_t status; // extent avail for query or not, or out of service EMPartition_struct_v4 partition; @@ -152,7 +164,7 @@ struct EMCasualPartition_struct EXPORT EMCasualPartition_struct(const EMCasualPartition_struct& em); EXPORT EMCasualPartition_struct& operator=(const EMCasualPartition_struct& em); }; -typedef EMCasualPartition_struct EMCasualPartition_t; +using EMCasualPartition_t = EMCasualPartition_struct; struct EMPartition_struct { @@ -166,9 +178,9 @@ struct EMEntry int fileID; uint32_t blockOffset; HWM_t HWM; - uint32_t partitionNum; // starts at 0 - uint16_t segmentNum; // starts at 0 - uint16_t dbRoot; // starts at 1 to match Columnstore.xml + PartitionNumberT partitionNum; // starts at 0 + uint16_t segmentNum; // starts at 0 + DBRootT dbRoot; // starts at 1 to match Columnstore.xml uint16_t colWid; int16_t status; // extent avail for query or not, or out of service EMPartition_t partition; @@ -320,6 +332,146 @@ class FreeListImpl static FreeListImpl* fInstance; }; +using ShmSegmentManagerT = bi::managed_shared_memory::segment_manager; +using ShmVoidAllocator = bi::allocator; + +using ExtentMapIdxT = size_t; +using ExtentMapIdxTAlloc = bi::allocator; +using PartitionNumberTAlloc = bi::allocator; +using ExtentMapIndicesT = std::vector; + +using PartitionIndexContainerKeyT = PartitionNumberT; +using PartitionIndexContainerValT = std::pair; +using PartitionIndexContainerValTAlloc = bi::allocator; +// Can't use std::unordered_map presumably b/c the map's pointer type doesn't use offset_type as boost::u_map +// does +using PartitionIndexContainerT = + boost::unordered_map, std::equal_to, + PartitionIndexContainerValTAlloc>; + +using OIDIndexContainerKeyT = OID_t; +using OIDIndexContainerValT = std::pair; +using OIDIndexContainerValTAlloc = bi::allocator; +using OIDIndexContainerT = + boost::unordered_map, + std::equal_to, OIDIndexContainerValTAlloc>; + +using DBRootIndexTAlloc = bi::allocator; +using DBRootIndexContainerT = std::vector; +using ExtentMapIndex = DBRootIndexContainerT; +using ExtentMapIndexFindResult = std::vector; +using InsertUpdateShmemKeyPair = std::pair; + +class ExtentMapIndexImpl +{ + public: + ~ExtentMapIndexImpl(){}; + + static ExtentMapIndexImpl* makeExtentMapIndexImpl(unsigned key, off_t size, bool readOnly = false); + static void refreshShm() + { + if (fInstance_) + { + delete fInstance_; + fInstance_ = nullptr; + } + } + + // The multipliers and constants here are pure theoretical + // tested using customer's data. + static size_t estimateEMIndexSize(uint32_t numberOfExtents) + { + // These are just educated guess values to calculate initial + // managed shmem size. + constexpr const size_t tablesNumber_ = 100ULL; + constexpr const size_t columnsNumber_ = 200ULL; + constexpr const size_t dbRootsNumber_ = 3ULL; + constexpr const size_t filesInPartition_ = 4ULL; + constexpr const size_t extentsInPartition_ = filesInPartition_ * 2; + return numberOfExtents * emIdentUnitSize_ + + numberOfExtents / extentsInPartition_ * partitionContainerUnitSize_ + + dbRootsNumber_ * tablesNumber_ * columnsNumber_; + } + + bool growIfNeeded(const size_t memoryNeeded); + + inline void grow(off_t size) + { + int rc = fBRMManagedShmMemImpl_.grow(size); + idbassert(rc == 0); + } + // After this call one needs to refresh any refs or ptrs sourced + // from this shmem. + inline void makeReadOnly() + { + fBRMManagedShmMemImpl_.setReadOnly(); + } + + inline void swapout(BRMManagedShmImpl& rhs) + { + fBRMManagedShmMemImpl_.swap(rhs); + } + + inline unsigned key() const + { + return fBRMManagedShmMemImpl_.key(); + } + + unsigned getShmemSize() + { + return fBRMManagedShmMemImpl_.getManagedSegment()->get_size(); + } + + size_t getShmemFree() + { + return fBRMManagedShmMemImpl_.getManagedSegment()->get_free_memory(); + } + + unsigned getShmemImplSize() + { + return fBRMManagedShmMemImpl_.size(); + } + + void createExtentMapIndexIfNeeded(); + ExtentMapIndex* get(); + InsertUpdateShmemKeyPair insert(const EMEntry& emEntry, const size_t emIdx); + InsertUpdateShmemKeyPair insert2ndLayerWrapper(OIDIndexContainerT& oids, const EMEntry& emEntry, + const size_t emIdx, const bool aShmemHasGrown); + InsertUpdateShmemKeyPair insert2ndLayer(OIDIndexContainerT& oids, const EMEntry& emEntry, + const size_t emIdx, const bool aShmemHasGrown); + InsertUpdateShmemKeyPair insert3dLayerWrapper(PartitionIndexContainerT& partitions, const EMEntry& emEntry, + const size_t emIdx, const bool aShmemHasGrown); + InsertUpdateShmemKeyPair insert3dLayer(PartitionIndexContainerT& partitions, const EMEntry& emEntry, + const size_t emIdx, const bool aShmemHasGrown); + ExtentMapIndexFindResult find(const DBRootT dbroot, const OID_t oid, + const PartitionNumberT partitionNumber); + ExtentMapIndexFindResult find(const DBRootT dbroot, const OID_t oid); + ExtentMapIndexFindResult search2ndLayer(OIDIndexContainerT& oids, const OID_t oid, + const PartitionNumberT partitionNumber); + ExtentMapIndexFindResult search2ndLayer(OIDIndexContainerT& oids, const OID_t oid); + ExtentMapIndexFindResult search3dLayer(PartitionIndexContainerT& partitions, + const PartitionNumberT partitionNumber); + void deleteDbRoot(const DBRootT dbroot); + void deleteOID(const DBRootT dbroot, const OID_t oid); + void deleteEMEntry(const EMEntry& emEntry, const ExtentMapIdxT emIdent); + + private: + BRMManagedShmImpl fBRMManagedShmMemImpl_; + ExtentMapIndexImpl(unsigned key, off_t size, bool readOnly = false); + ExtentMapIndexImpl(const ExtentMapIndexImpl& rhs); + ExtentMapIndexImpl& operator=(const ExtentMapIndexImpl& rhs); + + static std::mutex fInstanceMutex_; + static ExtentMapIndexImpl* fInstance_; + static const constexpr uint32_t dbRootContainerUnitSize_ = 64ULL; + static const constexpr uint32_t oidContainerUnitSize_ = 352ULL; // 2 * map overhead + static const constexpr uint32_t partitionContainerUnitSize_ = 368ULL; // single map overhead + static const constexpr uint32_t emIdentUnitSize_ = sizeof(uint64_t); + static const constexpr uint32_t extraUnits_ = 2; + static const constexpr size_t freeSpaceThreshold_ = 256 * 1024; +}; + /** @brief This class encapsulates the extent map functionality of the system * * This class encapsulates the extent map functionality of the system. It @@ -346,7 +498,7 @@ class ExtentMap : public Undoable */ EXPORT void load(const std::string& filename, bool fixFL = false); - /** @brief Loads the ExtentMap entries from a binayr blob. + /** @brief Loads the ExtentMap entries from a binary blob. * * Loads the ExtentMap entries from a file. This will * clear out any existing entries. The intention is that before @@ -887,6 +1039,9 @@ class ExtentMap : public Undoable EXPORT void dumpTo(std::ostream& os); EXPORT const bool* getEMLockStatus(); EXPORT const bool* getEMFLLockStatus(); + EXPORT const bool* getEMIndexLockStatus(); + size_t EMIndexShmemSize(); + size_t EMIndexShmemFree(); #ifdef BRM_DEBUG EXPORT void printEM() const; @@ -896,11 +1051,11 @@ class ExtentMap : public Undoable #endif private: - static const size_t EM_INCREMENT_ROWS = 100; - static const size_t EM_INITIAL_SIZE = EM_INCREMENT_ROWS * 10 * sizeof(EMEntry); - static const size_t EM_INCREMENT = EM_INCREMENT_ROWS * sizeof(EMEntry); - static const size_t EM_FREELIST_INITIAL_SIZE = 50 * sizeof(InlineLBIDRange); - static const size_t EM_FREELIST_INCREMENT = 50 * sizeof(InlineLBIDRange); + static const constexpr size_t EM_INCREMENT_ROWS = 100; + static const constexpr size_t EM_INITIAL_SIZE = EM_INCREMENT_ROWS * 10 * sizeof(EMEntry); + static const constexpr size_t EM_INCREMENT = EM_INCREMENT_ROWS * sizeof(EMEntry); + static const constexpr size_t EM_FREELIST_INITIAL_SIZE = 50 * sizeof(InlineLBIDRange); + static const constexpr size_t EM_FREELIST_INCREMENT = 50 * sizeof(InlineLBIDRange); ExtentMap(const ExtentMap& em); ExtentMap& operator=(const ExtentMap& em); @@ -911,6 +1066,7 @@ class ExtentMap : public Undoable key_t fCurrentFLShmkey; MSTEntry* fEMShminfo; MSTEntry* fFLShminfo; + MSTEntry* fEMIndexShminfo; const MasterSegmentTable fMST; bool r_only; typedef std::tr1::unordered_map PmDbRootMap_t; @@ -918,8 +1074,9 @@ class ExtentMap : public Undoable time_t fCacheTime; // timestamp associated with config cache int numUndoRecords; - bool flLocked, emLocked; - static boost::mutex mutex; // @bug5355 - made mutex static + bool flLocked, emLocked, emIndexLocked; + static boost::mutex mutex; // @bug5355 - made mutex static + static boost::mutex emIndexMutex; boost::mutex fConfigCacheMutex; // protect access to Config Cache enum OPS @@ -931,6 +1088,12 @@ class ExtentMap : public Undoable OPS EMLock, FLLock; + LastIndEmptyIndEmptyInd _createExtentCommonSearch(const OID_t OID, const DBRootT dbRoot, + const PartitionNumberT partitionNum, + const SegmentT segmentNum); + + void logAndSetEMIndexReadOnly(const std::string& funcName); + LBID_t _createColumnExtent_DBroot(uint32_t size, int OID, uint32_t colWidth, uint16_t dbRoot, execplan::CalpontSystemCatalog::ColDataType colDataType, uint32_t& partitionNum, uint16_t& segmentNum, uint32_t& startBlockOffset); @@ -942,24 +1105,32 @@ class ExtentMap : public Undoable uint16_t segmentNum); template bool isValidCPRange(const T& max, const T& min, execplan::CalpontSystemCatalog::ColDataType type) const; - void deleteExtent(int emIndex); + void deleteExtent(const int emIndex, const bool clearEMIndex = true); LBID_t getLBIDsFromFreeList(uint32_t size); void reserveLBIDRange(LBID_t start, uint8_t size); // used by load() to allocate pre-existing LBIDs - key_t chooseEMShmkey(); // see the code for how keys are segmented - key_t chooseFLShmkey(); // see the code for how keys are segmented + key_t chooseEMShmkey(); + key_t chooseFLShmkey(); + key_t chooseEMIndexShmkey(); + key_t getInitialEMIndexShmkey() const; + // see the code for how keys are segmented + key_t chooseShmkey(const MSTEntry* masterTableEntry, const uint32_t keyRangeBase) const; void grabEMEntryTable(OPS op); void grabFreeList(OPS op); + void grabEMIndex(OPS op); void releaseEMEntryTable(OPS op); void releaseFreeList(OPS op); + void releaseEMIndex(OPS op); void growEMShmseg(size_t nrows = 0); void growFLShmseg(); + void growEMIndexShmseg(const size_t suggestedSize = 0); void finishChanges(); EXPORT unsigned getFilesPerColumnPartition(); unsigned getExtentsPerSegmentFile(); unsigned getDbRootCount(); void getPmDbRoots(int pm, std::vector& dbRootList); + DBRootVec getAllDbRoots(); void checkReloadConfig(); ShmKeys fShmKeys; @@ -980,6 +1151,7 @@ class ExtentMap : public Undoable ExtentMapImpl* fPExtMapImpl; FreeListImpl* fPFreeListImpl; + ExtentMapIndexImpl* fPExtMapIndexImpl_; }; inline std::ostream& operator<<(std::ostream& os, ExtentMap& rhs) diff --git a/versioning/BRM/lock_grabber.cpp b/versioning/BRM/lock_grabber.cpp index 6932b7708..7c6884bb8 100644 --- a/versioning/BRM/lock_grabber.cpp +++ b/versioning/BRM/lock_grabber.cpp @@ -1,4 +1,5 @@ /* Copyright (C) 2014 InfiniDB, Inc. + Copyright (C) 2016-2022 MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -21,6 +22,7 @@ * third, lock or unlock it */ +#include #include #include #include @@ -32,10 +34,15 @@ char* name; void usage() { - cout << "Usage " << name << " which_lock_to_use which_side_to_use lock_or_unlock\n" - << " which_lock_to_use: 1=VSS 2=ExtentMap 3=FreeList 4=VBBM 5=CopyLocks\n" - << " which_side_to_use: r|w (read or write)\n" - << " lock_or_unlock: l|u (lock or unlock)\n"; + std::cout << "Usage " << name << " which_lock_to_use which_side_to_use lock_or_unlock" << std::endl; + size_t lockId = 0; + for (auto& lockName : RWLockNames) + { + std::cout << " " << lockId++ << "=" << lockName << " "; + } + std::cout << std::endl + << " which_side_to_use: r|w (read or write)" << std::endl + << " lock_or_unlock: l|u (lock or unlock)" << std::endl; exit(1); } @@ -54,10 +61,21 @@ int main(int argc, char** argv) if (strlen(argv[1]) != 1 || strlen(argv[2]) != 1 || strlen(argv[3]) != 1) usage(); - which_lock = atoi(argv[1]); - - if (which_lock < 1 || which_lock > 5) + try + { + which_lock = std::stoi(argv[1]); + } + catch (std::exception const& e) + { + std::cerr << "Cannot convert the lock id: " << e.what() << std::endl; usage(); + } + + if (which_lock >= RWLockNames.size()) + usage(); + + size_t minLockId = (which_lock > 0) ? which_lock : 1; + size_t maxLockId = (which_lock > 0) ? which_lock : RWLockNames.size() - 1; if (argv[2][0] == 'r') which_side = 0; @@ -73,17 +91,28 @@ int main(int argc, char** argv) else usage(); - rwlock = new RWLock(0x10000 * which_lock); + for (size_t i = minLockId; i <= maxLockId; ++i) + { + rwlock = new RWLock(0x10000 * which_lock); - if (which_side == 0) - if (lock_unlock == 0) - rwlock->read_lock(); + if (which_side == 0) + { + if (lock_unlock == 0) + rwlock->read_lock(); + else + rwlock->read_unlock(); + } + else if (lock_unlock == 0) + { + rwlock->write_lock(); + } else - rwlock->read_unlock(); - else if (lock_unlock == 0) - rwlock->write_lock(); - else - rwlock->write_unlock(); + { + rwlock->write_unlock(); + } + + delete rwlock; + } return 0; } diff --git a/versioning/BRM/lock_state.cpp b/versioning/BRM/lock_state.cpp index 812b59c08..4b5cef63e 100644 --- a/versioning/BRM/lock_state.cpp +++ b/versioning/BRM/lock_state.cpp @@ -1,4 +1,5 @@ /* Copyright (C) 2014 InfiniDB, Inc. + Copyright (C) 2016-2022 MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -22,6 +23,7 @@ */ #include +#include #include #include @@ -32,14 +34,18 @@ char* name; void usage() { - cout << "Usage " << name << " which_lock_to_use which_side_to_use lock_or_unlock\n" - << " which_lock_to_use: 1=VSS 2=ExtentMap 3=FreeList 4=VBBM 5=CopyLocks\n"; + std::cout << "Usage " << name << " which_lock_to_use:" << std::endl; + size_t lockId = 0; + for (auto& lockName : RWLockNames) + { + std::cout << " " << lockId++ << "=" << lockName << std::endl; + } exit(1); } int main(int argc, char** argv) { - uint32_t which_lock; // 1-5 + uint32_t which_lock; // 0-6 RWLock* rwlock; LockState state; @@ -51,18 +57,35 @@ int main(int argc, char** argv) if (strlen(argv[1]) != 1) usage(); - which_lock = atoi(argv[1]); + try + { + which_lock = std::stoi(argv[1]); + } + catch (std::exception const& e) + { + std::cerr << "Cannot convert the lock id: " << e.what() << std::endl; + usage(); + } - if (which_lock < 1 || which_lock > 5) + if (which_lock >= RWLockNames.size()) usage(); - rwlock = new RWLock(0x10000 * which_lock); - state = rwlock->getLockState(); - cout << "readers = " << state.reading << endl - << "writers = " << state.writing << endl - << "readers waiting = " << state.readerswaiting << endl - << "writers waiting = " << state.writerswaiting << endl - << "mutex locked = " << (int)state.mutexLocked << endl; + size_t minLockId = (which_lock > 0) ? which_lock : 1; + size_t maxLockId = (which_lock > 0) ? which_lock : RWLockNames.size() - 1; + + for (size_t i = minLockId; i <= maxLockId; ++i) + { + rwlock = new RWLock(0x10000 * i); + state = rwlock->getLockState(); + + cout << RWLockNames[i] << " RWLock" << std::endl + << " readers = " << state.reading << std::endl + << " writers = " << state.writing << std::endl + << " readers waiting = " << state.readerswaiting << std::endl + << " writers waiting = " << state.writerswaiting << std::endl + << " mutex locked = " << (int)state.mutexLocked << std::endl; + delete rwlock; + } return 0; } diff --git a/versioning/BRM/mastersegmenttable.cpp b/versioning/BRM/mastersegmenttable.cpp index c88dc13ba..31b2da6f7 100644 --- a/versioning/BRM/mastersegmenttable.cpp +++ b/versioning/BRM/mastersegmenttable.cpp @@ -138,6 +138,7 @@ MasterSegmentTable::MasterSegmentTable() RWLockKeys[2] = fShmKeys.KEYRANGE_VBBM_BASE; RWLockKeys[3] = fShmKeys.KEYRANGE_VSS_BASE; RWLockKeys[4] = fShmKeys.KEYRANGE_CL_BASE; + RWLockKeys[5] = fShmKeys.KEYRANGE_EXTENTMAP_INDEX_BASE; try { diff --git a/versioning/BRM/mastersegmenttable.h b/versioning/BRM/mastersegmenttable.h index beab0fa49..9d2ea111e 100644 --- a/versioning/BRM/mastersegmenttable.h +++ b/versioning/BRM/mastersegmenttable.h @@ -110,8 +110,10 @@ class MasterSegmentTable static const int VSSSegment = 3; /// specifies the copy lock segment static const int CLSegment = 4; + /// specifies the EM Index segment + static const int EMIndex = 5; /// the number of tables currently defined - static const int nTables = 5; + static const int nTables = 6; /** @brief This function gets the specified table. * diff --git a/versioning/BRM/shmkeys.cpp b/versioning/BRM/shmkeys.cpp index f7642ff9f..943d81d1b 100644 --- a/versioning/BRM/shmkeys.cpp +++ b/versioning/BRM/shmkeys.cpp @@ -50,6 +50,7 @@ ShmKeys::ShmKeys() KEYRANGE_EMFREELIST_BASE = 0x30000 | (BRM_UID << 20); KEYRANGE_VBBM_BASE = 0x40000 | (BRM_UID << 20); KEYRANGE_CL_BASE = 0x50000 | (BRM_UID << 20); + KEYRANGE_EXTENTMAP_INDEX_BASE = 0x60000 | (BRM_UID << 20); MST_SYSVKEY = 0xff000000 | BRM_UID; PROCESSSTATUS_SYSVKEY = 0xfd000000 | BRM_UID; SYSTEMSTATUS_SYSVKEY = 0xfc000000 | BRM_UID; @@ -62,7 +63,7 @@ ShmKeys::ShmKeys() string ShmKeys::keyToName(unsigned key) { ostringstream oss; - oss << "InfiniDB-shm-"; + oss << "MCS-shm-"; oss << setw(8) << setfill('0') << hex << key; return oss.str(); } diff --git a/versioning/BRM/shmkeys.h b/versioning/BRM/shmkeys.h index 29df2a2b4..60462edd6 100644 --- a/versioning/BRM/shmkeys.h +++ b/versioning/BRM/shmkeys.h @@ -56,6 +56,7 @@ struct ShmKeys uint32_t KEYRANGE_EMFREELIST_BASE; uint32_t KEYRANGE_VBBM_BASE; uint32_t KEYRANGE_VSS_BASE; + uint32_t KEYRANGE_EXTENTMAP_INDEX_BASE; /****** Fixed location assignments *******/ uint32_t MST_SYSVKEY; diff --git a/versioning/BRM/slavedbrmnode.cpp b/versioning/BRM/slavedbrmnode.cpp index ae7bb1d11..2331169a9 100644 --- a/versioning/BRM/slavedbrmnode.cpp +++ b/versioning/BRM/slavedbrmnode.cpp @@ -1492,6 +1492,11 @@ const bool* SlaveDBRMNode::getEMLockStatus() return em.getEMLockStatus(); } +const bool* SlaveDBRMNode::getEMIndexLockStatus() +{ + return em.getEMIndexLockStatus(); +} + const bool* SlaveDBRMNode::getVBBMLockStatus() { return &locked[0]; diff --git a/versioning/BRM/slavedbrmnode.h b/versioning/BRM/slavedbrmnode.h index 8512c0090..b5c7d2f08 100644 --- a/versioning/BRM/slavedbrmnode.h +++ b/versioning/BRM/slavedbrmnode.h @@ -462,6 +462,7 @@ class SlaveDBRMNode EXPORT const bool* getEMFLLockStatus(); EXPORT const bool* getEMLockStatus(); + EXPORT const bool* getEMIndexLockStatus(); EXPORT const bool* getVBBMLockStatus(); EXPORT const bool* getVSSLockStatus(); diff --git a/versioning/BRM/slavenode.cpp b/versioning/BRM/slavenode.cpp index 8651dc823..667feaa08 100644 --- a/versioning/BRM/slavenode.cpp +++ b/versioning/BRM/slavenode.cpp @@ -147,6 +147,8 @@ int ServiceWorkerNode::Child() monitorThreads.create_thread(RWLockMonitor(&die, slave.getEMLockStatus(), keys.KEYRANGE_EXTENTMAP_BASE)); monitorThreads.create_thread(RWLockMonitor(&die, slave.getVBBMLockStatus(), keys.KEYRANGE_VBBM_BASE)); monitorThreads.create_thread(RWLockMonitor(&die, slave.getVSSLockStatus(), keys.KEYRANGE_VSS_BASE)); + monitorThreads.create_thread( + RWLockMonitor(&die, slave.getEMIndexLockStatus(), keys.KEYRANGE_EXTENTMAP_INDEX_BASE)); try { diff --git a/writeengine/bulk/cpimport.cpp b/writeengine/bulk/cpimport.cpp index 4b1fbac8d..cf22e9590 100644 --- a/writeengine/bulk/cpimport.cpp +++ b/writeengine/bulk/cpimport.cpp @@ -917,16 +917,24 @@ void getTableOID(const std::string& xmlGenSchema, const std::string& xmlGenTable void constructTempXmlFile(const std::string& tempJobDir, const std::string& sJobIdStr, const std::string& xmlGenSchema, const std::string& xmlGenTable, const std::string& alternateImportDir, const std::string& S3Bucket, - boost::filesystem::path& sFileName) + const std::string& tableOIDStr, boost::filesystem::path& sFileName) { // Construct the job description file name std::string xmlErrMsg; int rc = 0; - std::string tableOIDStr; - getTableOID(xmlGenSchema, xmlGenTable, tableOIDStr); + std::string localTableOIDStr; + if (tableOIDStr.empty()) + { + getTableOID(xmlGenSchema, xmlGenTable, localTableOIDStr); + } + else + { + localTableOIDStr = tableOIDStr; + } + rc = XMLJob::genJobXMLFileName(std::string(), tempJobDir, sJobIdStr, true, // using temp job xml file - xmlGenSchema, xmlGenTable, sFileName, xmlErrMsg, tableOIDStr); + xmlGenSchema, xmlGenTable, sFileName, xmlErrMsg, localTableOIDStr); if (rc != NO_ERROR) { @@ -946,7 +954,7 @@ void constructTempXmlFile(const std::string& tempJobDir, const std::string& sJob { genProc.startXMLFile(); execplan::CalpontSystemCatalog::TableName tbl(xmlGenSchema, xmlGenTable); - genProc.makeTableData(tbl); + genProc.makeTableData(tbl, localTableOIDStr); if (!genProc.makeColumnData(tbl)) { @@ -1223,9 +1231,9 @@ int main(int argc, char** argv) if (!xmlGenSchema.empty()) // create temporary job file name { // If JobID is not provided, then default to the table OID + std::string tableOIDStr{""}; if (sJobIdStr.empty()) { - std::string tableOIDStr; getTableOID(xmlGenSchema, xmlGenTable, tableOIDStr); if (!(BulkLoad::disableConsoleOutput())) @@ -1240,7 +1248,7 @@ int main(int argc, char** argv) bUseTempJobFile = true; constructTempXmlFile(curJob.getTempJobDir(), sJobIdStr, xmlGenSchema, xmlGenTable, - curJob.getAlternateImportDir(), curJob.getS3Bucket(), sFileName); + curJob.getAlternateImportDir(), curJob.getS3Bucket(), tableOIDStr, sFileName); } else // create user's persistent job file name { diff --git a/writeengine/xml/we_xmlgenproc.cpp b/writeengine/xml/we_xmlgenproc.cpp index 3fca53e50..afd6d2b6c 100644 --- a/writeengine/xml/we_xmlgenproc.cpp +++ b/writeengine/xml/we_xmlgenproc.cpp @@ -168,7 +168,21 @@ void XMLGenProc::startXMLFile() // makeTableData // Create XML tag for a table. //------------------------------------------------------------------------------ +// This method is used by colxml only and it can be relatively slower doing tableRID() +// first call. All subsequent calls will re-use data from CalpontSystemCatalog cache. void XMLGenProc::makeTableData(const CalpontSystemCatalog::TableName& table) +{ + boost::shared_ptr cat = + CalpontSystemCatalog::makeCalpontSystemCatalog(BULK_SYSCAT_SESSION_ID); + cat->identity(CalpontSystemCatalog::EC); + std::ostringstream oss; + // tableRID method might take a lot with a significant EM. + oss << cat->tableRID(table).objnum; + + makeTableData(table, oss.str()); +} + +void XMLGenProc::makeTableData(const CalpontSystemCatalog::TableName& table, const std::string& tableOIDStr) { static unsigned kount; @@ -180,11 +194,8 @@ void XMLGenProc::makeTableData(const CalpontSystemCatalog::TableName& table) { try { - boost::shared_ptr cat = - CalpontSystemCatalog::makeCalpontSystemCatalog(BULK_SYSCAT_SESSION_ID); - cat->identity(CalpontSystemCatalog::EC); - xmlTextWriterWriteFormatAttribute(fWriter, BAD_CAST xmlTagTable[TAG_TBL_OID], "%d", - cat->tableRID(table).objnum); + xmlTextWriterWriteFormatAttribute(fWriter, BAD_CAST xmlTagTable[TAG_TBL_OID], "%s", + tableOIDStr.c_str()); } catch (std::exception& ex) { diff --git a/writeengine/xml/we_xmlgenproc.h b/writeengine/xml/we_xmlgenproc.h index 241ca763a..edfdfff33 100644 --- a/writeengine/xml/we_xmlgenproc.h +++ b/writeengine/xml/we_xmlgenproc.h @@ -76,6 +76,9 @@ class XMLGenProc * * @param table Name of table for which the table tag is to be generated. */ + EXPORT void makeTableData(const execplan::CalpontSystemCatalog::TableName& table, + const std::string& tableOIDStr); + EXPORT void makeTableData(const execplan::CalpontSystemCatalog::TableName& table); /** @brief Creates column tags for the specified table. diff --git a/writeengine/xml/we_xmljob.cpp b/writeengine/xml/we_xmljob.cpp index f4f5662d7..2a95203df 100644 --- a/writeengine/xml/we_xmljob.cpp +++ b/writeengine/xml/we_xmljob.cpp @@ -381,11 +381,11 @@ void XMLJob::setJobData(xmlNode* pNode, const xmlTag tag, bool bExpectContent, X if (tagType == TYPE_INT) bSuccess = getNodeContent(pNode, &intVal, TYPE_INT); else // longlong - if (tagType == TYPE_LONGLONG) - bSuccess = getNodeContent(pNode, &llVal, TYPE_LONGLONG); - else // char + if (tagType == TYPE_LONGLONG) + bSuccess = getNodeContent(pNode, &llVal, TYPE_LONGLONG); + else // char if (tagType == TYPE_CHAR) - bSuccess = getNodeContentStr(pNode, bufString); + bSuccess = getNodeContentStr(pNode, bufString); if (!bSuccess) return; @@ -1194,7 +1194,8 @@ void XMLJob::validateAllColumnsHaveTags(const execplan::CalpontSystemCatalog::RI /* static */ int XMLJob::genJobXMLFileName(const string& sXMLJobDir, const string& jobDir, const string& jobId, bool bTempFile, const string& schemaName, const string& tableName, - boost::filesystem::path& xmlFilePath, string& errMsg, std::string& tableOIDStr) + boost::filesystem::path& xmlFilePath, string& errMsg, + const std::string& tableOIDStr) { // get full file directory path for XML job description file if (sXMLJobDir.empty()) diff --git a/writeengine/xml/we_xmljob.h b/writeengine/xml/we_xmljob.h index 4265ff2c8..52c856f16 100644 --- a/writeengine/xml/we_xmljob.h +++ b/writeengine/xml/we_xmljob.h @@ -75,7 +75,7 @@ class XMLJob : public XMLOp EXPORT static int genJobXMLFileName(const std::string& sXMLJobDir, const std::string& jobDir, const std::string& jobId, bool bTempFile, const std::string& schemaName, const std::string& tableName, boost::filesystem::path& xmlDirPath, - std::string& errMsg, std::string& tableOIDStr); + std::string& errMsg, const std::string& tableOIDStr); /** * @brief Get job structure