From 2f5f9cdbe140952d2ce5692bd70f81d7d34dd542 Mon Sep 17 00:00:00 2001 From: Denis Khalikov Date: Thu, 18 Aug 2022 22:51:26 +0300 Subject: [PATCH] MCOL-5091/MCOL-4917 ExtentMap as RBTree. This patch replaces a `linear array` to `Red black tree` for ExtentMap. --- versioning/BRM/brmshmimpl.cpp | 114 +- versioning/BRM/brmshmimpl.h | 19 + versioning/BRM/dbrm.h | 2 +- versioning/BRM/extentmap.cpp | 3266 +++++++++++++++------------------ versioning/BRM/extentmap.h | 234 ++- 5 files changed, 1781 insertions(+), 1854 deletions(-) diff --git a/versioning/BRM/brmshmimpl.cpp b/versioning/BRM/brmshmimpl.cpp index 44750c630..681a7b0f7 100644 --- a/versioning/BRM/brmshmimpl.cpp +++ b/versioning/BRM/brmshmimpl.cpp @@ -408,5 +408,117 @@ void BRMManagedShmImpl::remap(const bool readOnly) fShmSegment = new bi::managed_shared_memory(bi::open_only, keyName.c_str()); } -} // namespace BRM +BRMManagedShmImplRBTree::BRMManagedShmImplRBTree(unsigned key, off_t size, bool readOnly) + : BRMShmImplParent(key, size, readOnly) +{ + // FIXME: Take a right size. + // Actually this means to open a segment at the default code. + if (fSize == 0) + fSize = 64000; + try + { + bi::permissions perms; + perms.set_unrestricted(); + + fShmSegment = new boost::interprocess::managed_shared_memory(boost::interprocess::open_or_create, + segmentName, fSize, 0, perms); + + fSize = fShmSegment->get_size(); + } + catch (exception& e) + { + std::cout << "Cannot create boost::interprocess::managed_shared_memory object: " << e.what() << std::endl; + } +} + +BRMManagedShmImplRBTree::~BRMManagedShmImplRBTree() +{ +} + +void BRMManagedShmImplRBTree::setReadOnly() +{ + try + { + if (fShmSegment) + { + delete fShmSegment; + fShmSegment = new bi::managed_shared_memory(bi::open_read_only, segmentName); + fReadOnly = true; + } + } + catch (exception& e) + { + std::cout << "BRMManagedShmImpl::setReadOnly() error " << e.what() << std::endl; + throw; + } +} + +int32_t BRMManagedShmImplRBTree::grow(unsigned key, off_t incSize) +{ + try + { + if (fShmSegment) + { + // Update key for this process. + fKey = key; + // Call destructor to unmap the segment. + delete fShmSegment; + // Grow the segment. + bi::managed_shared_memory::grow(segmentName, incSize); + // Open only. + fShmSegment = new bi::managed_shared_memory(bi::open_only, segmentName); + // Update size. + fSize = fShmSegment->get_size(); + } + } + catch (exception& e) + { + std::cout << "BRMManagedShmImpl::grow() error " << e.what() << std::endl; + throw; + } + + return 0; +} + +// Dummy method that has no references in the code. +int BRMManagedShmImplRBTree::clear(unsigned newKey, off_t newSize) +{ + return 0; +} + +void BRMManagedShmImplRBTree::reMapSegment() +{ + try + { + if (fShmSegment) + { + // Call destructor to unmap the segment. + delete fShmSegment; + // Map. + fShmSegment = new bi::managed_shared_memory(bi::open_only, segmentName); + // Update size. + fSize = fShmSegment->get_size(); + } + } + catch (exception& e) + { + std::cout << "BRMManagedShmImpl::remap() error " << e.what() << std::endl; + throw; + } +} + +void BRMManagedShmImplRBTree::destroy() +{ + try + { + bi::shared_memory_object::remove(segmentName); + } + catch (exception& e) + { + std::cout << "BRMManagedShmImplRBTree::destroy() " << std::endl; + throw; + } +} + +} // namespace BRM diff --git a/versioning/BRM/brmshmimpl.h b/versioning/BRM/brmshmimpl.h index fa1f0cfee..11b9db6e3 100644 --- a/versioning/BRM/brmshmimpl.h +++ b/versioning/BRM/brmshmimpl.h @@ -115,4 +115,23 @@ class BRMManagedShmImpl : public BRMShmImplParent bi::managed_shared_memory* fShmSegment; }; +class BRMManagedShmImplRBTree : public BRMShmImplParent +{ + public: + BRMManagedShmImplRBTree(unsigned key, off_t size, bool readOnly = false); + ~BRMManagedShmImplRBTree(); + + void setReadOnly() override; + int32_t grow(unsigned key, off_t incSize); + void destroy() override; + void reMapSegment(); + int clear(unsigned newKey, off_t newSize) override; + + boost::interprocess::managed_shared_memory* fShmSegment; + + private: + // The `segment` name is fixed. + const char* segmentName = "MCS-shm-00020001"; +}; + } // namespace BRM diff --git a/versioning/BRM/dbrm.h b/versioning/BRM/dbrm.h index 9f9c0b43b..9b04711e3 100644 --- a/versioning/BRM/dbrm.h +++ b/versioning/BRM/dbrm.h @@ -108,7 +108,7 @@ class DBRM EXPORT static void refreshShm() { MasterSegmentTableImpl::refreshShm(); - ExtentMapImpl::refreshShm(); + ExtentMapRBTreeImpl::refreshShm(); FreeListImpl::refreshShm(); } diff --git a/versioning/BRM/extentmap.cpp b/versioning/BRM/extentmap.cpp index 0069df668..f02623dfe 100644 --- a/versioning/BRM/extentmap.cpp +++ b/versioning/BRM/extentmap.cpp @@ -1,4 +1,3 @@ - /* Copyright (C) 2014 InfiniDB, Inc. Copyright (C) 2016-2022 MariaDB Corporation @@ -217,36 +216,33 @@ bool EMEntry::operator<(const EMEntry& e) const } /*static*/ -boost::mutex ExtentMapImpl::fInstanceMutex; boost::mutex ExtentMap::mutex; boost::mutex ExtentMap::emIndexMutex; -/*static*/ -ExtentMapImpl* ExtentMapImpl::fInstance = nullptr; +boost::mutex ExtentMapRBTreeImpl::fInstanceMutex; +ExtentMapRBTreeImpl* ExtentMapRBTreeImpl::fInstance = nullptr; /*static*/ -ExtentMapImpl* ExtentMapImpl::makeExtentMapImpl(unsigned key, off_t size, bool readOnly) +ExtentMapRBTreeImpl* ExtentMapRBTreeImpl::makeExtentMapRBTreeImpl(unsigned key, off_t size, bool readOnly) { boost::mutex::scoped_lock lk(fInstanceMutex); if (fInstance) { - if (key != fInstance->fExtMap.key()) + if (key != fInstance->fManagedShm.key()) { - BRMShmImpl newShm(key, 0); - fInstance->swapout(newShm); + fInstance->fManagedShm.reMapSegment(); } - ASSERT(key == fInstance->fExtMap.key()); return fInstance; } - fInstance = new ExtentMapImpl(key, size, readOnly); - + fInstance = new ExtentMapRBTreeImpl(key, size, readOnly); return fInstance; } -ExtentMapImpl::ExtentMapImpl(unsigned key, off_t size, bool readOnly) : fExtMap(key, size, readOnly) +ExtentMapRBTreeImpl::ExtentMapRBTreeImpl(unsigned key, off_t size, bool readOnly) + : fManagedShm(key, size, readOnly) { } @@ -350,7 +346,7 @@ bool ExtentMapIndexImpl::growIfNeeded(const size_t memoryNeeded) return false; } -InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert(const EMEntry& emEntry, const size_t emIdx) +InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert(const EMEntry& emEntry, const LBID_t lbid) { auto dbRoot = emEntry.dbRoot; auto* extentMapIndexPtr = get(); @@ -365,14 +361,14 @@ InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert(const EMEntry& emEntry, cons assert(extentMapIndexPtr); ShmVoidAllocator alloc(fBRMManagedShmMemImpl_.getManagedSegment()->get_segment_manager()); OIDIndexContainerT oidIndices(alloc); - extentMapIndexPtr->push_back(std::move(oidIndices)); + extentMapIndexPtr->push_back(oidIndices); } auto& extentMapIndex = *extentMapIndexPtr; - return insert2ndLayerWrapper(extentMapIndex[dbRoot], emEntry, emIdx, shmemHasGrown); + return insert2ndLayerWrapper(extentMapIndex[dbRoot], emEntry, lbid, shmemHasGrown); } InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert2ndLayer(OIDIndexContainerT& oids, const EMEntry& emEntry, - const size_t emIdx, const bool aShmemHasGrown) + const LBID_t lbid, const bool aShmemHasGrown) { OID_t oid = emEntry.fileID; ShmVoidAllocator alloc(fBRMManagedShmMemImpl_.getManagedSegment()->get_segment_manager()); @@ -383,14 +379,14 @@ InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert2ndLayer(OIDIndexContainerT& if (iterAndResult.second) { PartitionIndexContainerT& partitionsContainer = (*iterAndResult.first).second; - return insert3dLayerWrapper(partitionsContainer, emEntry, emIdx, aShmemHasGrown); + return insert3dLayerWrapper(partitionsContainer, emEntry, lbid, aShmemHasGrown); } else return {false, aShmemHasGrown}; } InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert2ndLayerWrapper(OIDIndexContainerT& oids, - const EMEntry& emEntry, const size_t emIdx, + const EMEntry& emEntry, const LBID_t lbid, const bool aShmemHasGrown) { OID_t oid = emEntry.fileID; @@ -410,28 +406,28 @@ InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert2ndLayerWrapper(OIDIndexConta shmemHasGrown = shmemHasGrown || aShmemHasGrown; // The dbroot must be here b/c it was already found once in the upper insert(). OIDIndexContainerT& refreshedOidsRef = extMapIndex[emEntry.dbRoot]; - return insert2ndLayer(refreshedOidsRef, emEntry, emIdx, shmemHasGrown); + return insert2ndLayer(refreshedOidsRef, emEntry, lbid, shmemHasGrown); } - return insert2ndLayer(oids, emEntry, emIdx, shmemHasGrown); + return insert2ndLayer(oids, emEntry, lbid, shmemHasGrown); } PartitionIndexContainerT& partitions = (*oidsIter).second; - return insert3dLayerWrapper(partitions, emEntry, emIdx, shmemHasGrown); + return insert3dLayerWrapper(partitions, emEntry, lbid, shmemHasGrown); } InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert3dLayer(PartitionIndexContainerT& partitions, - const EMEntry& emEntry, const size_t emIdx, + const EMEntry& emEntry, const LBID_t lbid, const bool aShmemHasGrown) { auto partitionNumber = emEntry.partitionNum; ShmVoidAllocator alloc(fBRMManagedShmMemImpl_.getManagedSegment()->get_segment_manager()); - ExtentMapIndicesT emIndices(alloc); - emIndices.push_back(emIdx); - auto iterAndResult = partitions.insert({partitionNumber, std::move(emIndices)}); + LBID_tVectorT lbids(alloc); + lbids.push_back(lbid); + auto iterAndResult = partitions.insert({partitionNumber, std::move(lbids)}); return {iterAndResult.second, aShmemHasGrown}; } InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert3dLayerWrapper(PartitionIndexContainerT& partitions, - const EMEntry& emEntry, const size_t emIdx, + const EMEntry& emEntry, const LBID_t lbid, const bool aShmemHasGrown) { auto partitionNumber = emEntry.partitionNum; @@ -454,18 +450,18 @@ InsertUpdateShmemKeyPair ExtentMapIndexImpl::insert3dLayerWrapper(PartitionIndex OIDIndexContainerT& refreshedOidsRef = extMapIndex[emEntry.dbRoot]; auto oidsIter = refreshedOidsRef.find(emEntry.fileID); PartitionIndexContainerT& refreshedPartitionsRef = (*oidsIter).second; - return insert3dLayer(refreshedPartitionsRef, emEntry, emIdx, shmemHasGrown); + return insert3dLayer(refreshedPartitionsRef, emEntry, lbid, shmemHasGrown); } - return insert3dLayer(partitions, emEntry, emIdx, shmemHasGrown); + return insert3dLayer(partitions, emEntry, lbid, shmemHasGrown); } - ExtentMapIndicesT& emIndices = (*partitionsIter).second; - emIndices.push_back(emIdx); + LBID_tVectorT& lbids = (*partitionsIter).second; + lbids.push_back(lbid); return {true, shmemHasGrown}; } -ExtentMapIndexFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OID_t oid, - const PartitionNumberT partitionNumber) +LBID_tFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OID_t oid, + const PartitionNumberT partitionNumber) { ExtentMapIndex& emIndex = *get(); if (dbroot >= emIndex.size()) @@ -473,7 +469,7 @@ ExtentMapIndexFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OI return search2ndLayer(emIndex[dbroot], oid, partitionNumber); } -ExtentMapIndexFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OID_t oid) +LBID_tFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OID_t oid) { ExtentMapIndex& emIndex = *get(); if (dbroot >= emIndex.size()) @@ -481,8 +477,8 @@ ExtentMapIndexFindResult ExtentMapIndexImpl::find(const DBRootT dbroot, const OI return search2ndLayer(emIndex[dbroot], oid); } -ExtentMapIndexFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& oids, const OID_t oid, - const PartitionNumberT partitionNumber) +LBID_tFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& oids, const OID_t oid, + const PartitionNumberT partitionNumber) { auto oidsIter = oids.find(oid); if (oidsIter == oids.end()) @@ -492,17 +488,17 @@ ExtentMapIndexFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& return search3dLayer(partitions, partitionNumber); } -ExtentMapIndexFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& oids, const OID_t oid) +LBID_tFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& oids, const OID_t oid) { auto oidsIter = oids.find(oid); if (oidsIter == oids.end()) return {}; - ExtentMapIndexFindResult result; + LBID_tFindResult result; PartitionIndexContainerT& partitions = (*oidsIter).second; for (auto& partKeyValue : partitions) { - ExtentMapIndicesT& emIdentifiers = partKeyValue.second; + LBID_tVectorT& emIdentifiers = partKeyValue.second; for (auto& emIdent : emIdentifiers) result.push_back(emIdent); } @@ -510,20 +506,29 @@ ExtentMapIndexFindResult ExtentMapIndexImpl::search2ndLayer(OIDIndexContainerT& return result; } -ExtentMapIndexFindResult ExtentMapIndexImpl::search3dLayer(PartitionIndexContainerT& partitions, - const PartitionNumberT partitionNumber) +LBID_tFindResult ExtentMapIndexImpl::search3dLayer(PartitionIndexContainerT& partitions, + const PartitionNumberT partitionNumber) { auto partitionsIter = partitions.find(partitionNumber); if (partitionsIter == partitions.end()) return {}; - ExtentMapIndexFindResult result; - ExtentMapIndicesT& emIndicesVec = (*partitionsIter).second; + LBID_tFindResult result; + LBID_tVectorT& emIndicesVec = (*partitionsIter).second; for (auto& emIndex : emIndicesVec) result.push_back(emIndex); + return result; } +bool ExtentMapIndexImpl::isDBRootEmpty(const DBRootT dbroot) +{ + ExtentMapIndex& extMapIndex = *get(); + if (dbroot >= extMapIndex.size()) + return true; + return extMapIndex[dbroot].empty(); +} + void ExtentMapIndexImpl::deleteDbRoot(const DBRootT dbroot) { auto& extMapIndex = *get(); @@ -547,7 +552,7 @@ void ExtentMapIndexImpl::deleteOID(const DBRootT dbroot, const OID_t oid) extMapIndex[dbroot].erase(oidsIter); } -void ExtentMapIndexImpl::deleteEMEntry(const EMEntry& emEntry, const ExtentMapIdxT emIdent) +void ExtentMapIndexImpl::deleteEMEntry(const EMEntry& emEntry, const LBID_t lbid) { // find partition auto& extMapIndex = *get(); @@ -562,11 +567,11 @@ void ExtentMapIndexImpl::deleteEMEntry(const EMEntry& emEntry, const ExtentMapId auto partitionsIter = partitions.find(emEntry.partitionNum); if (partitionsIter == partitions.end()) return; - ExtentMapIndicesT& emIdentifiers = (*partitionsIter).second; + LBID_tVectorT& emIdentifiers = (*partitionsIter).second; // pop the identifier if (emIdentifiers.size() > 1) { - auto emIdentifiersTargetIter = std::find(emIdentifiers.begin(), emIdentifiers.end(), emIdent); + auto emIdentifiersTargetIter = std::find(emIdentifiers.begin(), emIdentifiers.end(), lbid); std::swap(*emIdentifiersTargetIter, emIdentifiers.back()); emIdentifiers.pop_back(); } @@ -578,18 +583,18 @@ void ExtentMapIndexImpl::deleteEMEntry(const EMEntry& emEntry, const ExtentMapId ExtentMap::ExtentMap() { - fExtentMap = nullptr; - fFreeList = nullptr; - fPExtMapImpl = nullptr; fCurrentEMShmkey = -1; fCurrentFLShmkey = -1; - fEMShminfo = nullptr; - fFLShminfo = nullptr; + fEMRBTreeShminfo = nullptr; fEMIndexShminfo = nullptr; + fFLShminfo = nullptr; + r_only = false; flLocked = false; emLocked = false; emIndexLocked = false; + + fPExtMapRBTreeImpl = nullptr; fPFreeListImpl = nullptr; fPExtMapIndexImpl_ = nullptr; @@ -613,6 +618,38 @@ ExtentMap::~ExtentMap() fPmDbRootMap.clear(); } +ExtentMapRBTree::iterator ExtentMap::findByLBID(const LBID_t lbid) +{ + auto emIt = fExtentMapRBTree->lower_bound(lbid); + auto end = fExtentMapRBTree->end(); + if (emIt == end) + { + if (fExtentMapRBTree->size() == 0) + return end; + + // Check the last one. + auto last = std::prev(end); + const auto lastBlock = (last->second.range.size * 1024); + if ((last->first <= lbid) && (lbid < (last->first + lastBlock))) + { + return last; + } + return end; + } + + // Lower bound returns the first element not less than the given key. + if (emIt->first != lbid) + { + if (emIt == fExtentMapRBTree->begin()) + { + return end; + } + emIt = std::prev(emIt); + } + + return emIt; +} + // Casual Partioning support // @@ -627,67 +664,44 @@ ExtentMap::~ExtentMap() int ExtentMap::_markInvalid(const LBID_t lbid, const execplan::CalpontSystemCatalog::ColDataType colDataType) { - int entries; - int i; - LBID_t lastBlock; + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::markInvalid(): lbid isn't allocated"); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); + auto& emEntry = emIt->second; + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.partition.cprange.isValid = CP_UPDATING; - for (i = 0; i < entries; i++) + if (isUnsigned(colDataType)) { - lastBlock = fExtentMap[i].range.start + (static_cast(fExtentMap[i].range.size) * 1024) - 1; - - if (fExtentMap[i].range.size != 0) + if (emEntry.colWid != datatypes::MAXDECIMALWIDTH) { - if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock) - { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - fExtentMap[i].partition.cprange.isValid = CP_UPDATING; - - if (isUnsigned(colDataType)) - { - if (fExtentMap[i].colWid != datatypes::MAXDECIMALWIDTH) - { - fExtentMap[i].partition.cprange.loVal = numeric_limits::max(); - fExtentMap[i].partition.cprange.hiVal = numeric_limits::min(); - } - else - { - fExtentMap[i].partition.cprange.bigLoVal = - -1; // XXX: unsigned wide decimals do not exceed rang of signed wide decimals. - fExtentMap[i].partition.cprange.bigHiVal = 0; - } - } - else - { - if (fExtentMap[i].colWid != datatypes::MAXDECIMALWIDTH) - { - fExtentMap[i].partition.cprange.loVal = numeric_limits::max(); - fExtentMap[i].partition.cprange.hiVal = numeric_limits::min(); - } - else - { - utils::int128Max(fExtentMap[i].partition.cprange.bigLoVal); - utils::int128Min(fExtentMap[i].partition.cprange.bigHiVal); - } - } - - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); -#ifdef BRM_DEBUG - ostringstream os; - os << "ExtentMap::_markInvalid(): casual partitioning update: firstLBID=" << fExtentMap[i].range.start - << " lastLBID=" << fExtentMap[i].range.start + fExtentMap[i].range.size * 1024 - 1 - << " OID=" << fExtentMap[i].fileID << " min=" << fExtentMap[i].partition.cprange.loVal - << " max=" << fExtentMap[i].partition.cprange.hiVal - << "seq=" << fExtentMap[i].partition.cprange.sequenceNum; - log(os.str(), logging::LOG_TYPE_DEBUG); -#endif - return 0; - } + emEntry.partition.cprange.loVal = numeric_limits::max(); + emEntry.partition.cprange.hiVal = numeric_limits::min(); + } + else + { + emEntry.partition.cprange.bigLoVal = + -1; // XXX: unsigned wide decimals do not exceed rang of signed wide decimals. + emEntry.partition.cprange.bigHiVal = 0; + } + } + else + { + if (emEntry.colWid != datatypes::MAXDECIMALWIDTH) + { + emEntry.partition.cprange.loVal = numeric_limits::max(); + emEntry.partition.cprange.hiVal = numeric_limits::min(); + } + else + { + utils::int128Max(emEntry.partition.cprange.bigLoVal); + utils::int128Min(emEntry.partition.cprange.bigHiVal); } } - throw logic_error("ExtentMap::markInvalid(): lbid isn't allocated"); + incSeqNum(emEntry.partition.cprange.sequenceNum); + return 0; } int ExtentMap::markInvalid(const LBID_t lbid, const execplan::CalpontSystemCatalog::ColDataType colDataType) @@ -803,9 +817,6 @@ int ExtentMap::setMaxMin(const LBID_t lbid, const int64_t max, const int64_t min } #endif - int entries; - int i; - LBID_t lastBlock; int32_t curSequence; #ifdef BRM_DEBUG @@ -817,66 +828,47 @@ int ExtentMap::setMaxMin(const LBID_t lbid, const int64_t max, const int64_t min grabEMEntryTable(WRITE); grabEMIndex(WRITE); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (i = 0; i < entries; i++) + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::getMaxMin(): that lbid isn't allocated"); + + auto& emEntry = emIt->second; + curSequence = emEntry.partition.cprange.sequenceNum; + + if (curSequence == seqNum) { - if (fExtentMap[i].range.size != 0) - { - lastBlock = fExtentMap[i].range.start + (static_cast(fExtentMap[i].range.size) * 1024) - 1; - curSequence = fExtentMap[i].partition.cprange.sequenceNum; - - if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock) - { -#ifdef BRM_DEBUG - - if (firstNode) - { - ostringstream os; - os << "ExtentMap::setMaxMin(): casual partitioning update: firstLBID=" << fExtentMap[i].range.start - << " lastLBID=" << fExtentMap[i].range.start + fExtentMap[i].range.size * 1024 - 1 - << " OID=" << fExtentMap[i].fileID << " min=" << min << " max=" << max << "seq=" << seqNum; - log(os.str(), logging::LOG_TYPE_DEBUG); - } - -#endif - - if (curSequence == seqNum) - { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - fExtentMap[i].partition.cprange.hiVal = max; - fExtentMap[i].partition.cprange.loVal = min; - fExtentMap[i].partition.cprange.isValid = CP_VALID; - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - return 0; - } - // special val to indicate a reset--used by editem -c. - // Also used by COMMIT and ROLLBACK to invalidate CP. - else if (seqNum == SEQNUM_MARK_INVALID) - { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - // We set hiVal and loVal to correct values for signed or unsigned - // during the markinvalid step, which sets the invalid variable to CP_UPDATING. - // During this step (seqNum == SEQNUM_MARK_INVALID), the min and max passed in are not reliable - // and should not be used. - fExtentMap[i].partition.cprange.isValid = CP_INVALID; - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - return 0; - } - else - { - return 0; - } - } - } + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.partition.cprange.hiVal = max; + emEntry.partition.cprange.loVal = min; + emEntry.partition.cprange.isValid = CP_VALID; + incSeqNum(emEntry.partition.cprange.sequenceNum); + return 0; + } + // special val to indicate a reset--used by editem -c. + // Also used by COMMIT and ROLLBACK to invalidate CP. + else if (seqNum == SEQNUM_MARK_INVALID) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + // We set hi_val and lo_val to correct values for signed or unsigned + // during the markinvalid step, which sets the invalid variable to CP_UPDATING. + // During this step (seqNum == -1), the min and max passed in are not reliable + // and should not be used. + emEntry.partition.cprange.isValid = CP_INVALID; + incSeqNum(emEntry.partition.cprange.sequenceNum); + return 0; + } + else + { + return 0; } - - if (emLocked) - releaseEMEntryTable(WRITE); if (emIndexLocked) releaseEMIndex(WRITE); + if (emLocked) + releaseEMEntryTable(WRITE); + throw logic_error("ExtentMap::setMaxMin(): lbid isn't allocated"); } @@ -916,8 +908,6 @@ void ExtentMap::setExtentsMaxMin(const CPMaxMinMap_t& cpMap, bool firstNode, boo } #endif - int entries; - int i; int32_t curSequence; const int32_t extentsToUpdate = cpMap.size(); int32_t extentsUpdated = 0; @@ -935,132 +925,95 @@ void ExtentMap::setExtentsMaxMin(const CPMaxMinMap_t& cpMap, bool firstNode, boo grabEMIndex(WRITE); } - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (i = 0; i < entries; i++) + for (it = cpMap.begin(); it != cpMap.end(); ++it) { - if (fExtentMap[i].range.size != 0) + auto emIt = findByLBID(it->first); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::setExtentsMaxMin(): lbid not found"); + + auto& emEntry = emIt->second; + curSequence = emEntry.partition.cprange.sequenceNum; + + if (curSequence == it->second.seqNum && emEntry.partition.cprange.isValid == CP_INVALID) { - it = cpMap.find(fExtentMap[i].range.start); - - if (it != cpMap.end()) + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + if (it->second.isBinaryColumn) { - curSequence = fExtentMap[i].partition.cprange.sequenceNum; - - if (curSequence == it->second.seqNum && fExtentMap[i].partition.cprange.isValid == CP_INVALID) + emEntry.partition.cprange.bigHiVal = it->second.bigMax; + emEntry.partition.cprange.bigLoVal = it->second.bigMin; + } + else + { + emEntry.partition.cprange.hiVal = it->second.max; + emEntry.partition.cprange.loVal = it->second.min; + } + emEntry.partition.cprange.isValid = CP_VALID; + incSeqNum(emEntry.partition.cprange.sequenceNum); + extentsUpdated++; + } + // special val to indicate a reset -- ignore the min/max + else if (it->second.seqNum == SEQNUM_MARK_INVALID) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + // We set hiVal and loVal to correct values for signed or unsigned + // during the markinvalid step, which sets the invalid variable to CP_UPDATING. + // During this step (seqNum == -1), the min and max passed in are not reliable + // and should not be used. + emEntry.partition.cprange.isValid = CP_INVALID; + incSeqNum(emEntry.partition.cprange.sequenceNum); + extentsUpdated++; + } + // special val to indicate a reset -- assign the min/max + else if (it->second.seqNum == SEQNUM_MARK_INVALID_SET_RANGE) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + if (it->second.isBinaryColumn) + { + emEntry.partition.cprange.bigHiVal = it->second.bigMax; + emEntry.partition.cprange.bigLoVal = it->second.bigMin; + } + else + { + emEntry.partition.cprange.hiVal = it->second.max; + emEntry.partition.cprange.loVal = it->second.min; + } + emEntry.partition.cprange.isValid = CP_INVALID; + incSeqNum(emEntry.partition.cprange.sequenceNum); + extentsUpdated++; + } + else if (it->second.seqNum == SEQNUM_MARK_UPDATING_INVALID_SET_RANGE) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + if (emEntry.partition.cprange.isValid == CP_UPDATING) + { + if (it->second.isBinaryColumn) { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - if (it->second.isBinaryColumn) - { - fExtentMap[i].partition.cprange.bigHiVal = it->second.bigMax; - fExtentMap[i].partition.cprange.bigLoVal = it->second.bigMin; - } - else - { - fExtentMap[i].partition.cprange.hiVal = it->second.max; - fExtentMap[i].partition.cprange.loVal = it->second.min; - } - fExtentMap[i].partition.cprange.isValid = CP_VALID; - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - extentsUpdated++; -#ifdef BRM_DEBUG - - if (firstNode) - { - ostringstream os; - os << "ExtentMap::setExtentsMaxMin(): casual partitioning update: firstLBID=" - << fExtentMap[i].range.start - << " lastLBID=" << fExtentMap[i].range.start + fExtentMap[i].range.size * 1024 - 1 - << " OID=" << fExtentMap[i].fileID << " min=" << it->second.min << " max=" << it->second.max - << " seq=" << it->second.seqNum; - log(os.str(), logging::LOG_TYPE_DEBUG); - } - -#endif + emEntry.partition.cprange.bigHiVal = it->second.bigMax; + emEntry.partition.cprange.bigLoVal = it->second.bigMin; } - // special val to indicate a reset -- ignore the min/max - else if (it->second.seqNum == SEQNUM_MARK_INVALID) - { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - // We set hiVal and loVal to correct values for signed or unsigned - // during the markinvalid step, which sets the invalid variable to CP_UPDATING. - // During this step (seqNum == SEQNUM_MARK_INVALID), the min and max passed in are not reliable - // and should not be used. - fExtentMap[i].partition.cprange.isValid = CP_INVALID; - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - extentsUpdated++; - } - // special val to indicate a reset -- assign the min/max - else if (it->second.seqNum == SEQNUM_MARK_INVALID_SET_RANGE) - { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - if (it->second.isBinaryColumn) - { - fExtentMap[i].partition.cprange.bigHiVal = it->second.bigMax; - fExtentMap[i].partition.cprange.bigLoVal = it->second.bigMin; - } - else - { - fExtentMap[i].partition.cprange.hiVal = it->second.max; - fExtentMap[i].partition.cprange.loVal = it->second.min; - } - fExtentMap[i].partition.cprange.isValid = CP_INVALID; - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - extentsUpdated++; - } - else if (it->second.seqNum == SEQNUM_MARK_UPDATING_INVALID_SET_RANGE) - { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - if (fExtentMap[i].partition.cprange.isValid == CP_UPDATING) - { - if (it->second.isBinaryColumn) - { - fExtentMap[i].partition.cprange.bigHiVal = it->second.bigMax; - fExtentMap[i].partition.cprange.bigLoVal = it->second.bigMin; - } - else - { - fExtentMap[i].partition.cprange.hiVal = it->second.max; - fExtentMap[i].partition.cprange.loVal = it->second.min; - } - fExtentMap[i].partition.cprange.isValid = CP_INVALID; - } - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - extentsUpdated++; - } - // else sequence has changed since start of the query. Don't update the EM entry. else { - extentsUpdated++; - } - - if (extentsUpdated == extentsToUpdate) - { - return; + emEntry.partition.cprange.hiVal = it->second.max; + emEntry.partition.cprange.loVal = it->second.min; } + emEntry.partition.cprange.isValid = CP_INVALID; } + incSeqNum(emEntry.partition.cprange.sequenceNum); + extentsUpdated++; + } + // else sequence has changed since start of the query. Don't update the EM entry. + else + { + extentsUpdated++; + } + + if (extentsUpdated == extentsToUpdate) + { + return; } } - ostringstream oss; - oss << "ExtentMap::setExtentsMaxMin(): LBIDs not allocated:"; - for (it = cpMap.begin(); it != cpMap.end(); it++) - { - for (i = 0; i < entries; i++) - { - if (fExtentMap[i].range.start == it->first) - { - break; - } - } - if (i < entries) - { - continue; - } - oss << " " << it->first; - } - - throw logic_error(oss.str()); + throw logic_error("ExtentMap::setExtentsMaxMin(): lbid isn't allocated"); } //------------------------------------------------------------------------------ @@ -1136,199 +1089,184 @@ void ExtentMap::mergeExtentsMaxMin(CPMaxMinMergeMap_t& cpMap, bool useLock) grabEMIndex(WRITE); } - int entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (int i = 0; i < entries; i++) // loop through all extents + for (it = cpMap.begin(); it != cpMap.end(); ++it) { - if (fExtentMap[i].range.size != 0) // find eligible extents - { - it = cpMap.find(fExtentMap[i].range.start); + auto emIt = findByLBID(it->first); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::mergeExtentsMaxMin(): lbid not found"); - if (it != cpMap.end()) - { + auto& emEntry = emIt->second; #ifdef BRM_DEBUG - ostringstream os; - os << "ExtentMap::mergeExtentsMaxMin(): casual partitioning update: firstLBID=" - << fExtentMap[i].range.start - << " lastLBID=" << fExtentMap[i].range.start + fExtentMap[i].range.size * 1024 - 1 - << " OID=" << fExtentMap[i].fileID << " hiVal=" << fExtentMap[i].partition.cprange.hiVal - << " loVal=" << fExtentMap[i].partition.cprange.loVal << " min=" << it->second.min - << " max=" << it->second.max << " seq=" << it->second.seqNum; - log(os.str(), logging::LOG_TYPE_DEBUG); + ostringstream os; + os << "ExtentMap::mergeExtentsMaxMin(): casual partitioning update: firstLBID=" << emEntry.range.start + << " lastLBID=" << emEntry.range.start + emEntry.range.size * 1024 - 1 << " OID=" << emEntry.fileID + << " hi_val=" << emEntry.partition.cprange.hiVal << " lo_val=" << emEntry.partition.cprange.loVal + << " min=" << it->second.min << " max=" << it->second.max << " seq=" << it->second.seqNum; + log(os.str(), logging::LOG_TYPE_DEBUG); + #endif - bool isBinaryColumn = it->second.colWidth > 8; + bool isBinaryColumn = it->second.colWidth > 8; - switch (fExtentMap[i].partition.cprange.isValid) + switch (emEntry.partition.cprange.isValid) + { + // Merge input min/max with current min/max + case CP_VALID: + { + if ((!isBinaryColumn && !isValidCPRange(it->second.max, it->second.min, it->second.type)) || + (isBinaryColumn && !isValidCPRange(it->second.bigMax, it->second.bigMin, it->second.type))) { - // Merge input min/max with current min/max - case CP_VALID: + break; + } + + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + + // We check the validity of the current min/max, + // because isValid could be CP_VALID for an extent + // having all NULL values, in which case the current + // min/max needs to be set instead of merged. + if ((!isBinaryColumn && isValidCPRange(emEntry.partition.cprange.hiVal, + emEntry.partition.cprange.loVal, it->second.type)) || + (isBinaryColumn && isValidCPRange(emEntry.partition.cprange.bigHiVal, + emEntry.partition.cprange.bigLoVal, it->second.type))) + { + // Swap byte order to do binary string comparison + if (isCharType(it->second.type)) { - if ((!isBinaryColumn && !isValidCPRange(it->second.max, it->second.min, it->second.type)) || - (isBinaryColumn && !isValidCPRange(it->second.bigMax, it->second.bigMin, it->second.type))) + uint64_t newMinVal = static_cast(uint64ToStr(static_cast(it->second.min))); + uint64_t newMaxVal = static_cast(uint64ToStr(static_cast(it->second.max))); + uint64_t oldMinVal = + static_cast(uint64ToStr(static_cast(emEntry.partition.cprange.loVal))); + uint64_t oldMaxVal = + static_cast(uint64ToStr(static_cast(emEntry.partition.cprange.hiVal))); + + if (newMinVal < oldMinVal) + emEntry.partition.cprange.loVal = it->second.min; + + if (newMaxVal > oldMaxVal) + emEntry.partition.cprange.hiVal = it->second.max; + } + else if (isUnsigned(it->second.type)) + { + if (!isBinaryColumn) { - break; - } - - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - - // We check the validity of the current min/max, - // because isValid could be CP_VALID for an extent - // having all NULL values, in which case the current - // min/max needs to be set instead of merged. - - if ((!isBinaryColumn && isValidCPRange(fExtentMap[i].partition.cprange.hiVal, - fExtentMap[i].partition.cprange.loVal, it->second.type)) || - (isBinaryColumn && isValidCPRange(fExtentMap[i].partition.cprange.bigHiVal, - fExtentMap[i].partition.cprange.bigLoVal, it->second.type))) - { - // Swap byte order to do binary string comparison - if (isCharType(it->second.type)) + if (static_cast(it->second.min) < + static_cast(emEntry.partition.cprange.loVal)) { - uint64_t newMinVal = - static_cast(uint64ToStr(static_cast(it->second.min))); - uint64_t newMaxVal = - static_cast(uint64ToStr(static_cast(it->second.max))); - uint64_t oldMinVal = static_cast( - uint64ToStr(static_cast(fExtentMap[i].partition.cprange.loVal))); - uint64_t oldMaxVal = static_cast( - uint64ToStr(static_cast(fExtentMap[i].partition.cprange.hiVal))); - - if (newMinVal < oldMinVal) - fExtentMap[i].partition.cprange.loVal = it->second.min; - - if (newMaxVal > oldMaxVal) - fExtentMap[i].partition.cprange.hiVal = it->second.max; + emEntry.partition.cprange.loVal = it->second.min; } - else if (isUnsigned(it->second.type)) + + if (static_cast(it->second.max) > + static_cast(emEntry.partition.cprange.hiVal)) { - if (!isBinaryColumn) - { - if (static_cast(it->second.min) < - static_cast(fExtentMap[i].partition.cprange.loVal)) - { - fExtentMap[i].partition.cprange.loVal = it->second.min; - } - - if (static_cast(it->second.max) > - static_cast(fExtentMap[i].partition.cprange.hiVal)) - { - fExtentMap[i].partition.cprange.hiVal = it->second.max; - } - } - else - { - if (static_cast(it->second.bigMin) < - static_cast(fExtentMap[i].partition.cprange.bigLoVal)) - { - fExtentMap[i].partition.cprange.bigLoVal = it->second.bigMin; - } - - if (static_cast(it->second.bigMax) > - static_cast(fExtentMap[i].partition.cprange.bigHiVal)) - { - fExtentMap[i].partition.cprange.bigHiVal = it->second.bigMax; - } - } - } - else - { - if (!isBinaryColumn) - { - if (it->second.min < fExtentMap[i].partition.cprange.loVal) - fExtentMap[i].partition.cprange.loVal = it->second.min; - - if (it->second.max > fExtentMap[i].partition.cprange.hiVal) - fExtentMap[i].partition.cprange.hiVal = it->second.max; - } - else - { - if (it->second.bigMin < fExtentMap[i].partition.cprange.bigLoVal) - fExtentMap[i].partition.cprange.bigLoVal = it->second.bigMin; - - if (it->second.bigMax > fExtentMap[i].partition.cprange.bigHiVal) - fExtentMap[i].partition.cprange.bigHiVal = it->second.bigMax; - } + emEntry.partition.cprange.hiVal = it->second.max; } } else { - if (!isBinaryColumn) + if (static_cast(it->second.bigMin) < + static_cast(emEntry.partition.cprange.bigLoVal)) { - fExtentMap[i].partition.cprange.loVal = it->second.min; - fExtentMap[i].partition.cprange.hiVal = it->second.max; + emEntry.partition.cprange.bigLoVal = it->second.bigMin; } - else + + if (static_cast(it->second.bigMax) > + static_cast(emEntry.partition.cprange.bigHiVal)) { - fExtentMap[i].partition.cprange.bigLoVal = it->second.bigMin; - fExtentMap[i].partition.cprange.bigHiVal = it->second.bigMax; + emEntry.partition.cprange.bigHiVal = it->second.bigMax; } } - - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - - break; } - - // DML is updating; just increment seqnum. - // This case is here for completeness. Table lock should - // prevent this state from occurring (see notes at top of - // this function) - case CP_UPDATING: + else { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - - break; - } - - // Reset min/max to new min/max only "if" we can treat this - // as a new extent, else leave the extent marked as INVALID - case CP_INVALID: - default: - { - makeUndoRecord(&fExtentMap[i], sizeof(struct EMEntry)); - - if (it->second.newExtent) + if (!isBinaryColumn) { - if ((!isBinaryColumn && isValidCPRange(it->second.max, it->second.min, it->second.type)) || - (isBinaryColumn && isValidCPRange(it->second.bigMax, it->second.bigMin, it->second.type))) - { - if (!isBinaryColumn) - { - fExtentMap[i].partition.cprange.loVal = it->second.min; - fExtentMap[i].partition.cprange.hiVal = it->second.max; - } - else - { - fExtentMap[i].partition.cprange.bigLoVal = it->second.bigMin; - fExtentMap[i].partition.cprange.bigHiVal = it->second.bigMax; - } - } + if (it->second.min < emEntry.partition.cprange.loVal) + emEntry.partition.cprange.loVal = it->second.min; - // Even if invalid range; we set state to CP_VALID, - // because the extent is valid, it is just empty. - fExtentMap[i].partition.cprange.isValid = CP_VALID; + if (it->second.max > emEntry.partition.cprange.hiVal) + emEntry.partition.cprange.hiVal = it->second.max; } + else + { + if (it->second.bigMin < emEntry.partition.cprange.bigLoVal) + emEntry.partition.cprange.bigLoVal = it->second.bigMin; - incSeqNum(fExtentMap[i].partition.cprange.sequenceNum); - break; + if (it->second.bigMax > emEntry.partition.cprange.bigHiVal) + emEntry.partition.cprange.bigHiVal = it->second.bigMax; + } } - } // switch on isValid state - - extentsMerged++; - - if (extentsMerged == extentsToMerge) + } + else { - return; // Leave when all extents in map are matched + if (!isBinaryColumn) + { + emEntry.partition.cprange.loVal = it->second.min; + emEntry.partition.cprange.hiVal = it->second.max; + } + else + { + emEntry.partition.cprange.bigLoVal = it->second.bigMin; + emEntry.partition.cprange.bigHiVal = it->second.bigMax; + } } - // Deleting objects from map, may speed up successive searches - cpMap.erase(it); + incSeqNum(emEntry.partition.cprange.sequenceNum); + break; + } - } // found a matching extent in the Map - } // extent map range size != 0 - } // end of loop through extent map + // DML is updating; just increment seqnum. + // This case is here for completeness. Table lock should + // prevent this state from occurring (see notes at top of + // this function) + case CP_UPDATING: + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + incSeqNum(emEntry.partition.cprange.sequenceNum); + break; + } + + // Reset min/max to new min/max only "if" we can treat this + // as a new extent, else leave the extent marked as INVALID + case CP_INVALID: + default: + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + if (it->second.newExtent) + { + if ((!isBinaryColumn && isValidCPRange(it->second.max, it->second.min, it->second.type)) || + (isBinaryColumn && isValidCPRange(it->second.bigMax, it->second.bigMin, it->second.type))) + { + if (!isBinaryColumn) + { + emEntry.partition.cprange.loVal = it->second.min; + emEntry.partition.cprange.hiVal = it->second.max; + } + else + { + emEntry.partition.cprange.bigLoVal = it->second.bigMin; + emEntry.partition.cprange.bigHiVal = it->second.bigMax; + } + } + + // Even if invalid range; we set state to CP_VALID, + // because the extent is valid, it is just empty. + emEntry.partition.cprange.isValid = CP_VALID; + } + + incSeqNum(emEntry.partition.cprange.sequenceNum); + break; + } + } // switch on isValid state + + ++extentsMerged; + + if (extentsMerged == extentsToMerge) + return; // Leave when all extents in map are matched + + // Deleting objects from map, may speed up successive searches + cpMap.erase(it); + } throw logic_error("ExtentMap::mergeExtentsMaxMin(): lbid not found"); } @@ -1425,9 +1363,6 @@ int ExtentMap::getMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) min = numeric_limits::max(); } seqNum *= (-1); - int entries; - int i; - LBID_t lastBlock; int isValid = CP_INVALID; #ifdef BRM_DEBUG @@ -1439,79 +1374,83 @@ int ExtentMap::getMaxMin(const LBID_t lbid, T& max, T& min, int32_t& seqNum) grabEMEntryTable(READ); grabEMIndex(READ); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (i = 0; i < entries; i++) + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::getMaxMin(): that lbid isn't allocated"); + + auto& emEntry = emIt->second; + if (typeid(T) == typeid(int128_t)) { - if (fExtentMap[i].range.size != 0) - { - lastBlock = fExtentMap[i].range.start + (static_cast(fExtentMap[i].range.size) * 1024) - 1; - - if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock) - { - if (typeid(T) == typeid(int128_t)) - { - max = fExtentMap[i].partition.cprange.bigHiVal; - min = fExtentMap[i].partition.cprange.bigLoVal; - } - else - { - max = fExtentMap[i].partition.cprange.hiVal; - min = fExtentMap[i].partition.cprange.loVal; - } - seqNum = fExtentMap[i].partition.cprange.sequenceNum; - isValid = fExtentMap[i].partition.cprange.isValid; - releaseEMIndex(READ); - releaseEMEntryTable(READ); - return isValid; - } - } + max = emEntry.partition.cprange.bigHiVal; + min = emEntry.partition.cprange.bigLoVal; } + else + { + max = emEntry.partition.cprange.hiVal; + min = emEntry.partition.cprange.loVal; + } + seqNum = emEntry.partition.cprange.sequenceNum; + isValid = emEntry.partition.cprange.isValid; releaseEMIndex(READ); releaseEMEntryTable(READ); - throw logic_error("ExtentMap::getMaxMin(): that lbid isn't allocated"); - // return -1; + + return isValid; +} + +std::vector ExtentMap::getEmIdentsByLbids(const bi::vector& lbids) +{ + std::vector emEntries; + for (auto lbid : lbids) + { + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::markInvalid(): lbid isn't allocated"); + // Materialize. + emEntries.push_back(emIt->second); + } + return emEntries; +} + +std::vector ExtentMap::getEmIteratorsByLbids(const bi::vector& lbids) +{ + // ExtentMapRBTree::iterator + std::vector emEntries; + for (auto lbid : lbids) + { + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::markInvalid(): lbid isn't allocated"); + // Materialize. + emEntries.push_back(emIt); + } + return emEntries; } void ExtentMap::getCPMaxMin(const BRM::LBID_t lbid, BRM::CPMaxMin& cpMaxMin) { - int entries; - int i; - LBID_t lastBlock; - #ifdef BRM_DEBUG - if (lbid < 0) throw invalid_argument("ExtentMap::getMaxMin(): lbid must be >= 0"); - #endif grabEMEntryTable(READ); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (i = 0; i < entries; i++) - { - if (fExtentMap[i].range.size != 0) - { - lastBlock = fExtentMap[i].range.start + (static_cast(fExtentMap[i].range.size) * 1024) - 1; - - if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock) - { - cpMaxMin.bigMax = fExtentMap[i].partition.cprange.bigHiVal; - cpMaxMin.bigMin = fExtentMap[i].partition.cprange.bigLoVal; - cpMaxMin.max = fExtentMap[i].partition.cprange.hiVal; - cpMaxMin.min = fExtentMap[i].partition.cprange.loVal; - cpMaxMin.seqNum = fExtentMap[i].partition.cprange.sequenceNum; - - releaseEMEntryTable(READ); - return; - } - } - } + grabEMIndex(READ); + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) + throw logic_error("ExtentMap::getMaxMin(): that lbid isn't allocated"); + + auto& emEntry = emIt->second; + cpMaxMin.bigMax = emEntry.partition.cprange.bigHiVal; + cpMaxMin.bigMin = emEntry.partition.cprange.bigLoVal; + cpMaxMin.max = emEntry.partition.cprange.hiVal; + cpMaxMin.min = emEntry.partition.cprange.loVal; + cpMaxMin.seqNum = emEntry.partition.cprange.sequenceNum; + releaseEMIndex(READ); releaseEMEntryTable(READ); - throw logic_error("ExtentMap::getMaxMin(): that lbid isn't allocated"); + return; } /* Removes a range from the freelist. Used by load() */ @@ -1621,43 +1560,33 @@ void ExtentMap::reserveLBIDRange(LBID_t start, uint8_t size) template void ExtentMap::loadVersion4or5(T* in, bool upgradeV4ToV5) { - int emNumElements = 0, flNumElements = 0; + uint32_t emNumElements = 0; + uint32_t flNumElements = 0; + uint32_t nbytes = 0; - int nbytes = 0; - nbytes += in->read((char*)&emNumElements, sizeof(int)); - nbytes += in->read((char*)&flNumElements, sizeof(int)); + nbytes += in->read((char*)&emNumElements, sizeof(uint32_t)); + nbytes += in->read((char*)&flNumElements, sizeof(uint32_t)); idbassert(emNumElements > 0); - if ((size_t)nbytes != sizeof(int) + sizeof(int)) + if (nbytes != (2 * sizeof(uint32_t))) { log_errno("ExtentMap::loadVersion4or5(): read "); throw runtime_error("ExtentMap::loadVersion4or5(): read failed. Check the error log."); } - void* fExtentMapPtr = static_cast(fExtentMap); - memset(fExtentMapPtr, 0, fEMShminfo->allocdSize); - fEMShminfo->currentSize = 0; - // init the free list + // Clear the extent map. + fExtentMapRBTree->clear(); + fEMRBTreeShminfo->currentSize = 0; + + // Init the free list. memset(fFreeList, 0, fFLShminfo->allocdSize); fFreeList[0].size = (1 << 26); // 2^36 LBIDs fFLShminfo->currentSize = sizeof(InlineLBIDRange); - // @Bug 3498 - // Calculate how big an extent map we're going to need and allocate it in one call - if ((fEMShminfo->allocdSize / sizeof(EMEntry)) < (unsigned)emNumElements) - { - size_t nrows = emNumElements; - - // Round up to the nearest EM_INCREMENT_ROWS - if ((nrows % EM_INCREMENT_ROWS) != 0) - { - nrows /= EM_INCREMENT_ROWS; - nrows++; - nrows *= EM_INCREMENT_ROWS; - } - - growEMShmseg(nrows); - } + // Calculate how much memory we need. + const uint32_t memorySizeNeeded = (emNumElements * EM_RB_TREE_NODE_SIZE) + EM_RB_TREE_EMPTY_SIZE; + constexpr const uint32_t freeShmemThreshold = EM_RB_TREE_INITIAL_SIZE >> 4; + growEMShmseg(memorySizeNeeded); size_t progress = 0, writeSize = emNumElements * sizeof(EMEntry); int err; @@ -1665,29 +1594,40 @@ void ExtentMap::loadVersion4or5(T* in, bool upgradeV4ToV5) if (!upgradeV4ToV5) { - writeSize = emNumElements * sizeof(EMEntry); - writePos = (char*)fExtentMap; - - while (progress < writeSize) + for (uint32_t i = 0; i < emNumElements; ++i) { - err = in->read(writePos + progress, writeSize - progress); - if (err <= 0) + progress = 0; + EMEntry emEntry; + writeSize = sizeof(EMEntry); + writePos = reinterpret_cast(&emEntry); + + while (progress < writeSize) { - log_errno("ExtentMap::loadVersion4or5(): read "); - throw runtime_error("ExtentMap::loadVersion4or5(): read failed. Check the error log."); + err = in->read(writePos + progress, writeSize - progress); + if (err <= 0) + { + log_errno("ExtentMap::loadVersion4(): read "); + throw runtime_error("ExtentMap::loadVersion4(): read failed. Check the error log."); + } + progress += (uint)err; } - progress += (uint)err; + if (fPExtMapRBTreeImpl->getFreeMemory() < freeShmemThreshold) + growEMShmseg(EM_RB_TREE_INCREMENT); + + std::pair lbidEMEntryPair = make_pair(emEntry.range.start, emEntry); + fExtentMapRBTree->insert(lbidEMEntryPair); } } else { // We are upgrading extent map from v4 to v5. - for (int i = 0; i < emNumElements; i++) + for (uint32_t i = 0; i < emNumElements; i++) { EMEntry_v4 emEntryV4; progress = 0; writeSize = sizeof(EMEntry_v4); - writePos = (char*)&(emEntryV4); + writePos = reinterpret_cast(&emEntryV4); + while (progress < writeSize) { err = in->read(writePos + progress, writeSize - progress); @@ -1700,62 +1640,70 @@ void ExtentMap::loadVersion4or5(T* in, bool upgradeV4ToV5) progress += (uint)err; } - fExtentMap[i].range.start = emEntryV4.range.start; - fExtentMap[i].range.size = emEntryV4.range.size; - fExtentMap[i].fileID = emEntryV4.fileID; - fExtentMap[i].blockOffset = emEntryV4.blockOffset; - fExtentMap[i].HWM = emEntryV4.HWM; - fExtentMap[i].partitionNum = emEntryV4.partitionNum; - fExtentMap[i].segmentNum = emEntryV4.segmentNum; - fExtentMap[i].dbRoot = emEntryV4.dbRoot; - fExtentMap[i].colWid = emEntryV4.colWid; - fExtentMap[i].status = emEntryV4.status; - fExtentMap[i].partition.cprange.hiVal = emEntryV4.partition.cprange.hi_val; - fExtentMap[i].partition.cprange.loVal = emEntryV4.partition.cprange.lo_val; - fExtentMap[i].partition.cprange.sequenceNum = emEntryV4.partition.cprange.sequenceNum; - fExtentMap[i].partition.cprange.isValid = emEntryV4.partition.cprange.isValid; + if (fPExtMapRBTreeImpl->getFreeMemory() < freeShmemThreshold) + growEMShmseg(EM_RB_TREE_INCREMENT); + + EMEntry emEntry; + emEntry.range.start = emEntryV4.range.start; + emEntry.range.size = emEntryV4.range.size; + emEntry.fileID = emEntryV4.fileID; + emEntry.blockOffset = emEntryV4.blockOffset; + emEntry.HWM = emEntryV4.HWM; + emEntry.partitionNum = emEntryV4.partitionNum; + emEntry.segmentNum = emEntryV4.segmentNum; + emEntry.dbRoot = emEntryV4.dbRoot; + emEntry.colWid = emEntryV4.colWid; + emEntry.status = emEntryV4.status; + emEntry.partition.cprange.hiVal = emEntryV4.partition.cprange.hi_val; + emEntry.partition.cprange.loVal = emEntryV4.partition.cprange.lo_val; + emEntry.partition.cprange.sequenceNum = emEntryV4.partition.cprange.sequenceNum; + emEntry.partition.cprange.isValid = emEntryV4.partition.cprange.isValid; + + std::pair lbidEMEntryPair = make_pair(emEntry.range.start, emEntry); + fExtentMapRBTree->insert(lbidEMEntryPair); } std::cout << emNumElements << " extents successfully upgraded" << std::endl; } - for (int i = 0; i < emNumElements; i++) + for (auto& lbidEMEntryPair : *fExtentMapRBTree) { - reserveLBIDRange(fExtentMap[i].range.start, fExtentMap[i].range.size); + EMEntry& emEntry = lbidEMEntryPair.second; + reserveLBIDRange(emEntry.range.start, emEntry.range.size); //@bug 1911 - verify status value is valid - if (fExtentMap[i].status < EXTENTSTATUSMIN || fExtentMap[i].status > EXTENTSTATUSMAX) - fExtentMap[i].status = EXTENTAVAILABLE; + if (emEntry.status < EXTENTSTATUSMIN || emEntry.status > EXTENTSTATUSMAX) + emEntry.status = EXTENTAVAILABLE; - auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[i], i); + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(emEntry, emEntry.range.start); if (resShmemHasGrownPair.second) fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); if (!resShmemHasGrownPair.first) - logAndSetEMIndexReadOnly("loadVersion4or5"); + logAndSetEMIndexReadOnly("loadVersion4"); } - fEMShminfo->currentSize = emNumElements * sizeof(EMEntry); + fEMRBTreeShminfo->currentSize = (emNumElements * EM_RB_TREE_NODE_SIZE) + EM_RB_TREE_EMPTY_SIZE; #ifdef DUMP_EXTENT_MAP - EMEntry* emSrc = fExtentMap; cout << "lbid\tsz\toid\tfbo\thwm\tpart#\tseg#\tDBRoot\twid\tst\thi\tlo\tsq\tv" << endl; - for (int i = 0; i < emNumElements; i++) + for (const auto& lbidEMEntryPair : *fExtentMapRBTRee) { - cout << emSrc[i].start << '\t' << emSrc[i].size << '\t' << emSrc[i].fileID << '\t' << emSrc[i].blockOffset - << '\t' << emSrc[i].HWM << '\t' << emSrc[i].partitionNum << '\t' << emSrc[i].segmentNum << '\t' - << emSrc[i].dbRoot << '\t' << emSrc[i].status << '\t' << emSrc[i].partition.cprange.hiVal << '\t' - << emSrc[i].partition.cprange.loVal << '\t' << emSrc[i].partition.cprange.sequenceNum << '\t' - << (int)(emSrc[i].partition.cprange.isValid) << endl; - } + const EMEntry& emEntry = lbidEMEntryPair.second; + cout << emEntry.start << '\t' << emEntry.size << '\t' << emEntry.fileID << '\t' << emEntry.blockOffset + << '\t' << emEntry.HWM << '\t' << emEntry.partitionNum << '\t' << emEntry.segmentNum << '\t' + << emEntry.dbRoot << '\t' << emEntry.status << '\t' << emEntry.partition.cprange.hiVal << '\t' + << emEntry.partition.cprange.loVal << '\t' << emEntry.partition.cprange.sequenceNum << '\t' + << (int)(emEntry.partition.cprange.isValid) << endl; + } - cout << "Free list entries:" << endl; - cout << "start\tsize" << endl; + cout << "Free list entries:" << endl; + cout << "start\tsize" << endl; - for (int i = 0; i < flNumElements; i++) - cout << fFreeList[i].start << '\t' << fFreeList[i].size << endl; + for (uint32_t i = 0; i < flNumElements; i++) + cout << fFreeList[i].start << '\t' << fFreeList[i].size << endl; #endif } @@ -1818,59 +1766,6 @@ void ExtentMap::load(const string& filename, bool fixFL) releaseEMEntryTable(WRITE); } -// This is a quick workaround, to be able to initialize initial system tables -// from binary blob. -// This should be updated, probably we need inherit from `IDBDataFile`. -struct EMBinaryReader -{ - EMBinaryReader(const char* data) : src(data) - { - } - - ssize_t read(char* dst, size_t size) - { - memcpy(dst, src, size); - src += size; - return size; - } - - const char* src; -}; - -void ExtentMap::loadFromBinaryBlob(const char* blob) -{ - grabEMEntryTable(WRITE); - grabEMIndex(WRITE); - - try - { - grabFreeList(WRITE); - } - catch (...) - { - releaseEMIndex(WRITE); - releaseEMEntryTable(WRITE); - throw; - } - - try - { - EMBinaryReader emBinReader(blob); - load(&emBinReader); - } - catch (...) - { - releaseFreeList(WRITE); - releaseEMIndex(WRITE); - releaseEMEntryTable(WRITE); - throw; - } - - releaseFreeList(WRITE); - releaseEMIndex(WRITE); - releaseEMEntryTable(WRITE); -} - template void ExtentMap::load(T* in) { @@ -1911,8 +1806,6 @@ void ExtentMap::save(const string& filename) #endif - int allocdSize, loadSize[3], i; - grabEMEntryTable(READ); grabEMIndex(READ); @@ -1927,7 +1820,7 @@ void ExtentMap::save(const string& filename) throw; } - if (fEMShminfo->currentSize == 0) + if (fEMRBTreeShminfo->currentSize == 0) { log("ExtentMap::save(): got request to save an empty BRM"); releaseFreeList(READ); @@ -1949,16 +1842,15 @@ void ExtentMap::save(const string& filename) throw ios_base::failure("ExtentMap::save(): open failed. Check the error log."); } + int loadSize[3]; loadSize[0] = EM_MAGIC_V5; - loadSize[1] = fEMShminfo->currentSize / sizeof(EMEntry); + loadSize[1] = fExtentMapRBTree->size(); loadSize[2] = fFLShminfo->allocdSize / sizeof(InlineLBIDRange); // needs to send all entries - int bytes = 0; - try { - const int wsize = 3 * sizeof(int); - bytes = out->write((char*)loadSize, wsize); + const int32_t wsize = 3 * sizeof(uint32_t); + const int32_t bytes = out->write((char*)loadSize, wsize); if (bytes != wsize) throw ios_base::failure("ExtentMap::save(): write failed. Check the error log."); @@ -1971,44 +1863,16 @@ void ExtentMap::save(const string& filename) throw; } - allocdSize = fEMShminfo->allocdSize / sizeof(EMEntry); - // const int emEntrySize = sizeof(EMEntry); + for (auto& lbidEMEntryPair : *fExtentMapRBTree) + { + EMEntry& emEntry = lbidEMEntryPair.second; + const uint32_t writeSize = sizeof(EMEntry); + char* writePos = reinterpret_cast(&emEntry); + uint32_t progress = 0; - int first = -1, last = -1, err; - size_t progress, writeSize; - for (i = 0; i < allocdSize; i++) - { - if (fExtentMap[i].range.size > 0 && first == -1) - first = i; - else if (fExtentMap[i].range.size <= 0 && first != -1) - { - last = i; - writeSize = (last - first) * sizeof(EMEntry); - progress = 0; - char* writePos = (char*)&fExtentMap[first]; - while (progress < writeSize) - { - err = out->write(writePos + progress, writeSize - progress); - if (err < 0) - { - releaseFreeList(READ); - releaseEMIndex(READ); - releaseEMEntryTable(READ); - throw ios_base::failure("ExtentMap::save(): write failed. Check the error log."); - } - progress += err; - } - first = -1; - } - } - if (first != -1) - { - writeSize = (allocdSize - first) * sizeof(EMEntry); - progress = 0; - char* writePos = (char*)&fExtentMap[first]; while (progress < writeSize) { - err = out->write(writePos + progress, writeSize - progress); + auto err = out->write(writePos + progress, writeSize - progress); if (err < 0) { releaseFreeList(READ); @@ -2020,12 +1884,12 @@ void ExtentMap::save(const string& filename) } } - progress = 0; - writeSize = fFLShminfo->allocdSize; - char* writePos = (char*)fFreeList; + uint32_t progress = 0; + const uint32_t writeSize = fFLShminfo->allocdSize; + char* writePos = reinterpret_cast(fFreeList); while (progress < writeSize) { - err = out->write(writePos + progress, writeSize - progress); + auto err = out->write(writePos + progress, writeSize - progress); if (err < 0) { releaseFreeList(READ); @@ -2033,7 +1897,6 @@ void ExtentMap::save(const string& filename) releaseEMEntryTable(READ); throw ios_base::failure("ExtentMap::save(): write failed. Check the error log."); } - progress += err; } @@ -2049,34 +1912,30 @@ void ExtentMap::grabEMEntryTable(OPS op) if (op == READ) { - fEMShminfo = fMST.getTable_read(MasterSegmentTable::EMTable); + fEMRBTreeShminfo = fMST.getTable_read(MasterSegmentTable::EMTable); } else { - fEMShminfo = fMST.getTable_write(MasterSegmentTable::EMTable); + fEMRBTreeShminfo = fMST.getTable_write(MasterSegmentTable::EMTable); emLocked = true; } - if (!fPExtMapImpl || fPExtMapImpl->key() != (unsigned)fEMShminfo->tableShmkey) + if (!fPExtMapRBTreeImpl || fPExtMapRBTreeImpl->key() != (uint32_t)fEMRBTreeShminfo->tableShmkey) { - if (fExtentMap != nullptr) - { - fExtentMap = nullptr; - } - - if (fEMShminfo->allocdSize == 0) + if (fEMRBTreeShminfo->allocdSize == 0) { if (op == READ) { fMST.getTable_upgrade(MasterSegmentTable::EMTable); emLocked = true; - if (fEMShminfo->allocdSize == 0) + if (fEMRBTreeShminfo->allocdSize == 0) { growEMShmseg(); } - emLocked = false; // has to be done holding the write lock + // Has to be done holding the write lock. + emLocked = false; fMST.getTable_downgrade(MasterSegmentTable::EMTable); } else @@ -2086,25 +1945,20 @@ void ExtentMap::grabEMEntryTable(OPS op) } else { - fPExtMapImpl = ExtentMapImpl::makeExtentMapImpl(fEMShminfo->tableShmkey, 0); + fPExtMapRBTreeImpl = ExtentMapRBTreeImpl::makeExtentMapRBTreeImpl(fEMRBTreeShminfo->tableShmkey, 0); + ASSERT(fPExtMapRBTreeImpl); - ASSERT(fPExtMapImpl); - - if (r_only) - fPExtMapImpl->makeReadOnly(); - - fExtentMap = fPExtMapImpl->get(); - - if (fExtentMap == nullptr) + fExtentMapRBTree = fPExtMapRBTreeImpl->get(); + if (fExtentMapRBTree == nullptr) { - log_errno("ExtentMap::grabEMEntryTable(): shmat"); - throw runtime_error("ExtentMap::grabEMEntryTable(): shmat failed. Check the error log."); + log_errno("ExtentMap cannot create RBTree in shared memory segment"); + throw runtime_error("ExtentMap cannot create RBTree in shared memory segment"); } } } else { - fExtentMap = fPExtMapImpl->get(); + fExtentMapRBTree = fPExtMapRBTreeImpl->get(); } } @@ -2239,13 +2093,11 @@ void ExtentMap::releaseEMEntryTable(OPS op) } else { - /* - Note: Technically we should mark it unlocked after it's unlocked, - however, that's a race condition. The only reason the up operation - here will fail is if the underlying semaphore doesn't exist anymore - or there is a locking logic error somewhere else. Either way, - declaring the EM unlocked here is OK. Same with all similar assignments. - */ + // Note: Technically we should mark it unlocked after it's unlocked, + // however, that's a race condition. The only reason the up operation + // here will fail is if the underlying semaphore doesn't exist anymore + // or there is a locking logic error somewhere else. Either way, + // declaring the EM unlocked here is OK. Same with all similar assignments. emLocked = false; fMST.releaseTable_write(MasterSegmentTable::EMTable); } @@ -2277,7 +2129,7 @@ void ExtentMap::releaseEMIndex(OPS op) key_t ExtentMap::chooseEMShmkey() { - return chooseShmkey(fEMShminfo, fShmKeys.KEYRANGE_EXTENTMAP_BASE); + return chooseShmkey(fEMRBTreeShminfo, fShmKeys.KEYRANGE_EXTENTMAP_BASE); } key_t ExtentMap::chooseFLShmkey() @@ -2310,38 +2162,50 @@ key_t ExtentMap::chooseShmkey(const MSTEntry* masterTableEntry, const uint32_t k /* Must be called holding the EM write lock Returns with the new shmseg mapped */ -void ExtentMap::growEMShmseg(size_t nrows) +void ExtentMap::growEMShmseg(size_t size) { size_t allocSize; - key_t newshmkey; + auto newShmKey = chooseEMShmkey(); - if (fEMShminfo->allocdSize == 0) - allocSize = EM_INITIAL_SIZE; + if (fEMRBTreeShminfo->allocdSize == 0) + allocSize = EM_RB_TREE_INITIAL_SIZE; else - allocSize = fEMShminfo->allocdSize + EM_INCREMENT; + allocSize = EM_RB_TREE_INCREMENT; - newshmkey = chooseEMShmkey(); - ASSERT((allocSize == EM_INITIAL_SIZE && !fPExtMapImpl) || fPExtMapImpl); + allocSize = std::max(size, allocSize); + ASSERT((allocSize == EM_RB_TREE_INITIAL_SIZE && !fPExtMapRBTreeImpl) || fPExtMapRBTreeImpl); - // Use the larger of the calculated value or the specified value - allocSize = max(allocSize, nrows * sizeof(EMEntry)); - - if (!fPExtMapImpl) + if (!fPExtMapRBTreeImpl) { - fPExtMapImpl = ExtentMapImpl::makeExtentMapImpl(newshmkey, allocSize, r_only); + if (fEMRBTreeShminfo->tableShmkey == 0) + fEMRBTreeShminfo->tableShmkey = newShmKey; + + fPExtMapRBTreeImpl = + ExtentMapRBTreeImpl::makeExtentMapRBTreeImpl(fEMRBTreeShminfo->tableShmkey, allocSize, r_only); } else { - fPExtMapImpl->grow(newshmkey, allocSize); + fEMRBTreeShminfo->tableShmkey = newShmKey; + fPExtMapRBTreeImpl->grow(fEMRBTreeShminfo->tableShmkey, allocSize); } - fEMShminfo->tableShmkey = newshmkey; - fEMShminfo->allocdSize = allocSize; + fEMRBTreeShminfo->allocdSize += allocSize; + fExtentMapRBTree = fPExtMapRBTreeImpl->get(); - if (r_only) - fPExtMapImpl->makeReadOnly(); + // That's mean we have a initial size. + if (fEMRBTreeShminfo->currentSize == 0) + fEMRBTreeShminfo->currentSize = EM_RB_TREE_EMPTY_SIZE; +} - fExtentMap = fPExtMapImpl->get(); +void ExtentMap::growIfNeededOnExtentCreate() +{ + const auto freeMem = fPExtMapRBTreeImpl->getFreeMemory(); + const auto memNeeded = EM_RB_TREE_NODE_SIZE * 10; + if (freeMem <= memNeeded) + { + // The accurate size is memNeeded - freeMem. + growEMShmseg(memNeeded); + } } void ExtentMap::growEMIndexShmseg(const size_t suggestedSize) @@ -2423,8 +2287,6 @@ int ExtentMap::lookup(LBID_t lbid, LBID_t& firstLbid, LBID_t& lastLbid) } #endif - int entries, i; - LBID_t lastBlock; #ifdef BRM_DEBUG @@ -2439,27 +2301,24 @@ int ExtentMap::lookup(LBID_t lbid, LBID_t& firstLbid, LBID_t& lastLbid) grabEMEntryTable(READ); grabEMIndex(READ); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (i = 0; i < entries; i++) + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) { - if (fExtentMap[i].range.size != 0) - { - lastBlock = fExtentMap[i].range.start + (static_cast(fExtentMap[i].range.size) * 1024) - 1; - - if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock) - { - firstLbid = fExtentMap[i].range.start; - lastLbid = lastBlock; - releaseEMIndex(READ); - releaseEMEntryTable(READ); - return 0; - } - } + releaseEMIndex(READ); + releaseEMEntryTable(READ); + return -1; } + + auto& emEntry = emIt->second; + LBID_t lastBlock = emEntry.range.start + (static_cast(emEntry.range.size) * 1024) - 1; + firstLbid = emEntry.range.start; + lastLbid = lastBlock; + releaseEMIndex(READ); releaseEMEntryTable(READ); - return -1; + + return 0; } // @bug 1055+. New functions added for multiple files per OID enhancement. @@ -2494,8 +2353,6 @@ int ExtentMap::lookupLocal(LBID_t lbid, int& OID, uint16_t& dbRoot, uint32_t& pa } #endif - int entries, i, offset; - LBID_t lastBlock; if (lbid < 0) { @@ -2508,34 +2365,27 @@ int ExtentMap::lookupLocal(LBID_t lbid, int& OID, uint16_t& dbRoot, uint32_t& pa grabEMEntryTable(READ); grabEMIndex(READ); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (i = 0; i < entries; i++) + auto emIt = findByLBID(lbid); + if (emIt == fExtentMapRBTree->end()) { - if (fExtentMap[i].range.size != 0) - { - lastBlock = fExtentMap[i].range.start + (static_cast(fExtentMap[i].range.size) * 1024) - 1; - - if (lbid >= fExtentMap[i].range.start && lbid <= lastBlock) - { - OID = fExtentMap[i].fileID; - dbRoot = fExtentMap[i].dbRoot; - segmentNum = fExtentMap[i].segmentNum; - partitionNum = fExtentMap[i].partitionNum; - - // TODO: Offset logic. - offset = lbid - fExtentMap[i].range.start; - fileBlockOffset = fExtentMap[i].blockOffset + offset; - - releaseEMIndex(READ); - releaseEMEntryTable(READ); - return 0; - } - } + releaseEMIndex(READ); + releaseEMEntryTable(READ); + return -1; } + + auto& emEntry = emIt->second; + OID = emEntry.fileID; + dbRoot = emEntry.dbRoot; + segmentNum = emEntry.segmentNum; + partitionNum = emEntry.partitionNum; + + // TODO: Offset logic. + auto offset = lbid - emEntry.range.start; + fileBlockOffset = emEntry.blockOffset + offset; + releaseEMIndex(READ); releaseEMEntryTable(READ); - return -1; + return 0; } int ExtentMap::lookupLocal(int OID, uint32_t partitionNum, uint16_t segmentNum, uint32_t fileBlockOffset, @@ -2570,17 +2420,18 @@ int ExtentMap::lookupLocal(int OID, uint32_t partitionNum, uint16_t segmentNum, for (auto dbRoot : dbRootVec) { - auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); - for (auto i : emIdents) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + const auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { // TODO: Blockoffset logic. - if (fExtentMap[i].range.size != 0 && fExtentMap[i].segmentNum == segmentNum && - fExtentMap[i].blockOffset <= fileBlockOffset && - fileBlockOffset <= - (fExtentMap[i].blockOffset + (static_cast(fExtentMap[i].range.size) * 1024) - 1)) + if (emEntry.range.size != 0 && emEntry.segmentNum == segmentNum && + emEntry.blockOffset <= fileBlockOffset && + fileBlockOffset <= (emEntry.blockOffset + (static_cast(emEntry.range.size) * 1024) - 1)) { - offset = fileBlockOffset - fExtentMap[i].blockOffset; - LBID = fExtentMap[i].range.start + offset; + offset = fileBlockOffset - emEntry.blockOffset; + LBID = emEntry.range.start + offset; + releaseEMIndex(READ); releaseEMEntryTable(READ); return 0; @@ -2610,7 +2461,6 @@ int ExtentMap::lookupLocal_DBroot(int OID, uint16_t dbroot, uint32_t partitionNu } #endif - int entries, i, offset; if (OID < 0) { @@ -2621,19 +2471,17 @@ int ExtentMap::lookupLocal_DBroot(int OID, uint16_t dbroot, uint32_t partitionNu grabEMEntryTable(READ); grabEMIndex(READ); - entries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (i = 0; i < entries; i++) + for (auto emIt = fExtentMapRBTree->begin(), end = fExtentMapRBTree->end(); emIt != end; ++emIt) { + const auto& emEntry = emIt->second; // TODO: Blockoffset logic. - if (fExtentMap[i].range.size != 0 && fExtentMap[i].fileID == OID && fExtentMap[i].dbRoot == dbroot && - fExtentMap[i].partitionNum == partitionNum && fExtentMap[i].segmentNum == segmentNum && - fExtentMap[i].blockOffset <= fileBlockOffset && - fileBlockOffset <= - (fExtentMap[i].blockOffset + (static_cast(fExtentMap[i].range.size) * 1024) - 1)) + if (emEntry.fileID == OID && emEntry.dbRoot == dbroot && emEntry.partitionNum == partitionNum && + emEntry.segmentNum == segmentNum && emEntry.blockOffset <= fileBlockOffset && + fileBlockOffset <= (emEntry.blockOffset + (static_cast(emEntry.range.size) * 1024) - 1)) { - offset = fileBlockOffset - fExtentMap[i].blockOffset; - LBID = fExtentMap[i].range.start + offset; + auto offset = fileBlockOffset - emEntry.blockOffset; + LBID = emEntry.range.start + offset; + releaseEMIndex(READ); releaseEMEntryTable(READ); return 0; @@ -2684,17 +2532,19 @@ int ExtentMap::lookupLocalStartLbid(int OID, uint32_t partitionNum, uint16_t seg for (auto dbRoot : dbRootVec) { - auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); - for (auto i : emIdents) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + const auto emIdents = getEmIdentsByLbids(lbids); + + for (auto& emEntry : emIdents) { - if (fExtentMap[i].range.size != 0 && fExtentMap[i].segmentNum == segmentNum && - fExtentMap[i].blockOffset <= fileBlockOffset && - fileBlockOffset <= - (fExtentMap[i].blockOffset + (static_cast(fExtentMap[i].range.size) * 1024) - 1)) + if (emEntry.range.size != 0 && emEntry.segmentNum == segmentNum && + emEntry.blockOffset <= fileBlockOffset && + fileBlockOffset <= (emEntry.blockOffset + (static_cast(emEntry.range.size) * 1024) - 1)) { - LBID = fExtentMap[i].range.start; + LBID = emEntry.range.start; releaseEMIndex(READ); releaseEMEntryTable(READ); + return 0; } } @@ -2836,10 +2686,7 @@ void ExtentMap::createColumnExtent_DBroot(int OID, uint32_t colWidth, uint16_t d grabFreeList(WRITE); } - if (fEMShminfo->currentSize == fEMShminfo->allocdSize) - { - growEMShmseg(); - } + growIfNeededOnExtentCreate(); // size is the number of multiples of 1024 blocks. // ex: size=1 --> 1024 blocks @@ -2878,8 +2725,6 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co uint32_t& partitionNum, uint16_t& segmentNum, uint32_t& startBlockOffset) { - EmptyEMEntry emptyEMEntry = -1; - LastExtentIndexT lastExtentIndex = -1; HighestOffset highestOffset = 0; PartitionNumberT highestPartNum = 0; SegmentT highestSegNum = 0; @@ -2906,35 +2751,28 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co // 3. Find first unused extent map entry //-------------------------------------------------------------------------- LBID_t startLBID = getLBIDsFromFreeList(size); - - auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID); // Find the first empty Entry; and find last extent for this OID and dbRoot - - for (auto i : emIdents) + EMEntry* lastExtent = nullptr; + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID); + auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { - if (fExtentMap[i].range.size != 0) + // 1. Find HWM extent in relevant DBRoot + if ((emEntry.partitionNum > highestPartNum) || + ((emEntry.partitionNum == highestPartNum) && (emEntry.blockOffset > highestOffset)) || + ((emEntry.partitionNum == highestPartNum) && (emEntry.blockOffset == highestOffset) && + (emEntry.segmentNum >= highestSegNum))) { - // 1. Find HWM extent in relevant DBRoot - if ((fExtentMap[i].partitionNum > highestPartNum) || - ((fExtentMap[i].partitionNum == highestPartNum) && (fExtentMap[i].blockOffset > highestOffset)) || - ((fExtentMap[i].partitionNum == highestPartNum) && (fExtentMap[i].blockOffset == highestOffset) && - (fExtentMap[i].segmentNum >= highestSegNum))) - { - lastExtentIndex = i; - highestPartNum = fExtentMap[i].partitionNum; - highestSegNum = fExtentMap[i].segmentNum; - highestOffset = fExtentMap[i].blockOffset; - } - } // found valid extentmap entry - - // 3. Find first available extent map entry that can be reused - else if (emptyEMEntry < 0) - emptyEMEntry = i; - } // Loop through extent map entries + lastExtent = &emEntry; + highestPartNum = emEntry.partitionNum; + highestSegNum = emEntry.segmentNum; + highestOffset = emEntry.blockOffset; + } // found extentmap entry for specified OID + } // Loop through extent map entries DBRootVec dbRootVec(getAllDbRoots()); // 2. for empty DBRoot track hi seg# in user specified part# - if (lastExtentIndex == -1) + if (lastExtent == nullptr) { // loop over all extents that doesn't belong to the target dbroot for (auto dbRootFromList : dbRootVec) @@ -2942,47 +2780,19 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co if (dbRootFromList == dbRoot) continue; - auto emIdentsLocal = fPExtMapIndexImpl_->find(dbRootFromList, OID, partitionNum); - for (auto i : emIdentsLocal) + const auto lbidsLocal = fPExtMapIndexImpl_->find(dbRootFromList, OID, partitionNum); + auto emIdentsLocal = getEmIdentsByLbids(lbidsLocal); + for (auto& emEntry : emIdentsLocal) { - if ((fExtentMap[i].range.size != 0) && - ((fExtentMap[i].segmentNum > highEmptySegNum) || (!bHighEmptySegNumSet))) + if ((emEntry.segmentNum > highEmptySegNum) || (!bHighEmptySegNumSet)) { - highEmptySegNum = fExtentMap[i].segmentNum; + highEmptySegNum = emEntry.segmentNum; bHighEmptySegNumSet = true; } - - // Search for the first empty Entry - if (fExtentMap[i].range.size == 0) - { - emptyEMEntry = i; - break; - } } } } - size_t emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - // Search for the first empty Entry - for (size_t i = 0; emptyEMEntry < 0 && i < emEntries; ++i) - { - if (fExtentMap[i].range.size == 0) - { - emptyEMEntry = i; - break; - } - } - - if (emptyEMEntry == -1) - { - ostringstream oss; - oss << "ExtentMap::_createColumnExtent_DBroot(): " - "could not find an empty EMEntry for OID " - << OID << "; Extent Map is full", - log(oss.str(), logging::LOG_TYPE_CRITICAL); - throw logic_error(oss.str()); - } - //-------------------------------------------------------------------------- // If DBRoot is not empty, then... // Second Step: Scan ExtentMap again after I know the last partition @@ -3005,68 +2815,65 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co int partHighSegNext = -1; // hi seg num for next partition // Target dbroot has extents for the OID - if (lastExtentIndex >= 0) + if (lastExtent) { - PartitionNumberT targetDbRootPart = fExtentMap[lastExtentIndex].partitionNum; - PartitionNumberT targetDbRootPartNext = targetDbRootPart + 1; - partHighSeg = fExtentMap[lastExtentIndex].segmentNum; - targetDbRootSegs.insert(TargetDbRootSegsMap::value_type(fExtentMap[lastExtentIndex].segmentNum, - fExtentMap[lastExtentIndex].blockOffset)); - + uint32_t targetDbRootPart = lastExtent->partitionNum; + uint32_t targetDbRootPartNext = targetDbRootPart + 1; + partHighSeg = lastExtent->segmentNum; + targetDbRootSegs.insert(TargetDbRootSegsMap::value_type(lastExtent->segmentNum, lastExtent->blockOffset)); + for (auto dbRootFromList : dbRootVec) { if (dbRootFromList == dbRoot) { - auto emIdents = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPart); - for (auto i : emIdents) + const auto lbids = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPart); + const auto emIdents = getEmIdentsByLbids(lbids); + for (const auto& emEntry : emIdents) { // 5. Track hi seg for hwm partition - if (fExtentMap[i].segmentNum > partHighSeg) + if (emEntry.segmentNum > partHighSeg) { - partHighSeg = fExtentMap[i].segmentNum; + partHighSeg = emEntry.segmentNum; } // 6. Save list of seg files in target DBRoot/Partition, // along with the highest fbo for each seg file - if (fExtentMap[i].status == EXTENTOUTOFSERVICE) + if (emEntry.status == EXTENTOUTOFSERVICE) bSegsOutOfService = true; - TargetDbRootSegsMapIter iter = targetDbRootSegs.find(fExtentMap[i].segmentNum); + TargetDbRootSegsMapIter iter = targetDbRootSegs.find(emEntry.segmentNum); if (iter == targetDbRootSegs.end()) { - targetDbRootSegs.insert( - TargetDbRootSegsMap::value_type(fExtentMap[i].segmentNum, fExtentMap[i].blockOffset)); + targetDbRootSegs.insert(TargetDbRootSegsMap::value_type(emEntry.segmentNum, emEntry.blockOffset)); } else { - if (fExtentMap[i].blockOffset > iter->second) + if (emEntry.blockOffset > iter->second) { - iter->second = fExtentMap[i].blockOffset; + iter->second = emEntry.blockOffset; } } - } // loop over em idents - } // current dbroot == target dbroot + } // loop over em idents + } // current dbroot == target dbroot else { // 4. Track hi seg for hwm+1 partition - auto emIdentsNext = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPartNext); - for (auto i : emIdentsNext) + const auto lbidsNext = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPartNext); + const auto emIdentsNext = getEmIdentsByLbids(lbidsNext); + for (const auto& emEntry : emIdentsNext) { - if (fExtentMap[i].segmentNum > partHighSegNext) - { - partHighSegNext = fExtentMap[i].segmentNum; - } + if (emEntry.segmentNum > partHighSegNext) + partHighSegNext = emEntry.segmentNum; } // 5. Track hi seg for hwm partition - auto emIdents = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPart); - for (auto i : emIdents) + const auto lbids = fPExtMapIndexImpl_->find(dbRootFromList, OID, targetDbRootPart); + const auto emIdents = getEmIdentsByLbids(lbids); + for (const auto& emEntry : emIdents) { - if (fExtentMap[i].segmentNum > partHighSeg) - { - partHighSeg = fExtentMap[i].segmentNum; - } + if (emEntry.segmentNum > partHighSeg) + partHighSeg = emEntry.segmentNum; } } // current dbroot != target dbroot } // loop over dbroots @@ -3097,18 +2904,16 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co // partition number as needed. // else // use part# that the user specifies - if (lastExtentIndex >= 0) + if (lastExtent) { bool startNewPartition = false; bool startNewStripeInSegFile = false; const unsigned int filesPerDBRootPerPartition = FILES_PER_COL_PART / DBROOT_COUNT; - int& lastExtIdx = lastExtentIndex; - // Find first, last, next seg files in target partition and DBRoot - uint16_t firstTargetSeg = fExtentMap[lastExtIdx].segmentNum; - uint16_t lastTargetSeg = fExtentMap[lastExtIdx].segmentNum; - uint16_t nextTargetSeg = fExtentMap[lastExtIdx].segmentNum; + uint16_t firstTargetSeg = lastExtent->segmentNum; + uint16_t lastTargetSeg = lastExtent->segmentNum; + uint16_t nextTargetSeg = lastExtent->segmentNum; // 1. Loop thru targetDbRootSegs[] to find next segment after // lastExtIdx in target list. @@ -3127,7 +2932,7 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co else if (targetSeg > lastTargetSeg) lastTargetSeg = targetSeg; - if (targetSeg > fExtentMap[lastExtIdx].segmentNum) + if (targetSeg > lastExtent->segmentNum) { if ((targetSeg < nextTargetSeg) || (!bNextSegSet)) { @@ -3138,7 +2943,7 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co } } - newPartitionNum = fExtentMap[lastExtIdx].partitionNum; + newPartitionNum = lastExtent->partitionNum; // 2a. Skip to next physical partition if any extents in HWM partition/ // DBRoot are marked as outOfService @@ -3188,12 +2993,11 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co // find out if we need to add a new partition for next extent. else if (targetDbRootSegs.size() >= filesPerDBRootPerPartition) { - if (fExtentMap[lastExtIdx].segmentNum == lastTargetSeg) + if (lastExtent->segmentNum == lastTargetSeg) { // Use blockOffset of lastExtIdx to see if we need to add // the next extent to a new partition. - if (fExtentMap[lastExtIdx].blockOffset == - ((EXTENTS_PER_SEGFILE - 1) * (EXTENT_ROWS * colWidth / BLOCK_SIZE))) + if (lastExtent->blockOffset == ((EXTENTS_PER_SEGFILE - 1) * (EXTENT_ROWS * colWidth / BLOCK_SIZE))) { startNewPartition = true; } @@ -3238,7 +3042,7 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co } // case2: leave newBlockOffset set to 0 - else if ((fExtentMap[lastExtIdx].blockOffset == 0) && (newSegmentNum > firstTargetSeg)) + else if ((lastExtent->blockOffset == 0) && (newSegmentNum > firstTargetSeg)) { //...no action necessary } @@ -3250,12 +3054,11 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co { if (startNewStripeInSegFile) // start next stripe { - newBlockOffset = static_cast(fExtentMap[lastExtIdx].range.size) * 1024 + - fExtentMap[lastExtIdx].blockOffset; + newBlockOffset = static_cast(lastExtent->range.size) * 1024 + lastExtent->blockOffset; } else // next extent, same stripe { - newBlockOffset = fExtentMap[lastExtIdx].blockOffset; + newBlockOffset = lastExtent->blockOffset; } } } // lastExtentIndex >= 0 @@ -3270,53 +3073,46 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co //-------------------------------------------------------------------------- // Fourth Step: Construct the new extentmap entry //-------------------------------------------------------------------------- - - makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry)); - EMEntry* e = &fExtentMap[emptyEMEntry]; - - e->range.start = startLBID; - e->range.size = size; - e->fileID = OID; + EMEntry e; + e.range.start = startLBID; + e.range.size = size; + e.fileID = OID; + e.partition.cprange.sequenceNum = 0; + e.colWid = colWidth; + e.dbRoot = newDbRoot; + e.partitionNum = newPartitionNum; + e.segmentNum = newSegmentNum; + e.blockOffset = newBlockOffset; + e.HWM = 0; + e.status = EXTENTUNAVAILABLE; // mark extent as in process if (isUnsigned(colDataType)) { if (colWidth != datatypes::MAXDECIMALWIDTH) { - e->partition.cprange.loVal = numeric_limits::max(); - e->partition.cprange.hiVal = 0; + e.partition.cprange.loVal = numeric_limits::max(); + e.partition.cprange.hiVal = 0; } else { - e->partition.cprange.bigLoVal = -1; - e->partition.cprange.bigHiVal = 0; + e.partition.cprange.bigLoVal = -1; + e.partition.cprange.bigHiVal = 0; } } else { if (colWidth != datatypes::MAXDECIMALWIDTH) { - e->partition.cprange.loVal = numeric_limits::max(); - e->partition.cprange.hiVal = numeric_limits::min(); + e.partition.cprange.loVal = numeric_limits::max(); + e.partition.cprange.hiVal = numeric_limits::min(); } else { - utils::int128Max(e->partition.cprange.bigLoVal); - utils::int128Min(e->partition.cprange.bigHiVal); + utils::int128Max(e.partition.cprange.bigLoVal); + utils::int128Min(e.partition.cprange.bigHiVal); } } - e->partition.cprange.sequenceNum = 0; - - e->colWid = colWidth; - - e->dbRoot = newDbRoot; - e->partitionNum = newPartitionNum; - e->segmentNum = newSegmentNum; - - e->blockOffset = newBlockOffset; - e->HWM = 0; - e->status = EXTENTUNAVAILABLE; // mark extent as in process - #if 0 // XXX: sergueyz: I'll leave these under conditional flag for a while because it appears a huge change. // Partition, segment, and blockOffset 0 represents new table or column. // When DDL creates a table, we can mark the first extent as VALID, since @@ -3333,24 +3129,29 @@ LBID_t ExtentMap::_createColumnExtent_DBroot(uint32_t size, int OID, uint32_t co else e->partition.cprange.isValid = CP_INVALID; #else - e->partition.cprange.isValid = CP_INVALID; + e.partition.cprange.isValid = CP_INVALID; #endif - partitionNum = e->partitionNum; - segmentNum = e->segmentNum; - startBlockOffset = e->blockOffset; + partitionNum = e.partitionNum; + segmentNum = e.segmentNum; + startBlockOffset = e.blockOffset; - makeUndoRecord(fEMShminfo, sizeof(MSTEntry)); - fEMShminfo->currentSize += sizeof(struct EMEntry); - auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[emptyEMEntry], emptyEMEntry); + makeUndoRecordRBTree(UndoRecordType::INSERT, e); + makeUndoRecord(fEMRBTreeShminfo, sizeof(MSTEntry)); - if (resShmemHasGrownPair.second) - fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); + // Insert into RBTree. + std::pair lbidEmEntryPair = make_pair(startLBID, e); + fExtentMapRBTree->insert(lbidEmEntryPair); + fEMRBTreeShminfo->currentSize += EM_RB_TREE_NODE_SIZE; - if (!resShmemHasGrownPair.first) - logAndSetEMIndexReadOnly("_createColumnExtent_DBroot"); + // Insert into Index. + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(e, startLBID); + if (resShmemHasGrownPair.second) + fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); + if (!resShmemHasGrownPair.first) + logAndSetEMIndexReadOnly("_createColumnExtent_DBroot"); - return startLBID; + return startLBID; } //------------------------------------------------------------------------------ @@ -3408,11 +3209,7 @@ void ExtentMap::createColumnExtentExactFile(int OID, uint32_t colWidth, uint16_t grabEMIndex(WRITE); grabFreeList(WRITE); - if (fEMShminfo->currentSize == fEMShminfo->allocdSize) - { - growEMShmseg(); - } - + growIfNeededOnExtentCreate(); // size is the number of multiples of 1024 blocks. // ex: size=1 --> 1024 blocks // size=2 --> 2048 blocks @@ -3425,12 +3222,12 @@ void ExtentMap::createColumnExtentExactFile(int OID, uint32_t colWidth, uint16_t allocdsize = EXTENT_SIZE; } +/* LastIndEmptyIndEmptyInd ExtentMap::_createExtentCommonSearch(const OID_t OID, const DBRootT dbRoot, const PartitionNumberT partitionNum, const SegmentT segmentNum) { EmptyEMEntry emptyEMEntry = -1; - LastExtentIndexT lastExtentIndex = -1; HighestOffset highestOffset = 0; size_t emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); @@ -3465,6 +3262,7 @@ LastIndEmptyIndEmptyInd ExtentMap::_createExtentCommonSearch(const OID_t OID, co } return {lastExtentIndex, emptyEMEntry}; } +*/ void ExtentMap::logAndSetEMIndexReadOnly(const std::string& funcName) { @@ -3501,78 +3299,72 @@ LBID_t ExtentMap::_createColumnExtentExactFile(uint32_t size, int OID, uint32_t execplan::CalpontSystemCatalog::ColDataType colDataType, uint32_t& startBlockOffset) { - auto lastIndEmptyIndEmptyInd = _createExtentCommonSearch(OID, dbRoot, partitionNum, segmentNum); - LastExtentIndexT lastExtentIndex = lastIndEmptyIndEmptyInd.first; - EmptyEMEntry emptyEMEntry = lastIndEmptyIndEmptyInd.second; + uint32_t highestOffset = 0; + LBID_t startLBID = getLBIDsFromFreeList(size); + EMEntry* lastEmEntry = nullptr; - if (emptyEMEntry == -1) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { - ostringstream oss; - oss << "ExtentMap::_createColumnExtentExactFile(): " - "could not find an empty EMEntry for OID " - << OID << "; Extent Map is full", - log(oss.str(), logging::LOG_TYPE_CRITICAL); - throw logic_error(oss.str()); + if ((emEntry.segmentNum) == segmentNum && (emEntry.blockOffset >= highestOffset)) + { + lastEmEntry = &emEntry; + highestOffset = emEntry.blockOffset; + } } - makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry)); - EMEntry* e = &fExtentMap[emptyEMEntry]; + EMEntry newEmEntry; + newEmEntry.range.start = startLBID; + newEmEntry.range.size = size; + newEmEntry.fileID = OID; + newEmEntry.partition.cprange.sequenceNum = 0; + newEmEntry.colWid = colWidth; + newEmEntry.dbRoot = dbRoot; + newEmEntry.partitionNum = partitionNum; + newEmEntry.segmentNum = segmentNum; + newEmEntry.status = EXTENTUNAVAILABLE; // mark extent as in process - LBID_t startLBID = getLBIDsFromFreeList(size); - e->range.start = startLBID; - e->range.size = size; - e->fileID = OID; if (isUnsigned(colDataType)) { if (colWidth != datatypes::MAXDECIMALWIDTH) { - e->partition.cprange.loVal = numeric_limits::max(); - e->partition.cprange.hiVal = 0; + newEmEntry.partition.cprange.loVal = numeric_limits::max(); + newEmEntry.partition.cprange.hiVal = 0; } else { - e->partition.cprange.bigLoVal = -1; - e->partition.cprange.bigHiVal = 0; + newEmEntry.partition.cprange.bigLoVal = -1; + newEmEntry.partition.cprange.bigHiVal = 0; } } else { if (colWidth != datatypes::MAXDECIMALWIDTH) { - e->partition.cprange.loVal = numeric_limits::max(); - e->partition.cprange.hiVal = numeric_limits::min(); + newEmEntry.partition.cprange.loVal = numeric_limits::max(); + newEmEntry.partition.cprange.hiVal = numeric_limits::min(); } else { - utils::int128Max(e->partition.cprange.bigLoVal); - utils::int128Min(e->partition.cprange.bigHiVal); + utils::int128Max(newEmEntry.partition.cprange.bigLoVal); + utils::int128Min(newEmEntry.partition.cprange.bigHiVal); } } - e->partition.cprange.sequenceNum = 0; - - e->colWid = colWidth; - - e->dbRoot = dbRoot; - e->partitionNum = partitionNum; - e->segmentNum = segmentNum; - e->status = EXTENTUNAVAILABLE; // mark extent as in process - // If first extent for this OID, partition, dbroot, and segment then // blockOffset is set to 0 // else // blockOffset is extrapolated from the last extent - if (lastExtentIndex == -1) + newEmEntry.HWM = 0; + if (!lastEmEntry) { - e->blockOffset = 0; - e->HWM = 0; + newEmEntry.blockOffset = 0; } else { - e->blockOffset = static_cast(fExtentMap[lastExtentIndex].range.size) * 1024 + - fExtentMap[lastExtentIndex].blockOffset; - e->HWM = 0; + newEmEntry.blockOffset = static_cast(lastEmEntry->range.size) * 1024 + lastEmEntry->blockOffset; } #if 0 // XXX: sergueyz: I'll leave these under conditional flag for a while because it appears a huge change. @@ -3589,18 +3381,21 @@ LBID_t ExtentMap::_createColumnExtentExactFile(uint32_t size, int OID, uint32_t else e->partition.cprange.isValid = CP_INVALID; #else - e->partition.cprange.isValid = CP_INVALID; + newEmEntry.partition.cprange.isValid = CP_INVALID; #endif - startBlockOffset = e->blockOffset; - - makeUndoRecord(fEMShminfo, sizeof(MSTEntry)); - fEMShminfo->currentSize += sizeof(struct EMEntry); - auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[emptyEMEntry], emptyEMEntry); + // Insert into RBTree. + makeUndoRecordRBTree(UndoRecordType::INSERT, newEmEntry); + std::pair lbidEmEntryPair = make_pair(startLBID, newEmEntry); + fExtentMapRBTree->insert(lbidEmEntryPair); + startBlockOffset = newEmEntry.blockOffset; + makeUndoRecord(fEMRBTreeShminfo, sizeof(MSTEntry)); + fEMRBTreeShminfo->currentSize += EM_RB_TREE_NODE_SIZE; + // Insert into Index. + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(newEmEntry, startLBID); if (resShmemHasGrownPair.second) fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); - if (!resShmemHasGrownPair.first) logAndSetEMIndexReadOnly("_createColumnExtentExactFile"); @@ -3656,10 +3451,7 @@ void ExtentMap::createDictStoreExtent(int OID, uint16_t dbRoot, uint32_t partiti grabEMIndex(WRITE); grabFreeList(WRITE); - if (fEMShminfo->currentSize == fEMShminfo->allocdSize) - { - growEMShmseg(); - } + growIfNeededOnExtentCreate(); // size is the number of multiples of 1024 blocks. // ex: size=1 --> 1024 blocks @@ -3689,65 +3481,61 @@ void ExtentMap::createDictStoreExtent(int OID, uint16_t dbRoot, uint32_t partiti LBID_t ExtentMap::_createDictStoreExtent(uint32_t size, int OID, uint16_t dbRoot, uint32_t partitionNum, uint16_t segmentNum) { - auto lastIndEmptyIndEmptyInd = _createExtentCommonSearch(OID, dbRoot, partitionNum, segmentNum); - LastExtentIndexT lastExtentIndex = lastIndEmptyIndEmptyInd.first; - EmptyEMEntry emptyEMEntry = lastIndEmptyIndEmptyInd.second; + uint32_t highestOffset = 0; + LBID_t startLBID = getLBIDsFromFreeList(size); + EMEntry* lastEmEntry = nullptr; - if (emptyEMEntry == -1) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { - ostringstream oss; - oss << "ExtentMap::_createDictStoreExtent(): " - "could not find an empty EMEntry for OID " - << OID << "; Extent Map is full", - log(oss.str(), logging::LOG_TYPE_CRITICAL); - throw logic_error(oss.str()); + if ((emEntry.segmentNum == segmentNum) && (emEntry.blockOffset >= highestOffset)) + { + lastEmEntry = &emEntry; + highestOffset = emEntry.blockOffset; + } } - makeUndoRecord(&fExtentMap[emptyEMEntry], sizeof(EMEntry)); - EMEntry* e = &fExtentMap[emptyEMEntry]; + EMEntry newEmEntry; + newEmEntry.range.start = startLBID; + newEmEntry.range.size = size; + newEmEntry.fileID = OID; + newEmEntry.status = EXTENTUNAVAILABLE; // @bug 1911 mark extent as in process + utils::int128Max(newEmEntry.partition.cprange.bigLoVal); + utils::int128Min(newEmEntry.partition.cprange.bigHiVal); + newEmEntry.partition.cprange.sequenceNum = 0; + newEmEntry.partition.cprange.isValid = CP_INVALID; + newEmEntry.colWid = 0; // we don't store col width for dictionaries; + newEmEntry.HWM = 0; - LBID_t startLBID = getLBIDsFromFreeList(size); - e->range.start = startLBID; - e->range.size = size; - e->fileID = OID; - e->status = EXTENTUNAVAILABLE; // @bug 1911 mark extent as in process - utils::int128Max(e->partition.cprange.bigLoVal); - utils::int128Min(e->partition.cprange.bigHiVal); - e->partition.cprange.sequenceNum = 0; - e->partition.cprange.isValid = CP_INVALID; - - // If this is first extent for this OID, partition, segment then - // everything is set to 0 or taken from user input - // else - // everything is extrapolated from the last extent - if (lastExtentIndex == -1) + if (!lastEmEntry) { - e->blockOffset = 0; - e->HWM = 0; - e->segmentNum = segmentNum; - e->partitionNum = partitionNum; - e->dbRoot = dbRoot; - e->colWid = 0; // we don't store col width for dictionaries; - // this helps to flag this as a dictionary extent + newEmEntry.blockOffset = 0; + newEmEntry.segmentNum = segmentNum; + newEmEntry.partitionNum = partitionNum; + newEmEntry.dbRoot = dbRoot; } else + // TODO: Why is this different comparing to `column extent creation`. { - e->blockOffset = static_cast(fExtentMap[lastExtentIndex].range.size) * 1024 + - fExtentMap[lastExtentIndex].blockOffset; - e->HWM = 0; - e->segmentNum = fExtentMap[lastExtentIndex].segmentNum; - e->partitionNum = fExtentMap[lastExtentIndex].partitionNum; - e->dbRoot = fExtentMap[lastExtentIndex].dbRoot; - e->colWid = fExtentMap[lastExtentIndex].colWid; + newEmEntry.blockOffset = static_cast(lastEmEntry->range.size) * 1024 + lastEmEntry->blockOffset; + newEmEntry.segmentNum = lastEmEntry->segmentNum; + newEmEntry.partitionNum = lastEmEntry->partitionNum; + newEmEntry.dbRoot = lastEmEntry->dbRoot; + newEmEntry.colWid = lastEmEntry->colWid; } - makeUndoRecord(fEMShminfo, sizeof(MSTEntry)); - fEMShminfo->currentSize += sizeof(struct EMEntry); - auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(fExtentMap[emptyEMEntry], emptyEMEntry); + // Insert into RBTree. + makeUndoRecordRBTree(UndoRecordType::INSERT, newEmEntry); + std::pair lbidEmEntryPair = make_pair(startLBID, newEmEntry); + fExtentMapRBTree->insert(lbidEmEntryPair); + makeUndoRecord(fEMRBTreeShminfo, sizeof(MSTEntry)); + fEMRBTreeShminfo->currentSize += EM_RB_TREE_NODE_SIZE; + // Insert into Index. + auto resShmemHasGrownPair = fPExtMapIndexImpl_->insert(newEmEntry, startLBID); if (resShmemHasGrownPair.second) fEMIndexShminfo->allocdSize = fPExtMapIndexImpl_->getShmemSize(); - if (!resShmemHasGrownPair.first) logAndSetEMIndexReadOnly("_createDictStoreExtent"); @@ -3807,6 +3595,7 @@ void ExtentMap::printEM(const EMEntry& em) const cout << endl; } +// TODO: Add support for this RBTREE. void ExtentMap::printEM(const OID_t& oid) const { int emEntries = 0; @@ -3916,123 +3705,111 @@ void ExtentMap::rollbackColumnExtents_DBroot(int oid, bool bDeleteAll, uint16_t grabEMIndex(WRITE); grabFreeList(WRITE); - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (int i = 0; i < emEntries; i++) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, oid); + auto emIdents = getEmIteratorsByLbids(lbids); + for (auto& emIt : emIdents) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == oid) && (fExtentMap[i].dbRoot == dbRoot)) + auto& emEntry = emIt->second; + // Don't rollback extents that are out of service + if (emEntry.status == EXTENTOUTOFSERVICE) { - // oidExists = true; + continue; + } - // Don't rollback extents that are out of service - if (fExtentMap[i].status == EXTENTOUTOFSERVICE) - continue; + // If bDeleteAll is true, then we delete extent w/o regards to + // partition number, segment number, or HWM + if (bDeleteAll) + { + emIt = deleteExtent(emIt); // case 0 + continue; + } - // If bDeleteAll is true, then we delete extent w/o regards to - // partition number, segment number, or HWM - if (bDeleteAll) + // Calculate fbo range for the stripe containing the given hwm + if (fboHi == 0) + { + uint32_t range = emEntry.range.size * 1024; + fboLo = hwm - (hwm % range); + fboHi = fboLo + range - 1; + + if (fboLo > 0) + fboLoPreviousStripe = fboLo - range; + } + + // Delete, update, or ignore this extent: + // Later partition: + // case 1: extent in later partition than last extent, so delete + // Same partition: + // case 2: extent is in later stripe than last extent, so delete + // case 3: extent is in earlier stripe in the same partition. + // No action necessary for case3B and case3C. + // case 3A: extent is in trailing segment in previous stripe. + // This extent is now the last extent in that segment + // file, so reset the local HWM if it was altered. + // case 3B: extent in previous stripe but not a trailing segment + // case 3C: extent is in stripe that precedes previous stripe + // case 4: extent is in the same partition and stripe as the + // last logical extent we are to keep. + // case 4A: extent is in later segment so can be deleted + // case 4B: extent is in earlier segment, reset HWM if changed + // case 4C: this is last logical extent, reset HWM if changed + // Earlier partition: + // case 5: extent is in earlier parition, no action necessary + + if (emEntry.partitionNum > partitionNum) + { + emIt = deleteExtent(emIt); // case 1 + continue; + } + else if (emEntry.partitionNum == partitionNum) + { + if (emEntry.blockOffset > fboHi) { - deleteExtent(i); // case 0 + emIt = deleteExtent(emIt); // case 2 continue; } - - // Calculate fbo range for the stripe containing the given hwm - if (fboHi == 0) + else if (emEntry.blockOffset < fboLo) { - uint32_t range = fExtentMap[i].range.size * 1024; - fboLo = hwm - (hwm % range); - fboHi = fboLo + range - 1; - - if (fboLo > 0) - fboLoPreviousStripe = fboLo - range; - } - - // Delete, update, or ignore this extent: - // Later partition: - // case 1: extent in later partition than last extent, so delete - // Same partition: - // case 2: extent is in later stripe than last extent, so delete - // case 3: extent is in earlier stripe in the same partition. - // No action necessary for case3B and case3C. - // case 3A: extent is in trailing segment in previous stripe. - // This extent is now the last extent in that segment - // file, so reset the local HWM if it was altered. - // case 3B: extent in previous stripe but not a trailing segment - // case 3C: extent is in stripe that precedes previous stripe - // case 4: extent is in the same partition and stripe as the - // last logical extent we are to keep. - // case 4A: extent is in later segment so can be deleted - // case 4B: extent is in earlier segment, reset HWM if changed - // case 4C: this is last logical extent, reset HWM if changed - // Earlier partition: - // case 5: extent is in earlier parition, no action necessary - - if (fExtentMap[i].partitionNum > partitionNum) - { - deleteExtent(i); // case 1 - } - else if (fExtentMap[i].partitionNum == partitionNum) - { - if (fExtentMap[i].blockOffset > fboHi) + if (emEntry.blockOffset >= fboLoPreviousStripe) { - deleteExtent(i); // case 2 - } - else if (fExtentMap[i].blockOffset < fboLo) - { - if (fExtentMap[i].blockOffset >= fboLoPreviousStripe) + if (emEntry.segmentNum > segmentNum) { - if (fExtentMap[i].segmentNum > segmentNum) + if (emEntry.HWM != (fboLo - 1)) { - if (fExtentMap[i].HWM != (fboLo - 1)) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = fboLo - 1; // case 3A - fExtentMap[i].status = EXTENTAVAILABLE; - } - } - else - { - // not a trailing segment in prev stripe case 3B - } - } - else - { - // extent precedes previous stripe case 3C - } - } - else // extent is in same stripe - { - if (fExtentMap[i].segmentNum > segmentNum) - { - deleteExtent(i); // case 4A - } - else if (fExtentMap[i].segmentNum < segmentNum) - { - if (fExtentMap[i].HWM != fboHi) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = fboHi; // case 4B - fExtentMap[i].status = EXTENTAVAILABLE; - } - } - else // fExtentMap[i].segmentNum == segmentNum - { - if (fExtentMap[i].HWM != hwm) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = hwm; // case 4C - fExtentMap[i].status = EXTENTAVAILABLE; + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = fboLo - 1; // case 3A + emEntry.status = EXTENTAVAILABLE; } } } } - else + else // extent is in same stripe { - // extent in earlier partition; no action necessary case 5 + if (emEntry.segmentNum > segmentNum) + { + emIt = deleteExtent(emIt); // case 4A + continue; + } + else if (emEntry.segmentNum < segmentNum) + { + if (emEntry.HWM != fboHi) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = fboHi; // case 4B + emEntry.status = EXTENTAVAILABLE; + } + } + else // fExtentMap[i].segmentNum == segmentNum + { + if (emEntry.HWM != hwm) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = hwm; // case 4C + emEntry.status = EXTENTAVAILABLE; + } + } } - } // extent map entry with matching oid - } // loop through the extent map - + } + } // If this function is called, we are already in error recovery mode; so // don't worry about reporting an error if the OID is not found, because // we don't want/need the extents for that OID anyway. @@ -4108,119 +3885,116 @@ void ExtentMap::rollbackDictStoreExtents_DBroot(int oid, uint16_t dbRoot, uint32 bDeleteAll = true; // segToHwmMap maps segment file number to corresponding pair - tr1::unordered_map > segToHwmMap; - tr1::unordered_map >::const_iterator segToHwmMapIter; + tr1::unordered_map> segToHwmMap; + tr1::unordered_map>::const_iterator segToHwmMapIter; grabEMEntryTable(WRITE); grabEMIndex(WRITE); grabFreeList(WRITE); - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (int i = 0; i < emEntries; i++) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, oid); + auto emIdents = getEmIteratorsByLbids(lbids); + for (auto& emIt : emIdents) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == oid) && (fExtentMap[i].dbRoot == dbRoot)) + auto& emEntry = emIt->second; + // Don't rollback extents that are out of service + if (emEntry.status == EXTENTOUTOFSERVICE) { - // oidExists = true; + continue; + } - // Don't rollback extents that are out of service - if (fExtentMap[i].status == EXTENTOUTOFSERVICE) - continue; + // If bDeleteAll is true, then we delete extent w/o regards to + // partition number, segment number, or HWM + if (bDeleteAll) + { + emIt = deleteExtent(emIt); // case 0 + continue; + } - // If bDeleteAll is true, then we delete extent w/o regards to - // partition number, segment number, or HWM - if (bDeleteAll) + // Calculate fbo's for the list of hwms we are given; and store + // the fbo and hwm in a map, using the segment file number as a key. + if (segToHwmMap.size() == 0) + { + uint32_t range = emEntry.range.size * 1024; + pair segToHwmMapEntry; + + for (unsigned int k = 0; k < hwms.size(); k++) { - deleteExtent(i); // case 0 + uint32_t fboLo = hwms[k] - (hwms[k] % range); + segToHwmMapEntry.first = hwms[k]; + segToHwmMapEntry.second = fboLo; + segToHwmMap[segNums[k]] = segToHwmMapEntry; + } + } + + // Delete, update, or ignore this extent: + // Later partition: + // case 1: extent is in later partition, so delete the extent + // Same partition: + // case 2: extent is in trailing seg file we don't need; so delete + // case 3: extent is in partition and segment file of interest + // case 3A: earlier extent in segment file; no action necessary + // case 3B: specified HWM falls in this extent, so reset HWM + // case 3C: later extent in segment file; so delete the extent + // Earlier partition: + // case 4: extent is in earlier parition, no action necessary + + if (emEntry.partitionNum > partitionNum) + { + emIt = deleteExtent(emIt); // case 1 + continue; + } + else if (emEntry.partitionNum == partitionNum) + { + unsigned segNum = emEntry.segmentNum; + segToHwmMapIter = segToHwmMap.find(segNum); + + if (segToHwmMapIter == segToHwmMap.end()) + { + emIt = deleteExtent(emIt); // case 2 continue; } - - // Calculate fbo's for the list of hwms we are given; and store - // the fbo and hwm in a map, using the segment file number as a key. - if (segToHwmMap.size() == 0) + else // segment number in the map of files to keep { - uint32_t range = fExtentMap[i].range.size * 1024; - pair segToHwmMapEntry; + uint32_t fboLo = segToHwmMapIter->second.second; - for (unsigned int k = 0; k < hwms.size(); k++) + if (emEntry.blockOffset < fboLo) { - uint32_t fboLo = hwms[k] - (hwms[k] % range); - segToHwmMapEntry.first = hwms[k]; - segToHwmMapEntry.second = fboLo; - segToHwmMap[segNums[k]] = segToHwmMapEntry; + // no action necessary case 3A } - } - - // Delete, update, or ignore this extent: - // Later partition: - // case 1: extent is in later partition, so delete the extent - // Same partition: - // case 2: extent is in trailing seg file we don't need; so delete - // case 3: extent is in partition and segment file of interest - // case 3A: earlier extent in segment file; no action necessary - // case 3B: specified HWM falls in this extent, so reset HWM - // case 3C: later extent in segment file; so delete the extent - // Earlier partition: - // case 4: extent is in earlier parition, no action necessary - - if (fExtentMap[i].partitionNum > partitionNum) - { - deleteExtent(i); // case 1 - } - else if (fExtentMap[i].partitionNum == partitionNum) - { - unsigned segNum = fExtentMap[i].segmentNum; - segToHwmMapIter = segToHwmMap.find(segNum); - - if (segToHwmMapIter == segToHwmMap.end()) + else if (emEntry.blockOffset == fboLo) { - deleteExtent(i); // case 2 - } - else // segment number in the map of files to keep - { - uint32_t fboLo = segToHwmMapIter->second.second; + uint32_t hwm = segToHwmMapIter->second.first; - if (fExtentMap[i].blockOffset < fboLo) + if (emEntry.HWM != hwm) { - // no action necessary case 3A - } - else if (fExtentMap[i].blockOffset == fboLo) - { - uint32_t hwm = segToHwmMapIter->second.first; - - if (fExtentMap[i].HWM != hwm) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = hwm; - fExtentMap[i].status = EXTENTAVAILABLE; // case 3B - } - } - else - { - deleteExtent(i); // case 3C + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = hwm; + emEntry.status = EXTENTAVAILABLE; // case 3B } } + else + { + emIt = deleteExtent(emIt); // case 3C + continue; + } } - else - { - // extent in earlier partition; no action necessary case 4 - } - } // extent map entry with matching oid - } // loop through the extent map + } - // If this function is called, we are already in error recovery mode; so - // don't worry about reporting an error if the OID is not found, because - // we don't want/need the extents for that OID anyway. - // if (!oidExists) - //{ - // ostringstream oss; - // oss << "ExtentMap::rollbackDictStoreExtents_DBroot(): " - // "Rollback failed: no extents exist for: OID-" << oid << - // "; dbRoot-" << dbRoot << - // "; partition-" << partitionNum; - // log(oss.str(), logging::LOG_TYPE_CRITICAL); - // throw invalid_argument(oss.str()); - //} + // If this function is called, we are already in error recovery mode; so + // don't worry about reporting an error if the OID is not found, because + // we don't want/need the extents for that OID anyway. + // if (!oidExists) + //{ + // ostringstream oss; + // oss << "ExtentMap::rollbackDictStoreExtents_DBroot(): " + // "Rollback failed: no extents exist for: OID-" << oid << + // "; dbRoot-" << dbRoot << + // "; partition-" << partitionNum; + // log(oss.str(), logging::LOG_TYPE_CRITICAL); + // throw invalid_argument(oss.str()); + //} + } } //------------------------------------------------------------------------------ @@ -4246,120 +4020,111 @@ void ExtentMap::deleteEmptyColExtents(const ExtentsInfoMap_t& extentsInfo) uint32_t fboHi = 0; uint32_t fboLoPreviousStripe = 0; - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - ExtentsInfoMap_t::const_iterator it; - - for (int i = 0; i < emEntries; i++) + ExtentsInfoMap_t::const_iterator emInfoIter = extentsInfo.begin(); + for (; emInfoIter != extentsInfo.end(); ++emInfoIter) { - if (fExtentMap[i].range.size != 0) + const OID_t oid = emInfoIter->first; + const DBRootT dbroot = emInfoIter->second.dbRoot; + const PartitionNumberT partNum = emInfoIter->second.partitionNum; + const auto lbids = fPExtMapIndexImpl_->find(dbroot, oid, partNum); + auto emIters = getEmIteratorsByLbids(lbids); + for (auto& emIt : emIters) { - it = extentsInfo.find(fExtentMap[i].fileID); - - if (it != extentsInfo.end()) + auto emEntry = emIt->second; + // Don't rollback extents that are out of service. + if (emEntry.status == EXTENTOUTOFSERVICE) { - // Don't rollback extents that are out of service - if (fExtentMap[i].status == EXTENTOUTOFSERVICE) + continue; + } + + // Calculate fbo range for the stripe containing the given hwm. + if (fboHi == 0) + { + uint32_t range = emEntry.range.size * 1024; + fboLo = emInfoIter->second.hwm - (emInfoIter->second.hwm % range); + fboHi = fboLo + range - 1; + + if (fboLo > 0) + fboLoPreviousStripe = fboLo - range; + } + + // Delete, update, or ignore this extent: + // Later partition: + // case 1: extent in later partition than last extent, so delete + // Same partition: + // case 2: extent is in later stripe than last extent, so delete + // case 3: extent is in earlier stripe in the same partition. + // No action necessary for case3B and case3C. + // case 3A: extent is in trailing segment in previous stripe. + // This extent is now the last extent in that segment + // file, so reset the local HWM if it was altered. + // case 3B: extent in previous stripe but not a trailing segment + // case 3C: extent is in stripe that precedes previous stripe + // case 4: extent is in the same partition and stripe as the + // last logical extent we are to keep. + // case 4A: extent is in later segment so can be deleted + // case 4B: extent is in earlier segment, reset HWM if changed + // case 4C: this is last logical extent, reset HWM if changed + // Earlier partition: + // case 5: extent is in earlier parition, no action necessary + + if (emEntry.partitionNum > emInfoIter->second.partitionNum) + { + emIt = deleteExtent(emIt); // case 1 + continue; + } + else if (emEntry.partitionNum == emInfoIter->second.partitionNum) + { + if (emEntry.blockOffset > fboHi) + { + emIt = deleteExtent(emIt); // case 2 continue; - - // Calculate fbo range for the stripe containing the given hwm - if (fboHi == 0) - { - uint32_t range = fExtentMap[i].range.size * 1024; - fboLo = it->second.hwm - (it->second.hwm % range); - fboHi = fboLo + range - 1; - - if (fboLo > 0) - fboLoPreviousStripe = fboLo - range; } - - // Delete, update, or ignore this extent: - // Later partition: - // case 1: extent in later partition than last extent, so delete - // Same partition: - // case 2: extent is in later stripe than last extent, so delete - // case 3: extent is in earlier stripe in the same partition. - // No action necessary for case3B and case3C. - // case 3A: extent is in trailing segment in previous stripe. - // This extent is now the last extent in that segment - // file, so reset the local HWM if it was altered. - // case 3B: extent in previous stripe but not a trailing segment - // case 3C: extent is in stripe that precedes previous stripe - // case 4: extent is in the same partition and stripe as the - // last logical extent we are to keep. - // case 4A: extent is in later segment so can be deleted - // case 4B: extent is in earlier segment, reset HWM if changed - // case 4C: this is last logical extent, reset HWM if changed - // Earlier partition: - // case 5: extent is in earlier parition, no action necessary - - if (fExtentMap[i].partitionNum > it->second.partitionNum) + else if (emEntry.blockOffset < fboLo) { - deleteExtent(i); // case 1 - } - else if (fExtentMap[i].partitionNum == it->second.partitionNum) - { - if (fExtentMap[i].blockOffset > fboHi) + if (emEntry.blockOffset >= fboLoPreviousStripe) { - deleteExtent(i); // case 2 - } - else if (fExtentMap[i].blockOffset < fboLo) - { - if (fExtentMap[i].blockOffset >= fboLoPreviousStripe) + if (emEntry.segmentNum > emInfoIter->second.segmentNum) { - if (fExtentMap[i].segmentNum > it->second.segmentNum) + if (emEntry.HWM != (fboLo - 1)) { - if (fExtentMap[i].HWM != (fboLo - 1)) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = fboLo - 1; // case 3A - fExtentMap[i].status = EXTENTAVAILABLE; - } - } - else - { - // not a trailing segment in prev stripe case 3B - } - } - else - { - // extent precedes previous stripe case 3C - } - } - else - { - // extent is in same stripe - if (fExtentMap[i].segmentNum > it->second.segmentNum) - { - deleteExtent(i); // case 4A - } - else if (fExtentMap[i].segmentNum < it->second.segmentNum) - { - if (fExtentMap[i].HWM != fboHi) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = fboHi; // case 4B - fExtentMap[i].status = EXTENTAVAILABLE; - } - } - else - { - // fExtentMap[i].segmentNum == segmentNum - if (fExtentMap[i].HWM != it->second.hwm) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = it->second.hwm; // case 4C - fExtentMap[i].status = EXTENTAVAILABLE; + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = fboLo - 1; // case 3A + emEntry.status = EXTENTAVAILABLE; } } } } else { - // extent in earlier partition; no action necessary case 5 + // extent is in same stripe + if (emEntry.segmentNum > emInfoIter->second.segmentNum) + { + emIt = deleteExtent(emIt); // case 4A + continue; + } + else if (emEntry.segmentNum < emInfoIter->second.segmentNum) + { + if (emEntry.HWM != fboHi) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = fboHi; // case 4B + emEntry.status = EXTENTAVAILABLE; + } + } + else + { + if (emEntry.HWM != emInfoIter->second.hwm) + { + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = emInfoIter->second.hwm; // case 4C + emEntry.status = EXTENTAVAILABLE; + } + } } - } // extent map entry with matching oid + } } - } // loop through the extent map + } } void ExtentMap::deleteEmptyDictStoreExtents(const ExtentsInfoMap_t& extentsInfo) @@ -4378,103 +4143,98 @@ void ExtentMap::deleteEmptyDictStoreExtents(const ExtentsInfoMap_t& extentsInfo) grabEMIndex(WRITE); grabFreeList(WRITE); - ExtentsInfoMap_t::const_iterator it; - uint32_t fboLo = 0; uint32_t fboHi = 0; - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - it = extentsInfo.begin(); - - if (it->second.newFile) // The extent is the new extent + ExtentsInfoMap_t::const_iterator emInfoIter = extentsInfo.begin(); + if (emInfoIter->second.newFile) // The extent is the new extent { - for (int i = 0; i < emEntries; i++) + for (; emInfoIter != extentsInfo.end(); ++emInfoIter) { - if (fExtentMap[i].range.size != 0) + const OID_t oid = emInfoIter->first; + const DBRootT dbroot = emInfoIter->second.dbRoot; + const PartitionNumberT partNum = emInfoIter->second.partitionNum; + const auto lbids = fPExtMapIndexImpl_->find(dbroot, oid, partNum); + auto emIters = getEmIteratorsByLbids(lbids); + for (auto& emIt : emIters) { - it = extentsInfo.find(fExtentMap[i].fileID); - - if (it != extentsInfo.end()) + auto emEntry = emIt->second; + if (emEntry.segmentNum == emInfoIter->second.segmentNum) { - if ((fExtentMap[i].partitionNum == it->second.partitionNum) && - (fExtentMap[i].segmentNum == it->second.segmentNum) && - (fExtentMap[i].dbRoot == it->second.dbRoot)) - deleteExtent(i); + emIt = deleteExtent(emIt); } - } - } + } // em iterarors loop + } // em info map loop } - else // The extent is the old one + else { - for (int i = 0; i < emEntries; i++) + for (; emInfoIter != extentsInfo.end(); ++emInfoIter) { - if (fExtentMap[i].range.size != 0) + const OID_t oid = emInfoIter->first; + const DBRootT dbroot = emInfoIter->second.dbRoot; + const PartitionNumberT partNum = emInfoIter->second.partitionNum; + const auto lbids = fPExtMapIndexImpl_->find(dbroot, oid, partNum); + auto emIters = getEmIteratorsByLbids(lbids); + for (auto& emIt : emIters) { - it = extentsInfo.find(fExtentMap[i].fileID); + auto emEntry = emIt->second; - if (it != extentsInfo.end()) + // Don't rollback extents that are out of service + if (emEntry.status == EXTENTOUTOFSERVICE) { - // Don't rollback extents that are out of service - if (fExtentMap[i].status == EXTENTOUTOFSERVICE) - continue; + continue; + } - // Calculate fbo - if (fboHi == 0) - { - uint32_t range = fExtentMap[i].range.size * 1024; - fboLo = it->second.hwm - (it->second.hwm % range); - fboHi = fboLo + range - 1; - } + // Calculate fbo + if (fboHi == 0) + { + uint32_t range = emEntry.range.size * 1024; + fboLo = emInfoIter->second.hwm - (emInfoIter->second.hwm % range); + fboHi = fboLo + range - 1; + } - // Delete, update, or ignore this extent: - // Later partition: - // case 1: extent is in later partition, so delete the extent - // Same partition: - // case 2: extent is in partition and segment file of interest - // case 2A: earlier extent in segment file; no action necessary - // case 2B: specified HWM falls in this extent, so reset HWM - // case 2C: later extent in segment file; so delete the extent - // Earlier partition: - // case 3: extent is in earlier parition, no action necessary + // Delete, update, or ignore this extent: + // Later partition: + // case 1: extent is in later partition, so delete the extent + // Same partition: + // case 2: extent is in partition and segment file of interest + // case 2A: earlier extent in segment file; no action necessary + // case 2B: specified HWM falls in this extent, so reset HWM + // case 2C: later extent in segment file; so delete the extent + // Earlier partition: + // case 3: extent is in earlier parition, no action necessary - if (fExtentMap[i].partitionNum > it->second.partitionNum) + if (emEntry.partitionNum > emInfoIter->second.partitionNum) + { + emIt = deleteExtent(emIt); // case 1 + continue; + } + else if (emEntry.partitionNum == emInfoIter->second.partitionNum) + { + if (emEntry.segmentNum == emInfoIter->second.segmentNum) { - deleteExtent(i); // case 1 - } - else if (fExtentMap[i].partitionNum == it->second.partitionNum) - { - if (fExtentMap[i].segmentNum == it->second.segmentNum) + if (emEntry.blockOffset < fboLo) { - if (fExtentMap[i].blockOffset < fboLo) + // no action necessary case 2A + } + else if (emEntry.blockOffset == fboLo) + { + if (emEntry.HWM != emInfoIter->second.hwm) { - // no action necessary case 2A - } - else if (fExtentMap[i].blockOffset == fboLo) - { - if (fExtentMap[i].HWM != it->second.hwm) - { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].HWM = it->second.hwm; - fExtentMap[i].status = EXTENTAVAILABLE; // case 2B - } - } - else - { - deleteExtent(i); // case 3C + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.HWM = emInfoIter->second.hwm; + emEntry.status = EXTENTAVAILABLE; // case 2B } } else { - // no action necessary + emIt = deleteExtent(emIt); // case 3C + continue; } } - else - { - // extent in earlier partition; no action necessary case 4 - } - } // extent map entry with matching oid - } - } // loop through the extent map + } + } // em iterarors loop + } // em info map loop } } //------------------------------------------------------------------------------ @@ -4512,18 +4272,15 @@ void ExtentMap::deleteOID(int OID) // Clean up the index and tell deleteExtent to skip the clean-up. DBRootVec dbRootVec(getAllDbRoots()); for (auto dbRoot : dbRootVec) - fPExtMapIndexImpl_->deleteOID(dbRoot, OID); - const bool clearEMIndex = false; - - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (int emIndex = 0; emIndex < emEntries; emIndex++) { - if (fExtentMap[emIndex].range.size > 0 && fExtentMap[emIndex].fileID == OID) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID); + auto emIdents = getEmIteratorsByLbids(lbids); + OIDExists = (!emIdents.empty()); + for (auto& emIt : emIdents) { - OIDExists = true; - deleteExtent(emIndex, clearEMIndex); + emIt = deleteExtent(emIt, false); } + fPExtMapIndexImpl_->deleteOID(dbRoot, OID); } if (!OIDExists) @@ -4553,25 +4310,19 @@ void ExtentMap::deleteOIDs(const OidsMap_t& OIDs) grabEMIndex(WRITE); grabFreeList(WRITE); - OidsMap_t::const_iterator it; - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - const bool clearEMIndex = false; DBRootVec dbRootVec(getAllDbRoots()); for (auto dbRoot : dbRootVec) { for (auto& oidOidPair : OIDs) - fPExtMapIndexImpl_->deleteOID(dbRoot, oidOidPair.first); - } - - for (int emIndex = 0; emIndex < emEntries; emIndex++) - { - if (fExtentMap[emIndex].range.size > 0) { - it = OIDs.find(fExtentMap[emIndex].fileID); - - if (it != OIDs.end()) - deleteExtent(emIndex, clearEMIndex); + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, oidOidPair.first); + auto emIdents = getEmIteratorsByLbids(lbids); + for (auto& emIt : emIdents) + { + emIt = deleteExtent(emIt, false); // Don't clean up the index + } + // Clean-up the index at the dbroot/OID at once + fPExtMapIndexImpl_->deleteOID(dbRoot, oidOidPair.first); } } } @@ -4580,16 +4331,17 @@ void ExtentMap::deleteOIDs(const OidsMap_t& OIDs) // Delete the specified extent from the extentmap and return to the free list. // emIndex - the index (from the extent map) of the extent to be deleted //------------------------------------------------------------------------------ -void ExtentMap::deleteExtent(const int emIndex, const bool clearEMIndex) +ExtentMapRBTree::iterator ExtentMap::deleteExtent(ExtentMapRBTree::iterator it, const bool clearEMIndex) { int flIndex, freeFLIndex, flEntries, preceedingExtent, succeedingExtent; LBID_t flBlockEnd, emBlockEnd; flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange); + auto& emEntry = it->second; - emBlockEnd = fExtentMap[emIndex].range.start + (static_cast(fExtentMap[emIndex].range.size) * 1024); + emBlockEnd = emEntry.range.start + (static_cast(emEntry.range.size) * 1024); - // scan the freelist to see where this entry fits in + // Scan the freelist to see where this entry fits in. for (flIndex = 0, preceedingExtent = -1, succeedingExtent = -1, freeFLIndex = -1; flIndex < flEntries; flIndex++) { @@ -4601,19 +4353,18 @@ void ExtentMap::deleteExtent(const int emIndex, const bool clearEMIndex) if (emBlockEnd == fFreeList[flIndex].start) succeedingExtent = flIndex; - else if (flBlockEnd == fExtentMap[emIndex].range.start) + else if (flBlockEnd == emEntry.range.start) preceedingExtent = flIndex; } } - // update the freelist - - // this space is in between 2 blocks in the FL + // Update the freelist. + // This space is in between 2 blocks in the FL. if (preceedingExtent != -1 && succeedingExtent != -1) { makeUndoRecord(&fFreeList[preceedingExtent], sizeof(InlineLBIDRange)); - // migrate the entry upward if there's a space + // Migrate the entry upward if there's a space. if (freeFLIndex < preceedingExtent && freeFLIndex != -1) { makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange)); @@ -4622,19 +4373,19 @@ void ExtentMap::deleteExtent(const int emIndex, const bool clearEMIndex) preceedingExtent = freeFLIndex; } - fFreeList[preceedingExtent].size += fFreeList[succeedingExtent].size + fExtentMap[emIndex].range.size; + fFreeList[preceedingExtent].size += fFreeList[succeedingExtent].size + emEntry.range.size; makeUndoRecord(&fFreeList[succeedingExtent], sizeof(InlineLBIDRange)); fFreeList[succeedingExtent].size = 0; makeUndoRecord(fFLShminfo, sizeof(MSTEntry)); fFLShminfo->currentSize -= sizeof(InlineLBIDRange); } - // this space has a free block at the end + // This space has a free block at the end. else if (succeedingExtent != -1) { makeUndoRecord(&fFreeList[succeedingExtent], sizeof(InlineLBIDRange)); - // migrate the entry upward if there's a space + // Migrate the entry upward if there's a space. if (freeFLIndex < succeedingExtent && freeFLIndex != -1) { makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange)); @@ -4643,16 +4394,16 @@ void ExtentMap::deleteExtent(const int emIndex, const bool clearEMIndex) succeedingExtent = freeFLIndex; } - fFreeList[succeedingExtent].start = fExtentMap[emIndex].range.start; - fFreeList[succeedingExtent].size += fExtentMap[emIndex].range.size; + fFreeList[succeedingExtent].start = emEntry.range.start; + fFreeList[succeedingExtent].size += emEntry.range.size; } - // this space has a free block at the beginning + // This space has a free block at the beginning. else if (preceedingExtent != -1) { makeUndoRecord(&fFreeList[preceedingExtent], sizeof(InlineLBIDRange)); - // migrate the entry upward if there's a space + // Migrate the entry upward if there's a space. if (freeFLIndex < preceedingExtent && freeFLIndex != -1) { makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange)); @@ -4661,10 +4412,10 @@ void ExtentMap::deleteExtent(const int emIndex, const bool clearEMIndex) preceedingExtent = freeFLIndex; } - fFreeList[preceedingExtent].size += fExtentMap[emIndex].range.size; + fFreeList[preceedingExtent].size += emEntry.range.size; } - // the freelist has no adjacent blocks, so make a new entry + // The freelist has no adjacent blocks, so make a new entry. else { if (fFLShminfo->currentSize == fFLShminfo->allocdSize) @@ -4694,19 +4445,21 @@ void ExtentMap::deleteExtent(const int emIndex, const bool clearEMIndex) #endif makeUndoRecord(&fFreeList[freeFLIndex], sizeof(InlineLBIDRange)); - fFreeList[freeFLIndex].start = fExtentMap[emIndex].range.start; - fFreeList[freeFLIndex].size = fExtentMap[emIndex].range.size; + fFreeList[freeFLIndex].start = emEntry.range.start; + fFreeList[freeFLIndex].size = emEntry.range.size; makeUndoRecord(&fFLShminfo, sizeof(MSTEntry)); fFLShminfo->currentSize += sizeof(InlineLBIDRange); } - // invalidate the entry in the Extent Map - makeUndoRecord(&fExtentMap[emIndex], sizeof(EMEntry)); - fExtentMap[emIndex].range.size = 0; + // Clear index if needed. if (clearEMIndex) - fPExtMapIndexImpl_->deleteEMEntry(fExtentMap[emIndex], emIndex); - makeUndoRecord(&fEMShminfo, sizeof(MSTEntry)); - fEMShminfo->currentSize -= sizeof(struct EMEntry); + fPExtMapIndexImpl_->deleteEMEntry(it->second, it->first); + + makeUndoRecordRBTree(UndoRecordType::DELETE, it->second); + // Erase a node for the given iterator. + makeUndoRecord(&fEMRBTreeShminfo, sizeof(MSTEntry)); + fEMRBTreeShminfo->currentSize -= EM_RB_TREE_NODE_SIZE; + return fExtentMapRBTree->erase(it); } //------------------------------------------------------------------------------ @@ -4739,7 +4492,6 @@ HWM_t ExtentMap::getLastHWM_DBroot(int OID, uint16_t dbRoot, uint32_t& partition #endif uint32_t lastExtent = 0; - int lastExtentIndex = -1; partitionNum = 0; segmentNum = 0; HWM_t hwm = 0; @@ -4756,37 +4508,30 @@ HWM_t ExtentMap::getLastHWM_DBroot(int OID, uint16_t dbRoot, uint32_t& partition grabEMEntryTable(READ); grabEMIndex(READ); - // Searching the array in reverse order should be faster since the last - // extent is usually at the bottom. We still have to search the entire - // array (just in case), but the number of operations per loop iteration - // will be less. - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID); + const auto emIdents = getEmIdentsByLbids(lbids); + auto lastEmEntry = emIdents.begin(); - for (int i = emEntries - 1; i >= 0; i--) + for (auto emEntry = emIdents.begin(); emEntry < emIdents.end(); ++emEntry) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == OID) && - (fExtentMap[i].dbRoot == dbRoot) && - ((fExtentMap[i].status == EXTENTAVAILABLE) || (fExtentMap[i].status == EXTENTOUTOFSERVICE))) + if (((emEntry->status == EXTENTAVAILABLE) || (emEntry->status == EXTENTOUTOFSERVICE)) && + ((emEntry->partitionNum > partitionNum) || + ((emEntry->partitionNum == partitionNum) && (emEntry->blockOffset > lastExtent)) || + ((emEntry->partitionNum == partitionNum) && (emEntry->blockOffset == lastExtent) && + (emEntry->segmentNum >= segmentNum)))) { - if ((fExtentMap[i].partitionNum > partitionNum) || - ((fExtentMap[i].partitionNum == partitionNum) && (fExtentMap[i].blockOffset > lastExtent)) || - ((fExtentMap[i].partitionNum == partitionNum) && (fExtentMap[i].blockOffset == lastExtent) && - (fExtentMap[i].segmentNum >= segmentNum))) - { - lastExtent = fExtentMap[i].blockOffset; - partitionNum = fExtentMap[i].partitionNum; - segmentNum = fExtentMap[i].segmentNum; - lastExtentIndex = i; - } + bFound = true; + lastExtent = emEntry->blockOffset; + partitionNum = emEntry->partitionNum; + segmentNum = emEntry->segmentNum; + lastEmEntry = emEntry; } } - // save additional information before we release the read-lock - if (lastExtentIndex != -1) + if (bFound) { - hwm = fExtentMap[lastExtentIndex].HWM; - status = fExtentMap[lastExtentIndex].status; - bFound = true; + hwm = lastEmEntry->HWM; + status = lastEmEntry->status; } releaseEMIndex(READ); @@ -4851,45 +4596,49 @@ void ExtentMap::getDbRootHWMInfo(int OID, uint16_t pmNumber, EmDbRootHWMInfo_v& grabEMIndex(READ); tr1::unordered_map::iterator emIter; + // Indicates that we found an extent, the index itself means nothing. + // TODO: Update to flag. + uint32_t foundIndex = 0; + // Searching the array in reverse order should be faster since the last // extent is usually at the bottom. We still have to search the entire // array (just in case), but the number of operations per loop iteration // will be less. for (auto dbRoot : dbRootVec) { - auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID); - for (auto i : emIdents) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID); + const auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == OID)) + if (emEntry.fileID == OID) { // Include this extent in the search, only if the extent's // DBRoot falls in the list of DBRoots for this PM. - emIter = emDbRootMap.find(fExtentMap[i].dbRoot); + emIter = emDbRootMap.find(emEntry.dbRoot); if (emIter == emDbRootMap.end()) continue; EmDbRootHWMInfo& emDbRoot = emIter->second; - if ((fExtentMap[i].status != EXTENTOUTOFSERVICE) && (fExtentMap[i].HWM != 0)) - emDbRoot.totalBlocks += (fExtentMap[i].HWM + 1); + if ((emEntry.status != EXTENTOUTOFSERVICE) && (emEntry.HWM != 0)) + emDbRoot.totalBlocks += (emEntry.HWM + 1); - if ((fExtentMap[i].partitionNum > emDbRoot.partitionNum) || - ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) && - (fExtentMap[i].blockOffset > emDbRoot.fbo)) || - ((fExtentMap[i].partitionNum == emDbRoot.partitionNum) && - (fExtentMap[i].blockOffset == emDbRoot.fbo) && - (fExtentMap[i].segmentNum >= emDbRoot.segmentNum))) + if ((emEntry.partitionNum > emDbRoot.partitionNum) || + ((emEntry.partitionNum == emDbRoot.partitionNum) && (emEntry.blockOffset > emDbRoot.fbo)) || + ((emEntry.partitionNum == emDbRoot.partitionNum) && (emEntry.blockOffset == emDbRoot.fbo) && + (emEntry.segmentNum >= emDbRoot.segmentNum))) { - emDbRoot.fbo = fExtentMap[i].blockOffset; - emDbRoot.partitionNum = fExtentMap[i].partitionNum; - emDbRoot.segmentNum = fExtentMap[i].segmentNum; - emDbRoot.localHWM = fExtentMap[i].HWM; - emDbRoot.startLbid = fExtentMap[i].range.start; - emDbRoot.status = fExtentMap[i].status; - emDbRoot.hwmExtentIndex = i; + emDbRoot.fbo = emEntry.blockOffset; + emDbRoot.partitionNum = emEntry.partitionNum; + emDbRoot.segmentNum = emEntry.segmentNum; + emDbRoot.localHWM = emEntry.HWM; + emDbRoot.startLbid = emEntry.range.start; + emDbRoot.status = emEntry.status; + emDbRoot.hwmExtentIndex = foundIndex; } } + ++foundIndex; } } @@ -4960,7 +4709,6 @@ void ExtentMap::getExtentState(int OID, uint32_t partitionNum, uint16_t segmentN } #endif - int i, emEntries; bFound = false; status = EXTENTAVAILABLE; @@ -4975,16 +4723,19 @@ void ExtentMap::getExtentState(int OID, uint32_t partitionNum, uint16_t segmentN grabEMEntryTable(READ); grabEMIndex(READ); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (i = 0; i < emEntries; i++) + DBRootVec dbRootVec(getAllDbRoots()); + for (auto dbRoot : dbRootVec) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == OID) && - (fExtentMap[i].partitionNum == partitionNum) && (fExtentMap[i].segmentNum == segmentNum)) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + const auto emIdents = getEmIdentsByLbids(lbids); + for (const auto& emEntry : emIdents) { - bFound = true; - status = fExtentMap[i].status; - break; + if (emEntry.segmentNum == segmentNum) + { + bFound = true; + status = emEntry.status; + break; + } } } @@ -5038,17 +4789,17 @@ HWM_t ExtentMap::getLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum DBRootVec dbRootVec(getAllDbRoots()); for (auto dbRoot : dbRootVec) { - auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); - for (auto i : emIdents) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + const auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].segmentNum == segmentNum)) + if ((emEntry.segmentNum == segmentNum)) { OIDPartSegExists = true; - status = fExtentMap[i].status; - - if (fExtentMap[i].HWM != 0) + status = emEntry.status; + if (emEntry.HWM != 0) { - ret = fExtentMap[i].HWM; + ret = emEntry.HWM; releaseEMIndex(READ); releaseEMEntryTable(READ); return ret; @@ -5104,9 +4855,9 @@ void ExtentMap::setLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum, #endif - LastExtentIndexT lastExtentIndex = -1; - int oldHWMExtentIndex = -1; - HighestOffset highestOffset = 0; + EMEntry* lastEm = nullptr; + EMEntry* prevEm = nullptr; + uint32_t highestOffset = 0; if (uselock) { @@ -5118,28 +4869,28 @@ void ExtentMap::setLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum, for (auto dbRoot : dbRootVec) { - auto emIdents = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); - for (auto i : emIdents) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID, partitionNum); + auto emIdents = getEmIteratorsByLbids(lbids); + for (auto emIt : emIdents) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].segmentNum == segmentNum)) + auto& emEntry = emIt->second; + if (emEntry.segmentNum == segmentNum) { - // Find current HWM extent - if (fExtentMap[i].blockOffset >= highestOffset) + // Find current HWM extent in case of multiple extents per segment file. + if (emEntry.blockOffset >= highestOffset) { - highestOffset = fExtentMap[i].blockOffset; - lastExtentIndex = i; + highestOffset = emEntry.blockOffset; + lastEm = &emEntry; } - // Find previous HWM extent - if (fExtentMap[i].HWM != 0) - { - oldHWMExtentIndex = i; - } + // Find previous HWM extent. + if (emEntry.HWM != 0) + prevEm = &emEntry; } } } - if (lastExtentIndex == -1) + if (lastEm == nullptr) { ostringstream oss; oss << "ExtentMap::setLocalHWM(): Bad OID/partition/segment argument; " @@ -5149,7 +4900,7 @@ void ExtentMap::setLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum, throw invalid_argument(oss.str()); } - if (newHWM >= (fExtentMap[lastExtentIndex].blockOffset + fExtentMap[lastExtentIndex].range.size * 1024)) + if (newHWM >= (lastEm->blockOffset + lastEm->range.size * 1024)) { ostringstream oss; oss << "ExtentMap::setLocalHWM(): " @@ -5160,18 +4911,15 @@ void ExtentMap::setLocalHWM(int OID, uint32_t partitionNum, uint16_t segmentNum, } // Save HWM in last extent for this segment file; and mark as AVAILABLE - makeUndoRecord(&fExtentMap[lastExtentIndex], sizeof(EMEntry)); - fExtentMap[lastExtentIndex].HWM = newHWM; - fExtentMap[lastExtentIndex].status = EXTENTAVAILABLE; + makeUndoRecordRBTree(UndoRecordType::DEFAULT, *lastEm); + lastEm->HWM = newHWM; + lastEm->status = EXTENTAVAILABLE; // Reset HWM in old HWM extent to 0 - if ((oldHWMExtentIndex != -1) && (oldHWMExtentIndex != lastExtentIndex)) + if ((prevEm != nullptr) && (prevEm != lastEm)) { - makeUndoRecord(&fExtentMap[oldHWMExtentIndex], sizeof(EMEntry)); - fExtentMap[oldHWMExtentIndex].HWM = 0; -#ifdef BRM_INFO - addedAnExtent = true; -#endif + makeUndoRecordRBTree(UndoRecordType::DEFAULT, *prevEm); + prevEm->HWM = 0; } #ifdef BRM_INFO @@ -5235,26 +4983,13 @@ class BUEqual void ExtentMap::bulkUpdateDBRoot(const vector& args) { - tr1::unordered_set sArgs; - tr1::unordered_set::iterator sit; - BulkUpdateDBRootArg key; - int emEntries; - - for (uint32_t i = 0; i < args.size(); i++) - sArgs.insert(args[i]); - grabEMEntryTable(WRITE); grabEMIndex(WRITE); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - for (int i = 0; i < emEntries; i++) + for (auto& lbidDbroot : args) { - key.startLBID = fExtentMap[i].range.start; - sit = sArgs.find(key); - - if (sit != sArgs.end()) - fExtentMap[i].dbRoot = sit->dbRoot; + auto emIter = findByLBID(lbidDbroot.startLBID); + emIter->second.dbRoot = lbidDbroot.dbRoot; } } @@ -5271,7 +5006,6 @@ void ExtentMap::getExtents(int OID, vector& entries, bool sorted } #endif - int i, emEntries; entries.clear(); @@ -5285,21 +5019,28 @@ void ExtentMap::getExtents(int OID, vector& entries, bool sorted grabEMEntryTable(READ); grabEMIndex(READ); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - // Pre-expand entries to stop lots of small allocs - entries.reserve(emEntries); - if (incOutOfService) + // Artibtrary sized constant + entries.reserve(100); + + DBRootVec dbRootVec(getAllDbRoots()); + + for (auto dbRoot : dbRootVec) { - for (i = 0; i < emEntries; i++) - if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].range.size != 0)) - entries.push_back(fExtentMap[i]); - } - else - { - for (i = 0; i < emEntries; i++) - if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].range.size != 0) && - (fExtentMap[i].status != EXTENTOUTOFSERVICE)) - entries.push_back(fExtentMap[i]); + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID); + entries.reserve(entries.size() + lbids.size()); + const auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) + { + if (incOutOfService) + { + entries.push_back(emEntry); + } + else + { + if (emEntry.status != EXTENTOUTOFSERVICE) + entries.push_back(emEntry); + } + } } releaseEMIndex(READ); @@ -5347,8 +5088,6 @@ void ExtentMap::getExtents_dbroot(int OID, vector& entries, cons #endif - int i, emEntries; - entries.clear(); if (OID < 0) @@ -5361,11 +5100,10 @@ void ExtentMap::getExtents_dbroot(int OID, vector& entries, cons grabEMEntryTable(READ); grabEMIndex(READ); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (i = 0; i < emEntries; i++) - if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].range.size != 0) && (fExtentMap[i].dbRoot == dbroot)) - entries.push_back(fExtentMap[i]); + const auto lbids = fPExtMapIndexImpl_->find(dbroot, OID); + auto emIdents = getEmIdentsByLbids(lbids); + entries.swap(emIdents); releaseEMIndex(READ); releaseEMEntryTable(READ); @@ -5378,8 +5116,6 @@ void ExtentMap::getExtents_dbroot(int OID, vector& entries, cons //------------------------------------------------------------------------------ void ExtentMap::getExtentCount_dbroot(int OID, uint16_t dbroot, bool incOutOfService, uint64_t& numExtents) { - int i, emEntries; - if (OID < 0) { ostringstream oss; @@ -5390,29 +5126,20 @@ void ExtentMap::getExtentCount_dbroot(int OID, uint16_t dbroot, bool incOutOfSer grabEMEntryTable(READ); grabEMIndex(READ); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - - numExtents = 0; - - if (incOutOfService) + const auto lbids = fPExtMapIndexImpl_->find(dbroot, OID); + if (!incOutOfService) { - for (i = 0; i < emEntries; i++) + const auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { - if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].range.size != 0) && - (fExtentMap[i].dbRoot == dbroot)) - numExtents++; - } - } - else - { - for (i = 0; i < emEntries; i++) - { - if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].range.size != 0) && - (fExtentMap[i].dbRoot == dbroot) && (fExtentMap[i].status != EXTENTOUTOFSERVICE)) - numExtents++; + if (emEntry.status != EXTENTOUTOFSERVICE) + { + ++numExtents; + } } } + numExtents = (incOutOfService) ? lbids.size() : numExtents; releaseEMIndex(READ); releaseEMEntryTable(READ); } @@ -5440,13 +5167,15 @@ void ExtentMap::getSysCatDBRoot(OID_t oid, uint16_t& dbRoot) bool bFound = false; grabEMEntryTable(READ); grabEMIndex(READ); - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (int i = 0; i < emEntries; i++) + DBRootVec dbRootVec(getAllDbRoots()); + for (auto localDbRoot : dbRootVec) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == oid)) + const auto lbids = fPExtMapIndexImpl_->find(localDbRoot, oid); + if (!lbids.empty()) { - dbRoot = fExtentMap[i].dbRoot; + auto emIt = findByLBID(lbids[0]); + dbRoot = emIt->second.dbRoot; bFound = true; break; } @@ -5505,42 +5234,41 @@ void ExtentMap::deletePartition(const set& oids, const set foundPartitions; - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - vector extents; - // First: validate against referencing non-existent logical partitions - std::set::const_iterator it; + std::set foundPartitions; + std::vector extents; - for (int i = 0; i < emEntries; i++) + for (auto oid : oids) { - LogicalPartition lp(fExtentMap[i].dbRoot, fExtentMap[i].partitionNum, fExtentMap[i].segmentNum); - - if ((fExtentMap[i].range.size != 0) && (partitionNums.find(lp) != partitionNums.end())) + for (auto& partition : partitionNums) { - it = oids.find(fExtentMap[i].fileID); - - if (it != oids.end()) + const auto lbids = fPExtMapIndexImpl_->find(partition.dbroot, oid, partition.pp); + auto emIters = getEmIteratorsByLbids(lbids); + for (auto& emIter : emIters) { + const auto& emEntry = emIter->second; + if (emEntry.segmentNum != partition.seg) + { + continue; + } + LogicalPartition lp(emEntry.dbRoot, emEntry.partitionNum, emEntry.segmentNum); foundPartitions.insert(lp); - extents.push_back(i); + emIter = deleteExtent(emIter); } } } if (foundPartitions.size() != partitionNums.size()) { - set::const_iterator partIt; Message::Args args; ostringstream oss; - for (partIt = partitionNums.begin(); partIt != partitionNums.end(); ++partIt) + for (auto partIt = partitionNums.begin(), end = partitionNums.end(); partIt != end; ++partIt) { if (foundPartitions.find((*partIt)) == foundPartitions.end()) { if (!oss.str().empty()) oss << ", "; - oss << (*partIt).toString(); } } @@ -5550,16 +5278,10 @@ void ExtentMap::deletePartition(const set& oids, const set& oids, const setallocdSize / sizeof(struct EMEntry); - set foundPartitions; - vector extents; + std::set foundPartitions; + std::vector extents; bool partitionAlreadyDisabled = false; - // Identify not exists partition first. Then mark disable. - std::set::const_iterator it; - - for (int i = 0; i < emEntries; i++) + for (auto oid : oids) { - LogicalPartition lp(fExtentMap[i].dbRoot, fExtentMap[i].partitionNum, fExtentMap[i].segmentNum); - - if ((fExtentMap[i].range.size != 0) && (partitionNums.find(lp) != partitionNums.end())) + for (auto& partition : partitionNums) { - it = oids.find(fExtentMap[i].fileID); - - if (it != oids.end()) + const auto lbids = fPExtMapIndexImpl_->find(partition.dbroot, oid, partition.pp); + auto emIters = getEmIteratorsByLbids(lbids); + for (auto& emIter : emIters) { - if (fExtentMap[i].status == EXTENTOUTOFSERVICE) + const auto& emEntry = emIter->second; + if ((emEntry.segmentNum != partition.seg) || emEntry.range.size == 0) + { + continue; + } + if (emEntry.status == EXTENTOUTOFSERVICE) { partitionAlreadyDisabled = true; } - + LogicalPartition lp(emEntry.dbRoot, emEntry.partitionNum, emEntry.segmentNum); foundPartitions.insert(lp); - extents.push_back(i); + extents.push_back(emIter); } } } @@ -5638,18 +5359,17 @@ void ExtentMap::markPartitionForDeletion(const set& oids, const setsecond); + extents[i]->second.status = EXTENTOUTOFSERVICE; } // validate against referencing non-existent logical partitions if (foundPartitions.size() != partitionNums.size()) { - set::const_iterator partIt; Message::Args args; ostringstream oss; - for (partIt = partitionNums.begin(); partIt != partitionNums.end(); ++partIt) + for (auto partIt = partitionNums.begin(); partIt != partitionNums.end(); ++partIt) { if (foundPartitions.find((*partIt)) == foundPartitions.end()) { @@ -5714,18 +5434,20 @@ void ExtentMap::markAllPartitionForDeletion(const set& oids) grabEMEntryTable(WRITE); grabEMIndex(WRITE); - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (int i = 0; i < emEntries; i++) + DBRootVec dbRootVec(getAllDbRoots()); + // It would be interesting to pick the smallest set(either oids or partitions) for the second loop. + for (auto dbRoot : dbRootVec) { - if (fExtentMap[i].range.size != 0) + for (auto oid : oids) { - it = oids.find(fExtentMap[i].fileID); - - if (it != oids.end()) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, oid); + auto emIters = getEmIteratorsByLbids(lbids); + for (auto& emIter : emIters) { - makeUndoRecord(&fExtentMap[i], sizeof(EMEntry)); - fExtentMap[i].status = EXTENTOUTOFSERVICE; + auto& emEntry = emIter->second; + makeUndoRecordRBTree(UndoRecordType::DEFAULT, emEntry); + emEntry.status = EXTENTOUTOFSERVICE; } } } @@ -5769,27 +5491,29 @@ void ExtentMap::restorePartition(const set& oids, const setallocdSize / sizeof(struct EMEntry); - vector extents; + vector extents; set foundPartitions; bool partitionAlreadyEnabled = false; - for (int i = 0; i < emEntries; i++) + for (auto oid : oids) { - LogicalPartition lp(fExtentMap[i].dbRoot, fExtentMap[i].partitionNum, fExtentMap[i].segmentNum); - - if ((fExtentMap[i].range.size != 0) && partitionNums.find(lp) != partitionNums.end()) + for (auto& partition : partitionNums) { - it = oids.find(fExtentMap[i].fileID); - - if (it != oids.end()) + const auto lbids = fPExtMapIndexImpl_->find(partition.dbroot, oid, partition.pp); + auto emIters = getEmIteratorsByLbids(lbids); + for (auto& emIter : emIters) { - if (fExtentMap[i].status == EXTENTAVAILABLE) + const auto& emEntry = emIter->second; + if (emEntry.segmentNum != partition.seg) + { + continue; + } + if (emEntry.status == EXTENTAVAILABLE) { partitionAlreadyEnabled = true; } - - extents.push_back(i); + LogicalPartition lp(emEntry.dbRoot, emEntry.partitionNum, emEntry.segmentNum); + extents.push_back(emIter); foundPartitions.insert(lp); } } @@ -5797,11 +5521,10 @@ void ExtentMap::restorePartition(const set& oids, const set::const_iterator partIt; Message::Args args; ostringstream oss; - for (partIt = partitionNums.begin(); partIt != partitionNums.end(); ++partIt) + for (auto partIt = partitionNums.begin(); partIt != partitionNums.end(); ++partIt) { if (foundPartitions.empty() || foundPartitions.find((*partIt)) == foundPartitions.end()) { @@ -5820,8 +5543,8 @@ void ExtentMap::restorePartition(const set& oids, const setsecond); + extents[i]->second.status = EXTENTAVAILABLE; } if (partitionAlreadyEnabled) @@ -5861,16 +5584,20 @@ void ExtentMap::getOutOfServicePartitions(OID_t oid, set& part grabEMEntryTable(READ); grabEMIndex(READ); - int emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (int i = 0; i < emEntries; i++) + DBRootVec dbRootVec(getAllDbRoots()); + // It would be interesting to pick the smallest set(either oids or partitions) for the second loop. + for (auto dbRoot : dbRootVec) { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].fileID == oid) && - (fExtentMap[i].status == EXTENTOUTOFSERVICE)) + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, oid); + const auto emEntries = getEmIdentsByLbids(lbids); + for (const auto& emEntry : emEntries) { - // need to be logical partition number - LogicalPartition lp(fExtentMap[i].dbRoot, fExtentMap[i].partitionNum, fExtentMap[i].segmentNum); - partitionNums.insert(lp); + if (emEntry.status == EXTENTOUTOFSERVICE) + { + LogicalPartition lp(emEntry.dbRoot, emEntry.partitionNum, emEntry.segmentNum); + partitionNums.insert(lp); + } } } @@ -5899,9 +5626,17 @@ void ExtentMap::deleteDBRoot(uint16_t dbroot) grabEMIndex(WRITE); grabFreeList(WRITE); - for (unsigned i = 0; i < fEMShminfo->allocdSize / sizeof(struct EMEntry); i++) - if (fExtentMap[i].range.size != 0 && fExtentMap[i].dbRoot == dbroot) - deleteExtent(i); + auto it = fExtentMapRBTree->begin(); + auto end = fExtentMapRBTree->end(); + + while (it != end) + { + if (it->second.dbRoot == dbroot) + it = deleteExtent(it); + else + ++it; + } + fPExtMapIndexImpl_->deleteDbRoot(dbroot); } @@ -5922,25 +5657,15 @@ bool ExtentMap::isDBRootEmpty(uint16_t dbroot) #endif - bool bEmpty = true; - int i, emEntries; grabEMEntryTable(READ); grabEMIndex(READ); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - if (fEMShminfo->currentSize == 0) + if (fEMRBTreeShminfo->currentSize == 0) { throw runtime_error("ExtentMap::isDBRootEmpty() shared memory not loaded"); } - for (i = 0; i < emEntries; i++) - { - if ((fExtentMap[i].range.size != 0) && (fExtentMap[i].dbRoot == dbroot)) - { - bEmpty = false; - break; - } - } + bool bEmpty = fPExtMapIndexImpl_->isDBRootEmpty(dbroot); releaseEMIndex(READ); releaseEMEntryTable(READ); @@ -5987,10 +5712,6 @@ void ExtentMap::lookup(OID_t OID, LBIDRange_v& ranges) } #endif - - int i, emEntries; - LBIDRange tmp; - ranges.clear(); if (OID < 0) @@ -6003,16 +5724,24 @@ void ExtentMap::lookup(OID_t OID, LBIDRange_v& ranges) grabEMEntryTable(READ); grabEMIndex(READ); - emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (i = 0; i < emEntries; i++) - if ((fExtentMap[i].fileID == OID) && (fExtentMap[i].range.size != 0) && - (fExtentMap[i].status != EXTENTOUTOFSERVICE)) + DBRootVec dbRootVec(getAllDbRoots()); + + for (auto dbRoot : dbRootVec) + { + const auto lbids = fPExtMapIndexImpl_->find(dbRoot, OID); + const auto emIdents = getEmIdentsByLbids(lbids); + for (auto& emEntry : emIdents) { - tmp.start = fExtentMap[i].range.start; - tmp.size = fExtentMap[i].range.size * 1024; - ranges.push_back(tmp); + if (emEntry.status != EXTENTOUTOFSERVICE) + { + LBIDRange tmp; + tmp.start = emEntry.range.start; + tmp.size = emEntry.range.size * 1024; + ranges.emplace_back(tmp); + } } + } releaseEMIndex(READ); releaseEMEntryTable(READ); @@ -6021,7 +5750,6 @@ void ExtentMap::lookup(OID_t OID, LBIDRange_v& ranges) int ExtentMap::checkConsistency() { #ifdef BRM_INFO - if (fDebug) TRACER_WRITENOW("checkConsistency"); @@ -6039,10 +5767,10 @@ int ExtentMap::checkConsistency() Struct integrity 5. verify that the number of entries in each table is consistent with the recorded current size - */ + */ LBID_t emBegin, emEnd, flBegin, flEnd; - int i, j, flEntries, emEntries; + int i, j, flEntries; uint32_t usedEntries; grabEMEntryTable(READ); @@ -6060,15 +5788,14 @@ int ExtentMap::checkConsistency() } flEntries = fFLShminfo->allocdSize / sizeof(InlineLBIDRange); - emEntries = fEMShminfo->allocdSize / sizeof(EMEntry); // test 1a - make sure every entry in the EM is not overlapped by an entry in the FL - for (i = 0; i < emEntries; i++) + for (auto emIt = fExtentMapRBTree->begin(), end = fExtentMapRBTree->end(); emIt != end; ++emIt) { - if (fExtentMap[i].range.size != 0) + auto& emEntry = emIt->second; { - emBegin = fExtentMap[i].range.start; - emEnd = emBegin + (fExtentMap[i].range.size * 1024) - 1; + emBegin = emEntry.range.start; + emEnd = emBegin + (emEntry.range.size * 1024) - 1; for (j = 0; j < flEntries; j++) { @@ -6115,17 +5842,18 @@ int ExtentMap::checkConsistency() lbid += fFreeList[i].size; } - for (i = 0; i < emEntries; i++) + for (auto emIt = fExtentMapRBTree->begin(), end = fExtentMapRBTree->end(); emIt != end; ++emIt) { - if (fExtentMap[i].range.start % 1024 != 0) + auto& emEntry = emIt->second; + if (emEntry.range.start % 1024 != 0) { cerr << "EM::checkConsistency(): An extent map entry is not 1024-block aligned " << i << " " - << fExtentMap[i].range.start << endl; + << emEntry.range.start << endl; throw logic_error("EM checkConsistency test 1b (data structures are read-locked)"); } - if (fExtentMap[i].range.start / 1024 == lbid) - lbid += fExtentMap[i].range.size; + if (emEntry.range.start / 1024 == lbid) + lbid += emEntry.range.size; } if (oldlbid == lbid) @@ -6141,12 +5869,13 @@ int ExtentMap::checkConsistency() // test 1c - verify that no dbroot is < 1 bool errorOut = false; - for (i = 0; i < emEntries; i++) + for (auto emIt = fExtentMapRBTree->begin(), end = fExtentMapRBTree->end(); emIt != end; ++emIt) { - if (fExtentMap[i].range.size != 0) + auto& emEntry = emIt->second; + if (emEntry.range.size != 0) { // cout << "EM[" << i << "]: dbRoot=" << fExtentMap[i].dbRoot(listMan) << endl; - if (fExtentMap[i].dbRoot == 0) + if (emEntry.dbRoot == 0) { errorOut = true; cerr << "EM::checkConsistency(): index " << i << " has a 0 dbroot\n"; @@ -6180,7 +5909,7 @@ int ExtentMap::checkConsistency() cout << "Test 1d passed\n"; #endif - // test 2 - verify that the freelist is consolidated + // test 2 - verify that the freelist is consolidated for (i = 0; i < flEntries; i++) { if (fFreeList[i].size != 0) @@ -6195,65 +5924,51 @@ int ExtentMap::checkConsistency() cout << "test 2 passed\n"; -// needs to be updated + // test 3 - scan the extent map to make sure files have no LBID gaps #if 0 - // test 3 - scan the extent map to make sure files have no LBID gaps - vector oids; - vector< vector > fbos; + vector oids; + vector> fbos; - for (i = 0; i < emEntries; i++) + for (i = 0; i < emEntries; i++) + { + if (fExtentMap[i].size != 0) { - if (fExtentMap[i].size != 0) - { - for (j = 0; j < (int)oids.size(); j++) - if (oids[j] == fExtentMap[i].fileID) - break; + for (j = 0; j < (int)oids.size(); j++) + if (oids[j] == fExtentMap[i].fileID) + break; - if (j == (int)oids.size()) - { - oids.push_back(fExtentMap[i].fileID); - fbos.push_back(vector()); - } + if (j == (int)oids.size()) + { + oids.push_back(fExtentMap[i].fileID); + fbos.push_back(vector()); + } - fbos[j].push_back(fExtentMap[i].blockOffset); - } + fbos[j].push_back(fExtentMap[i].blockOffset); } + } - for (i = 0; i < (int)fbos.size(); i++) - sort::iterator>(fbos[i].begin(), fbos[i].end()); + for (i = 0; i < (int)fbos.size(); i++) + sort::iterator>(fbos[i].begin(), fbos[i].end()); - const unsigned EXTENT_SIZE = getExtentSize(); + const unsigned EXTENT_SIZE = getExtentSize(); - for (i = 0; i < (int)fbos.size(); i++) + for (i = 0; i < (int)fbos.size(); i++) + { + for (j = 0; j < (int)fbos[i].size(); j++) { - for (j = 0; j < (int)fbos[i].size(); j++) - { - if (fbos[i][j] != static_cast(j * EXTENT_SIZE)) - { - cerr << "EM: OID " << oids[i] << " has no extent at FBO " << - j* EXTENT_SIZE << endl; - throw logic_error("EM checkConsistency test 3 (data structures are read-locked)"); - } - } + if (fbos[i][j] != static_cast(j * EXTENT_SIZE)) + { + cerr << "EM: OID " << oids[i] << " has no extent at FBO " << j * EXTENT_SIZE << endl; + throw logic_error("EM checkConsistency test 3 (data structures are read-locked)"); + } } + } - fbos.clear(); - oids.clear(); + fbos.clear(); + oids.clear(); #endif // test 5a - scan freelist to make sure the current size is accurate - - for (i = 0, usedEntries = 0; i < emEntries; i++) - if (fExtentMap[i].range.size != 0) - usedEntries++; - - if (usedEntries != fEMShminfo->currentSize / sizeof(EMEntry)) - { - cerr << "checkConsistency: used extent map entries = " << usedEntries << " metadata says " - << fEMShminfo->currentSize / sizeof(EMEntry) << endl; - throw logic_error("EM checkConsistency test 5a (data structures are read-locked)"); - } - for (i = 0, usedEntries = 0; i < flEntries; i++) if (fFreeList[i].size != 0) usedEntries++; @@ -6287,6 +6002,7 @@ void ExtentMap::undoChanges() #endif Undoable::undoChanges(); + undoChangesRBTree(); finishChanges(); } @@ -6299,6 +6015,7 @@ void ExtentMap::confirmChanges() #endif Undoable::confirmChanges(); + confirmChangesRBTree(); finishChanges(); } @@ -6314,6 +6031,46 @@ void ExtentMap::finishChanges() releaseEMEntryTable(WRITE); } +void ExtentMap::makeUndoRecordRBTree(UndoRecordType type, const EMEntry& emEntry) +{ + undoRecordsRBTree.push_back(make_pair(type, emEntry)); +} + +void ExtentMap::undoChangesRBTree() +{ + for (const auto& undoPair : undoRecordsRBTree) + { + if (undoPair.first == UndoRecordType::INSERT) + { + const auto key = undoPair.second.range.start; + auto emIt = findByLBID(key); + if (emIt != fExtentMapRBTree->end()) + { + fExtentMapRBTree->erase(emIt); + } + } + else if (undoPair.first == UndoRecordType::DELETE) + { + const auto& emEntry = undoPair.second; + fExtentMapRBTree->insert(make_pair(emEntry.range.start, emEntry)); + } + else + { + const auto key = undoPair.second.range.start; + auto emIt = findByLBID(key); + if (emIt != fExtentMapRBTree->end()) + { + emIt->second = undoPair.second; + } + } + } +} + +void ExtentMap::confirmChangesRBTree() +{ + undoRecordsRBTree.clear(); +} + const bool* ExtentMap::getEMFLLockStatus() { return &flLocked; @@ -6523,18 +6280,17 @@ void ExtentMap::dumpTo(ostream& os) { grabEMEntryTable(READ); grabEMIndex(READ); - unsigned emEntries = fEMShminfo->allocdSize / sizeof(struct EMEntry); - for (unsigned i = 0; i < emEntries; i++) + for (auto emIt = fExtentMapRBTree->begin(), end = fExtentMapRBTree->end(); emIt != end; ++emIt) { - if (fExtentMap[i].range.size != 0) + const auto& emEntry = emIt->second; { - os << fExtentMap[i].range.start << '|' << fExtentMap[i].range.size << '|' << fExtentMap[i].fileID << '|' - << fExtentMap[i].blockOffset << '|' << fExtentMap[i].HWM << '|' << fExtentMap[i].partitionNum << '|' - << fExtentMap[i].segmentNum << '|' << fExtentMap[i].dbRoot << '|' << fExtentMap[i].colWid << '|' - << fExtentMap[i].status << '|' << fExtentMap[i].partition.cprange.hiVal << '|' - << fExtentMap[i].partition.cprange.loVal << '|' << fExtentMap[i].partition.cprange.sequenceNum << '|' - << (int)fExtentMap[i].partition.cprange.isValid << '|' << endl; + os << emEntry.range.start << '|' << emEntry.range.size << '|' << emEntry.fileID << '|' + << emEntry.blockOffset << '|' << emEntry.HWM << '|' << emEntry.partitionNum << '|' + << emEntry.segmentNum << '|' << emEntry.dbRoot << '|' << emEntry.colWid << '|' << emEntry.status + << '|' << emEntry.partition.cprange.hiVal << '|' << emEntry.partition.cprange.loVal << '|' + << emEntry.partition.cprange.sequenceNum << '|' << (int)emEntry.partition.cprange.isValid << '|' + << endl; } } diff --git a/versioning/BRM/extentmap.h b/versioning/BRM/extentmap.h index 8452d822e..6ece3f096 100644 --- a/versioning/BRM/extentmap.h +++ b/versioning/BRM/extentmap.h @@ -39,6 +39,7 @@ #include //boost::hash #include #include +#include #include #include #include @@ -212,12 +213,52 @@ struct ExtentSorter } }; -class ExtentMapImpl +using ShmSegmentManagerT = bi::managed_shared_memory::segment_manager; +using ShmVoidAllocator = bi::allocator; + +using LBID_tAlloc = bi::allocator; +using PartitionNumberTAlloc = bi::allocator; +using LBID_tVectorT = bi::vector; + +using PartitionIndexContainerKeyT = PartitionNumberT; +using PartitionIndexContainerValT = std::pair; +using PartitionIndexContainerValTAlloc = bi::allocator; +// Can't use std::unordered_map presumably b/c the map's pointer type doesn't use offset_type as boost::u_map +// does +using PartitionIndexContainerT = + boost::unordered_map, + std::equal_to, PartitionIndexContainerValTAlloc>; + +using OIDIndexContainerKeyT = OID_t; +using OIDIndexContainerValT = std::pair; +using OIDIndexContainerValTAlloc = bi::allocator; +using OIDIndexContainerT = + boost::unordered_map, + std::equal_to, OIDIndexContainerValTAlloc>; + +using DBRootIndexTAlloc = bi::allocator; +using DBRootIndexContainerT = bi::vector; +using ExtentMapIndex = DBRootIndexContainerT; +using LBID_tFindResult = bi::vector; +using InsertUpdateShmemKeyPair = std::pair; + +// RBTREE. +using EMEntryKeyValueType = std::pair; +using VoidAllocator = + boost::interprocess::allocator; +using EMEntryKeyValueTypeAllocator = + boost::interprocess::allocator; +using ExtentMapRBTree = + boost::interprocess::map, EMEntryKeyValueTypeAllocator>; + +class ExtentMapRBTreeImpl { public: - ~ExtentMapImpl(){}; + ~ExtentMapRBTreeImpl() = default; + + static ExtentMapRBTreeImpl* makeExtentMapRBTreeImpl(unsigned key, off_t size, bool readOnly = false); - static ExtentMapImpl* makeExtentMapImpl(unsigned key, off_t size, bool readOnly = false); static void refreshShm() { if (fInstance) @@ -228,48 +269,40 @@ class ExtentMapImpl } inline void grow(unsigned key, off_t size) -#ifdef NDEBUG { - fExtMap.grow(key, size); - } -#else - { - int rc = fExtMap.grow(key, size); - idbassert(rc == 0); - } -#endif - inline void makeReadOnly() - { - fExtMap.setReadOnly(); - } - inline void clear(unsigned key, off_t size) - { - fExtMap.clear(key, size); - } - inline void swapout(BRMShmImpl& rhs) - { - fExtMap.swap(rhs); - rhs.destroy(); - } - inline unsigned key() const - { - return fExtMap.key(); + fManagedShm.grow(key, size); } - inline EMEntry* get() const + inline unsigned key() const { - return reinterpret_cast(fExtMap.fMapreg.get_address()); + return fManagedShm.key(); + } + + inline ExtentMapRBTree* get() const + { + VoidAllocator allocator(fManagedShm.fShmSegment->get_segment_manager()); + return fManagedShm.fShmSegment->find_or_construct("EmMapRBTree")(std::less(), + allocator); + } + + inline uint64_t getFreeMemory() const + { + return fManagedShm.fShmSegment->get_free_memory(); + } + inline uint64_t getSize() const + { + return fManagedShm.fShmSegment->get_size(); } private: - ExtentMapImpl(unsigned key, off_t size, bool readOnly = false); - ExtentMapImpl(const ExtentMapImpl& rhs); - ExtentMapImpl& operator=(const ExtentMapImpl& rhs); + ExtentMapRBTreeImpl(unsigned key, off_t size, bool readOnly = false); + ExtentMapRBTreeImpl(const ExtentMapRBTreeImpl& rhs); + ExtentMapRBTreeImpl& operator=(const ExtentMapRBTreeImpl& rhs); - BRMShmImpl fExtMap; + BRMManagedShmImplRBTree fManagedShm; static boost::mutex fInstanceMutex; - static ExtentMapImpl* fInstance; + static ExtentMapRBTreeImpl* fInstance; }; class FreeListImpl @@ -332,37 +365,6 @@ class FreeListImpl static FreeListImpl* fInstance; }; -using ShmSegmentManagerT = bi::managed_shared_memory::segment_manager; -using ShmVoidAllocator = bi::allocator; - -using ExtentMapIdxT = size_t; -using ExtentMapIdxTAlloc = bi::allocator; -using PartitionNumberTAlloc = bi::allocator; -using ExtentMapIndicesT = bi::vector; - -using PartitionIndexContainerKeyT = PartitionNumberT; -using PartitionIndexContainerValT = std::pair; -using PartitionIndexContainerValTAlloc = bi::allocator; -// Can't use std::unordered_map presumably b/c the map's pointer type doesn't use offset_type as boost::u_map -// does -using PartitionIndexContainerT = - boost::unordered_map, std::equal_to, - PartitionIndexContainerValTAlloc>; - -using OIDIndexContainerKeyT = OID_t; -using OIDIndexContainerValT = std::pair; -using OIDIndexContainerValTAlloc = bi::allocator; -using OIDIndexContainerT = - boost::unordered_map, - std::equal_to, OIDIndexContainerValTAlloc>; - -using DBRootIndexTAlloc = bi::allocator; -using DBRootIndexContainerT = bi::vector; -using ExtentMapIndex = DBRootIndexContainerT; -using ExtentMapIndexFindResult = bi::vector; -using InsertUpdateShmemKeyPair = std::pair; - class ExtentMapIndexImpl { public: @@ -435,26 +437,26 @@ class ExtentMapIndexImpl void createExtentMapIndexIfNeeded(); ExtentMapIndex* get(); - InsertUpdateShmemKeyPair insert(const EMEntry& emEntry, const size_t emIdx); + InsertUpdateShmemKeyPair insert(const EMEntry& emEntry, const LBID_t lbid); InsertUpdateShmemKeyPair insert2ndLayerWrapper(OIDIndexContainerT& oids, const EMEntry& emEntry, - const size_t emIdx, const bool aShmemHasGrown); - InsertUpdateShmemKeyPair insert2ndLayer(OIDIndexContainerT& oids, const EMEntry& emEntry, - const size_t emIdx, const bool aShmemHasGrown); + const LBID_t lbid, const bool aShmemHasGrown); + InsertUpdateShmemKeyPair insert2ndLayer(OIDIndexContainerT& oids, const EMEntry& emEntry, const LBID_t lbid, + const bool aShmemHasGrown); InsertUpdateShmemKeyPair insert3dLayerWrapper(PartitionIndexContainerT& partitions, const EMEntry& emEntry, - const size_t emIdx, const bool aShmemHasGrown); + const LBID_t lbid, const bool aShmemHasGrown); InsertUpdateShmemKeyPair insert3dLayer(PartitionIndexContainerT& partitions, const EMEntry& emEntry, - const size_t emIdx, const bool aShmemHasGrown); - ExtentMapIndexFindResult find(const DBRootT dbroot, const OID_t oid, - const PartitionNumberT partitionNumber); - ExtentMapIndexFindResult find(const DBRootT dbroot, const OID_t oid); - ExtentMapIndexFindResult search2ndLayer(OIDIndexContainerT& oids, const OID_t oid, - const PartitionNumberT partitionNumber); - ExtentMapIndexFindResult search2ndLayer(OIDIndexContainerT& oids, const OID_t oid); - ExtentMapIndexFindResult search3dLayer(PartitionIndexContainerT& partitions, - const PartitionNumberT partitionNumber); + const LBID_t lbid, const bool aShmemHasGrown); + LBID_tFindResult find(const DBRootT dbroot, const OID_t oid, const PartitionNumberT partitionNumber); + LBID_tFindResult find(const DBRootT dbroot, const OID_t oid); + LBID_tFindResult search2ndLayer(OIDIndexContainerT& oids, const OID_t oid, + const PartitionNumberT partitionNumber); + LBID_tFindResult search2ndLayer(OIDIndexContainerT& oids, const OID_t oid); + LBID_tFindResult search3dLayer(PartitionIndexContainerT& partitions, + const PartitionNumberT partitionNumber); + bool isDBRootEmpty(const DBRootT dbroot); void deleteDbRoot(const DBRootT dbroot); void deleteOID(const DBRootT dbroot, const OID_t oid); - void deleteEMEntry(const EMEntry& emEntry, const ExtentMapIdxT emIdent); + void deleteEMEntry(const EMEntry& emEntry, const LBID_t lbid); private: BRMManagedShmImpl fBRMManagedShmMemImpl_; @@ -1025,13 +1027,13 @@ class ExtentMap : public Undoable inline bool empty() { - if (fEMShminfo == 0) + if (fEMRBTreeShminfo == nullptr) { grabEMEntryTable(BRM::ExtentMap::READ); releaseEMEntryTable(BRM::ExtentMap::READ); } - - return (fEMShminfo->currentSize == 0); + // Initial size. + return (fEMRBTreeShminfo->currentSize == EM_RB_TREE_EMPTY_SIZE); } EXPORT std::vector getFreeListEntries(); @@ -1051,22 +1053,38 @@ class ExtentMap : public Undoable #endif private: - static const constexpr size_t EM_INCREMENT_ROWS = 100; + enum class UndoRecordType + { + DEFAULT, + INSERT, + DELETE + }; + + static const constexpr size_t EM_INCREMENT_ROWS = 1000; static const constexpr size_t EM_INITIAL_SIZE = EM_INCREMENT_ROWS * 10 * sizeof(EMEntry); static const constexpr size_t EM_INCREMENT = EM_INCREMENT_ROWS * sizeof(EMEntry); static const constexpr size_t EM_FREELIST_INITIAL_SIZE = 50 * sizeof(InlineLBIDRange); static const constexpr size_t EM_FREELIST_INCREMENT = 50 * sizeof(InlineLBIDRange); + // RBTree constants. + static const size_t EM_RB_TREE_NODE_SIZE = 10 * (sizeof(EMEntry) + 8 * sizeof(uint64_t)); + static const size_t EM_RB_TREE_EMPTY_SIZE = 1024; + static const size_t EM_RB_TREE_INITIAL_SIZE = + EM_INCREMENT_ROWS * 10 * EM_RB_TREE_NODE_SIZE + EM_RB_TREE_EMPTY_SIZE; + static const size_t EM_RB_TREE_INCREMENT = EM_INCREMENT_ROWS * EM_RB_TREE_NODE_SIZE; ExtentMap(const ExtentMap& em); ExtentMap& operator=(const ExtentMap& em); - EMEntry* fExtentMap; + ExtentMapRBTree* fExtentMapRBTree; InlineLBIDRange* fFreeList; + key_t fCurrentEMShmkey; key_t fCurrentFLShmkey; - MSTEntry* fEMShminfo; + + MSTEntry* fEMRBTreeShminfo; MSTEntry* fFLShminfo; MSTEntry* fEMIndexShminfo; + const MasterSegmentTable fMST; bool r_only; typedef std::tr1::unordered_map PmDbRootMap_t; @@ -1075,6 +1093,7 @@ class ExtentMap : public Undoable int numUndoRecords; bool flLocked, emLocked, emIndexLocked; + static boost::mutex mutex; // @bug5355 - made mutex static static boost::mutex emIndexMutex; boost::mutex fConfigCacheMutex; // protect access to Config Cache @@ -1094,6 +1113,7 @@ class ExtentMap : public Undoable void logAndSetEMIndexReadOnly(const std::string& funcName); + // Create extents. LBID_t _createColumnExtent_DBroot(uint32_t size, int OID, uint32_t colWidth, uint16_t dbRoot, execplan::CalpontSystemCatalog::ColDataType colDataType, uint32_t& partitionNum, uint16_t& segmentNum, uint32_t& startBlockOffset); @@ -1103,27 +1123,43 @@ class ExtentMap : public Undoable uint32_t& startBlockOffset); LBID_t _createDictStoreExtent(uint32_t size, int OID, uint16_t dbRoot, uint32_t partitionNum, uint16_t segmentNum); + + // Delete extent. + ExtentMapRBTree::iterator deleteExtent(ExtentMapRBTree::iterator it, const bool clearEMIndex = true); + template bool isValidCPRange(const T& max, const T& min, execplan::CalpontSystemCatalog::ColDataType type) const; - void deleteExtent(const int emIndex, const bool clearEMIndex = true); LBID_t getLBIDsFromFreeList(uint32_t size); void reserveLBIDRange(LBID_t start, uint8_t size); // used by load() to allocate pre-existing LBIDs + std::vector getEmIdentsByLbids(const bi::vector& lbids); + std::vector getEmIteratorsByLbids(const bi::vector& lbids); + // Choose keys. key_t chooseEMShmkey(); key_t chooseFLShmkey(); key_t chooseEMIndexShmkey(); + key_t getInitialEMIndexShmkey() const; // see the code for how keys are segmented key_t chooseShmkey(const MSTEntry* masterTableEntry, const uint32_t keyRangeBase) const; + + // Grab table. void grabEMEntryTable(OPS op); void grabFreeList(OPS op); void grabEMIndex(OPS op); + + // Release table. void releaseEMEntryTable(OPS op); void releaseFreeList(OPS op); void releaseEMIndex(OPS op); + + // Grow memory. void growEMShmseg(size_t nrows = 0); void growFLShmseg(); void growEMIndexShmseg(const size_t suggestedSize = 0); + void growIfNeededOnExtentCreate(); + + // Finish. void finishChanges(); EXPORT unsigned getFilesPerColumnPartition(); @@ -1134,22 +1170,26 @@ class ExtentMap : public Undoable void checkReloadConfig(); ShmKeys fShmKeys; + void makeUndoRecordRBTree(UndoRecordType type, const EMEntry& emEntry); + void undoChangesRBTree(); + void confirmChangesRBTree(); + bool fDebug; int _markInvalid(const LBID_t lbid, const execplan::CalpontSystemCatalog::ColDataType colDataType); - template + template void load(T* in); - /** @brief Loads the extent map from a file into memory. - * - * @param in (in) the file to load the extent map from. - * @param upgradeV4ToV5 (in) flag indicating whether we are upgrading - * extent map from v4 to v5. - */ - template + + template void loadVersion4or5(T* in, bool upgradeV4ToV5); - ExtentMapImpl* fPExtMapImpl; + ExtentMapRBTree::iterator findByLBID(const LBID_t lbid); + + using UndoRecordPair = std::pair; + std::vector undoRecordsRBTree; + + ExtentMapRBTreeImpl* fPExtMapRBTreeImpl; FreeListImpl* fPFreeListImpl; ExtentMapIndexImpl* fPExtMapIndexImpl_; };