From aeec468814d6fec720cdb2f0349a1585dfd625f9 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Wed, 20 May 2020 18:37:17 -0400 Subject: [PATCH 01/24] Fixed a number of bugs in storage manager, and added code to detect and recover from being killed while writing new objects. Conflicts: storage-manager/src/Synchronizer.cpp --- storage-manager/src/Cache.cpp | 14 ++++++++ storage-manager/src/Cache.h | 5 +++ storage-manager/src/IOCoordinator.cpp | 35 +++++++++++++------ storage-manager/src/PrefixCache.cpp | 50 ++++++++++++++++++++++----- storage-manager/src/PrefixCache.h | 9 ++++- storage-manager/src/Replicator.cpp | 2 +- storage-manager/src/Synchronizer.cpp | 15 ++++++-- 7 files changed, 107 insertions(+), 23 deletions(-) diff --git a/storage-manager/src/Cache.cpp b/storage-manager/src/Cache.cpp index 05a70adf3..de7b22c16 100644 --- a/storage-manager/src/Cache.cpp +++ b/storage-manager/src/Cache.cpp @@ -355,6 +355,20 @@ void Cache::configListener() logger->log(LOG_CRIT, "Cache/cache_size is not a number. Using current value = %zi",maxCacheSize); } } + +void Cache::repopulate() +{ + boost::unique_lock sl(lru_mutex); + + for (auto &pcache : prefixCaches) + pcache.second->repopulate(); +} + +void Cache::repopulate(const boost::filesystem::path &p) +{ + getPCache(p).repopulate(); +} + } diff --git a/storage-manager/src/Cache.h b/storage-manager/src/Cache.h index 12396550a..3ae57a342 100644 --- a/storage-manager/src/Cache.h +++ b/storage-manager/src/Cache.h @@ -92,6 +92,11 @@ class Cache : public boost::noncopyable , public ConfigListener void shutdown(); void printKPIs() const; + // Used to update accounting variables in the PrefixCaches when a potential error + // is detected. + void repopulate(); + void repopulate(const boost::filesystem::path &prefix); + // test helpers const boost::filesystem::path &getCachePath() const; const boost::filesystem::path &getJournalPath() const; diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index 0b79d864a..e86ab2c83 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -505,6 +505,8 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin dataRemaining -= err; count += err; iocBytesWritten += err; + // get a new name for the object + newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + objectOffset); metadata.updateEntryLength(newObject.offset, (err + objectOffset)); cache->newObject(firstDir, newObject.key,err + objectOffset); newObjectKeys.push_back(newObject.key); @@ -634,14 +636,17 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t count += err; dataRemaining -= err; iocBytesWritten += err; + if (err < (int64_t) writeLength) + { + newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + newObject.offset); + metadata.updateEntry(newObject.offset, newObject.key, err + newObject.offset); + } cache->newObject(firstDir, newObject.key,err); newObjectKeys.push_back(newObject.key); if (err < (int64_t) writeLength) { //logger->log(LOG_ERR,"IOCoordinator::append(): newObject failed to complete write, %u of %u bytes written.",count,length); - // make the object reflect length actually written - metadata.updateEntryLength(newObject.offset, err); goto out; } } @@ -1135,7 +1140,10 @@ boost::shared_array IOCoordinator::mergeJournal(const char *object, con objFD = ::open(object, O_RDONLY); if (objFD < 0) + { + *_bytesReadOut = 0; return ret; + } ScopedCloser s1(objFD); ret.reset(new uint8_t[len]); @@ -1148,11 +1156,12 @@ boost::shared_array IOCoordinator::mergeJournal(const char *object, con int err = ::read(objFD, &ret[count], len - count); if (err < 0) { - char buf[80]; - logger->log(LOG_CRIT, "IOC::mergeJournal(): failed to read %s, got '%s'", object, strerror_r(errno, buf, 80)); int l_errno = errno; + char buf[80]; + logger->log(LOG_CRIT, "IOC::mergeJournal(): failed to read %s, got '%s'", object, strerror_r(l_errno, buf, 80)); ret.reset(); errno = l_errno; + *_bytesReadOut = count; return ret; } else if (err == 0) @@ -1171,17 +1180,18 @@ boost::shared_array IOCoordinator::mergeJournal(const char *object, con size_t mjimBytesRead = 0; int mjimerr = mergeJournalInMem(ret, len, journal, &mjimBytesRead); if (mjimerr) - { ret.reset(); - return ret; - } l_bytesRead += mjimBytesRead; + *_bytesReadOut = l_bytesRead; return ret; } journalFD = ::open(journal, O_RDONLY); if (journalFD < 0) + { + *_bytesReadOut = l_bytesRead; return ret; + } ScopedCloser s2(journalFD); boost::shared_array headertxt = seekToEndOfHeader1(journalFD, &l_bytesRead); @@ -1219,17 +1229,21 @@ boost::shared_array IOCoordinator::mergeJournal(const char *object, con err = ::read(journalFD, &ret[startReadingAt - offset + count], lengthOfRead - count); if (err < 0) { + int l_errno = errno; char buf[80]; - logger->log(LOG_ERR, "mergeJournal: got %s", strerror_r(errno, buf, 80)); + logger->log(LOG_ERR, "mergeJournal: got %s", strerror_r(l_errno, buf, 80)); ret.reset(); - return ret; + errno = l_errno; + l_bytesRead += count; + goto out; } else if (err == 0) { logger->log(LOG_ERR, "mergeJournal: got early EOF. offset=%ld, len=%ld, jOffset=%ld, jLen=%ld," " startReadingAt=%ld, lengthOfRead=%ld", offset, len, offlen[0], offlen[1], startReadingAt, lengthOfRead); ret.reset(); - return ret; + l_bytesRead += count; + goto out; } count += err; } @@ -1243,6 +1257,7 @@ boost::shared_array IOCoordinator::mergeJournal(const char *object, con // skip over this journal entry ::lseek(journalFD, offlen[1], SEEK_CUR); } +out: *_bytesReadOut = l_bytesRead; return ret; } diff --git a/storage-manager/src/PrefixCache.cpp b/storage-manager/src/PrefixCache.cpp index e29845b03..99a65dbba 100644 --- a/storage-manager/src/PrefixCache.cpp +++ b/storage-manager/src/PrefixCache.cpp @@ -127,12 +127,20 @@ PrefixCache::~PrefixCache() */ } -void PrefixCache::populate() +void PrefixCache::repopulate() +{ + lru_mutex.lock(); + populate(false); +} + +void PrefixCache::populate(bool useSync) { Synchronizer *sync = Synchronizer::get(); bf::directory_iterator dir(cachePrefix); bf::directory_iterator dend; vector newObjects; + lru.clear(); + m_lru.clear(); while (dir != dend) { // put everything in lru & m_lru @@ -143,13 +151,15 @@ void PrefixCache::populate() auto last = lru.end(); m_lru.insert(--last); currentCacheSize += bf::file_size(*dir); - newObjects.push_back(p.filename().string()); + if (useSync) + newObjects.push_back(p.filename().string()); } else if (p != cachePrefix/downloader->getTmpPath()) logger->log(LOG_WARNING, "Cache: found something in the cache that does not belong '%s'", p.string().c_str()); ++dir; } - sync->newObjects(firstDir, newObjects); + if (useSync) + sync->newObjects(firstDir, newObjects); newObjects.clear(); // account for what's in the journal dir @@ -164,7 +174,8 @@ void PrefixCache::populate() { size_t s = bf::file_size(*dir); currentCacheSize += s; - newJournals.push_back(pair(p.stem().string(), s)); + if (useSync) + newJournals.push_back(pair(p.stem().string(), s)); } else logger->log(LOG_WARNING, "Cache: found a file in the journal dir that does not belong '%s'", p.string().c_str()); @@ -174,7 +185,8 @@ void PrefixCache::populate() ++dir; } lru_mutex.unlock(); - sync->newJournalEntries(firstDir, newJournals); + if (useSync) + sync->newJournalEntries(firstDir, newJournals); } // be careful using this! SM should be idle. No ongoing reads or writes. @@ -380,14 +392,25 @@ void PrefixCache::newJournalEntry(size_t size) void PrefixCache::deletedJournal(size_t size) { boost::unique_lock s(lru_mutex); - assert(currentCacheSize >= size); - currentCacheSize -= size; + + //assert(currentCacheSize >= size); + if (currentCacheSize >= size) + currentCacheSize -= size; + else + { + ostringstream oss; + oss << "PrefixCache::deletedJournal(): Detected an accounting error." << + " Reloading cache metadata, this will pause IO activity briefly."; + logger->log(LOG_WARNING, oss.str().c_str()); + populate(false); + } } void PrefixCache::deletedObject(const string &key, size_t size) { boost::unique_lock s(lru_mutex); - assert(currentCacheSize >= size); + + //assert(currentCacheSize >= size); M_LRU_t::iterator mit = m_lru.find(key); assert(mit != m_lru.end()); @@ -397,7 +420,16 @@ void PrefixCache::deletedObject(const string &key, size_t size) doNotEvict.erase(mit->lit); lru.erase(mit->lit); m_lru.erase(mit); - currentCacheSize -= size; + if (currentCacheSize >= size) + currentCacheSize -= size; + else + { + ostringstream oss; + oss << "PrefixCache::deletedObject(): Detected an accounting error." << + " Reloading cache metadata, this will pause IO activity briefly."; + logger->log(LOG_WARNING, oss.str().c_str()); + populate(false); + } } } diff --git a/storage-manager/src/PrefixCache.h b/storage-manager/src/PrefixCache.h index 1121275c7..da6b7fb66 100644 --- a/storage-manager/src/PrefixCache.h +++ b/storage-manager/src/PrefixCache.h @@ -77,6 +77,11 @@ class PrefixCache : public boost::noncopyable size_t getMaxCacheSize() const; void shutdown(); + // clears out cache structures and reloads them from cache/journal dir contents + // needed to potentially repair the cache's accounting error after detecting + // an error. + void repopulate(); + // test helpers const boost::filesystem::path &getCachePath(); const boost::filesystem::path &getJournalPath(); @@ -97,7 +102,9 @@ class PrefixCache : public boost::noncopyable SMLogging *logger; Downloader *downloader; - void populate(); + // useSync makes populate() tell Synchronizer about what it finds. + // set it to false when the system is already fully up. + void populate(bool useSync = true); void _makeSpace(size_t size); /* The main PrefixCache structures */ diff --git a/storage-manager/src/Replicator.cpp b/storage-manager/src/Replicator.cpp index 6e387726c..82492736e 100644 --- a/storage-manager/src/Replicator.cpp +++ b/storage-manager/src/Replicator.cpp @@ -128,7 +128,7 @@ int Replicator::newObject(const boost::filesystem::path &filename, const uint8_t OPEN(objectFilename.c_str(), O_WRONLY | O_CREAT); size_t count = 0; while (count < length) { - err = ::pwrite(fd, &data[count], length - count, offset); + err = ::pwrite(fd, &data[count], length - count, offset + count); if (err <= 0) { if (count > 0) // return what was successfully written diff --git a/storage-manager/src/Synchronizer.cpp b/storage-manager/src/Synchronizer.cpp index ee47b9dfa..7276227cf 100644 --- a/storage-manager/src/Synchronizer.cpp +++ b/storage-manager/src/Synchronizer.cpp @@ -683,7 +683,7 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list while (count < size) { - err = ::write(newFD, data.get(), size - count); + err = ::write(newFD, &data[count], size - count); if (err < 0) { ::unlink(newCachePath.string().c_str()); @@ -693,8 +693,19 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list count += err; } numBytesWritten += size; - assert(bf::file_size(oldCachePath) == MetadataFile::getLengthFromKey(cloudKey)); + + //assert(bf::file_size(oldCachePath) == MetadataFile::getLengthFromKey(cloudKey)); cache->rename(prefix, cloudKey, newCloudKey, size - MetadataFile::getLengthFromKey(cloudKey)); + if (bf::file_size(oldCachePath) != MetadataFile::getLengthFromKey(cloudKey)) + { + ostringstream oss; + oss << "Synchronizer::synchronizeWithJournal(): detected a mismatch between file size and " << + "length stored in the object name. object name = " << cloudKey << " length-in-name = " << + MetadataFile::getLengthFromKey(cloudKey) << " real-length = " << bf::file_size(oldCachePath) + << ". Reloading cache metadata, this will pause IO activity briefly."; + logger->log(LOG_WARNING, oss.str().c_str()); + cache->repopulate(prefix); + } replicator->remove(oldCachePath); } From 4b9c1d9169f1991db847087c3e59b07c3a72ffce Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Thu, 21 May 2020 16:28:32 -0400 Subject: [PATCH 02/24] Cleaned up a little code of previous commit, added retry loops and a little better error handling to the code that writes journal entries. --- storage-manager/src/Replicator.cpp | 100 +++++++++++++++++++++++---- storage-manager/src/Replicator.h | 5 ++ storage-manager/src/Synchronizer.cpp | 12 ++-- 3 files changed, 100 insertions(+), 17 deletions(-) diff --git a/storage-manager/src/Replicator.cpp b/storage-manager/src/Replicator.cpp index 82492736e..b14f1107d 100644 --- a/storage-manager/src/Replicator.cpp +++ b/storage-manager/src/Replicator.cpp @@ -154,6 +154,67 @@ int Replicator::newNullObject(const boost::filesystem::path &filename,size_t len return err; } +ssize_t Replicator::_pwrite(int fd, const void *data, size_t length, off_t offset) +{ + ssize_t err; + size_t count = 0; + uint8_t *bData = (uint8_t *) data; + + do + { + err = ::pwrite(fd, &bData[count], length - count, offset + count); + if (err < 0 || (err == 0 && errno != EINTR)) + { + if (count > 0) + return count; + else + return err; + } + err += count; + } while (count < length); + + return count; +} + +ssize_t Replicator::_write(int fd, const void *data, size_t length) +{ + ssize_t err; + size_t count = 0; + uint8_t *bData = (uint8_t *) data; + + do + { + err = ::write(fd, &bData[count], length - count); + if (err < 0 || (err == 0 && errno != EINTR)) + { + if (count > 0) + return count; + else + return err; + } + err += count; + } while (count < length); + + return count; +} + +/* XXXPAT: I think we'll have to rewrite this function some; we'll have to at least clearly define + what happens in the various error scenarios. + + To be more resilent in the face of hard errors, we may also want to redefine what a journal file is. + If/when we cannot fix the journal file in the face of an error, there are scenarios that the read code + will not be able to cope with. Ex, a journal entry that says it's 200 bytes long, but there are only + really 100 bytes. The read code has no way to tell the difference if there is an entry that follows + the bad entry, and that will cause an unrecoverable error. + + Initial thought on a sol'n. Make each journal entry its own file in a tmp dir, ordered by a sequence + number in the filename. Then, one entry cannot affect the others, and the end of the file is unambiguously + the end of the data. On successful write, move the file to where it should be. This would also prevent + the readers from ever seeing bad data, and possibly reduce the size of some critical sections. + + Benefits would be data integrity, and possibly add'l parallelism. The downside is of course, a higher + number of IO ops for the same operation. +*/ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const uint8_t *data, off_t offset, size_t length) { int fd, err; @@ -177,7 +238,7 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u bHeaderChanged = true; // create new journal file with header string header = (boost::format("{ \"version\" : \"%03i\", \"max_offset\" : \"%011u\" }") % version % thisEntryMaxOffset).str(); - err = ::write(fd, header.c_str(), header.length() + 1); + err = _write(fd, header.c_str(), header.length() + 1); l_errno = errno; repHeaderDataWritten += (header.length() + 1); if ((uint)err != (header.length() + 1)) @@ -238,26 +299,32 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u { bHeaderChanged = true; string header = (boost::format("{ \"version\" : \"%03i\", \"max_offset\" : \"%011u\" }") % version % thisEntryMaxOffset).str(); - err = ::pwrite(fd, header.c_str(), header.length() + 1,0); + err = _pwrite(fd, header.c_str(), header.length() + 1,0); + l_errno = errno; repHeaderDataWritten += (header.length() + 1); if ((uint)err != (header.length() + 1)) { // only the header was possibly changed rollback attempt mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Updating journal header failed. " "Attempting to rollback and continue."); - int rollbackErr = ::pwrite(fd, headerRollback.c_str(), headerRollback.length() + 1,0); + int rollbackErr = _pwrite(fd, headerRollback.c_str(), headerRollback.length() + 1,0); if ((uint)rollbackErr == (headerRollback.length() + 1)) mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Rollback of journal header success."); else mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Rollback of journal header failed!"); - return err; + errno = l_errno; + if (err < 0) + return err; + else + return 0; } } } off_t entryHeaderOffset = ::lseek(fd, 0, SEEK_END); - err = ::write(fd, offlen, JOURNAL_ENTRY_HEADER_SIZE); + err = _write(fd, offlen, JOURNAL_ENTRY_HEADER_SIZE); + l_errno = errno; repHeaderDataWritten += JOURNAL_ENTRY_HEADER_SIZE; if (err != JOURNAL_ENTRY_HEADER_SIZE) { @@ -266,12 +333,16 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u { mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: write journal entry header failed. Attempting to rollback and continue."); //attempt to rollback top level header - int rollbackErr = ::pwrite(fd, headerRollback.c_str(), headerRollback.length() + 1,0); + int rollbackErr = _pwrite(fd, headerRollback.c_str(), headerRollback.length() + 1,0); if ((uint)rollbackErr != (headerRollback.length() + 1)) { mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Rollback of journal header failed! (%s)", strerror_r(errno, errbuf, 80)); - return err; + errno = l_errno; + if (err < 0) + return err; + else + return 0; } } int rollbackErr = ::ftruncate(fd,entryHeaderOffset); @@ -279,13 +350,16 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u { mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Truncate to previous EOF failed! (%s)", strerror_r(errno, errbuf, 80)); - return err; + if (err < 0) + return err; + else + return 0; } + l_errno = errno; return err; - } while (count < length) { - err = ::write(fd, &data[count], length - count); + err = _write(fd, &data[count], length - count); if (err < 0 ) { l_errno = errno; @@ -301,7 +375,7 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u if (thisEntryMaxOffset > currentMaxOffset) { string header = (boost::format("{ \"version\" : \"%03i\", \"max_offset\" : \"%011u\" }") % version % thisEntryMaxOffset).str(); - int rollbackErr = ::pwrite(fd, header.c_str(), header.length() + 1,0); + int rollbackErr = _pwrite(fd, header.c_str(), header.length() + 1,0); if ((uint)rollbackErr != (header.length() + 1)) { mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Update of journal header failed! (%s)", @@ -312,7 +386,7 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u } // Update the journal entry header offlen[1] = count; - int rollbackErr = ::pwrite(fd, offlen, JOURNAL_ENTRY_HEADER_SIZE,entryHeaderOffset); + int rollbackErr = _pwrite(fd, offlen, JOURNAL_ENTRY_HEADER_SIZE,entryHeaderOffset); if ((uint)rollbackErr != JOURNAL_ENTRY_HEADER_SIZE) { mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Update of journal entry header failed! (%s)", @@ -337,7 +411,7 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u "Attempting to rollback and continue.", strerror_r(l_errno, errbuf, 80)); //attempt to rollback top level header string header = (boost::format("{ \"version\" : \"%03i\", \"max_offset\" : \"%011u\" }") % version % 0).str(); - int rollbackErr = ::pwrite(fd, header.c_str(), header.length() + 1,0); + int rollbackErr = _pwrite(fd, header.c_str(), header.length() + 1,0); if ((uint)rollbackErr != (header.length() + 1)) { mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Rollback of journal header failed (%s)!", diff --git a/storage-manager/src/Replicator.h b/storage-manager/src/Replicator.h index d4a6942f2..4cf96ac92 100644 --- a/storage-manager/src/Replicator.h +++ b/storage-manager/src/Replicator.h @@ -58,6 +58,11 @@ class Replicator private: Replicator(); + + // a couple helpers + ssize_t _write(int fd, const void *data, size_t len); + ssize_t _pwrite(int fd, const void *data, size_t len, off_t offset); + Config *mpConfig; SMLogging *mpLogger; std::string msJournalPath; diff --git a/storage-manager/src/Synchronizer.cpp b/storage-manager/src/Synchronizer.cpp index 7276227cf..b8e07ae39 100644 --- a/storage-manager/src/Synchronizer.cpp +++ b/storage-manager/src/Synchronizer.cpp @@ -694,9 +694,14 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list } numBytesWritten += size; - //assert(bf::file_size(oldCachePath) == MetadataFile::getLengthFromKey(cloudKey)); - cache->rename(prefix, cloudKey, newCloudKey, size - MetadataFile::getLengthFromKey(cloudKey)); - if (bf::file_size(oldCachePath) != MetadataFile::getLengthFromKey(cloudKey)) + size_t oldSize = bf::file_size(oldCachePath); + + cache->rename(prefix, cloudKey, newCloudKey, size - oldSize); + replicator->remove(oldCachePath); + + // This condition is probably irrelevant for correct functioning now, + // but it should be very rare so what the hell. + if (oldSize != MetadataFile::getLengthFromKey(cloudKey)) { ostringstream oss; oss << "Synchronizer::synchronizeWithJournal(): detected a mismatch between file size and " << @@ -706,7 +711,6 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list logger->log(LOG_WARNING, oss.str().c_str()); cache->repopulate(prefix); } - replicator->remove(oldCachePath); } mergeDiff += size - originalSize; From 31e06e77a206e603bf52bd4a5d4ffaf0e5fb6f98 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Thu, 21 May 2020 17:34:36 -0400 Subject: [PATCH 03/24] Fixed the fix. Actually need to rename the file after you pick a new name for it LOL! --- storage-manager/src/IOCoordinator.cpp | 9 ++++++++- storage-manager/src/MetadataFile.h | 1 - 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index e86ab2c83..86e0a9989 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -506,8 +506,13 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin count += err; iocBytesWritten += err; // get a new name for the object + + bf::path oldPath = firstDir/newObject.key; newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + objectOffset); - metadata.updateEntryLength(newObject.offset, (err + objectOffset)); + ::rename(oldPath.string().c_str(), (firstDir/newObject.key).string().c_str()); + + // rename and resize the object in metadata + metadata.updateEntry(newObject.offset, newObject.key, (err + objectOffset)); cache->newObject(firstDir, newObject.key,err + objectOffset); newObjectKeys.push_back(newObject.key); goto out; @@ -638,7 +643,9 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t iocBytesWritten += err; if (err < (int64_t) writeLength) { + bf::path oldPath = firstDir/newObject.key; newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + newObject.offset); + ::rename(oldPath.string().c_str(), (firstDir/newObject.key).string().c_str()); metadata.updateEntry(newObject.offset, newObject.key, err + newObject.offset); } cache->newObject(firstDir, newObject.key,err); diff --git a/storage-manager/src/MetadataFile.h b/storage-manager/src/MetadataFile.h index bd70a581c..38a36c006 100644 --- a/storage-manager/src/MetadataFile.h +++ b/storage-manager/src/MetadataFile.h @@ -77,7 +77,6 @@ class MetadataFile // removes p from the json cache. p should be a fully qualified metadata file static void deletedMeta(const boost::filesystem::path &p); - // TBD: this may have to go; there may be no use case where only the uuid needs to change. static std::string getNewKeyFromOldKey(const std::string &oldKey, size_t length=0); static std::string getNewKey(std::string sourceName, size_t offset, size_t length); static off_t getOffsetFromKey(const std::string &key); From 3497191d0cb9b27d08aef6c6e6c7133eba7e5714 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Thu, 21 May 2020 21:43:05 -0400 Subject: [PATCH 04/24] Fixed a silly error. --- storage-manager/src/Replicator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage-manager/src/Replicator.cpp b/storage-manager/src/Replicator.cpp index b14f1107d..c2a69f870 100644 --- a/storage-manager/src/Replicator.cpp +++ b/storage-manager/src/Replicator.cpp @@ -170,7 +170,7 @@ ssize_t Replicator::_pwrite(int fd, const void *data, size_t length, off_t offse else return err; } - err += count; + count += err; } while (count < length); return count; @@ -192,7 +192,7 @@ ssize_t Replicator::_write(int fd, const void *data, size_t length) else return err; } - err += count; + count += err; } while (count < length); return count; From c29050899d4977da2e66e67013a3876363281799 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Fri, 22 May 2020 08:06:04 -0400 Subject: [PATCH 05/24] Took out the cache repopulate idea. Now we will only have warnings of problems. I realized we can't reliably tell how big the cache is while the system is running. There's a window where write/append has added / is adding a journal file but hasn't told Cache about it yet. This capability will have to wait for now. This shouldn't be a problem because in theory, we will no longer have data whose size is not consistent with metadata stored outside of the file. If we do, it means there was either a hard failure, or SM was killed. Either way, SM will be restarted and the cache will populate its meta fresh then. --- storage-manager/src/Cache.cpp | 13 ------------ storage-manager/src/Cache.h | 5 ----- storage-manager/src/PrefixCache.cpp | 30 +++++++++------------------- storage-manager/src/PrefixCache.h | 9 +-------- storage-manager/src/Synchronizer.cpp | 4 +--- 5 files changed, 11 insertions(+), 50 deletions(-) diff --git a/storage-manager/src/Cache.cpp b/storage-manager/src/Cache.cpp index de7b22c16..3f9db3460 100644 --- a/storage-manager/src/Cache.cpp +++ b/storage-manager/src/Cache.cpp @@ -356,19 +356,6 @@ void Cache::configListener() } } -void Cache::repopulate() -{ - boost::unique_lock sl(lru_mutex); - - for (auto &pcache : prefixCaches) - pcache.second->repopulate(); -} - -void Cache::repopulate(const boost::filesystem::path &p) -{ - getPCache(p).repopulate(); -} - } diff --git a/storage-manager/src/Cache.h b/storage-manager/src/Cache.h index 3ae57a342..12396550a 100644 --- a/storage-manager/src/Cache.h +++ b/storage-manager/src/Cache.h @@ -92,11 +92,6 @@ class Cache : public boost::noncopyable , public ConfigListener void shutdown(); void printKPIs() const; - // Used to update accounting variables in the PrefixCaches when a potential error - // is detected. - void repopulate(); - void repopulate(const boost::filesystem::path &prefix); - // test helpers const boost::filesystem::path &getCachePath() const; const boost::filesystem::path &getJournalPath() const; diff --git a/storage-manager/src/PrefixCache.cpp b/storage-manager/src/PrefixCache.cpp index 99a65dbba..85e5c68c7 100644 --- a/storage-manager/src/PrefixCache.cpp +++ b/storage-manager/src/PrefixCache.cpp @@ -127,13 +127,7 @@ PrefixCache::~PrefixCache() */ } -void PrefixCache::repopulate() -{ - lru_mutex.lock(); - populate(false); -} - -void PrefixCache::populate(bool useSync) +void PrefixCache::populate() { Synchronizer *sync = Synchronizer::get(); bf::directory_iterator dir(cachePrefix); @@ -151,15 +145,13 @@ void PrefixCache::populate(bool useSync) auto last = lru.end(); m_lru.insert(--last); currentCacheSize += bf::file_size(*dir); - if (useSync) - newObjects.push_back(p.filename().string()); + newObjects.push_back(p.filename().string()); } else if (p != cachePrefix/downloader->getTmpPath()) logger->log(LOG_WARNING, "Cache: found something in the cache that does not belong '%s'", p.string().c_str()); ++dir; } - if (useSync) - sync->newObjects(firstDir, newObjects); + sync->newObjects(firstDir, newObjects); newObjects.clear(); // account for what's in the journal dir @@ -174,8 +166,7 @@ void PrefixCache::populate(bool useSync) { size_t s = bf::file_size(*dir); currentCacheSize += s; - if (useSync) - newJournals.push_back(pair(p.stem().string(), s)); + newJournals.push_back(pair(p.stem().string(), s)); } else logger->log(LOG_WARNING, "Cache: found a file in the journal dir that does not belong '%s'", p.string().c_str()); @@ -185,8 +176,7 @@ void PrefixCache::populate(bool useSync) ++dir; } lru_mutex.unlock(); - if (useSync) - sync->newJournalEntries(firstDir, newJournals); + sync->newJournalEntries(firstDir, newJournals); } // be careful using this! SM should be idle. No ongoing reads or writes. @@ -399,10 +389,9 @@ void PrefixCache::deletedJournal(size_t size) else { ostringstream oss; - oss << "PrefixCache::deletedJournal(): Detected an accounting error." << - " Reloading cache metadata, this will pause IO activity briefly."; + oss << "PrefixCache::deletedJournal(): Detected an accounting error."; logger->log(LOG_WARNING, oss.str().c_str()); - populate(false); + currentCacheSize = 0; } } @@ -425,10 +414,9 @@ void PrefixCache::deletedObject(const string &key, size_t size) else { ostringstream oss; - oss << "PrefixCache::deletedObject(): Detected an accounting error." << - " Reloading cache metadata, this will pause IO activity briefly."; + oss << "PrefixCache::deletedObject(): Detected an accounting error."; logger->log(LOG_WARNING, oss.str().c_str()); - populate(false); + currentCacheSize = 0; } } } diff --git a/storage-manager/src/PrefixCache.h b/storage-manager/src/PrefixCache.h index da6b7fb66..1121275c7 100644 --- a/storage-manager/src/PrefixCache.h +++ b/storage-manager/src/PrefixCache.h @@ -77,11 +77,6 @@ class PrefixCache : public boost::noncopyable size_t getMaxCacheSize() const; void shutdown(); - // clears out cache structures and reloads them from cache/journal dir contents - // needed to potentially repair the cache's accounting error after detecting - // an error. - void repopulate(); - // test helpers const boost::filesystem::path &getCachePath(); const boost::filesystem::path &getJournalPath(); @@ -102,9 +97,7 @@ class PrefixCache : public boost::noncopyable SMLogging *logger; Downloader *downloader; - // useSync makes populate() tell Synchronizer about what it finds. - // set it to false when the system is already fully up. - void populate(bool useSync = true); + void populate(); void _makeSpace(size_t size); /* The main PrefixCache structures */ diff --git a/storage-manager/src/Synchronizer.cpp b/storage-manager/src/Synchronizer.cpp index b8e07ae39..20bd39213 100644 --- a/storage-manager/src/Synchronizer.cpp +++ b/storage-manager/src/Synchronizer.cpp @@ -706,10 +706,8 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list ostringstream oss; oss << "Synchronizer::synchronizeWithJournal(): detected a mismatch between file size and " << "length stored in the object name. object name = " << cloudKey << " length-in-name = " << - MetadataFile::getLengthFromKey(cloudKey) << " real-length = " << bf::file_size(oldCachePath) - << ". Reloading cache metadata, this will pause IO activity briefly."; + MetadataFile::getLengthFromKey(cloudKey) << " real-length = " << bf::file_size(oldCachePath); logger->log(LOG_WARNING, oss.str().c_str()); - cache->repopulate(prefix); } } From 43585d55cd417f1424363dd731b3c95d60af1652 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Fri, 22 May 2020 08:45:28 -0400 Subject: [PATCH 06/24] Added error detection to the new ::rename calls. --- storage-manager/src/IOCoordinator.cpp | 31 ++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index 86e0a9989..d37dcf707 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -505,11 +505,21 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin dataRemaining -= err; count += err; iocBytesWritten += err; - // get a new name for the object - bf::path oldPath = firstDir/newObject.key; + // get a new name for the object + string oldKey = newObject.key; newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + objectOffset); - ::rename(oldPath.string().c_str(), (firstDir/newObject.key).string().c_str()); + int renameErr = ::rename((firstDir/oldKey).string().c_str(), (firstDir/newObject.key).string().c_str()); + int renameErrno = errno; + if (renameErr < 0) + { + ostringstream oss; + char buf[80]; + oss << "IOCoordinator::write(): Failed to rename " << (firstDir/oldKey).string() << " to " << + (firstDir/newObject.key).string() << "! Got " << strerror_r(renameErrno, buf, 80); + logger->log(LOG_ERR, oss.str().c_str()); + newObject.key = oldKey; + } // rename and resize the object in metadata metadata.updateEntry(newObject.offset, newObject.key, (err + objectOffset)); @@ -643,9 +653,20 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t iocBytesWritten += err; if (err < (int64_t) writeLength) { - bf::path oldPath = firstDir/newObject.key; + string oldKey = newObject.key; newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + newObject.offset); - ::rename(oldPath.string().c_str(), (firstDir/newObject.key).string().c_str()); + int renameErr = ::rename((firstDir/oldKey).string().c_str(), (firstDir/newObject.key).string().c_str()); + int renameErrno = errno; + if (renameErr < 0) + { + ostringstream oss; + char buf[80]; + oss << "IOCoordinator::write(): Failed to rename " << (firstDir/oldKey).string() << " to " << + (firstDir/newObject.key).string() << "! Got " << strerror_r(renameErrno, buf, 80); + logger->log(LOG_ERR, oss.str().c_str()); + newObject.key = oldKey; + } + metadata.updateEntry(newObject.offset, newObject.key, err + newObject.offset); } cache->newObject(firstDir, newObject.key,err); From 5d8470b91f0b8bcd7d294adc41ecd623a9e75ef4 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Fri, 22 May 2020 08:50:17 -0400 Subject: [PATCH 07/24] Fixed a length parameter after a partial write. --- storage-manager/src/IOCoordinator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index d37dcf707..2dc3c3630 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -667,7 +667,7 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t newObject.key = oldKey; } - metadata.updateEntry(newObject.offset, newObject.key, err + newObject.offset); + metadata.updateEntry(newObject.offset, newObject.key, err); } cache->newObject(firstDir, newObject.key,err); newObjectKeys.push_back(newObject.key); From 359beb9c96cd0651b38eb35bf6333c36e9e46c5c Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Fri, 22 May 2020 13:10:21 -0400 Subject: [PATCH 08/24] Suppressed logging self-correcting problems. It will start logging as an err if it does not self-correct after 10 attempts, and will escalate to crit after 20 attempts. Also fixed a silly error where it was checking the file size after it deleted the file. --- storage-manager/src/Synchronizer.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/storage-manager/src/Synchronizer.cpp b/storage-manager/src/Synchronizer.cpp index 20bd39213..100a265ef 100644 --- a/storage-manager/src/Synchronizer.cpp +++ b/storage-manager/src/Synchronizer.cpp @@ -409,6 +409,7 @@ void Synchronizer::process(list::iterator name) s.unlock(); bool success = false; + int retryCount = 0; while (!success) { assert(!s.owns_lock()); @@ -434,8 +435,11 @@ void Synchronizer::process(list::iterator name) success = true; } catch(exception &e) { - logger->log(LOG_CRIT, "Synchronizer::process(): error sync'ing %s opFlags=%d, got '%s'. Retrying...", key.c_str(), - pending->opFlags, e.what()); + // these are often self-resolving, so we will suppress logging it for 10 iterations, then escalate + // to error, then to crit + if (++retryCount >= 10) + logger->log((retryCount < 20 ? LOG_ERR : LOG_CRIT), "Synchronizer::process(): error sync'ing %s opFlags=%d, got '%s'. Retrying...", key.c_str(), + pending->opFlags, e.what()); success = false; sleep(1); continue; @@ -706,7 +710,7 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list ostringstream oss; oss << "Synchronizer::synchronizeWithJournal(): detected a mismatch between file size and " << "length stored in the object name. object name = " << cloudKey << " length-in-name = " << - MetadataFile::getLengthFromKey(cloudKey) << " real-length = " << bf::file_size(oldCachePath); + MetadataFile::getLengthFromKey(cloudKey) << " real-length = " << oldSize; logger->log(LOG_WARNING, oss.str().c_str()); } } From faa35ebeebd53af28febad76d887fd1c62aa23c0 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Fri, 22 May 2020 17:55:19 -0400 Subject: [PATCH 09/24] Tentative commit, hunting down a source of misbehavior. Conflicts: storage-manager/src/IOCoordinator.cpp --- storage-manager/src/Downloader.cpp | 8 ++++ storage-manager/src/IOCoordinator.cpp | 57 +++++++++++++++++++++++---- storage-manager/src/LocalStorage.cpp | 6 +++ storage-manager/src/Synchronizer.cpp | 27 +++++++++++-- 4 files changed, 88 insertions(+), 10 deletions(-) diff --git a/storage-manager/src/Downloader.cpp b/storage-manager/src/Downloader.cpp index 0c230f566..c6ea05198 100644 --- a/storage-manager/src/Downloader.cpp +++ b/storage-manager/src/Downloader.cpp @@ -18,6 +18,7 @@ #include "Downloader.h" #include "Config.h" #include "SMLogging.h" +#include "MetadataFile.h" #include #include #include @@ -166,6 +167,13 @@ void Downloader::Download::operator()() bf::remove(tmpFile); size = 0; } + if (size != MetadataFile::getLengthFromKey(key)) + { + ostringstream oss; + SMLogging *logr = SMLogging::get(); + oss << "Downloader: got a file with a bad length field. key = " << key << " actual size = " << size; + logr->log(LOG_ERR, oss.str().c_str()); + } // move it to its proper place boost::system::error_code berr; diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index 2dc3c3630..687eb434e 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -488,6 +488,8 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin //log error and abort l_errno = errno; logger->log(LOG_ERR,"IOCoordinator::write(): Failed newObject."); + metadata.removeEntry(newObject.offset); + replicator->remove(firstDir/newObject.key); errno = l_errno; if (count == 0) // if no data has been written yet, it's safe to return -1 here. return -1; @@ -509,14 +511,17 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin // get a new name for the object string oldKey = newObject.key; newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + objectOffset); - int renameErr = ::rename((firstDir/oldKey).string().c_str(), (firstDir/newObject.key).string().c_str()); + ostringstream os; + os << "IOCoordinator::write(): renaming " << oldKey << " to " << newObject.key; + logger->log(LOG_DEBUG, os.str().c_str()); + int renameErr = ::rename((cachePath/firstDir/oldKey).string().c_str(), (cachePath/firstDir/newObject.key).string().c_str()); int renameErrno = errno; if (renameErr < 0) { ostringstream oss; char buf[80]; - oss << "IOCoordinator::write(): Failed to rename " << (firstDir/oldKey).string() << " to " << - (firstDir/newObject.key).string() << "! Got " << strerror_r(renameErrno, buf, 80); + oss << "IOCoordinator::write(): Failed to rename " << (cachePath/firstDir/oldKey).string() << " to " << + (cachePath/firstDir/newObject.key).string() << "! Got " << strerror_r(renameErrno, buf, 80); logger->log(LOG_ERR, oss.str().c_str()); newObject.key = oldKey; } @@ -527,6 +532,14 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin newObjectKeys.push_back(newObject.key); goto out; } + + if (bf::file_size(cachePath/firstDir/newObject.key) != MetadataFile::getLengthFromKey(newObject.key)) + { + ostringstream oss; + oss << "IOCoordinator::write(): detected bad length field in " << newObject.key + << " real size = " << bf::file_size(cachePath/firstDir/newObject.key); + logger->log(LOG_ERR, oss.str().c_str()); + } cache->newObject(firstDir, newObject.key,writeLength + objectOffset); newObjectKeys.push_back(newObject.key); @@ -635,6 +648,8 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t l_errno = errno; //log error and abort logger->log(LOG_ERR,"IOCoordinator::append(): Failed newObject."); + metadata.removeEntry(newObject.offset); + replicator->remove(firstDir/newObject.key); errno = l_errno; // if no data was written successfully yet, it's safe to return -1 here. if (count == 0) @@ -655,20 +670,32 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t { string oldKey = newObject.key; newObject.key = metadata.getNewKeyFromOldKey(newObject.key, err + newObject.offset); - int renameErr = ::rename((firstDir/oldKey).string().c_str(), (firstDir/newObject.key).string().c_str()); + ostringstream os; + os << "IOCoordinator::append(): renaming " << oldKey << " to " << newObject.key; + logger->log(LOG_DEBUG, os.str().c_str()); + int renameErr = ::rename((cachePath/firstDir/oldKey).string().c_str(), (cachePath/firstDir/newObject.key).string().c_str()); int renameErrno = errno; if (renameErr < 0) { ostringstream oss; char buf[80]; - oss << "IOCoordinator::write(): Failed to rename " << (firstDir/oldKey).string() << " to " << - (firstDir/newObject.key).string() << "! Got " << strerror_r(renameErrno, buf, 80); + oss << "IOCoordinator::write(): Failed to rename " << (cachePath/firstDir/oldKey).string() << " to " << + (cachePath/firstDir/newObject.key).string() << "! Got " << strerror_r(renameErrno, buf, 80); logger->log(LOG_ERR, oss.str().c_str()); newObject.key = oldKey; } metadata.updateEntry(newObject.offset, newObject.key, err); } + + if (bf::file_size(cachePath/firstDir/newObject.key) != MetadataFile::getLengthFromKey(newObject.key)) + { + ostringstream oss; + oss << "IOCoordinator::write(): detected bad length field in " << newObject.key + << " real size = " << bf::file_size(cachePath/firstDir/newObject.key); + logger->log(LOG_ERR, oss.str().c_str()); + } + cache->newObject(firstDir, newObject.key,err); newObjectKeys.push_back(newObject.key); @@ -1034,7 +1061,7 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) for (const auto &object : objects) { bf::path journalFile = journalPath/firstDir1/(object.key + ".journal"); - metadataObject newObj = meta2.addMetadataObject(filename2, object.length); + metadataObject newObj = meta2.addMetadataObject(filename2, MetadataFile::getLengthFromKey(object.key)); assert(newObj.offset == object.offset); err = cs->copyObject(object.key, newObj.key); if (err) @@ -1049,6 +1076,22 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) ", dest = " + filename2 + ". Object " + object.key + " does not exist in either " "cloud storage or the cache!"); + if (bf::file_size(cachedObjPath) != MetadataFile::getLengthFromKey(object.key)) + { + ostringstream oss; + oss << "CopyFile: found a size mismatch in " << cachedObjPath << + " real size = " << bf::file_size(cachedObjPath); + logger->log(LOG_ERR, oss.str().c_str()); + } + + if (MetadataFile::getLengthFromKey(object.key) != MetadataFile::getLengthFromKey(newObj.key)) + { + ostringstream oss; + oss << "CopyFile: found a size mismatch in src and dest keys src = " << object.key << + " dest = " << newObj.key; + logger->log(LOG_ERR, oss.str().c_str()); + } + // put the copy in cloudstorage err = cs->putObject(cachedObjPath.string(), newObj.key); if (err) diff --git a/storage-manager/src/LocalStorage.cpp b/storage-manager/src/LocalStorage.cpp index 4f6c31958..c80e571fc 100644 --- a/storage-manager/src/LocalStorage.cpp +++ b/storage-manager/src/LocalStorage.cpp @@ -101,8 +101,11 @@ int LocalStorage::copy(const bf::path &source, const bf::path &dest) if (err) { errno = err.value(); + ::unlink(dest.string().c_str()); return -1; } + if (bf::file_size(source) != bf::file_size(dest)) + logger->log(LOG_ERR, "LocalStorage::copy: partially copied a file somehow"); return 0; } @@ -216,6 +219,7 @@ int LocalStorage::putObject(boost::shared_array data, size_t len, const l_errno = errno; //logger->log(LOG_CRIT, "LocalStorage::putObject(): Failed to write to %s, got '%s'", c_dest, strerror_r(errno, buf, 80)); close(fd); + ::unlink(c_dest); errno = l_errno; bytesWritten += count; return err; @@ -240,6 +244,8 @@ int LocalStorage::copyObject(const string &source, const string &dest) size_t _size = bf::file_size(prefix/source); bytesRead += _size; bytesWritten += _size; + if (bf::file_size(prefix/source) != bf::file_size(prefix/dest)) + logger->log(LOG_ERR, "LocalStorage::copyObject(): partially copied a file somehow"); } return ret; } diff --git a/storage-manager/src/Synchronizer.cpp b/storage-manager/src/Synchronizer.cpp index 100a265ef..da2820065 100644 --- a/storage-manager/src/Synchronizer.cpp +++ b/storage-manager/src/Synchronizer.cpp @@ -437,7 +437,7 @@ void Synchronizer::process(list::iterator name) catch(exception &e) { // these are often self-resolving, so we will suppress logging it for 10 iterations, then escalate // to error, then to crit - if (++retryCount >= 10) + //if (++retryCount >= 10) logger->log((retryCount < 20 ? LOG_ERR : LOG_CRIT), "Synchronizer::process(): error sync'ing %s opFlags=%d, got '%s'. Retrying...", key.c_str(), pending->opFlags, e.what()); success = false; @@ -467,7 +467,7 @@ void Synchronizer::synchronize(const string &sourceFile, list::iterator { ScopedReadLock s(ioc, sourceFile); - string &key = *it; + string key = *it; size_t pos = key.find_first_of('/'); bf::path prefix = key.substr(0, pos); string cloudKey = key.substr(pos + 1); @@ -499,7 +499,6 @@ void Synchronizer::synchronize(const string &sourceFile, list::iterator if (exists) return; - // TODO: should be safe to check with Cache instead of a file existence check exists = cache->exists(prefix, cloudKey); if (!exists) { @@ -507,9 +506,17 @@ void Synchronizer::synchronize(const string &sourceFile, list::iterator return; } + if (bf::file_size(cachePath/key) != MetadataFile::getLengthFromKey(cloudKey)) + { + ostringstream oss; + oss << "Synchronizer::synchronize(): found a size mismatch in key = " << cloudKey << + " real size = " << bf::file_size(cachePath/key); + logger->log(LOG_ERR, oss.str().c_str()); + } err = cs->putObject((cachePath / key).string(), cloudKey); if (err) throw runtime_error(string("synchronize(): uploading ") + key + ", got " + strerror_r(errno, buf, 80)); + numBytesRead += mdEntry.length; bytesReadBySync += mdEntry.length; numBytesUploaded += mdEntry.length; @@ -658,6 +665,20 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list // get a new key for the resolved version & upload it string newCloudKey = MetadataFile::getNewKeyFromOldKey(cloudKey, size); string newKey = (prefix/newCloudKey).string(); + +try { + if (size != MetadataFile::getLengthFromKey(newCloudKey)) + { + ostringstream oss; + oss << "SyncWithJournal: detected the file size mismatch on the merged object somehow. " << + "key = " << newCloudKey << "real size = " << bf::file_size(prefix/newCloudKey); + logger->log(LOG_ERR, oss.str().c_str()); + } +} catch(exception &e) +{ + logger->log(LOG_ERR, "DEB4"); +} + err = cs->putObject(data, size, newCloudKey); if (err) { From c2c23b8098230a9c9e48826bf389e6374d9be7b7 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Fri, 22 May 2020 18:39:30 -0400 Subject: [PATCH 10/24] Snapshotting the changes so far, we'll have to come back to it later. --- storage-manager/src/IOCoordinator.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index 687eb434e..ee69cc6de 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -1061,6 +1061,10 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) for (const auto &object : objects) { bf::path journalFile = journalPath/firstDir1/(object.key + ".journal"); + // XXXPAT: There is a risk from using the length in the key here. If SM got killed + // in the middle of a write, it will have the _intended_ length of the object, not the + // actual length. + // see MCOL-3459 metadataObject newObj = meta2.addMetadataObject(filename2, MetadataFile::getLengthFromKey(object.key)); assert(newObj.offset == object.offset); err = cs->copyObject(object.key, newObj.key); @@ -1081,6 +1085,7 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) ostringstream oss; oss << "CopyFile: found a size mismatch in " << cachedObjPath << " real size = " << bf::file_size(cachedObjPath); + // XXXPAT: get a new key here logger->log(LOG_ERR, oss.str().c_str()); } From 33558881ab2fa1aaccfd184aea0b2b6a96d460d2 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Tue, 26 May 2020 16:48:01 -0400 Subject: [PATCH 11/24] Fixed the problem with using the correct length for new objects in copyFile. The metadata should contain the merged length, the object name should contain the pre-merged length. --- storage-manager/src/IOCoordinator.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index ee69cc6de..2f693925e 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -1024,9 +1024,13 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) errno = ENOENT; return -1; } - if (bf::exists(metaFile2)) + { + cout << "copyFile: deleting previous version of " << metaFile2 << endl; deleteMetaFile(metaFile2); + ++filesDeleted; + } + // since we don't implement mkdir(), assume the caller did that and // create any necessary parent dirs for filename2 try @@ -1051,6 +1055,7 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) if (meta2.exists()) { + cout << "copyFile: this shouldn't happen" << endl; meta2.removeAllEntries(); ++filesDeleted; } @@ -1061,12 +1066,13 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) for (const auto &object : objects) { bf::path journalFile = journalPath/firstDir1/(object.key + ".journal"); - // XXXPAT: There is a risk from using the length in the key here. If SM got killed - // in the middle of a write, it will have the _intended_ length of the object, not the - // actual length. - // see MCOL-3459 - metadataObject newObj = meta2.addMetadataObject(filename2, MetadataFile::getLengthFromKey(object.key)); - assert(newObj.offset == object.offset); + + // originalLength = the length of the object before journal entries. + // the length in the metadata is the length after journal entries + size_t originalLength = MetadataFile::getLengthFromKey(object.key); + metadataObject newObj = meta2.addMetadataObject(filename2, originalLength); + if (originalLength != object.length) + meta2.updateEntryLength(newObj.offset, object.length); err = cs->copyObject(object.key, newObj.key); if (err) { From 6fd24d2d067fd540b35f965cd83c9499e27ef23d Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Wed, 27 May 2020 13:43:47 -0400 Subject: [PATCH 12/24] Added code to delete orphaned objects from the cache & from cloud storage. --- storage-manager/src/Cache.cpp | 1 - storage-manager/src/Replicator.cpp | 1 + storage-manager/src/Synchronizer.cpp | 48 +++++++++++++++++++++++++--- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/storage-manager/src/Cache.cpp b/storage-manager/src/Cache.cpp index 3f9db3460..05a70adf3 100644 --- a/storage-manager/src/Cache.cpp +++ b/storage-manager/src/Cache.cpp @@ -355,7 +355,6 @@ void Cache::configListener() logger->log(LOG_CRIT, "Cache/cache_size is not a number. Using current value = %zi",maxCacheSize); } } - } diff --git a/storage-manager/src/Replicator.cpp b/storage-manager/src/Replicator.cpp index c2a69f870..542f00060 100644 --- a/storage-manager/src/Replicator.cpp +++ b/storage-manager/src/Replicator.cpp @@ -350,6 +350,7 @@ int Replicator::addJournalEntry(const boost::filesystem::path &filename, const u { mpLogger->log(LOG_CRIT, "Replicator::addJournalEntry: Truncate to previous EOF failed! (%s)", strerror_r(errno, errbuf, 80)); + errno = l_errno; if (err < 0) return err; else diff --git a/storage-manager/src/Synchronizer.cpp b/storage-manager/src/Synchronizer.cpp index da2820065..2474dc52a 100644 --- a/storage-manager/src/Synchronizer.cpp +++ b/storage-manager/src/Synchronizer.cpp @@ -474,11 +474,26 @@ void Synchronizer::synchronize(const string &sourceFile, list::iterator char buf[80]; bool exists = false; int err; + bf::path objectPath = cachePath/key; MetadataFile md(sourceFile, MetadataFile::no_create_t(),true); if (!md.exists()) { logger->log(LOG_DEBUG, "synchronize(): no metadata found for %s. It must have been deleted.", sourceFile.c_str()); + try + { + if (!bf::exists(objectPath)) + return; + size_t size = bf::file_size(objectPath); + replicator->remove(objectPath); + cache->deletedObject(prefix, cloudKey, size); + cs->deleteObject(cloudKey); + } + catch (exception &e) + { + logger->log(LOG_DEBUG, "synchronize(): failed to remove orphaned object '%s' from the cache, got %s", + objectPath.string().c_str(), e.what()); + } return; } @@ -506,14 +521,14 @@ void Synchronizer::synchronize(const string &sourceFile, list::iterator return; } - if (bf::file_size(cachePath/key) != MetadataFile::getLengthFromKey(cloudKey)) + if (bf::file_size(objectPath) != MetadataFile::getLengthFromKey(cloudKey)) { ostringstream oss; oss << "Synchronizer::synchronize(): found a size mismatch in key = " << cloudKey << - " real size = " << bf::file_size(cachePath/key); + " real size = " << bf::file_size(objectPath); logger->log(LOG_ERR, oss.str().c_str()); } - err = cs->putObject((cachePath / key).string(), cloudKey); + err = cs->putObject(objectPath.string(), cloudKey); if (err) throw runtime_error(string("synchronize(): uploading ") + key + ", got " + strerror_r(errno, buf, 80)); @@ -521,7 +536,7 @@ void Synchronizer::synchronize(const string &sourceFile, list::iterator bytesReadBySync += mdEntry.length; numBytesUploaded += mdEntry.length; ++objectsSyncedWithNoJournal; - replicator->remove((cachePath/key), Replicator::NO_LOCAL); + replicator->remove(objectPath, Replicator::NO_LOCAL); } void Synchronizer::synchronizeDelete(const string &sourceFile, list::iterator &it) @@ -546,6 +561,29 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list if (!md.exists()) { logger->log(LOG_DEBUG, "synchronizeWithJournal(): no metadata found for %s. It must have been deleted.", sourceFile.c_str()); + try + { + bf::path objectPath = cachePath/key; + if (bf::exists(objectPath)) + { + size_t objSize = bf::file_size(objectPath); + replicator->remove(objectPath); + cache->deletedObject(prefix, cloudKey, objSize); + cs->deleteObject(cloudKey); + } + bf::path jPath = journalPath/(key + ".journal"); + if (bf::exists(jPath)) + { + size_t jSize = bf::file_size(jPath); + replicator->remove(jPath); + cache->deletedJournal(prefix, jSize); + } + } + catch(exception &e) + { + logger->log(LOG_DEBUG, "synchronizeWithJournal(): failed to remove orphaned object '%s' from the cache, got %s", + (cachePath/key).string().c_str(), e.what()); + } return; } @@ -559,7 +597,7 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list //assert(key == mdEntry->key); <--- I suspect this can happen in a truncate + write situation + a deep sync queue bf::path oldCachePath = cachePath / key; - string journalName = (journalPath/ (key + ".journal")).string(); + string journalName = (journalPath/(key + ".journal")).string(); if (!bf::exists(journalName)) { From cd5e87210421e324e1341cd8dc12cc9cc37c330a Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Wed, 27 May 2020 14:57:10 -0400 Subject: [PATCH 13/24] Removed unnecessary debugging printouts/logging, fixed a couple add'l bugs. --- storage-manager/src/Downloader.cpp | 8 ----- storage-manager/src/IOCoordinator.cpp | 43 +++------------------------ storage-manager/src/LocalStorage.cpp | 4 --- storage-manager/src/PrefixCache.cpp | 8 ++--- storage-manager/src/Synchronizer.cpp | 21 ------------- 5 files changed, 6 insertions(+), 78 deletions(-) diff --git a/storage-manager/src/Downloader.cpp b/storage-manager/src/Downloader.cpp index c6ea05198..0c230f566 100644 --- a/storage-manager/src/Downloader.cpp +++ b/storage-manager/src/Downloader.cpp @@ -18,7 +18,6 @@ #include "Downloader.h" #include "Config.h" #include "SMLogging.h" -#include "MetadataFile.h" #include #include #include @@ -167,13 +166,6 @@ void Downloader::Download::operator()() bf::remove(tmpFile); size = 0; } - if (size != MetadataFile::getLengthFromKey(key)) - { - ostringstream oss; - SMLogging *logr = SMLogging::get(); - oss << "Downloader: got a file with a bad length field. key = " << key << " actual size = " << size; - logr->log(LOG_ERR, oss.str().c_str()); - } // move it to its proper place boost::system::error_code berr; diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index 2f693925e..c3de7da88 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -489,7 +489,7 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin l_errno = errno; logger->log(LOG_ERR,"IOCoordinator::write(): Failed newObject."); metadata.removeEntry(newObject.offset); - replicator->remove(firstDir/newObject.key); + replicator->remove(cachePath/firstDir/newObject.key); errno = l_errno; if (count == 0) // if no data has been written yet, it's safe to return -1 here. return -1; @@ -499,7 +499,7 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin { // remove the object created above; can't have 0-length objects metadata.removeEntry(newObject.offset); - replicator->remove(firstDir/newObject.key); + replicator->remove(cachePath/firstDir/newObject.key); goto out; } else if ((uint)err < writeLength) @@ -532,14 +532,6 @@ ssize_t IOCoordinator::_write(const boost::filesystem::path &filename, const uin newObjectKeys.push_back(newObject.key); goto out; } - - if (bf::file_size(cachePath/firstDir/newObject.key) != MetadataFile::getLengthFromKey(newObject.key)) - { - ostringstream oss; - oss << "IOCoordinator::write(): detected bad length field in " << newObject.key - << " real size = " << bf::file_size(cachePath/firstDir/newObject.key); - logger->log(LOG_ERR, oss.str().c_str()); - } cache->newObject(firstDir, newObject.key,writeLength + objectOffset); newObjectKeys.push_back(newObject.key); @@ -649,7 +641,7 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t //log error and abort logger->log(LOG_ERR,"IOCoordinator::append(): Failed newObject."); metadata.removeEntry(newObject.offset); - replicator->remove(firstDir/newObject.key); + replicator->remove(cachePath/firstDir/newObject.key); errno = l_errno; // if no data was written successfully yet, it's safe to return -1 here. if (count == 0) @@ -659,7 +651,7 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t else if (err == 0) { metadata.removeEntry(newObject.offset); - replicator->remove(firstDir/newObject.key); + replicator->remove(cachePath/firstDir/newObject.key); goto out; } @@ -687,14 +679,6 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t metadata.updateEntry(newObject.offset, newObject.key, err); } - - if (bf::file_size(cachePath/firstDir/newObject.key) != MetadataFile::getLengthFromKey(newObject.key)) - { - ostringstream oss; - oss << "IOCoordinator::write(): detected bad length field in " << newObject.key - << " real size = " << bf::file_size(cachePath/firstDir/newObject.key); - logger->log(LOG_ERR, oss.str().c_str()); - } cache->newObject(firstDir, newObject.key,err); newObjectKeys.push_back(newObject.key); @@ -1026,7 +1010,6 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) } if (bf::exists(metaFile2)) { - cout << "copyFile: deleting previous version of " << metaFile2 << endl; deleteMetaFile(metaFile2); ++filesDeleted; } @@ -1055,7 +1038,6 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) if (meta2.exists()) { - cout << "copyFile: this shouldn't happen" << endl; meta2.removeAllEntries(); ++filesDeleted; } @@ -1086,23 +1068,6 @@ int IOCoordinator::copyFile(const char *_filename1, const char *_filename2) ", dest = " + filename2 + ". Object " + object.key + " does not exist in either " "cloud storage or the cache!"); - if (bf::file_size(cachedObjPath) != MetadataFile::getLengthFromKey(object.key)) - { - ostringstream oss; - oss << "CopyFile: found a size mismatch in " << cachedObjPath << - " real size = " << bf::file_size(cachedObjPath); - // XXXPAT: get a new key here - logger->log(LOG_ERR, oss.str().c_str()); - } - - if (MetadataFile::getLengthFromKey(object.key) != MetadataFile::getLengthFromKey(newObj.key)) - { - ostringstream oss; - oss << "CopyFile: found a size mismatch in src and dest keys src = " << object.key << - " dest = " << newObj.key; - logger->log(LOG_ERR, oss.str().c_str()); - } - // put the copy in cloudstorage err = cs->putObject(cachedObjPath.string(), newObj.key); if (err) diff --git a/storage-manager/src/LocalStorage.cpp b/storage-manager/src/LocalStorage.cpp index c80e571fc..0c2cd9d18 100644 --- a/storage-manager/src/LocalStorage.cpp +++ b/storage-manager/src/LocalStorage.cpp @@ -104,8 +104,6 @@ int LocalStorage::copy(const bf::path &source, const bf::path &dest) ::unlink(dest.string().c_str()); return -1; } - if (bf::file_size(source) != bf::file_size(dest)) - logger->log(LOG_ERR, "LocalStorage::copy: partially copied a file somehow"); return 0; } @@ -244,8 +242,6 @@ int LocalStorage::copyObject(const string &source, const string &dest) size_t _size = bf::file_size(prefix/source); bytesRead += _size; bytesWritten += _size; - if (bf::file_size(prefix/source) != bf::file_size(prefix/dest)) - logger->log(LOG_ERR, "LocalStorage::copyObject(): partially copied a file somehow"); } return ret; } diff --git a/storage-manager/src/PrefixCache.cpp b/storage-manager/src/PrefixCache.cpp index 85e5c68c7..86192faae 100644 --- a/storage-manager/src/PrefixCache.cpp +++ b/storage-manager/src/PrefixCache.cpp @@ -133,8 +133,6 @@ void PrefixCache::populate() bf::directory_iterator dir(cachePrefix); bf::directory_iterator dend; vector newObjects; - lru.clear(); - m_lru.clear(); while (dir != dend) { // put everything in lru & m_lru @@ -382,8 +380,7 @@ void PrefixCache::newJournalEntry(size_t size) void PrefixCache::deletedJournal(size_t size) { boost::unique_lock s(lru_mutex); - - //assert(currentCacheSize >= size); + if (currentCacheSize >= size) currentCacheSize -= size; else @@ -398,8 +395,7 @@ void PrefixCache::deletedJournal(size_t size) void PrefixCache::deletedObject(const string &key, size_t size) { boost::unique_lock s(lru_mutex); - - //assert(currentCacheSize >= size); + M_LRU_t::iterator mit = m_lru.find(key); assert(mit != m_lru.end()); diff --git a/storage-manager/src/Synchronizer.cpp b/storage-manager/src/Synchronizer.cpp index 2474dc52a..5a510f7cc 100644 --- a/storage-manager/src/Synchronizer.cpp +++ b/storage-manager/src/Synchronizer.cpp @@ -521,13 +521,6 @@ void Synchronizer::synchronize(const string &sourceFile, list::iterator return; } - if (bf::file_size(objectPath) != MetadataFile::getLengthFromKey(cloudKey)) - { - ostringstream oss; - oss << "Synchronizer::synchronize(): found a size mismatch in key = " << cloudKey << - " real size = " << bf::file_size(objectPath); - logger->log(LOG_ERR, oss.str().c_str()); - } err = cs->putObject(objectPath.string(), cloudKey); if (err) throw runtime_error(string("synchronize(): uploading ") + key + ", got " + strerror_r(errno, buf, 80)); @@ -703,20 +696,6 @@ void Synchronizer::synchronizeWithJournal(const string &sourceFile, list // get a new key for the resolved version & upload it string newCloudKey = MetadataFile::getNewKeyFromOldKey(cloudKey, size); string newKey = (prefix/newCloudKey).string(); - -try { - if (size != MetadataFile::getLengthFromKey(newCloudKey)) - { - ostringstream oss; - oss << "SyncWithJournal: detected the file size mismatch on the merged object somehow. " << - "key = " << newCloudKey << "real size = " << bf::file_size(prefix/newCloudKey); - logger->log(LOG_ERR, oss.str().c_str()); - } -} catch(exception &e) -{ - logger->log(LOG_ERR, "DEB4"); -} - err = cs->putObject(data, size, newCloudKey); if (err) { From 2e65619cb1268825775c726c3b6cd317c7a0aef8 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Wed, 27 May 2020 15:35:27 -0400 Subject: [PATCH 14/24] Fixed an error msg; the function name was wrong. --- storage-manager/src/IOCoordinator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage-manager/src/IOCoordinator.cpp b/storage-manager/src/IOCoordinator.cpp index c3de7da88..5aa31dca4 100644 --- a/storage-manager/src/IOCoordinator.cpp +++ b/storage-manager/src/IOCoordinator.cpp @@ -671,7 +671,7 @@ ssize_t IOCoordinator::append(const char *_filename, const uint8_t *data, size_t { ostringstream oss; char buf[80]; - oss << "IOCoordinator::write(): Failed to rename " << (cachePath/firstDir/oldKey).string() << " to " << + oss << "IOCoordinator::append(): Failed to rename " << (cachePath/firstDir/oldKey).string() << " to " << (cachePath/firstDir/newObject.key).string() << "! Got " << strerror_r(renameErrno, buf, 80); logger->log(LOG_ERR, oss.str().c_str()); newObject.key = oldKey; From 358a7a0020656ec5646cc2553bf0d7edd53fdd46 Mon Sep 17 00:00:00 2001 From: mariadb-RomanNavrotskiy Date: Thu, 28 May 2020 21:54:29 +0200 Subject: [PATCH 15/24] enable drone pipelines for develop branch (1.5) --- .drone.jsonnet | 211 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 .drone.jsonnet diff --git a/.drone.jsonnet b/.drone.jsonnet new file mode 100644 index 000000000..ec4ebacd3 --- /dev/null +++ b/.drone.jsonnet @@ -0,0 +1,211 @@ +local platforms = { + "develop": ["centos:7", "centos:8", "debian:9", "debian:10", "ubuntu:18.04", "ubuntu:20.04"], + 'develop-1.4': ["centos:7", "centos:8", "debian:9", "debian:10", "ubuntu:16.04", "ubuntu:18.04", "ubuntu:20.04"] +}; + +local codebase_map = { + // "develop": "git clone --recurse-submodules --branch mariadb-10.5.3 --depth 1 https://github.com/MariaDB/server .", + develop: 'git clone --recurse-submodules --branch bb-10.5-cs --depth 1 https://github.com/MariaDB/server .', + "develop-1.4": "git clone --recurse-submodules --branch 10.4-enterprise --depth 1 https://github.com/mariadb-corporation/MariaDBEnterprise .", +}; + +local builddir = 'verylongdirnameforverystrangecpackbehavior'; +local cmakeflags = '-DCMAKE_BUILD_TYPE=Release -DPLUGIN_COLUMNSTORE=YES -DPLUGIN_MROONGA=NO -DPLUGIN_ROCKSDB=NO -DPLUGIN_TOKUDB=NO -DPLUGIN_CONNECT=NO -DPLUGIN_SPIDER=NO -DPLUGIN_OQGRAPH=NO -DPLUGIN_PERFSCHEMA=NO -DPLUGIN_SPHINX=NO'; + +local rpm_build_deps = 'yum install -y git cmake make gcc gcc-c++ libaio-devel openssl-devel boost-devel bison snappy-devel flex libcurl-devel libxml2-devel ncurses-devel automake libtool policycoreutils-devel rpm-build lsof iproute pam-devel perl-DBI cracklib-devel expect readline-devel'; + +local deb_build_deps = 'apt update && apt install --yes --no-install-recommends git ca-certificates devscripts equivs build-essential libboost-all-dev libdistro-info-perl flex pkg-config automake libtool lsb-release bison chrpath cmake dh-apparmor dh-systemd gdb libaio-dev libcrack2-dev libjemalloc-dev libjudy-dev libkrb5-dev libncurses5-dev libpam0g-dev libpcre3-dev libreadline-gplv2-dev libsnappy-dev libssl-dev libsystemd-dev libxml2-dev unixodbc-dev uuid-dev zlib1g-dev libcurl4-openssl-dev dh-exec libpcre2-dev libzstd-dev psmisc socat expect net-tools rsync lsof libdbi-perl iproute2 gawk && mk-build-deps debian/control && dpkg -i mariadb-10*.deb || true && apt install -fy --no-install-recommends'; + +local platformMap(branch, platform) = + local branch_cmakeflags_map = { + develop: ' -DBUILD_CONFIG=mysql_release -DWITH_WSREP=OFF', + 'develop-1.4': ' -DBUILD_CONFIG=enterprise', + }; + + local platform_map = { + 'opensuse/leap:15': 'zypper install -y ' + rpm_build_deps + ' && cmake ' + cmakeflags + branch_cmakeflags_map[branch] + ' -DRPM=sles15 && make -j$(nproc) package', + 'centos:7': rpm_build_deps + ' && cmake ' + cmakeflags + branch_cmakeflags_map[branch] + ' -DRPM=centos7 && make -j$(nproc) package', + 'centos:8': "sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/CentOS-PowerTools.repo && " + rpm_build_deps + ' && cmake ' + cmakeflags + branch_cmakeflags_map[branch] + ' -DRPM=centos8 && make -j$(nproc) package', + 'debian:9': deb_build_deps + " && CMAKEFLAGS='" + cmakeflags + branch_cmakeflags_map[branch] + " -DDEB=stretch' debian/autobake-deb.sh", + 'debian:10': deb_build_deps + " && CMAKEFLAGS='" + cmakeflags + branch_cmakeflags_map[branch] + " -DDEB=buster' debian/autobake-deb.sh", + 'ubuntu:16.04': deb_build_deps + " && CMAKEFLAGS='" + cmakeflags + branch_cmakeflags_map[branch] + " -DDEB=xenial' debian/autobake-deb.sh", + 'ubuntu:18.04': deb_build_deps + " && CMAKEFLAGS='" + cmakeflags + branch_cmakeflags_map[branch] + " -DDEB=bionic' debian/autobake-deb.sh", + 'ubuntu:20.04': deb_build_deps + " && CMAKEFLAGS='" + cmakeflags + branch_cmakeflags_map[branch] + " -DDEB=focal' debian/autobake-deb.sh", + }; + + platform_map[platform]; + +local Pipeline(branch, platform, event) = { + local pipeline = self, + _volumes:: { + mdb: { + name: 'mdb', + path: '/mdb', + }, + }, + tests:: { + name: 'tests', + image: platform, + commands: [ + (if platform == 'centos:7' then 'yum install -y sysvinit-tools' else '' ), + (if platform == 'centos:8' then 'yum install -y diffutils' else '' ), + 'yum install -y lz4 wget git rsyslog', + "sed -i '/OmitLocalLogging/d' /etc/rsyslog.conf", + "sed -i 's/off/on/' /etc/rsyslog.conf", + "rm -f /etc/rsyslog.d/listen.conf", + 'rsyslogd', + 'yum install -y result/*.rpm', + 'kill $(pidof rsyslogd) && while pidof rsyslogd; do sleep 2; done', + 'rsyslogd', + 'bash -o pipefail ./build/columnstore_startup.sh', + 'git clone --recurse-submodules --branch ' + branch + ' --depth 1 https://github.com/mariadb-corporation/mariadb-columnstore-regression-test', + 'wget -qO- https://cspkg.s3.amazonaws.com/testData.tar.lz4 | lz4 -dc - | tar xf - -C mariadb-columnstore-regression-test/', + 'cd mariadb-columnstore-regression-test/mysql/queries/nightly/alltest', + "./go.sh --sm_unit_test_dir=/drone/src/storage-manager" + (if event == 'pull_request' then ' --tests=test000.sh' else '' ), + 'cat go.log', + 'test -f testErrorLogs.tgz && mv testErrorLogs.tgz /drone/src/result/ || echo no-errors-archive', + ], + }, + kind: 'pipeline', + type: 'docker', + name: std.join(" ", [branch, platform, event]), + clone: { + depth: 10, + }, + steps: [ + { + name: 'submodules', + image: 'alpine/git', + commands: [ + 'git submodule update --recursive --remote', + 'git config cmake.update-submodules no', + 'ls -la /drone/src/storage-manager', + ], + }, + { + name: 'clone-mdb', + image: 'alpine/git', + volumes: [pipeline._volumes.mdb], + commands: [ + 'mkdir -p /mdb/' + builddir + ' && cd /mdb/' + builddir, + codebase_map[branch], + 'git config cmake.update-submodules no', + 'rm -rf storage/columnstore', + 'cp -r /drone/src /mdb/' + builddir + '/storage/columnstore', + ], + }, + { + name: 'build', + image: platform, + volumes: [pipeline._volumes.mdb], + environment: { + DEBIAN_FRONTEND: 'noninteractive', + TRAVIS: 'true', + }, + commands: [ + 'cd /mdb/' + builddir, + "sed -i -e '/-DBUILD_CONFIG=mysql_release/d' debian/rules", + "sed -i -e '/Package: libmariadbd19/,/^$/d' debian/control", + "sed -i -e '/Package: libmariadbd-dev/,/^$/d' debian/control", + "sed -i -e '/Package: mariadb-backup/,/^$/d' debian/control", + "sed -i -e '/Package: mariadb-plugin-connect/,/^$/d' debian/control", + "sed -i -e '/Package: mariadb-plugin-cracklib-password-check/,/^$/d' debian/control", + "sed -i -e '/Package: mariadb-plugin-gssapi-*/,/^$/d' debian/control", + "sed -i -e '/wsrep/d' debian/mariadb-server-*.install", + "sed -i -e 's/Depends: galera.*/Depends:/' debian/control", + "sed -i -e 's/\"galera-enterprise-4\"//' cmake/cpack_rpm.cmake", + platformMap(branch, platform), + ], + }, + { + name: 'list pkgs', + image: 'centos:7', + volumes: [pipeline._volumes.mdb], + commands: [ + 'cd /mdb/' + builddir, + 'mkdir /drone/src/result', + 'cp *.rpm /drone/src/result 2>/dev/null || true', + 'cp ../*.deb /drone/src/result 2>/dev/null || true', + '! test -n "$(find /drone/src/result -prune -empty)" && ls /drone/src/result', + ], + }, + ] + + (if branch=='develop-1.4' && std.split(platform, ":")[0]=="centos" then [pipeline.tests] else []) + + [ + { + name: 'publish', + image: 'plugins/s3', + when: { + status: ['success', 'failure'], + // event: ['cron'], + }, + settings: { + bucket: 'cspkg', + access_key: { + from_secret: 'aws_access_key_id', + }, + secret_key: { + from_secret: 'aws_secret_access_key', + }, + source: 'result/*', + target: branch + '/${DRONE_BUILD_NUMBER}/' + std.strReplace(platform, ':', ''), + strip_prefix: 'result/', + }, + }, + ], + + volumes: [pipeline._volumes.mdb + {"temp": {}}], + trigger: { + event: [event], + branch: [branch], + } + (if event == 'cron' then { + cron: ['nightly-'+ std.strReplace(branch, '.', '-')] + } else {}) +}; + +local FinalPipeline(branch, event) = { + kind: "pipeline", + name: std.join(" ", ["after", branch, event]), + steps: [ + { + name: "notify", + image: "plugins/slack", + settings: { + room: "#drone_test", + webhook: { + from_secret: "slack_webhook" + }, + template: (if event == 'cron' then "*Nightly" else "*Pull Request " ) + + " build <{{build.link}}|{{build.number}}> {{#success build.status}}succeeded{{else}}failed{{/success}}*. + +*Branch*: +*Commit*: {{truncate build.message.title 100 }} +*Author*: {{ build.author }} +*Duration*: {{since build.started}} +*Artifacts*: https://cspkg.s3.amazonaws.com/index.html?prefix={{build.branch}}/{{build.number}}" + }, + }, + ], + trigger: { + event: [event], + branch: [branch], + status: [ + "success", + "failure" + ], + } + (if event == 'cron' then { + cron: ['nightly-'+ std.strReplace(branch, '.', '-')] + } else {}), + depends_on: std.map(function(p) std.join(" ", [branch, p, event]), platforms[branch]) +}; + +[ +Pipeline(b, p, e) +for b in ['develop', 'develop-1.4'] +for p in platforms[b] +for e in ['pull_request', 'cron'] +] + [ +FinalPipeline(b, e) +for b in ['develop', 'develop-1.4'] +for e in ['pull_request', 'cron'] +] From 9280b7ece1d27a9e3e1eb7f01834181655fd772e Mon Sep 17 00:00:00 2001 From: Jose Date: Thu, 28 May 2020 20:53:56 +0000 Subject: [PATCH 16/24] MCOL-4025 systemd units now preload libjemalloc --- CMakeLists.txt | 4 ++-- cmake/cpackEngineRPM.cmake | 6 +++--- oam/install_scripts/columnstore-post-install.in | 16 ++++++++++------ .../mcs-controllernode.service.in | 1 - oam/install_scripts/mcs-ddlproc.service.in | 1 - oam/install_scripts/mcs-dmlproc.service.in | 1 - oam/install_scripts/mcs-exemgr.service.in | 2 +- oam/install_scripts/mcs-loadbrm.service.in | 1 - oam/install_scripts/mcs-primproc.service.in | 2 +- oam/install_scripts/mcs-workernode.service.in | 1 - .../mcs-writeengineserver.service.in | 2 +- oam/install_scripts/post-mysql-install | 5 +---- 12 files changed, 19 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a11483145..c42857cf4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -392,9 +392,9 @@ IF (INSTALL_LAYOUT) set(SUSE_VERSION_NUMBER "${CMAKE_MATCH_1}") ENDIF () if (${SUSE_VERSION_NUMBER} EQUAL 12) - SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "expect" "boost-devel >= 1.54.0" "snappy" "jemalloc" "net-tools" PARENT_SCOPE) + SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "expect" "boost-devel >= 1.54.0" "snappy" "jemalloc" "net-tools MariaDB-server" PARENT_SCOPE) else () - SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "expect" "boost >= 1.53.0" "snappy" "jemalloc" "net-tools" PARENT_SCOPE) + SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "expect" "boost >= 1.53.0" "snappy" "jemalloc" "net-tools MariaDB-server" PARENT_SCOPE) endif() SET(CPACK_RPM_columnstore-engine_PRE_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/build/preInstall_storage_engine.sh PARENT_SCOPE) diff --git a/cmake/cpackEngineRPM.cmake b/cmake/cpackEngineRPM.cmake index 211362102..c624143a3 100644 --- a/cmake/cpackEngineRPM.cmake +++ b/cmake/cpackEngineRPM.cmake @@ -77,13 +77,13 @@ IF (EXISTS "/etc/SuSE-release") set(SUSE_VERSION_NUMBER "${CMAKE_MATCH_1}") ENDIF () if (${REDHAT_VERSION_NUMBER} EQUAL 6) - SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "MariaDB-columnstore-shared" "snappy" "net-tools") + SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "MariaDB-columnstore-shared" "snappy" "net-tools" "MariaDB-server") # Disable auto require as this will also try to pull Boost via RPM SET(CPACK_RPM_PACKAGE_AUTOREQPROV " no") elseif (${SUSE_VERSION_NUMBER} EQUAL 12) - SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "boost-devel >= 1.54.0" "libsnappy1" "jemalloc" "net-tools") + SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "boost-devel >= 1.54.0" "libsnappy1" "jemalloc" "net-tools" "MariaDB-server") else () - SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "boost >= 1.53.0" "snappy" "jemalloc" "net-tools") + SETA(CPACK_RPM_columnstore-engine_PACKAGE_REQUIRES "boost >= 1.53.0" "snappy" "jemalloc" "net-tools" "MariaDB-server") endif() SET(CPACK_RPM_columnstore-engine_PRE_INSTALL_SCRIPT_FILE ${CMAKE_SOURCE_DIR}/build/preInstall_storage_engine.sh) diff --git a/oam/install_scripts/columnstore-post-install.in b/oam/install_scripts/columnstore-post-install.in index 72969eaff..4545ac120 100755 --- a/oam/install_scripts/columnstore-post-install.in +++ b/oam/install_scripts/columnstore-post-install.in @@ -177,13 +177,17 @@ if [ -z "aws" ]; then fi postConfigure -systemctl start mariadb-columnstore -# Wait for all columnstore to be ready, DDLProc is final process in startup order -while [ -z "$(pgrep -x DDLProc)" ]; -do - sleep 1 -done +systemctl cat mariadb-columnstore.service > /dev/null 2>&1 +if [ $? -eq 0 ] && [ $(running_systemd) -eq 0 ]; then + systemctl start mariadb-columnstore + + # Wait for all columnstore to be ready, DDLProc is final process in startup order + while [ -z "$(pgrep -x DDLProc)" ]; + do + sleep 1 + done +fi dbbuilder 7 > $tmpDir/dbbuilder.log diff --git a/oam/install_scripts/mcs-controllernode.service.in b/oam/install_scripts/mcs-controllernode.service.in index e51b01c4c..928393fde 100644 --- a/oam/install_scripts/mcs-controllernode.service.in +++ b/oam/install_scripts/mcs-controllernode.service.in @@ -5,7 +5,6 @@ After=mcs-workernode.service [Service] Type=forking -Environment="SKIP_OAM_INIT=1" ExecStart=@ENGINE_BINDIR@/controllernode Restart=on-failure ExecStop=@ENGINE_BINDIR@/mcs-stop-controllernode.sh $MAINPID diff --git a/oam/install_scripts/mcs-ddlproc.service.in b/oam/install_scripts/mcs-ddlproc.service.in index 44f3c5494..837e04b04 100644 --- a/oam/install_scripts/mcs-ddlproc.service.in +++ b/oam/install_scripts/mcs-ddlproc.service.in @@ -5,7 +5,6 @@ After=mcs-dmlproc.service [Service] Type=simple -Environment="SKIP_OAM_INIT=1" ExecStart=@ENGINE_BINDIR@/DDLProc Restart=on-failure TimeoutStopSec=2 diff --git a/oam/install_scripts/mcs-dmlproc.service.in b/oam/install_scripts/mcs-dmlproc.service.in index a25841a99..8e1aa4280 100644 --- a/oam/install_scripts/mcs-dmlproc.service.in +++ b/oam/install_scripts/mcs-dmlproc.service.in @@ -5,7 +5,6 @@ After=mcs-exemgr.service [Service] Type=simple -Environment="SKIP_OAM_INIT=1" ExecStart=@ENGINE_BINDIR@/DMLProc Restart=on-failure TimeoutStopSec=2 diff --git a/oam/install_scripts/mcs-exemgr.service.in b/oam/install_scripts/mcs-exemgr.service.in index 2130f7cf6..c528b184b 100644 --- a/oam/install_scripts/mcs-exemgr.service.in +++ b/oam/install_scripts/mcs-exemgr.service.in @@ -5,7 +5,7 @@ After=mcs-writeengineserver.service [Service] Type=simple -Environment="SKIP_OAM_INIT=1" +Environment="LD_PRELOAD=libjemalloc.so.1 libjemalloc.so.2" ExecStart=@ENGINE_BINDIR@/ExeMgr Restart=on-failure TimeoutStopSec=2 diff --git a/oam/install_scripts/mcs-loadbrm.service.in b/oam/install_scripts/mcs-loadbrm.service.in index d9a634287..47cd00f42 100644 --- a/oam/install_scripts/mcs-loadbrm.service.in +++ b/oam/install_scripts/mcs-loadbrm.service.in @@ -6,7 +6,6 @@ ConditionPathExists=/var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves_curren [Service] Type=simple -Environment="SKIP_OAM_INIT=1" ExecStart=/usr/bin/env bash -c "/usr/bin/load_brm /var/lib/columnstore/data1/systemFiles/dbrm/$(cat /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves_current)" [Install] diff --git a/oam/install_scripts/mcs-primproc.service.in b/oam/install_scripts/mcs-primproc.service.in index 84c08d436..64c421710 100644 --- a/oam/install_scripts/mcs-primproc.service.in +++ b/oam/install_scripts/mcs-primproc.service.in @@ -6,7 +6,7 @@ After=mcs-controllernode.service [Service] Type=simple -Environment="SKIP_OAM_INIT=1" +Environment="LD_PRELOAD=libjemalloc.so.1 libjemalloc.so.2" ExecStart=@ENGINE_BINDIR@/PrimProc Restart=on-failure TimeoutStopSec=2 diff --git a/oam/install_scripts/mcs-workernode.service.in b/oam/install_scripts/mcs-workernode.service.in index 714d85058..65862a360 100644 --- a/oam/install_scripts/mcs-workernode.service.in +++ b/oam/install_scripts/mcs-workernode.service.in @@ -5,7 +5,6 @@ After=mcs-loadbrm.service [Service] Type=forking -Environment="SKIP_OAM_INIT=1" ExecStart=@ENGINE_BINDIR@/workernode DBRM_Worker1 Restart=on-failure ExecStop=-@ENGINE_BINDIR@/save_brm diff --git a/oam/install_scripts/mcs-writeengineserver.service.in b/oam/install_scripts/mcs-writeengineserver.service.in index c603efe18..2b5099039 100644 --- a/oam/install_scripts/mcs-writeengineserver.service.in +++ b/oam/install_scripts/mcs-writeengineserver.service.in @@ -5,7 +5,7 @@ After=mcs-primproc.service [Service] Type=simple -Environment="SKIP_OAM_INIT=1" +Environment="LD_PRELOAD=libjemalloc.so.1 libjemalloc.so.2" ExecStart=@ENGINE_BINDIR@/WriteEngineServer Restart=on-failure TimeoutStopSec=2 diff --git a/oam/install_scripts/post-mysql-install b/oam/install_scripts/post-mysql-install index d4cd68339..88ddd040f 100755 --- a/oam/install_scripts/post-mysql-install +++ b/oam/install_scripts/post-mysql-install @@ -22,10 +22,7 @@ checkForError() { # See if engine columnstore exist #--------------------------------------------------------------------------- echo "checking for engine columnstore..." - mysql \ - --user=root \ - --execute='show engines;' \ - | grep -i columnstore + su -s /bin/sh -c 'mysql --execute="show engines"' mysql 2> ${tmpdir}/post-mysql-install.log | grep -i columnstore # # Add compressiontype column to SYSCOLUMN if applicable From d056090d23fef26a4da232e6b183ba14d84ae859 Mon Sep 17 00:00:00 2001 From: Jose Date: Thu, 28 May 2020 23:10:21 +0000 Subject: [PATCH 17/24] MCOL-4025 libjemalloc preloading update --- oam/install_scripts/mcs-controllernode.service.in | 1 + oam/install_scripts/mcs-ddlproc.service.in | 2 +- oam/install_scripts/mcs-dmlproc.service.in | 4 ++-- oam/install_scripts/mcs-exemgr.service.in | 7 +++---- oam/install_scripts/mcs-primproc.service.in | 4 ++-- oam/install_scripts/mcs-writeengineserver.service.in | 7 +++---- 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/oam/install_scripts/mcs-controllernode.service.in b/oam/install_scripts/mcs-controllernode.service.in index 928393fde..1ac114c68 100644 --- a/oam/install_scripts/mcs-controllernode.service.in +++ b/oam/install_scripts/mcs-controllernode.service.in @@ -5,6 +5,7 @@ After=mcs-workernode.service [Service] Type=forking +ExecStartPre=/usr/bin/env bash -c "systemctl start mcs-workernode" ExecStart=@ENGINE_BINDIR@/controllernode Restart=on-failure ExecStop=@ENGINE_BINDIR@/mcs-stop-controllernode.sh $MAINPID diff --git a/oam/install_scripts/mcs-ddlproc.service.in b/oam/install_scripts/mcs-ddlproc.service.in index 837e04b04..c40703792 100644 --- a/oam/install_scripts/mcs-ddlproc.service.in +++ b/oam/install_scripts/mcs-ddlproc.service.in @@ -1,6 +1,6 @@ [Unit] Description=mcs-ddlproc -PartOf=mcs-exemgr.service +PartOf=mcs-writeengineserver.service After=mcs-dmlproc.service [Service] diff --git a/oam/install_scripts/mcs-dmlproc.service.in b/oam/install_scripts/mcs-dmlproc.service.in index 8e1aa4280..576fdce42 100644 --- a/oam/install_scripts/mcs-dmlproc.service.in +++ b/oam/install_scripts/mcs-dmlproc.service.in @@ -1,7 +1,7 @@ [Unit] Description=mcs-dmlproc -PartOf=mcs-exemgr.service -After=mcs-exemgr.service +PartOf=mcs-writeengineserver.service +After=mcs-writeengineserver.service [Service] Type=simple diff --git a/oam/install_scripts/mcs-exemgr.service.in b/oam/install_scripts/mcs-exemgr.service.in index c528b184b..439d3a6a0 100644 --- a/oam/install_scripts/mcs-exemgr.service.in +++ b/oam/install_scripts/mcs-exemgr.service.in @@ -1,12 +1,11 @@ [Unit] Description=mcs-exemgr -PartOf=mcs-writeengineserver.service -After=mcs-writeengineserver.service +PartOf=mcs-primproc.service +After=mcs-primproc.service [Service] Type=simple -Environment="LD_PRELOAD=libjemalloc.so.1 libjemalloc.so.2" -ExecStart=@ENGINE_BINDIR@/ExeMgr +ExecStart=/usr/bin/env bash -c "LD_PRELOAD=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') exec @ENGINE_BINDIR@/ExeMgr" Restart=on-failure TimeoutStopSec=2 diff --git a/oam/install_scripts/mcs-primproc.service.in b/oam/install_scripts/mcs-primproc.service.in index 64c421710..baee874d4 100644 --- a/oam/install_scripts/mcs-primproc.service.in +++ b/oam/install_scripts/mcs-primproc.service.in @@ -6,8 +6,8 @@ After=mcs-controllernode.service [Service] Type=simple -Environment="LD_PRELOAD=libjemalloc.so.1 libjemalloc.so.2" -ExecStart=@ENGINE_BINDIR@/PrimProc +ExecStart=/usr/bin/env bash -c "LD_PRELOAD=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') exec @ENGINE_BINDIR@/PrimProc" +ExecStartPost=sleep 2 Restart=on-failure TimeoutStopSec=2 diff --git a/oam/install_scripts/mcs-writeengineserver.service.in b/oam/install_scripts/mcs-writeengineserver.service.in index 2b5099039..3764c80ab 100644 --- a/oam/install_scripts/mcs-writeengineserver.service.in +++ b/oam/install_scripts/mcs-writeengineserver.service.in @@ -1,12 +1,11 @@ [Unit] Description=WriteEngineServer -PartOf=mcs-primproc.service -After=mcs-primproc.service +PartOf=mcs-exemgr.service +After=mcs-exemgr.service [Service] Type=simple -Environment="LD_PRELOAD=libjemalloc.so.1 libjemalloc.so.2" -ExecStart=@ENGINE_BINDIR@/WriteEngineServer +ExecStart=/usr/bin/env bash -c "LD_PRELOAD=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') exec @ENGINE_BINDIR@/WriteEngineServer" Restart=on-failure TimeoutStopSec=2 From 6675b8ae961130d71fc81efde94a5051cf3d5eba Mon Sep 17 00:00:00 2001 From: Jose Date: Fri, 29 May 2020 04:22:33 +0000 Subject: [PATCH 18/24] MCOL-4011 Add support for SM into systemd units. --- oam/install_scripts/CMakeLists.txt | 4 ++ .../columnstore-post-install.in | 4 ++ .../columnstore-pre-uninstall.in | 6 +- oam/install_scripts/mcs-loadbrm.py | 60 +++++++++++++++++++ oam/install_scripts/mcs-loadbrm.service.in | 3 +- .../mcs-start-storagemanager.py | 15 +++++ .../mcs-storagemanager.service.in | 16 +++++ 7 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 oam/install_scripts/mcs-loadbrm.py create mode 100644 oam/install_scripts/mcs-start-storagemanager.py create mode 100644 oam/install_scripts/mcs-storagemanager.service.in diff --git a/oam/install_scripts/CMakeLists.txt b/oam/install_scripts/CMakeLists.txt index 361cf6f7b..3e6b7762a 100644 --- a/oam/install_scripts/CMakeLists.txt +++ b/oam/install_scripts/CMakeLists.txt @@ -20,6 +20,7 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-writeengineserver.service.in" "$ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-dmlproc.service.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-dmlproc.service" @ONLY) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-ddlproc.service.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-ddlproc.service" @ONLY) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.service.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-loadbrm.service" @ONLY) +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-storagemanager.service.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-storagemanager.service" @ONLY) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/mcs-stop-controllernode.sh.in" "${CMAKE_CURRENT_SOURCE_DIR}/mcs-stop-controllernode.sh" @ONLY) install(PROGRAMS columnstore-post-install @@ -44,6 +45,8 @@ install(PROGRAMS columnstore-post-install mariadb-command-line.sh mcs_module_installer.sh mcs-stop-controllernode.sh + mcs-loadbrm.py + mcs-start-storagemanager.py DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine) install(FILES mariadb-columnstore.service @@ -62,6 +65,7 @@ install(FILES mariadb-columnstore.service mcs-dmlproc.service mcs-ddlproc.service mcs-loadbrm.service + mcs-storagemanager.service DESTINATION ${ENGINE_SUPPORTDIR} COMPONENT columnstore-engine) install(FILES module DESTINATION ${ENGINE_DATADIR}/local COMPONENT columnstore-engine) diff --git a/oam/install_scripts/columnstore-post-install.in b/oam/install_scripts/columnstore-post-install.in index 72969eaff..7d522d1f1 100755 --- a/oam/install_scripts/columnstore-post-install.in +++ b/oam/install_scripts/columnstore-post-install.in @@ -105,6 +105,9 @@ if [ $user = "root" ]; then cp @ENGINE_SUPPORTDIR@/mcs-writeengineserver.service /lib/systemd/system/. >/dev/null 2>&1 cp @ENGINE_SUPPORTDIR@/mcs-loadbrm.service /usr/lib/systemd/system/. >/dev/null 2>&1 cp @ENGINE_SUPPORTDIR@/mcs-loadbrm.service /lib/systemd/system/. >/dev/null 2>&1 + cp @ENGINE_SUPPORTDIR@/mcs-storagemanager.service /usr/lib/systemd/system/. >/dev/null 2>&1 + cp @ENGINE_SUPPORTDIR@/mcs-storagemanager.service /lib/systemd/system/. >/dev/null 2>&1 + systemctl enable mariadb-columnstore >/dev/null 2>&1 systemctl enable mcs-controllernode > /dev/null 2>&1 @@ -115,6 +118,7 @@ if [ $user = "root" ]; then systemctl enable mcs-workernode > /dev/null 2>&1 systemctl enable mcs-writeengineserver > /dev/null 2>&1 systemctl enable mcs-loadbrm > /dev/null 2>&1 + systemctl enable mcs-storagemanager > /dev/null 2>&1 else chkconfig=`which chkconfig 2>/dev/null` if [ -n "$chkconfig" ]; then diff --git a/oam/install_scripts/columnstore-pre-uninstall.in b/oam/install_scripts/columnstore-pre-uninstall.in index 4b8aa2e6b..ae6b88690 100755 --- a/oam/install_scripts/columnstore-pre-uninstall.in +++ b/oam/install_scripts/columnstore-pre-uninstall.in @@ -17,7 +17,7 @@ systemctl cat mariadb-columnstore.service > /dev/null 2>&1 if [ $? -eq 0 ] && [ $(running_systemd) -eq 0 ]; then systemctl stop mariadb-columnstore >/dev/null 2>&1 else - PROGS='load_brm workernode controllernode PrimProc ExeMgr DMLProc DDLProc WriteEngineServer' + PROGS='StorageManager workernode controllernode PrimProc ExeMgr DMLProc DDLProc WriteEngineServer' kill $(pidof $PROGS) > /dev/null sleep 3 kill -9 $(pidof $PROGS) > /dev/null @@ -70,6 +70,7 @@ if [ -n "$systemctl" ] && [ $(running_systemd) -eq 0 ]; then systemctl disable mcs-workernode > /dev/null 2>&1 systemctl disable mcs-writeengineserver > /dev/null 2>&1 systemctl disable mcs-loadbrm > /dev/null 2>&1 + systemctl disable mcs-storagemanager > /dev/null 2>&1 rm -f /usr/lib/systemd/system/mariadb-columnstore.service rm -f /lib/systemd/system/mariadb-columnstore.service @@ -89,6 +90,9 @@ if [ -n "$systemctl" ] && [ $(running_systemd) -eq 0 ]; then rm -f /lib/systemd/system/mcs-writeengineserver.service rm -f /usr/lib/systemd/system/mcs-loadbrm.service rm -f /lib/systemd/system/mcs-loadbrm.service + rm -f /usr/lib/systemd/system/mcs-storagemanager.service + rm -f /lib/systemd/system/mcs-storagemanager.service + systemctl daemon-reload else chkconfig=`which chkconfig 2>/dev/null` diff --git a/oam/install_scripts/mcs-loadbrm.py b/oam/install_scripts/mcs-loadbrm.py new file mode 100644 index 000000000..e8b138802 --- /dev/null +++ b/oam/install_scripts/mcs-loadbrm.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import configparser +import subprocess + +config = configparser.ConfigParser() +config.read('/etc/columnstore/storagemanager.cnf') + +storage = config['ObjectStorage']['service'] +region = config['S3']['region'] +bucket = config['S3']['bucket'] +loadbrm = '/usr/bin/load_brm /var/lib/columnstore/data1/systemFiles/dbrm/{0}' +brm_saves_current = '' + +if storage.lower() == 's3' and not region.lower() == 'some_region' and not bucket.lower == 'some_bucket': + # load s3 + brm = 'data1/systemFiles/dbrm/BRM_saves_current' + + try: + brm_saves_current = subprocess.check_output(['smcat', brm]) + except subprocess.CalledProcessError as e: + # will happen when brm file does not exist + pass +else: + import xml.etree.ElementTree as ET + tree = ET.parse('/etc/columnstore/Columnstore.xml') + root = tree.getroot() + pmCount = int(root.find("./SystemModuleConfig/ModuleCount3").text) + brm = '/var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves_current' + + if pmCount > 1: + # load multinode dbrm + try: + brm_saves_current = subprocess.check_output(['cat', brm]) + + if not brm_saves_current: + # local dbrm empty, need to pull from main node + pass + except subprocess.CalledProcessError as e: + # will happen when brm file does not exist + pass + else: + # load local dbrm + try: + brm_saves_current = subprocess.check_output(['cat', brm]) + except subprocess.CalledProcessError as e: + # will happen when brm file does not exist + pass + +if brm_saves_current: + cmd = loadbrm.format(brm_saves_current.decode('utf-8')) + try: + retcode = subprocess.call(cmd, shell=True) + if retcode < 0: + #print("Child was terminated by signal", -retcode, file=sys.stderr) + pass + + except OSError as e: + #print("Execution failed:", e, file=sys.stderr) + pass diff --git a/oam/install_scripts/mcs-loadbrm.service.in b/oam/install_scripts/mcs-loadbrm.service.in index d9a634287..da58c9d78 100644 --- a/oam/install_scripts/mcs-loadbrm.service.in +++ b/oam/install_scripts/mcs-loadbrm.service.in @@ -2,12 +2,11 @@ Description=loadbrm PartOf=mcs-workernode.service Before=mcs-workernode.service -ConditionPathExists=/var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves_current [Service] Type=simple Environment="SKIP_OAM_INIT=1" -ExecStart=/usr/bin/env bash -c "/usr/bin/load_brm /var/lib/columnstore/data1/systemFiles/dbrm/$(cat /var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves_current)" +ExecStart=@ENGINE_BINDIR@/mcs-loadbrm.py [Install] WantedBy=mariadb-columnstore.service diff --git a/oam/install_scripts/mcs-start-storagemanager.py b/oam/install_scripts/mcs-start-storagemanager.py new file mode 100644 index 000000000..689a5922e --- /dev/null +++ b/oam/install_scripts/mcs-start-storagemanager.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import configparser +import sys + +config = configparser.ConfigParser() +config.read('/etc/columnstore/storagemanager.cnf') + +storage = config['ObjectStorage']['service'] +region = config['S3']['region'] +bucket = config['S3']['bucket'] + +if storage.lower() == 's3' and not region.lower() == 'some_region' and not bucket.lower == 'some_bucket': + sys.exit(0) +sys.exit(1) diff --git a/oam/install_scripts/mcs-storagemanager.service.in b/oam/install_scripts/mcs-storagemanager.service.in new file mode 100644 index 000000000..297d89687 --- /dev/null +++ b/oam/install_scripts/mcs-storagemanager.service.in @@ -0,0 +1,16 @@ +[Unit] +Description=storagemanager +PartOf=mcs-workernode.service +Before=mcs-workernode.service +ConditionPathExists=/etc/columnstore/storagemanager.cnf +# FailureAction="exit" +# FailureActionExitStatus=0 + +[Service] +Type=simple +ExecStartPre=@ENGINE_BINDIR@/mcs-start-storagemanager.py +ExecStart=/usr/bin/env bash -c "LD_PRELOAD=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') exec @ENGINE_BINDIR@/StorageManager" + +[Install] +WantedBy=mariadb-columnstore.service +WantedBy=mcs-workernode.service From 4c489769233ad1f546491a04311aa5878d620eff Mon Sep 17 00:00:00 2001 From: Jose Date: Fri, 29 May 2020 17:30:52 +0000 Subject: [PATCH 19/24] MCOL-4011 SM Support Update --- oam/install_scripts/mcs-loadbrm.py | 28 ++++++++++++------- oam/install_scripts/mcs-loadbrm.service.in | 2 +- .../mcs-start-storagemanager.py | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/oam/install_scripts/mcs-loadbrm.py b/oam/install_scripts/mcs-loadbrm.py index e8b138802..21e182bc5 100644 --- a/oam/install_scripts/mcs-loadbrm.py +++ b/oam/install_scripts/mcs-loadbrm.py @@ -2,19 +2,30 @@ import configparser import subprocess +import xml.etree.ElementTree as ET -config = configparser.ConfigParser() -config.read('/etc/columnstore/storagemanager.cnf') +sm_config = configparser.ConfigParser() +sm_config.read('/etc/columnstore/storagemanager.cnf') +cs_config = ET.parse('/etc/columnstore/Columnstore.xml') +config_root = cs_config.getroot() -storage = config['ObjectStorage']['service'] -region = config['S3']['region'] -bucket = config['S3']['bucket'] +storage = sm_config['ObjectStorage']['service'] +region = sm_config['S3']['region'] +bucket = sm_config['S3']['bucket'] loadbrm = '/usr/bin/load_brm /var/lib/columnstore/data1/systemFiles/dbrm/{0}' brm_saves_current = '' -if storage.lower() == 's3' and not region.lower() == 'some_region' and not bucket.lower == 'some_bucket': +if storage.lower() == 's3' and not region.lower() == 'some_region' and not bucket.lower() == 'some_bucket': # load s3 brm = 'data1/systemFiles/dbrm/BRM_saves_current' + config_root.find('./Installation/DBRootStorageType').text = "StorageManager" + config_root.find('./StorageManager/Enabled').text = "Y" + + if config_root.find('./SystemConfig/DataFilePlugin') is None: + config_root.find('./SystemConfig').append(ET.Element("DataFilePlugin")) + + config_root.find('./SystemConfig/DataFilePlugin').text = "libcloudio.so" + cs_config.write('/etc/columnstore/Columnstore.xml') try: brm_saves_current = subprocess.check_output(['smcat', brm]) @@ -22,10 +33,7 @@ if storage.lower() == 's3' and not region.lower() == 'some_region' and not bucke # will happen when brm file does not exist pass else: - import xml.etree.ElementTree as ET - tree = ET.parse('/etc/columnstore/Columnstore.xml') - root = tree.getroot() - pmCount = int(root.find("./SystemModuleConfig/ModuleCount3").text) + pmCount = int(config_root.find('./SystemModuleConfig/ModuleCount3').text) brm = '/var/lib/columnstore/data1/systemFiles/dbrm/BRM_saves_current' if pmCount > 1: diff --git a/oam/install_scripts/mcs-loadbrm.service.in b/oam/install_scripts/mcs-loadbrm.service.in index da58c9d78..cc62d8966 100644 --- a/oam/install_scripts/mcs-loadbrm.service.in +++ b/oam/install_scripts/mcs-loadbrm.service.in @@ -1,7 +1,7 @@ [Unit] Description=loadbrm PartOf=mcs-workernode.service -Before=mcs-workernode.service +After=mcs-storagemanager.service [Service] Type=simple diff --git a/oam/install_scripts/mcs-start-storagemanager.py b/oam/install_scripts/mcs-start-storagemanager.py index 689a5922e..3eb3030f1 100644 --- a/oam/install_scripts/mcs-start-storagemanager.py +++ b/oam/install_scripts/mcs-start-storagemanager.py @@ -10,6 +10,6 @@ storage = config['ObjectStorage']['service'] region = config['S3']['region'] bucket = config['S3']['bucket'] -if storage.lower() == 's3' and not region.lower() == 'some_region' and not bucket.lower == 'some_bucket': +if storage.lower() == 's3' and not region.lower() == 'some_region' and not bucket.lower() == 'some_bucket': sys.exit(0) sys.exit(1) From 8fcce90d511e8546432fb75f304522251473daf1 Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Sun, 31 May 2020 10:21:47 +0200 Subject: [PATCH 20/24] cmake: don't FATAL_ERROR if prerequisites are not found skip building the plugin instead --- CMakeLists.txt | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a11483145..794dd8771 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,8 +123,17 @@ ENDIF () SET_PROPERTY(DIRECTORY PROPERTY EP_BASE ${CMAKE_CURRENT_BINARY_DIR}/external) LIST(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) -FIND_PACKAGE(Boost 1.53.0 REQUIRED COMPONENTS system filesystem thread regex date_time chrono atomic) -FIND_PACKAGE(BISON REQUIRED) +FIND_PACKAGE(Boost 1.53.0 COMPONENTS system filesystem thread regex date_time chrono atomic) +IF (NOT Boost_FOUND) + message(WARNING "Required Boost libraries not found!") + return() +ENDIF() + +FIND_PACKAGE(BISON) +IF (NOT BISON_FOUND) + message(WARNING "bison not found!") + return() +ENDIF() check_cxx_source_compiles("#include \n void main(){}" HAS_STD_FILESYSTEM) check_cxx_source_compiles("#include \n void main(){}" HAS_STD_EXPERIMENTAL_FILESYSTEM) @@ -149,24 +158,28 @@ FIND_PROGRAM(LEX_EXECUTABLE flex DOC "path to the flex executable") if(NOT LEX_EXECUTABLE) FIND_PROGRAM(LEX_EXECUTABLE lex DOC "path to the lex executable") if(NOT LEX_EXECUTABLE) - message(FATAL_ERROR "flex/lex not found!") + message(WARNING "flex/lex not found!") + return() endif() endif() FIND_PACKAGE(LibXml2) if (NOT LIBXML2_FOUND) - MESSAGE(FATAL_ERROR "Could not find a usable libxml2 development environment!") + MESSAGE(WARNING "Could not find a usable libxml2 development environment!") + return() endif() INCLUDE (FindSnappy) if (NOT SNAPPY_FOUND) - MESSAGE(FATAL_ERROR "Snappy not found please install snappy-devel for CentOS/RedHat or libsnappy-dev for Ubuntu/Debian") + MESSAGE(WARNING "Snappy not found please install snappy-devel for CentOS/RedHat or libsnappy-dev for Ubuntu/Debian") + return() endif() FIND_PROGRAM(AWK_EXECUTABLE awk DOC "path to the awk executable") if(NOT AWK_EXECUTABLE) - message(FATAL_ERROR "awk not found!") + message(WARNING "awk not found!") + return() endif() IF (NOT INSTALL_LAYOUT) From 2384328d4c87bbdb1952d2d379c5f5dc3ccf34ea Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Sun, 31 May 2020 10:23:47 +0200 Subject: [PATCH 21/24] cmake: output cleanup 1. reduce the number of cmake warnings 2. remove unused OLD policies 3. only warn about missing prerequisites once 4. start cmake output from columnstore version --- CMakeLists.txt | 75 +++++++++++---------------------- cmake/FindSnappy.cmake | 4 ++ cmake/columnstore_version.cmake | 2 +- utils/libmarias3/CMakeLists.txt | 16 ------- 4 files changed, 30 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 794dd8771..6f1976254 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,41 +1,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12) -# Avoid warnings in higher versions -if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" GREATER 2.6) - CMAKE_POLICY(VERSION 2.8) -endif() - -# explicitly set the policy to OLD -# (cannot use NEW, not everyone is on cmake-2.8.12 yet) -IF(POLICY CMP0022) - CMAKE_POLICY(SET CMP0022 OLD) -ENDIF() - -# We use the LOCATION target property (CMP0026) -# and get_target_property() for non-existent targets (CMP0045) -# and INSTALL_NAME_DIR (CMP0042) -IF(CMAKE_VERSION VERSION_EQUAL "3.0.0" OR - CMAKE_VERSION VERSION_GREATER "3.0.0") - CMAKE_POLICY(SET CMP0026 OLD) - CMAKE_POLICY(SET CMP0045 OLD) - CMAKE_POLICY(SET CMP0042 OLD) -ENDIF() - -MESSAGE(STATUS "Running cmake version ${CMAKE_VERSION}") - -OPTION(USE_CCACHE "reduce compile time with ccache." FALSE) -if(NOT USE_CCACHE) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "") - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "") -else() - find_program(CCACHE_FOUND ccache) - if(CCACHE_FOUND) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) - endif(CCACHE_FOUND) -endif() - IF(NOT INSTALL_LAYOUT) IF(NOT CMAKE_BUILD_TYPE) SET(CMAKE_BUILD_TYPE RELWITHDEBINFO CACHE STRING @@ -123,25 +88,37 @@ ENDIF () SET_PROPERTY(DIRECTORY PROPERTY EP_BASE ${CMAKE_CURRENT_BINARY_DIR}/external) LIST(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) +SET (ENGINE_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + +INCLUDE(columnstore_version) + +OPTION(USE_CCACHE "reduce compile time with ccache." FALSE) +if(NOT USE_CCACHE) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "") + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "") +else() + find_program(CCACHE_FOUND ccache) + if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) + endif(CCACHE_FOUND) +endif() + FIND_PACKAGE(Boost 1.53.0 COMPONENTS system filesystem thread regex date_time chrono atomic) IF (NOT Boost_FOUND) - message(WARNING "Required Boost libraries not found!") + MESSAGE_ONCE(CS_NO_BOOST "Required Boost libraries not found!") return() ENDIF() FIND_PACKAGE(BISON) IF (NOT BISON_FOUND) - message(WARNING "bison not found!") + MESSAGE_ONCE(CS_NO_BISON "bison not found!") return() ENDIF() check_cxx_source_compiles("#include \n void main(){}" HAS_STD_FILESYSTEM) check_cxx_source_compiles("#include \n void main(){}" HAS_STD_EXPERIMENTAL_FILESYSTEM) -SET (ENGINE_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}) - -INCLUDE(columnstore_version) - SET (PACKAGE columnstore) SET (PACKAGE_NAME columnstore) SET (PACKAGE_TARNAME columnstore) @@ -152,13 +129,11 @@ SET (PACKAGE_STRING columnstore-${PACKAGE_VERSION}) INCLUDE (configureEngine) - - FIND_PROGRAM(LEX_EXECUTABLE flex DOC "path to the flex executable") if(NOT LEX_EXECUTABLE) FIND_PROGRAM(LEX_EXECUTABLE lex DOC "path to the lex executable") if(NOT LEX_EXECUTABLE) - message(WARNING "flex/lex not found!") + MESSAGE_ONCE(CS_NO_LEX "flex/lex not found!") return() endif() endif() @@ -166,19 +141,19 @@ endif() FIND_PACKAGE(LibXml2) if (NOT LIBXML2_FOUND) - MESSAGE(WARNING "Could not find a usable libxml2 development environment!") + MESSAGE_ONCE(CS_NO_LIBXML "Could not find a usable libxml2 development environment!") return() endif() -INCLUDE (FindSnappy) +find_package(Snappy) if (NOT SNAPPY_FOUND) - MESSAGE(WARNING "Snappy not found please install snappy-devel for CentOS/RedHat or libsnappy-dev for Ubuntu/Debian") + MESSAGE_ONCE(CS_NO_SNAPPY "Snappy not found please install snappy-devel for CentOS/RedHat or libsnappy-dev for Ubuntu/Debian") return() endif() FIND_PROGRAM(AWK_EXECUTABLE awk DOC "path to the awk executable") if(NOT AWK_EXECUTABLE) - message(WARNING "awk not found!") + MESSAGE_ONCE(CS_NO_AWK "awk not found!") return() endif() @@ -253,8 +228,8 @@ IF (NOT SERVER_BUILD_DIR) SET (SERVER_BUILD_DIR ${SERVER_SOURCE_ROOT_DIR}) ENDIF() -MESSAGE("SERVER_BUILD_INCLUDE_DIR = ${SERVER_BUILD_INCLUDE_DIR}") -MESSAGE("SERVER_SOURCE_ROOT_DIR = ${SERVER_SOURCE_ROOT_DIR}") +MESSAGE_ONCE(SERVER_BUILD_INCLUDE_DIR "SERVER_BUILD_INCLUDE_DIR = ${SERVER_BUILD_INCLUDE_DIR}") +MESSAGE_ONCE(SERVER_SOURCE_ROOT_DIR "SERVER_SOURCE_ROOT_DIR = ${SERVER_SOURCE_ROOT_DIR}") IF (INSTALL_LAYOUT) SET (MARIADB_CLIENT_LIBS libmariadb) diff --git a/cmake/FindSnappy.cmake b/cmake/FindSnappy.cmake index 6aaf92229..7f8f551ec 100644 --- a/cmake/FindSnappy.cmake +++ b/cmake/FindSnappy.cmake @@ -17,6 +17,10 @@ # SNAPPY_LIBRARIES The snappy library/libraries # SNAPPY_INCLUDE_DIR The location of snappy headers +if(DEFINED SNAPPY_ROOT_DIR) + set(Snappy_FIND_QUIET) +endif() + find_path(SNAPPY_ROOT_DIR NAMES include/snappy.h ) diff --git a/cmake/columnstore_version.cmake b/cmake/columnstore_version.cmake index 5acdcc8a6..fe373a82d 100644 --- a/cmake/columnstore_version.cmake +++ b/cmake/columnstore_version.cmake @@ -34,7 +34,7 @@ IF(NOT "${CS_MAJOR_VERSION}" MATCHES "[0-9]+" OR ENDIF() SET(VERSION "${CS_MAJOR_VERSION}.${CS_MINOR_VERSION}.${CS_PATCH_VERSION}${CS_EXTRA_VERSION}") - MESSAGE(STATUS "MariaDB-Columnstore ${VERSION}") + MESSAGE("== MariaDB-Columnstore ${VERSION}") IF (NOT INSTALL_LAYOUT) SET(CPACK_PACKAGE_VERSION_MAJOR ${CS_MAJOR_VERSION}) SET(CPACK_PACKAGE_VERSION_MINOR ${CS_MINOR_VERSION}) diff --git a/utils/libmarias3/CMakeLists.txt b/utils/libmarias3/CMakeLists.txt index 1be7f11e6..5de4533a5 100644 --- a/utils/libmarias3/CMakeLists.txt +++ b/utils/libmarias3/CMakeLists.txt @@ -1,21 +1,5 @@ set(S3API_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libmarias3 CACHE INTERNAL "S3API_DIR") -find_package(Git QUIET) - -if(GIT_FOUND AND EXISTS ${ENGINE_SRC_DIR}/.git) -# Update submodules as needed - option(GIT_SUBMODULE "Check submodules during build" ON) - if(GIT_SUBMODULE) - message(STATUS "Submodule update") - execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive - WORKING_DIRECTORY ${ENGINE_SRC_DIR} - RESULT_VARIABLE GIT_SUBMOD_RESULT) - if(NOT GIT_SUBMOD_RESULT EQUAL "0") - message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules") - endif() - endif() -endif() - SET(S3_SOURCES ${S3API_DIR}/src/debug.c ${S3API_DIR}/src/error.c ${S3API_DIR}/src/marias3.c From 87efbea4b92eb8d731107ccbc6717c9ed6cdefa8 Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Sun, 31 May 2020 11:19:32 +0200 Subject: [PATCH 22/24] .gitignore --- .gitignore | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.gitignore b/.gitignore index 7328f65e5..02fe345cb 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ CMakeCache.txt CMakeFiles CMakeScripts Makefile +VERSION.dep cmake_install.cmake install_manifest.txt CTestTestfile.cmake @@ -133,8 +134,18 @@ oam/install_scripts/columnstore.service oam/install_scripts/columnstoreSyslogSetup.sh oam/install_scripts/columnstore_module_installer.sh oam/install_scripts/disable-rep-columnstore.sh +oam/install_scripts/mariadb-columnstore.service oam/install_scripts/mariadb-command-line.sh oam/install_scripts/master-rep-columnstore.sh +oam/install_scripts/mcs-controllernode.service +oam/install_scripts/mcs-ddlproc.service +oam/install_scripts/mcs-dmlproc.service +oam/install_scripts/mcs-exemgr.service +oam/install_scripts/mcs-loadbrm.service +oam/install_scripts/mcs-primproc.service +oam/install_scripts/mcs-stop-controllernode.sh +oam/install_scripts/mcs-workernode.service +oam/install_scripts/mcs-writeengineserver.service oam/install_scripts/mcs_module_installer.sh oam/install_scripts/slave-rep-columnstore.sh oam/install_scripts/startupTests.sh @@ -147,4 +158,5 @@ bin external gitversionEngine mcsconfig.h +storage-manager/testS3Connection storage-manager/unit_tests From 5b857db7e8089c1724eb392d98dff9e2c55a6872 Mon Sep 17 00:00:00 2001 From: Roman Nozdrin Date: Sun, 31 May 2020 18:41:53 +0300 Subject: [PATCH 23/24] Systemd doesn't complain about relative path anymore. --- oam/install_scripts/mcs-primproc.service.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oam/install_scripts/mcs-primproc.service.in b/oam/install_scripts/mcs-primproc.service.in index baee874d4..bb5b843c8 100644 --- a/oam/install_scripts/mcs-primproc.service.in +++ b/oam/install_scripts/mcs-primproc.service.in @@ -7,7 +7,7 @@ After=mcs-controllernode.service [Service] Type=simple ExecStart=/usr/bin/env bash -c "LD_PRELOAD=$(ldconfig -p | grep -m1 libjemalloc | awk '{print $1}') exec @ENGINE_BINDIR@/PrimProc" -ExecStartPost=sleep 2 +ExecStartPost=/bin/sleep 2 Restart=on-failure TimeoutStopSec=2 From 4bddc92092f85b8eef361ef5753497d6a21780b3 Mon Sep 17 00:00:00 2001 From: Patrick LeBlanc Date: Mon, 1 Jun 2020 12:52:43 -0400 Subject: [PATCH 24/24] MCOL-4010 - fixes compilation errors on x64 w/-Werror Merged in Sergei's patch. --- dbcon/mysql/ha_mcs_impl.cpp | 2 +- dbcon/mysql/ha_window_function.cpp | 2 +- storage-manager/src/AppendTask.cpp | 2 +- storage-manager/src/Config.cpp | 2 +- storage-manager/src/MetadataFile.cpp | 4 ++-- storage-manager/src/WriteTask.cpp | 2 +- storage-manager/src/unit_tests.cpp | 6 +++--- utils/funcexp/func_conv.cpp | 25 ++++--------------------- utils/funcexp/functor_str.h | 4 ++-- 9 files changed, 16 insertions(+), 33 deletions(-) diff --git a/dbcon/mysql/ha_mcs_impl.cpp b/dbcon/mysql/ha_mcs_impl.cpp index c920c8657..127abb078 100644 --- a/dbcon/mysql/ha_mcs_impl.cpp +++ b/dbcon/mysql/ha_mcs_impl.cpp @@ -337,7 +337,7 @@ void storeNumericField(Field** f, int64_t value, CalpontSystemCatalog::ColType& if (ct.colDataType == CalpontSystemCatalog::DECIMAL) dataconvert::DataConvert::decimalToString(value, (unsigned)ct.scale, tmp, 25, ct.colDataType); else - snprintf(tmp, 25, "%ld", value); + snprintf(tmp, 25, "%lld", (long long)value); f2->store(tmp, strlen(tmp), f2->charset()); break; diff --git a/dbcon/mysql/ha_window_function.cpp b/dbcon/mysql/ha_window_function.cpp index f70715118..7c039cc75 100644 --- a/dbcon/mysql/ha_window_function.cpp +++ b/dbcon/mysql/ha_window_function.cpp @@ -442,7 +442,7 @@ ReturnedColumn* buildWindowFunctionColumn(Item* item, gp_walk_info& gwi, bool& n { case Item_sum::UDF_SUM_FUNC: { - uint64_t bRespectNulls = (ac->getUDAFContext().getRunFlag(mcsv1sdk::UDAF_IGNORE_NULLS)) ? 0 : 1; + unsigned long bRespectNulls = (ac->getUDAFContext().getRunFlag(mcsv1sdk::UDAF_IGNORE_NULLS)) ? 0 : 1; char sRespectNulls[18]; sprintf(sRespectNulls, "%lu", bRespectNulls); srcp.reset(new ConstantColumn(sRespectNulls, (uint64_t)bRespectNulls, ConstantColumn::NUM)); // IGNORE/RESPECT NULLS. 1 => RESPECT diff --git a/storage-manager/src/AppendTask.cpp b/storage-manager/src/AppendTask.cpp index f6d5f35cd..25a2da94d 100644 --- a/storage-manager/src/AppendTask.cpp +++ b/storage-manager/src/AppendTask.cpp @@ -74,7 +74,7 @@ bool AppendTask::run() while (readCount < cmd->count) { - uint toRead = min(cmd->count - readCount, bufsize); + uint toRead = min(static_cast(cmd->count - readCount), bufsize); success = read(&databuf[0], toRead); check_error("AppendTask read data", false); if (success==0) diff --git a/storage-manager/src/Config.cpp b/storage-manager/src/Config.cpp index 2ba96e3e7..2238e0a2d 100644 --- a/storage-manager/src/Config.cpp +++ b/storage-manager/src/Config.cpp @@ -174,7 +174,7 @@ string use_envvar(const boost::smatch &envvar) string expand_numbers(const boost::smatch &match) { - long num = stol(match[1].str()); + long long num = stol(match[1].str()); char suffix = (char) ::tolower(match[2].str()[0]); if (suffix == 't') diff --git a/storage-manager/src/MetadataFile.cpp b/storage-manager/src/MetadataFile.cpp index 028b1d22c..cde04925d 100644 --- a/storage-manager/src/MetadataFile.cpp +++ b/storage-manager/src/MetadataFile.cpp @@ -461,8 +461,8 @@ void MetadataFile::printObjects() const { BOOST_FOREACH(const boost::property_tree::ptree::value_type &v, jsontree->get_child("objects")) { - printf("Name: %s Length: %lu Offset: %lu\n", v.second.get("key").c_str(), - v.second.get("length"), v.second.get("offset")); + printf("Name: %s Length: %zu Offset: %lld\n", v.second.get("key").c_str(), + v.second.get("length"), (long long)v.second.get("offset")); } } diff --git a/storage-manager/src/WriteTask.cpp b/storage-manager/src/WriteTask.cpp index fc7c3fce5..70cc7d7b0 100644 --- a/storage-manager/src/WriteTask.cpp +++ b/storage-manager/src/WriteTask.cpp @@ -74,7 +74,7 @@ bool WriteTask::run() while (readCount < cmd->count) { - uint toRead = min(cmd->count - readCount, bufsize); + uint toRead = min(static_cast(cmd->count - readCount), bufsize); success = read(&databuf[0], toRead); check_error("WriteTask read data", false); if (success==0) diff --git a/storage-manager/src/unit_tests.cpp b/storage-manager/src/unit_tests.cpp index c77f37d11..b3460f447 100644 --- a/storage-manager/src/unit_tests.cpp +++ b/storage-manager/src/unit_tests.cpp @@ -433,7 +433,7 @@ bool writetask() WriteTask w(clientSock, hdr->payloadLen); ssize_t result = ::write(sessionSock, cmd, hdr->payloadLen); - assert(result==(hdr->payloadLen)); + assert(result == static_cast(hdr->payloadLen)); w.run(); @@ -1065,7 +1065,7 @@ bool copytask(bool connectionTest=false) len -= 2; ssize_t result = ::write(sessionSock, buf, len); - assert(result==len); + assert(result==static_cast(len)); int err=0; @@ -1805,7 +1805,7 @@ void shortMsg() WriteTask w(clientSock, hdrWrite->payloadLen); ssize_t result = ::write(sessionSock, cmdWrite, hdrWrite->payloadLen); - assert(result==(hdrWrite->payloadLen)); + assert(result==static_cast(hdrWrite->payloadLen)); w.run(); diff --git a/utils/funcexp/func_conv.cpp b/utils/funcexp/func_conv.cpp index 4e80eed25..3afc13790 100644 --- a/utils/funcexp/func_conv.cpp +++ b/utils/funcexp/func_conv.cpp @@ -132,35 +132,18 @@ namespace helpers const char* convNumToStr(int64_t val, char* dst, int radix) { if (radix == 16 || radix == -16) -#ifdef _MSC_VER - sprintf(dst, "%llX", val); + sprintf(dst, "%llX", (long long)val); -#else - sprintf(dst, "%lX", val); -#endif else if (radix == 8 || radix == -8) -#ifdef _MSC_VER - sprintf(dst, "%llo", val); + sprintf(dst, "%llo", (long long)val); -#else - sprintf(dst, "%lo", val); -#endif else if (radix == 10) { - uint64_t uval = static_cast(val); -#ifdef _MSC_VER - sprintf(dst, "%llu", uval); -#else - sprintf(dst, "%lu", uval); -#endif + sprintf(dst, "%llu", (unsigned long long)val); } else if (radix == -10) -#ifdef _MSC_VER - sprintf(dst, "%lld", val); + sprintf(dst, "%lld", (long long)val); -#else - sprintf(dst, "%ld", val); -#endif else if (radix == 2 || radix == -2) { char tmp[65]; diff --git a/utils/funcexp/functor_str.h b/utils/funcexp/functor_str.h index 2144c44ae..aa17fde97 100644 --- a/utils/funcexp/functor_str.h +++ b/utils/funcexp/functor_str.h @@ -122,7 +122,7 @@ protected: // [ the default format in treenode.h is fixed-point notation ] char buf[20]; long double floatVal; - int64_t exponent; + int exponent; long double base; switch (fp->data()->resultType().colDataType) @@ -157,7 +157,7 @@ protected: { snprintf(buf, 20, "%.5Lf", base); fFloatStr = execplan::removeTrailing0(buf, 20); - snprintf(buf, 20, "e%02ld", exponent); + snprintf(buf, 20, "e%02d", exponent); fFloatStr += buf; }