1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-04-21 19:45:56 +03:00
mariadb-columnstore-engine/writeengine/redistribute/we_redistributeworkerthread.cpp
Roman Nozdrin 4fe9cd64a3
Revert "No boost condition (#2822)" (#2828)
This reverts commit f916e64927cd81569327014f20c4cc0b8aca40ff.
2023-04-22 15:49:50 +03:00

1525 lines
44 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/*
* $Id: we_redistributeworkerthread.cpp 4646 2013-05-23 20:58:08Z xlou $
*/
#include <iostream>
#include <set>
#include <vector>
#include <cassert>
#include <stdexcept>
#include <sstream>
#include <string>
#include <unistd.h>
using namespace std;
#include "boost/scoped_ptr.hpp"
#include "boost/scoped_array.hpp"
#include "boost/thread/mutex.hpp"
#include "boost/filesystem/path.hpp"
#include "boost/filesystem/operations.hpp"
using namespace boost;
#include "installdir.h"
#include "configcpp.h"
using namespace config;
#include "liboamcpp.h"
#include "oamcache.h"
using namespace oam;
#include "dbrm.h"
using namespace BRM;
#include "messagequeue.h"
#include "bytestream.h"
using namespace messageqcpp;
#include "calpontsystemcatalog.h"
using namespace execplan;
#include "exceptclasses.h"
using namespace logging;
#include "IDBFileSystem.h"
#include "IDBPolicy.h"
using namespace idbdatafile;
#include "we_fileop.h"
#include "we_messages.h"
#include "we_convertor.h"
#include "we_redistributedef.h"
#include "we_redistributecontrol.h"
#include "we_redistributeworkerthread.h"
namespace redistribute
{
// static variables
boost::mutex RedistributeWorkerThread::fActionMutex;
volatile bool RedistributeWorkerThread::fStopAction = false;
volatile bool RedistributeWorkerThread::fCommitted = false;
string RedistributeWorkerThread::fWesInUse;
RedistributeWorkerThread::RedistributeWorkerThread(ByteStream& bs, IOSocket& ios)
: fBs(bs), fIOSocket(ios), fTableLockId(0), fErrorCode(RED_EC_OK), fNewFilePtr(NULL), fOldFilePtr(NULL)
{
fWriteBuffer.reset(new char[CHUNK_SIZE]);
}
RedistributeWorkerThread::~RedistributeWorkerThread()
{
boost::mutex::scoped_lock lock(fActionMutex);
if (fNewFilePtr)
closeFile(fNewFilePtr);
if (fOldFilePtr)
closeFile(fOldFilePtr);
// make sure releasing the table lock.
if (fTableLockId > 0)
{
fDbrm->releaseTableLock(fTableLockId);
// use the interface, line# replaced with lock id.
logMessage("Releasing table lock in destructor. ", fTableLockId);
}
}
void RedistributeWorkerThread::operator()()
{
memcpy(&fMsgHeader, fBs.buf(), sizeof(RedistributeMsgHeader));
fBs.advance(sizeof(RedistributeMsgHeader));
if (fMsgHeader.messageId == RED_ACTN_REQUEST)
handleRequest();
else if (fMsgHeader.messageId == RED_ACTN_STOP)
handleStop();
else if (fMsgHeader.messageId == RED_DATA_INIT)
handleData();
else
handleUnknowJobMsg();
}
void RedistributeWorkerThread::handleRequest()
{
try
{
// clear stop flag if ever set.
{
boost::mutex::scoped_lock lock(fActionMutex);
fStopAction = false;
fCommitted = false;
}
if (setup() == 0)
{
if (fBs.length() >= sizeof(RedistributePlanEntry))
{
memcpy(&fPlanEntry, fBs.buf(), sizeof(RedistributePlanEntry));
fBs.advance(sizeof(RedistributePlanEntry));
OamCache::dbRootPMMap_t dbrootToPM = fOamCache->getDBRootToPMMap();
fMyId.first = fPlanEntry.source;
fMyId.second = (*dbrootToPM)[fMyId.first];
fPeerId.first = fPlanEntry.destination;
fPeerId.second = (*dbrootToPM)[fPeerId.first];
if (grabTableLock() == 0)
{
// workaround extentmap slow update
sleep(1);
// build segment & entry list after grabbing the table lock.
if (buildEntryList() == 0)
{
if (sendData() == 0)
{
// do bulk update
updateDbrm();
}
}
// conversation to peer after got table lock
// confirm commit or abort
confirmToPeer();
}
}
}
}
catch (const std::exception&)
{
}
catch (...)
{
}
sendResponse(RED_ACTN_REQUEST);
boost::mutex::scoped_lock lock(fActionMutex);
fWesInUse.clear();
fMsgQueueClient.reset();
fStopAction = false;
fCommitted = false;
}
int RedistributeWorkerThread::setup()
{
int ret = 0;
try
{
fConfig = Config::makeConfig();
fOamCache = oam::OamCache::makeOamCache();
fDbrm = RedistributeControl::instance()->fDbrm;
// for segment file # workaround
// string tmp = fConfig->getConfig("ExtentMap", "FilesPerColumnPartition");
// int filesPerPartition = fConfig->fromText(tmp);
// if (filesPerPartition == 0)
// filesPerPartition = DEFAULT_FILES_PER_COLUMN_PARTITION;
// int dbrootNum = fOamCache->getDBRootNums().size();
// if (dbrootNum == 0)
// {
// fErrorMsg = "OamCache->getDBRootNums() failed.";
// logMessage(fErrorMsg, __LINE__);
// return 1;
// }
// if ((filesPerPartition % dbrootNum) != 0)
// {
// fErrorMsg = "ExtentMap::FilesPerColumnPartition is not a multiple of db root number.";
// logMessage(fErrorMsg, __LINE__);
// return 1;
// }
// fSegPerRoot = filesPerPartition / dbrootNum;
}
catch (const std::exception&)
{
ret = 2;
}
catch (...)
{
ret = 2;
}
return ret;
}
int RedistributeWorkerThread::grabTableLock()
{
fTableLockId = 0;
try
{
vector<uint32_t> pms;
pms.push_back(fMyId.second);
if (fMyId.second != fPeerId.second)
pms.push_back(fPeerId.second);
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 100 * 1000000;
while (fTableLockId == 0 && !fStopAction)
{
// make sure it's not stopped.
if (fStopAction)
return RED_EC_USER_STOP;
// always wait long enough for ddl/dml/cpimport to get table lock
// for now, triple the ddl/dml/cpimport retry interval: 3 * 100ms
struct timespec tmp = ts;
while (nanosleep(&tmp, &ts) < 0)
;
tmp = ts;
try
{
uint32_t processID = ::getpid();
int32_t txnId = 0;
int32_t sessionId = 0;
string processName = "WriteEngineServer";
fTableLockId = fDbrm->getTableLock(pms, fPlanEntry.table, &processName, &processID, &sessionId,
&txnId, BRM::LOADING);
}
catch (const std::exception& ex)
{
fErrorCode = RED_EC_IDB_HARD_FAIL;
logMessage(string("getTableLock exception") + ex.what(), __LINE__);
}
catch (...)
{
fErrorCode = RED_EC_IDB_HARD_FAIL;
logMessage("getTableLock exception", __LINE__);
// no need to throw
// throw IDBExcept(ERR_HARD_FAILURE);
}
}
}
catch (const std::exception& ex)
{
// use the interface, line# replaced with lock id.
logMessage(string(ex.what()) + " when try to get table lock: ", fTableLockId);
}
catch (...)
{
// use the interface, line# replaced with lock id.
logMessage("Unknown exception when try to get table lock: ", fTableLockId);
}
// use the interface, line# replaced with lock id.
logMessage("Got table lock: ", fTableLockId);
return ((fTableLockId > 0) ? 0 : -1);
}
int RedistributeWorkerThread::buildEntryList()
{
int ret = 0;
try
{
// get all column oids
boost::shared_ptr<CalpontSystemCatalog> csc = CalpontSystemCatalog::makeCalpontSystemCatalog(0);
const CalpontSystemCatalog::TableName table = csc->tableName(fPlanEntry.table);
CalpontSystemCatalog::RIDList cols = csc->columnRIDs(table, true);
CalpontSystemCatalog::OID tableAuxColOid = csc->tableAUXColumnOID(table);
for (CalpontSystemCatalog::RIDList::iterator i = cols.begin(); i != cols.end(); i++)
fOids.push_back(i->objnum);
CalpontSystemCatalog::DictOIDList dicts = csc->dictOIDs(table);
for (CalpontSystemCatalog::DictOIDList::iterator i = dicts.begin(); i != dicts.end(); i++)
fOids.push_back(i->dictOID);
if (tableAuxColOid > 3000)
{
fOids.push_back(tableAuxColOid);
}
bool firstOid = true; // for adding segments, all columns have the same lay out.
uint16_t source = fPlanEntry.source;
uint16_t target = fPlanEntry.destination;
uint16_t partition = fPlanEntry.partition;
uint32_t minWidth = 8; // column width greater than 8 will be dictionary.
for (vector<int64_t>::iterator i = fOids.begin(); i != fOids.end(); i++)
{
vector<EMEntry> entries;
int rc = fDbrm->getExtents(*i, entries, false, false, true);
if (rc != 0 || entries.size() == 0)
{
ostringstream oss;
oss << "Error in DBRM getExtents; oid:" << *i << "; returnCode: " << rc;
throw runtime_error(oss.str());
}
// same oid has the same column width
uint32_t colWid = entries.front().colWid;
vector<EMEntry>::iterator targetHwmEntry = entries.end(); // for HWM_0 workaround
if (colWid > 0 && colWid < minWidth)
minWidth = colWid;
for (vector<EMEntry>::iterator j = entries.begin(); j != entries.end(); j++)
{
if (j->dbRoot == source && j->partitionNum == partition)
{
fUpdateRtEntries.push_back(BulkUpdateDBRootArg(j->range.start, target));
if (firstOid)
fSegments.insert(j->segmentNum);
}
#if 0
else if (j->dbRoot == target && j->partitionNum == partition)
{
// the partition already exists on the target dbroot
fErrorCode = RED_EC_PART_EXIST_ON_TARGET;
ostringstream oss;
oss << "oid:" << *i << ", partition:" << partition << " exists, source:"
<< source << ", destination:" << target;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
#endif
// workaround for HWM_0 of highest extents of the oid on target dbroot.
if (j->dbRoot == target)
{
if (targetHwmEntry == entries.end())
{
targetHwmEntry = j;
}
else
{
if (j->partitionNum > targetHwmEntry->partitionNum)
{
targetHwmEntry = j;
}
else if (j->partitionNum == targetHwmEntry->partitionNum &&
j->blockOffset > targetHwmEntry->blockOffset)
{
targetHwmEntry = j;
}
else if (j->partitionNum == targetHwmEntry->partitionNum &&
j->blockOffset == targetHwmEntry->blockOffset &&
j->segmentNum > targetHwmEntry->segmentNum)
{
targetHwmEntry = j;
}
}
}
} // for em entries
// HWM_0 workaround
// HWM 0 has two possibilities:
// 1. segment file has one extent, and the first block is not full yet.
// 2. segment file has more than one extents, the HWM of the extents other than
// the last extent is set to 0, that is only last extent has none-zero HWM.
// In tuple-bps::makeJob, there is a check to handle last extent has 0 hwm:
// (scannedExtents[i].HWM == 0 && (int) i < lastExtent[scannedExtents[i].dbRoot-1])
// lbidsToScan = scannedExtents[i].range.size * 1024;
// Based on this check, the number of block to scan is caculated.
// After redistributing the partitions, the original case 1 extent on destination
// may not be the highest extent in the dbroot, and result in a full extent scan.
// This scan will fail because there is no enough blocks if this is an abbreviated
// extent or not enough chunks if the column is compressed.
// The workaround is to bump up the HWM to 1 if moved in partitions are greater.
if (targetHwmEntry != entries.end() && // exclude no extent case
targetHwmEntry->colWid > 0 && // exclude dictionary
targetHwmEntry->HWM == 0 && targetHwmEntry->partitionNum < partition)
{
BulkSetHWMArg arg;
arg.oid = *i;
arg.partNum = targetHwmEntry->partitionNum;
arg.segNum = targetHwmEntry->segmentNum;
arg.hwm = targetHwmEntry->colWid; // will correct later based on minWidth
fUpdateHwmEntries.push_back(arg);
}
} // for oids
// HWM_0 workaround
// Caculate the min(column width), the HWM(bump up to) for each column.
if (fUpdateHwmEntries.size() > 0)
{
// update the HWM based in column width, not include dictionary extents
for (vector<BRM::BulkSetHWMArg>::iterator j = fUpdateHwmEntries.begin(); j != fUpdateHwmEntries.end();
j++)
{
if (j->hwm <= 8)
j->hwm /= minWidth;
else
j->hwm = 1; // not needed, but in case
}
}
}
catch (const std::exception& ex)
{
fErrorCode = RED_EC_EXTENT_ERROR;
fErrorMsg = ex.what();
logMessage(fErrorMsg, __LINE__);
ret = fErrorCode;
}
catch (...)
{
fErrorCode = RED_EC_EXTENT_ERROR;
fErrorMsg = "get extent error.";
logMessage(fErrorMsg, __LINE__);
ret = fErrorCode;
}
return ret;
}
int RedistributeWorkerThread::sendData()
{
WriteEngine::FileOp fileOp; // just to get filename, not for file operations
bool remotePM = (fMyId.second != fPeerId.second);
uint32_t dbroot = fPlanEntry.source;
uint32_t partition = fPlanEntry.partition;
int16_t source = fPlanEntry.source;
int16_t dest = fPlanEntry.destination;
IDBDataFile::Types fileType = (IDBPolicy::useHdfs() ? IDBDataFile::HDFS
: IDBPolicy::useCloud() ? IDBDataFile::CLOUD
: IDBDataFile::UNBUFFERED);
IDBFileSystem& fs = IDBFileSystem::getFs(fileType);
if ((remotePM) && (fileType != IDBDataFile::HDFS))
{
if (connectToWes(fPeerId.second) != 0)
{
fErrorCode = RED_EC_CONNECT_FAIL;
ostringstream oss;
oss << "Failed to connect to PM" << fPeerId.second << " from PM" << fMyId.second;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
// start to send each segment file
uint32_t seq = 0;
ByteStream bs;
// start conversion with peer, hand shaking.
RedistributeMsgHeader header(dest, source, seq++, RED_DATA_INIT);
bs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE;
bs.append((const ByteStream::byte*)&header, sizeof(header));
fMsgQueueClient->write(bs);
SBS sbs = fMsgQueueClient->read();
if (!checkDataTransferAck(sbs, 0))
return fErrorCode;
for (vector<int64_t>::iterator i = fOids.begin(); i != fOids.end(); i++)
{
for (set<int16_t>::iterator j = fSegments.begin(); j != fSegments.end(); ++j)
{
char fileName[WriteEngine::FILE_NAME_SIZE];
int rc = fileOp.oid2FileName(*i, fileName, false, dbroot, partition, *j);
if (rc == WriteEngine::NO_ERROR)
{
ostringstream oss;
oss << "<=redistributing: " << fileName << ", oid=" << *i << ", db=" << source
<< ", part=" << partition << ", seg=" << *j << " to db=" << dest;
logMessage(oss.str(), __LINE__);
}
else
{
fErrorCode = RED_EC_OID_TO_FILENAME;
ostringstream oss;
oss << "Failed to get file name: oid=" << *i << ", dbroot=" << dbroot << ", partition=" << partition
<< ", segment=" << *j;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
if (fOldFilePtr != NULL)
closeFile(fOldFilePtr);
errno = 0;
FILE* fOldFilePtr = fopen(fileName, "rb");
if (fOldFilePtr != NULL)
{
ostringstream oss;
oss << "open " << fileName << ", oid=" << *i << ", dbroot=" << dbroot << ", partition=" << partition
<< ", segment=" << *j << ". " << fOldFilePtr;
logMessage(oss.str(), __LINE__);
}
else
{
int e = errno;
fErrorCode = RED_EC_OPEN_FILE_FAIL;
ostringstream oss;
oss << "Failed to open " << fileName << ", oid=" << *i << ", dbroot=" << dbroot
<< ", partition=" << partition << ", segment=" << *j << ". " << strerror(e) << " (" << e << ")";
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
// add to set for remove after commit
addToDirSet(fileName, true);
char chunk[CHUNK_SIZE];
errno = 0;
fseek(fOldFilePtr, 0, SEEK_END); // go to end of file
long fileSize = ftell(fOldFilePtr); // get current file size
if (fileSize < 0)
{
int e = errno;
ostringstream oss;
oss << "Fail to tell file size: " << strerror(e) << " (" << e << ")";
fErrorMsg = oss.str();
fErrorCode = RED_EC_FSEEK_FAIL;
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
// send start message to have the file of fileSize created at target dbroot.
bs.restart();
RedistributeMsgHeader header(dest, source, seq++, RED_DATA_START);
RedistributeDataControl dataControl(*i, dest, partition, *j, fileSize);
bs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE;
bs.append((const ByteStream::byte*)&header, sizeof(header));
bs.append((const ByteStream::byte*)&dataControl, sizeof(dataControl));
fMsgQueueClient->write(bs);
sbs = fMsgQueueClient->read();
if (!checkDataTransferAck(sbs, fileSize))
return fErrorCode;
// now send the file chunk by chunk.
rewind(fOldFilePtr);
int64_t bytesLeft = fileSize;
size_t bytesSend = CHUNK_SIZE;
header.messageId = RED_DATA_CONT;
while (bytesLeft > 0)
{
if (fStopAction)
{
closeFile(fOldFilePtr);
fOldFilePtr = NULL;
return RED_EC_USER_STOP;
}
if (bytesLeft < (long)CHUNK_SIZE)
bytesSend = bytesLeft;
errno = 0;
size_t n = fread(chunk, 1, bytesSend, fOldFilePtr);
if (n != bytesSend)
{
int e = errno;
ostringstream oss;
oss << "Fail to read: " << strerror(e) << " (" << e << ")";
fErrorMsg = oss.str();
fErrorCode = RED_EC_FREAD_FAIL;
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
header.sequenceNum = seq++;
bs.restart();
bs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE;
bs.append((const ByteStream::byte*)&header, sizeof(header));
bs << (size_t)bytesSend;
bs.append((const ByteStream::byte*)chunk, bytesSend);
fMsgQueueClient->write(bs);
sbs = fMsgQueueClient->read();
if (!checkDataTransferAck(sbs, bytesSend))
return fErrorCode;
bytesLeft -= bytesSend;
}
closeFile(fOldFilePtr);
fOldFilePtr = NULL;
header.messageId = RED_DATA_FINISH;
header.sequenceNum = seq++;
bs.restart();
bs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE;
bs.append((const ByteStream::byte*)&header, sizeof(header));
bs << (uint64_t)fileSize;
fMsgQueueClient->write(bs);
sbs = fMsgQueueClient->read();
if (!checkDataTransferAck(sbs, fileSize))
return fErrorCode;
} // segments
} // for oids
} // remote peer non-hdfs
else // local or HDFS file copy
{
std::map<int, std::string> rootToPathMap;
// use cp, in case failed in middle. May consider to use rename if possible.
for (vector<int64_t>::iterator i = fOids.begin(); i != fOids.end(); i++)
{
for (set<int16_t>::iterator j = fSegments.begin(); j != fSegments.end(); ++j)
{
if (fStopAction)
return RED_EC_USER_STOP;
if (fileType == IDBDataFile::HDFS) // HDFS file copy
{
string sourceName;
int rc = buildFullHdfsPath(rootToPathMap, // map of root to path
*i, // OID
source, // dbroot
partition, // partition
*j, // segment
sourceName); // full path name
if (rc != 0)
{
fErrorCode = RED_EC_OID_TO_FILENAME;
ostringstream oss;
oss << "Failed to get src file name: oid=" << *i << ", dbroot=" << source
<< ", partition=" << partition << ", segment=" << *j;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
string destName;
rc = buildFullHdfsPath(rootToPathMap, // map of root to path
*i, // OID
dest, // dbroot
partition, // partition
*j, // segment
destName); // full path name
if (rc != 0)
{
fErrorCode = RED_EC_OID_TO_FILENAME;
ostringstream oss;
oss << "Failed to get dest file name: oid=" << *i << ", dbroot=" << dest
<< ", partition=" << partition << ", segment=" << *j;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
ostringstream oss;
oss << "<=redistributing(hdfs): " << sourceName << ", oid=" << *i << ", db=" << source
<< ", part=" << partition << ", seg=" << *j << " to db=" << dest;
logMessage(oss.str(), __LINE__);
// add to set for remove after commit/abort
addToDirSet(sourceName.c_str(), true);
addToDirSet(destName.c_str(), false);
int ret = fs.copyFile(sourceName.c_str(), destName.c_str());
if (ret != 0)
{
fErrorCode = RED_EC_COPY_FILE_FAIL;
ostringstream oss;
oss << "Failed to copy " << sourceName << " to " << destName << "; error is: " << strerror(errno);
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
}
else // local file copy
{
char sourceName[WriteEngine::FILE_NAME_SIZE];
int rc = fileOp.oid2FileName(*i, sourceName, false, source, partition, *j);
if (rc != WriteEngine::NO_ERROR)
{
fErrorCode = RED_EC_OID_TO_FILENAME;
ostringstream oss;
oss << "Failed to get file name: oid=" << *i << ", dbroot=" << source
<< ", partition=" << partition << ", segment=" << *j;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
char destName[WriteEngine::FILE_NAME_SIZE];
rc = fileOp.oid2FileName(*i, destName, true, dest, partition, *j);
if (rc != WriteEngine::NO_ERROR)
{
fErrorCode = RED_EC_OID_TO_FILENAME;
ostringstream oss;
oss << "Failed to get file name: oid=" << *i << ", dbroot=" << dest << ", partition=" << partition
<< ", segment=" << *j;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
ostringstream oss;
oss << "<=redistributing(copy): " << sourceName << ", oid=" << *i << ", db=" << source
<< ", part=" << partition << ", seg=" << *j << " to db=" << dest;
logMessage(oss.str(), __LINE__);
// add to set for remove after commit/abort
addToDirSet(sourceName, true);
addToDirSet(destName, false);
// Using boost::copy_file() instead of IDBFileSystem::copy-
// File() so we can capture/report any boost exception error
// msg that IDBFileSystem::copyFile() currently swallows.
try
{
boost::filesystem::copy_file(sourceName, destName);
}
#if BOOST_VERSION >= 105200
catch (boost::filesystem::filesystem_error& e)
#else
catch (boost::filesystem::basic_filesystem_error<filesystem::path>& e)
#endif
{
fErrorCode = RED_EC_COPY_FILE_FAIL;
ostringstream oss;
oss << "Failed to copy " << sourceName << " to " << destName << "; error is: " << e.what();
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
return fErrorCode;
}
}
} // segment
} // oid
} // !remote
return 0;
}
//------------------------------------------------------------------------------
// Construct a full path name based on the given oid, root, partition, and seg.
// The rootToPathMap is the map of dbroot to dbrootPath that we are using. We
// are using this function instead of the usual FileOp::oid2FileName() function,
// because that function only works with "local" DBRoots. In the case of
// an HDFS copy, we will be copying files from/to DBRoots that are not on the
// local PM.
//------------------------------------------------------------------------------
int RedistributeWorkerThread::buildFullHdfsPath(std::map<int, std::string>& rootToPathMap, int64_t colOid,
int16_t dbRoot, uint32_t partition, int16_t segment,
std::string& fullFileName)
{
std::map<int, std::string>::const_iterator iter = rootToPathMap.find(dbRoot);
if (iter == rootToPathMap.end())
{
ostringstream oss;
oss << "DBRoot" << dbRoot;
std::string dbRootPath = fConfig->getConfig("SystemConfig", oss.str());
if (dbRootPath.empty())
{
return 1;
}
rootToPathMap[dbRoot] = dbRootPath;
iter = rootToPathMap.find(dbRoot);
}
char tempFileName[WriteEngine::FILE_NAME_SIZE];
char dbDir[WriteEngine::MAX_DB_DIR_LEVEL][WriteEngine::MAX_DB_DIR_NAME_SIZE];
int rc = WriteEngine::Convertor::oid2FileName(colOid, tempFileName, dbDir, partition, segment);
if (rc != WriteEngine::NO_ERROR)
{
return 2;
}
ostringstream fullFileNameOss;
fullFileNameOss << iter->second << '/' << tempFileName;
fullFileName = fullFileNameOss.str();
return 0;
}
int RedistributeWorkerThread::connectToWes(int pmId)
{
int ret = 0;
ostringstream oss;
oss << "pm" << pmId << "_WriteEngineServer";
try
{
fMsgQueueClient.reset(new MessageQueueClient(oss.str(), fConfig));
}
catch (const std::exception& ex)
{
fErrorMsg = "Caught exception when connecting to " + oss.str() + " -- " + ex.what();
ret = 1;
}
catch (...)
{
fErrorMsg = "Caught exception when connecting to " + oss.str() + " -- unknown";
ret = 2;
}
return ret;
}
int RedistributeWorkerThread::updateDbrm()
{
int rc1 = BRM::ERR_OK;
int rc2 = BRM::ERR_OK;
boost::mutex::scoped_lock lock(fActionMutex);
// cannot stop after extent map is updated.
if (!fStopAction)
{
if (fUpdateHwmEntries.size() > 0)
rc1 = fDbrm->bulkSetHWM(fUpdateHwmEntries, 0);
if (rc1 == BRM::ERR_OK)
{
int rc2 = fDbrm->bulkUpdateDBRoot(fUpdateRtEntries);
if (rc2 == 0)
fCommitted = true;
else
fErrorCode = RED_EC_UPDATE_DBRM_FAIL;
}
// logging for debug
{
if (fUpdateHwmEntries.size() > 0)
{
ostringstream oss;
oss << "HWM_0 workaround, updateHWM(oid,part,seg,hwm)";
vector<BRM::BulkSetHWMArg>::iterator i = fUpdateHwmEntries.begin();
for (; i != fUpdateHwmEntries.end(); i++)
{
oss << ":(" << i->oid << "," << i->partNum << "," << i->segNum << "," << i->hwm << ")";
}
oss << ((rc1 == BRM::ERR_OK) ? " success" : " failed");
logMessage(oss.str(), __LINE__);
}
if (rc1 == BRM::ERR_OK)
{
ostringstream oss;
oss << "updateDBRoot(startLBID,dbRoot)";
vector<BRM::BulkUpdateDBRootArg>::iterator i = fUpdateRtEntries.begin();
for (; i != fUpdateRtEntries.end(); i++)
oss << ":(" << i->startLBID << "," << i->dbRoot << ")";
oss << ((rc2 == BRM::ERR_OK) ? " success" : " failed");
logMessage(oss.str(), __LINE__);
}
}
}
return ((rc1 == BRM::ERR_OK && rc2 == BRM::ERR_OK) ? 0 : -1);
}
bool RedistributeWorkerThread::checkDataTransferAck(SBS& sbs, size_t size)
{
if (sbs->length() == 0)
{
ostringstream oss;
oss << "Zero byte read, Network error.";
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
fErrorCode = RED_EC_NETWORK_FAIL;
}
else if (sbs->length() < (sizeof(RedistributeMsgHeader) + 1))
{
ostringstream oss;
oss << "Short message, length=" << sbs->length();
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
fErrorCode = RED_EC_WKR_MSG_SHORT;
}
else
{
// Need check header info
ByteStream::byte wesMsgId;
*sbs >> wesMsgId;
// const RedistributeMsgHeader* h = (const RedistributeMsgHeader*) sbs->buf();
sbs->advance(sizeof(RedistributeMsgHeader));
size_t ack;
*sbs >> ack;
if (ack != size)
{
ostringstream oss;
oss << "Acked size does not match request: " << ack << "/" << size;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
fErrorCode = RED_EC_SIZE_NACK;
}
}
sbs.reset();
return (fErrorCode == RED_EC_OK);
}
void RedistributeWorkerThread::confirmToPeer()
{
if (fTableLockId > 0)
{
bool rc = false;
try
{
rc = fDbrm->releaseTableLock(fTableLockId);
// use the interface, line# replaced with lock id.
logMessage("Releasing table lock... ", fTableLockId);
}
catch (const std::exception& ex)
{
// too bad, the talbe lock is messed up.
fErrorMsg = ex.what();
// use the interface, line# replaced with lock id.
logMessage("Release table exception: " + fErrorMsg, fTableLockId);
}
catch (...)
{
// use the interface, line# replaced with lock id.
logMessage("Release table lock unknown exception. ", fTableLockId);
}
if (rc == true)
{
// use the interface, line# replaced with lock id.
logMessage("Release table lock return true. ", fTableLockId);
fTableLockId = 0;
}
else
{
// let destructor try again.
// use the interface, line# replaced with lock id.
logMessage("Release table lock return false. ", fTableLockId);
}
}
IDBFileSystem& fs = (IDBPolicy::useHdfs() ? IDBFileSystem::getFs(IDBDataFile::HDFS)
: IDBPolicy::useCloud() ? IDBFileSystem::getFs(IDBDataFile::CLOUD)
: IDBFileSystem::getFs(IDBDataFile::BUFFERED));
uint32_t confirmCode = RED_DATA_COMMIT;
if (fErrorCode != RED_EC_OK || fStopAction == true) // fCommitted must be false
confirmCode = RED_DATA_ABORT;
if (fMyId.second != fPeerId.second)
{
if (fMsgQueueClient.get() != NULL)
{
ByteStream bs;
RedistributeMsgHeader header(fPeerId.first, fMyId.first, -1, confirmCode);
bs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE;
bs.append((const ByteStream::byte*)&header, sizeof(header));
fMsgQueueClient->write(bs);
// not going to retry for now, ignore the ack and close the connection.
fMsgQueueClient->read();
fMsgQueueClient.reset();
}
}
else if (confirmCode != RED_DATA_COMMIT)
{
for (set<string>::iterator i = fNewDirSet.begin(); i != fNewDirSet.end(); i++)
{
fs.remove(i->c_str()); // ignoring return code
}
}
// new files committed, remove old ones.
if (confirmCode == RED_DATA_COMMIT)
{
for (set<string>::iterator i = fOldDirSet.begin(); i != fOldDirSet.end(); i++)
{
fs.remove(i->c_str()); // ignoring return code
}
}
fNewDirSet.clear();
fOldDirSet.clear();
}
void RedistributeWorkerThread::addToDirSet(const char* fileName, bool isSource)
{
string path(fileName);
size_t found = path.find_last_of("/\\");
path = path.substr(0, found);
if (isSource)
fOldDirSet.insert(path);
else
fNewDirSet.insert(path);
}
void RedistributeWorkerThread::handleStop()
{
boost::mutex::scoped_lock lock(fActionMutex);
// cannot stop after extent map is updated.
if (!fCommitted)
fStopAction = true;
lock.unlock();
logMessage("User stop", __LINE__);
sendResponse(RED_ACTN_STOP);
}
void RedistributeWorkerThread::sendResponse(uint32_t type)
{
uint32_t tmp = fMsgHeader.destination;
fMsgHeader.destination = fMsgHeader.source;
fMsgHeader.source = tmp;
fMsgHeader.messageId = RED_ACTN_RESP;
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
if (type == RED_ACTN_REQUEST)
{
if (fErrorCode == RED_EC_OK && fStopAction == false)
fPlanEntry.status = RED_TRANS_SUCCESS;
else if (fErrorCode == RED_EC_PART_EXIST_ON_TARGET)
fPlanEntry.status = RED_TRANS_SKIPPED;
else if (fErrorCode != RED_EC_OK)
fPlanEntry.status = RED_TRANS_FAILED;
// else -- stopped, may try again if support resume
fBs.append((const ByteStream::byte*)&fPlanEntry, sizeof(fPlanEntry));
}
fIOSocket.write(fBs);
}
void RedistributeWorkerThread::handleData()
{
bool done = false;
bool noExcept = true;
SBS sbs;
size_t size = 0;
try
{
do
{
switch (fMsgHeader.messageId)
{
case RED_DATA_INIT: handleDataInit(); break;
case RED_DATA_START: handleDataStart(sbs, size); break;
case RED_DATA_CONT: handleDataCont(sbs, size); break;
case RED_DATA_FINISH: handleDataFinish(sbs, size); break;
case RED_DATA_COMMIT:
handleDataCommit(sbs, size);
done = true;
break;
case RED_DATA_ABORT:
handleDataAbort(sbs, size);
done = true;
break;
default:
handleUnknowDataMsg();
done = true;
break;
}
if (!done)
{
// get next message
sbs = fIOSocket.read();
ByteStream::byte wesMsgId;
*sbs >> wesMsgId;
memcpy(&fMsgHeader, sbs->buf(), sizeof(RedistributeMsgHeader));
sbs->advance(sizeof(RedistributeMsgHeader));
}
} while (!done); // will break after commit/abort or catch an exception
}
catch (const std::exception& ex)
{
noExcept = false;
logMessage(ex.what(), __LINE__);
}
catch (...)
{
noExcept = false;
}
if (noExcept == false)
{
// send NACK to peer
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
fBs << ((size_t)-1);
fIOSocket.write(fBs);
}
fBs.reset();
fIOSocket.close();
}
void RedistributeWorkerThread::handleDataInit()
{
uint32_t tmp = fMsgHeader.destination;
fMsgHeader.destination = fMsgHeader.source;
fMsgHeader.source = tmp;
fMsgHeader.messageId = RED_DATA_ACK;
size_t size = 0;
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
fBs << size;
// finish the hand shaking
fIOSocket.write(fBs);
}
void RedistributeWorkerThread::handleDataStart(SBS& sbs, size_t& size)
{
char fileName[WriteEngine::FILE_NAME_SIZE];
try
{
// extract the control data for the segment file
RedistributeDataControl dc;
if (sbs->length() >= sizeof(RedistributeDataControl))
{
memcpy(&dc, sbs->buf(), sizeof(RedistributeDataControl));
sbs->advance(sizeof(RedistributeDataControl));
size = dc.size;
}
else
{
ostringstream oss;
oss << "Short message, length=" << sbs->length();
fErrorMsg = oss.str();
fErrorCode = RED_EC_WKR_MSG_SHORT;
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
// create and open the file for writing.
WriteEngine::FileOp fileOp; // just to get filename, not for file operations
int rc = fileOp.oid2FileName(dc.oid, fileName, true, dc.dbroot, dc.partition, dc.segment);
if (rc == WriteEngine::NO_ERROR)
{
ostringstream oss;
oss << "=>redistributing: " << fileName << ", oid=" << dc.oid << ", db=" << dc.dbroot
<< ", part=" << dc.partition << ", seg=" << dc.segment << " from db=" << fMsgHeader.source;
logMessage(oss.str(), __LINE__);
}
else
{
fErrorCode = RED_EC_OID_TO_FILENAME;
ostringstream oss;
oss << "Failed to get file name: oid=" << dc.oid << ", dbroot=" << dc.dbroot
<< ", partition=" << dc.partition << ", segment=" << dc.segment;
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
if (fNewFilePtr != NULL)
closeFile(fNewFilePtr);
errno = 0;
fNewFilePtr = fopen(fileName, "wb");
if (fNewFilePtr != NULL)
{
ostringstream oss;
oss << "open " << fileName << ", oid=" << dc.oid << ", dbroot=" << dc.dbroot
<< ", partition=" << dc.partition << ", segment=" << dc.segment << ". " << fNewFilePtr;
logMessage(oss.str(), __LINE__);
}
else
{
int e = errno;
fErrorCode = RED_EC_OPEN_FILE_FAIL;
ostringstream oss;
oss << "Failed to open " << fileName << ", oid=" << dc.oid << ", dbroot=" << dc.dbroot
<< ", partition=" << dc.partition << ", segment=" << dc.segment << ". " << strerror(e) << " (" << e
<< ")";
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
// set output buffering
errno = 0;
if (setvbuf(fNewFilePtr, fWriteBuffer.get(), _IOFBF, CHUNK_SIZE))
{
int e = errno;
ostringstream oss;
oss << "Failed to set i/o buffer: " << strerror(e) << " (" << e << ")";
fErrorMsg = oss.str();
logMessage(fErrorMsg, __LINE__);
// not throwing an exception now.
}
// add to set for remove after abort
addToDirSet(fileName, false);
// do a fseek will show the right size, but will not actually allocate the continuous block.
// do write 4k block till file size.
char buf[PRE_ALLOC_SIZE] = {1};
size_t nmemb = size / PRE_ALLOC_SIZE;
while (nmemb-- > 0)
{
errno = 0;
size_t n = fwrite(buf, PRE_ALLOC_SIZE, 1, fNewFilePtr);
if (n != 1)
{
int e = errno;
ostringstream oss;
oss << "Fail to preallocate file: " << strerror(e) << " (" << e << ")";
fErrorMsg = oss.str();
fErrorCode = RED_EC_FWRITE_FAIL;
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
}
// move back to beging to write real data
fflush(fNewFilePtr);
rewind(fNewFilePtr);
}
catch (const std::exception& ex)
{
// NACK
size = -1;
logMessage(ex.what(), __LINE__);
}
catch (...)
{
// NACK
size = -1;
}
// ack file size
fMsgHeader.messageId = RED_DATA_ACK;
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
fBs << size;
fIOSocket.write(fBs);
// reset to count the data received
size = 0;
sbs.reset();
}
void RedistributeWorkerThread::handleDataCont(SBS& sbs, size_t& size)
{
size_t ack = 0;
try
{
size_t bytesRcvd = 0;
*sbs >> bytesRcvd;
if (bytesRcvd != sbs->length())
{
ostringstream oss;
oss << "Incorrect data length: " << sbs->length() << ", expecting " << bytesRcvd;
fErrorMsg = oss.str();
fErrorCode = RED_EC_BS_TOO_SHORT;
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
errno = 0;
size_t n = fwrite(sbs->buf(), 1, bytesRcvd, fNewFilePtr);
if (n != bytesRcvd)
{
int e = errno;
ostringstream oss;
oss << "Fail to write file: " << strerror(e) << " (" << e << ")";
fErrorMsg = oss.str();
fErrorCode = RED_EC_FWRITE_FAIL;
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
ack = bytesRcvd;
size += ack;
}
catch (const std::exception&)
{
// NACK
size = -1;
}
catch (...)
{
// NACK
ack = -1;
}
// ack received data
sbs.reset();
fMsgHeader.messageId = RED_DATA_ACK;
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
fBs << ack;
fIOSocket.write(fBs);
}
void RedistributeWorkerThread::handleDataFinish(SBS& sbs, size_t& size)
{
size_t ack = 0;
// close open file
closeFile(fNewFilePtr);
fNewFilePtr = NULL;
try
{
size_t fileSize = 0;
*sbs >> fileSize;
if (fileSize != size)
{
ostringstream oss;
oss << "File size not match: local=" << size << ", remote=" << fileSize;
fErrorMsg = oss.str();
fErrorCode = RED_EC_FILE_SIZE_NOT_MATCH;
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
ack = size;
}
catch (const std::exception&)
{
// NACK
size = -1;
}
catch (...)
{
// NACK
ack = -1;
}
// ack received data
sbs.reset();
fMsgHeader.messageId = RED_DATA_ACK;
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
fBs << ack;
fIOSocket.write(fBs);
}
void RedistributeWorkerThread::handleDataCommit(SBS& sbs, size_t& size)
{
size_t ack = 0;
sbs.reset();
fMsgHeader.messageId = RED_DATA_ACK;
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
fBs << ack;
fIOSocket.write(fBs);
}
void RedistributeWorkerThread::handleDataAbort(SBS& sbs, size_t& size)
{
// close open file
if (fNewFilePtr != NULL)
closeFile(fNewFilePtr);
IDBFileSystem& fs = (IDBPolicy::useHdfs() ? IDBFileSystem::getFs(IDBDataFile::HDFS)
: IDBPolicy::useCloud() ? IDBFileSystem::getFs(IDBDataFile::CLOUD)
: IDBFileSystem::getFs(IDBDataFile::BUFFERED));
// remove local files
for (set<string>::iterator i = fNewDirSet.begin(); i != fNewDirSet.end(); i++)
{
fs.remove(i->c_str()); // ignoring return code
}
// send ack
sbs.reset();
size_t ack = 0;
fMsgHeader.messageId = RED_DATA_ACK;
fBs.restart();
fBs << (ByteStream::byte)WriteEngine::WE_SVR_REDISTRIBUTE; // dummy, keep for now.
fBs.append((const ByteStream::byte*)&fMsgHeader, sizeof(fMsgHeader));
fBs << ack;
fIOSocket.write(fBs);
}
void RedistributeWorkerThread::handleUnknowDataMsg()
{
ostringstream oss;
oss << "Unknown data message: " << fMsgHeader.messageId;
fErrorMsg = oss.str();
fErrorCode = RED_EC_UNKNOWN_DATA_MSG;
logMessage(fErrorMsg, __LINE__);
throw runtime_error(fErrorMsg);
}
void RedistributeWorkerThread::handleUnknowJobMsg()
{
ostringstream oss;
oss << "Unknown job message: " << fMsgHeader.messageId;
fErrorMsg = oss.str();
fErrorCode = RED_EC_UNKNOWN_JOB_MSG;
logMessage(fErrorMsg, __LINE__);
// protocol error, ignore and close connection.
}
void RedistributeWorkerThread::closeFile(FILE* f)
{
if (f == NULL)
return;
ostringstream oss;
oss << "close file* " << f << " ";
errno = 0;
int rc = fclose(f);
if (rc != 0)
oss << "error: " << strerror(errno) << " (" << errno << ")";
else
oss << "OK";
logMessage(oss.str(), __LINE__);
}
void RedistributeWorkerThread::logMessage(const string& msg, int line)
{
ostringstream oss;
oss << msg << " @workerThread:" << line;
RedistributeControl::instance()->logMessage(oss.str());
}
} // namespace redistribute