1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-06-06 08:21:01 +03:00
2020-05-26 12:42:11 -05:00

1679 lines
58 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
Copyright (C) 2019 MariaDB Corporation
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/**********************************************************************
* $Id: main.cpp 1000 2013-07-24 21:05:51Z pleblanc $
*
*
***********************************************************************/
/**
* @brief execution plan manager main program
*
* This is the ?main? program for dealing with execution plans and
* result sets. It sits in a loop waiting for CalpontExecutionPlan
* from the FEP. It then passes the CalpontExecutionPlan to the
* JobListFactory from which a JobList is obtained. This is passed
* to to the Query Manager running in the real-time portion of the
* EC. The ExecutionPlanManager waits until the Query Manager
* returns a result set for the job list. These results are passed
* into the CalpontResultFactory, which outputs a CalpontResultSet.
* The ExecutionPlanManager passes the CalpontResultSet into the
* VendorResultFactory which produces a result set tailored to the
* specific DBMS front end in use. The ExecutionPlanManager then
* sends the VendorResultSet back to the Calpont Database Connector
* on the Front-End Processor where it is returned to the DBMS
* front-end.
*/
#include <mariadb.h>
#undef set_bits // mariadb.h defines set_bits, which is incompatible with boost
#include <my_sys.h>
#include <iostream>
#include <cstdint>
#include <csignal>
#include <sys/resource.h>
#undef root_name
#include <boost/filesystem.hpp>
#include "calpontselectexecutionplan.h"
#include "activestatementcounter.h"
#include "distributedenginecomm.h"
#include "resourcemanager.h"
#include "configcpp.h"
#include "queryteleserverparms.h"
#include "iosocket.h"
#include "joblist.h"
#include "joblistfactory.h"
#include "oamcache.h"
#include "simplecolumn.h"
#include "bytestream.h"
#include "telestats.h"
#include "messageobj.h"
#include "messagelog.h"
#include "sqllogger.h"
#include "femsghandler.h"
#include "idberrorinfo.h"
#include "MonitorProcMem.h"
#include "liboamcpp.h"
#include "crashtrace.h"
#include "utils_utf8.h"
#include "mcsconfig.h"
#include <mutex>
#include <thread>
#include <condition_variable>
#include "dbrm.h"
namespace
{
//If any flags other than the table mode flags are set, produce output to screeen
const uint32_t flagsWantOutput = (0xffffffff &
~execplan::CalpontSelectExecutionPlan::TRACE_TUPLE_AUTOSWITCH &
~execplan::CalpontSelectExecutionPlan::TRACE_TUPLE_OFF);
int gDebug;
const unsigned logDefaultMsg = logging::M0000;
const unsigned logDbProfStartStatement = logging::M0028;
const unsigned logDbProfEndStatement = logging::M0029;
const unsigned logStartSql = logging::M0041;
const unsigned logEndSql = logging::M0042;
const unsigned logRssTooBig = logging::M0044;
const unsigned logDbProfQueryStats = logging::M0047;
const unsigned logExeMgrExcpt = logging::M0055;
logging::Logger msgLog(16);
typedef std::map<uint32_t, size_t> SessionMemMap_t;
SessionMemMap_t sessionMemMap; // track memory% usage during a query
std::mutex sessionMemMapMutex;
//...The FrontEnd may establish more than 1 connection (which results in
// more than 1 ExeMgr thread) per session. These threads will share
// the same CalpontSystemCatalog object for that session. Here, we
// define a std::map to track how many threads are sharing each session, so
// that we know when we can safely delete a CalpontSystemCatalog object
// shared by multiple threads per session.
typedef std::map<uint32_t, uint32_t> ThreadCntPerSessionMap_t;
ThreadCntPerSessionMap_t threadCntPerSessionMap;
std::mutex threadCntPerSessionMapMutex;
//This var is only accessed using thread-safe inc/dec calls
ActiveStatementCounter* statementsRunningCount;
joblist::DistributedEngineComm* ec;
auto rm = joblist::ResourceManager::instance(true);
int toInt(const std::string& val)
{
if (val.length() == 0) return -1;
return static_cast<int>(config::Config::fromText(val));
}
const std::string ExeMgr("ExeMgr1");
const std::string prettyPrintMiniInfo(const std::string& in)
{
//1. take the std::string and tok it by '\n'
//2. for each part in each line calc the longest part
//3. padding to each longest value, output a header and the lines
typedef boost::tokenizer<boost::char_separator<char> > my_tokenizer;
boost::char_separator<char> sep1("\n");
my_tokenizer tok1(in, sep1);
std::vector<std::string> lines;
std::string header = "Desc Mode Table TableOID ReferencedColumns PIO LIO PBE Elapsed Rows";
const int header_parts = 10;
lines.push_back(header);
for (my_tokenizer::const_iterator iter1 = tok1.begin(); iter1 != tok1.end(); ++iter1)
{
if (!iter1->empty())
lines.push_back(*iter1);
}
std::vector<unsigned> lens;
for (int i = 0; i < header_parts; i++)
lens.push_back(0);
std::vector<std::vector<std::string> > lineparts;
std::vector<std::string>::iterator iter2;
int j;
for (iter2 = lines.begin(), j = 0; iter2 != lines.end(); ++iter2, j++)
{
boost::char_separator<char> sep2(" ");
my_tokenizer tok2(*iter2, sep2);
int i;
std::vector<std::string> parts;
my_tokenizer::iterator iter3;
for (iter3 = tok2.begin(), i = 0; iter3 != tok2.end(); ++iter3, i++)
{
if (i >= header_parts) break;
std::string part(*iter3);
if (j != 0 && i == 8)
part.resize(part.size() - 3);
assert(i < header_parts);
if (part.size() > lens[i])
lens[i] = part.size();
parts.push_back(part);
}
assert(i == header_parts);
lineparts.push_back(parts);
}
std::ostringstream oss;
std::vector<std::vector<std::string> >::iterator iter1 = lineparts.begin();
std::vector<std::vector<std::string> >::iterator end1 = lineparts.end();
oss << "\n";
while (iter1 != end1)
{
std::vector<std::string>::iterator iter2 = iter1->begin();
std::vector<std::string>::iterator end2 = iter1->end();
assert(distance(iter2, end2) == header_parts);
int i = 0;
while (iter2 != end2)
{
assert(i < header_parts);
oss << std::setw(lens[i]) << std::left << *iter2 << " ";
++iter2;
i++;
}
oss << "\n";
++iter1;
}
return oss.str();
}
const std::string timeNow()
{
time_t outputTime = time(0);
struct tm ltm;
char buf[32]; //ctime(3) says at least 26
size_t len = 0;
#ifdef _MSC_VER
asctime_s(buf, 32, localtime_r(&outputTime, &ltm));
#else
asctime_r(localtime_r(&outputTime, &ltm), buf);
#endif
len = strlen(buf);
if (len > 0) --len;
if (buf[len] == '\n') buf[len] = 0;
return buf;
}
querytele::QueryTeleServerParms gTeleServerParms;
class SessionThread
{
public:
SessionThread(const messageqcpp::IOSocket& ios, joblist::DistributedEngineComm* ec, joblist::ResourceManager* rm) :
fIos(ios), fEc(ec),
fRm(rm),
fStatsRetrieved(false),
fTeleClient(gTeleServerParms),
fOamCachePtr(oam::OamCache::makeOamCache())
{
}
private:
messageqcpp::IOSocket fIos;
joblist::DistributedEngineComm* fEc;
joblist::ResourceManager* fRm;
querystats::QueryStats fStats;
// Variables used to store return stats
bool fStatsRetrieved;
querytele::QueryTeleClient fTeleClient;
oam::OamCache* fOamCachePtr; //this ptr is copyable...
//...Reinitialize stats for start of a new query
void initStats ( uint32_t sessionId, std::string& sqlText )
{
initMaxMemPct ( sessionId );
fStats.reset();
fStats.setStartTime();
fStats.fSessionID = sessionId;
fStats.fQuery = sqlText;
fStatsRetrieved = false;
}
//...Get % memory usage during latest query for sesssionId.
//...SessionId >= 0x80000000 is system catalog query we can ignore.
static uint64_t getMaxMemPct ( uint32_t sessionId )
{
uint64_t maxMemoryPct = 0;
if ( sessionId < 0x80000000 )
{
std::lock_guard<std::mutex> lk(sessionMemMapMutex);
SessionMemMap_t::iterator mapIter =
sessionMemMap.find( sessionId );
if ( mapIter != sessionMemMap.end() )
{
maxMemoryPct = (uint64_t)mapIter->second;
}
}
return maxMemoryPct;
}
//...Delete sessionMemMap entry for the specified session's memory % use.
//...SessionId >= 0x80000000 is system catalog query we can ignore.
static void deleteMaxMemPct ( uint32_t sessionId )
{
if ( sessionId < 0x80000000 )
{
std::lock_guard<std::mutex> lk(sessionMemMapMutex);
SessionMemMap_t::iterator mapIter =
sessionMemMap.find( sessionId );
if ( mapIter != sessionMemMap.end() )
{
sessionMemMap.erase( sessionId );
}
}
}
//...Get and log query stats to specified output stream
const std::string formatQueryStats (
joblist::SJLP& jl, // joblist associated with query
const std::string& label, // header label to print in front of log output
bool includeNewLine,//include line breaks in query stats std::string
bool vtableModeOn,
bool wantExtendedStats,
uint64_t rowsReturned)
{
std::ostringstream os;
// Get stats if not already acquired for current query
if ( !fStatsRetrieved )
{
if (wantExtendedStats)
{
//wait for the ei data to be written by another thread (brain-dead)
struct timespec req = { 0, 250000 }; //250 usec
#ifdef _MSC_VER
Sleep(20); //20ms on Windows
#else
nanosleep(&req, 0);
#endif
}
// Get % memory usage during current query for sessionId
jl->querySummary( wantExtendedStats );
fStats = jl->queryStats();
fStats.fMaxMemPct = getMaxMemPct( fStats.fSessionID );
fStats.fRows = rowsReturned;
fStatsRetrieved = true;
}
std::string queryMode;
queryMode = (vtableModeOn ? "Distributed" : "Standard");
// Log stats to specified output stream
os << label <<
": MaxMemPct-" << fStats.fMaxMemPct <<
"; NumTempFiles-" << fStats.fNumFiles <<
"; TempFileSpace-" << roundBytes(fStats.fFileBytes) <<
"; ApproxPhyI/O-" << fStats.fPhyIO <<
"; CacheI/O-" << fStats.fCacheIO <<
"; BlocksTouched-" << fStats.fMsgRcvCnt;
if ( includeNewLine )
os << std::endl << " "; // insert line break
else
os << "; "; // continue without line break
os << "PartitionBlocksEliminated-" << fStats.fCPBlocksSkipped <<
"; MsgBytesIn-" << roundBytes(fStats.fMsgBytesIn) <<
"; MsgBytesOut-" << roundBytes(fStats.fMsgBytesOut) <<
"; Mode-" << queryMode;
return os.str();
}
//...Increment the number of threads using the specified sessionId
static void incThreadCntPerSession(uint32_t sessionId)
{
std::lock_guard<std::mutex> lk(threadCntPerSessionMapMutex);
ThreadCntPerSessionMap_t::iterator mapIter =
threadCntPerSessionMap.find(sessionId);
if (mapIter == threadCntPerSessionMap.end())
threadCntPerSessionMap.insert(ThreadCntPerSessionMap_t::value_type(sessionId, 1));
else
mapIter->second++;
}
//...Decrement the number of threads using the specified sessionId.
//...When the thread count for a sessionId reaches 0, the corresponding
//...CalpontSystemCatalog objects are deleted.
//...The user query and its associated catalog query have a different
//...session Id where the highest bit is flipped.
//...The object with id(sessionId | 0x80000000) cannot be removed before
//...user query session completes because the sysdata may be used for
//...debugging/stats purpose, such as result graph, etc.
static void decThreadCntPerSession(uint32_t sessionId)
{
std::lock_guard<std::mutex> lk(threadCntPerSessionMapMutex);
ThreadCntPerSessionMap_t::iterator mapIter =
threadCntPerSessionMap.find(sessionId);
if (mapIter != threadCntPerSessionMap.end())
{
if (--mapIter->second == 0)
{
threadCntPerSessionMap.erase(mapIter);
execplan::CalpontSystemCatalog::removeCalpontSystemCatalog(sessionId);
execplan::CalpontSystemCatalog::removeCalpontSystemCatalog((sessionId ^ 0x80000000));
}
}
}
//...Init sessionMemMap entry for specified session to 0 memory %.
//...SessionId >= 0x80000000 is system catalog query we can ignore.
static void initMaxMemPct ( uint32_t sessionId )
{
if ( sessionId < 0x80000000 )
{
// std::cout << "Setting pct to 0 for session " << sessionId << std::endl;
std::lock_guard<std::mutex> lk(sessionMemMapMutex);
SessionMemMap_t::iterator mapIter = sessionMemMap.find( sessionId );
if ( mapIter == sessionMemMap.end() )
{
sessionMemMap[sessionId] = 0;
}
else
{
mapIter->second = 0;
}
}
}
//... Round off to human readable format (KB, MB, or GB).
const std::string roundBytes(uint64_t value) const
{
const char* units[] = {"B", "KB", "MB", "GB", "TB"};
uint64_t i = 0, up = 0;
uint64_t roundedValue = value;
while (roundedValue > 1024 && i < 4)
{
up = (roundedValue & 512);
roundedValue /= 1024;
i++;
}
if (up)
roundedValue++;
std::ostringstream oss;
oss << roundedValue << units[i];
return oss.str();
}
//...Round off to nearest (1024*1024) MB
uint64_t roundMB ( uint64_t value ) const
{
uint64_t roundedValue = value >> 20;
if (value & 524288)
roundedValue++;
return roundedValue;
}
void setRMParms ( const execplan::CalpontSelectExecutionPlan::RMParmVec& parms )
{
for (execplan::CalpontSelectExecutionPlan::RMParmVec::const_iterator it = parms.begin();
it != parms.end(); ++it)
{
switch (it->id)
{
case execplan::PMSMALLSIDEMEMORY:
{
fRm->addHJPmMaxSmallSideMap(it->sessionId, it->value);
break;
}
case execplan::UMSMALLSIDEMEMORY:
{
fRm->addHJUmMaxSmallSideMap(it->sessionId, it->value);
break;
}
default:
;
}
}
}
void buildSysCache(const execplan::CalpontSelectExecutionPlan& csep, boost::shared_ptr<execplan::CalpontSystemCatalog> csc)
{
const execplan::CalpontSelectExecutionPlan::ColumnMap& colMap = csep.columnMap();
execplan::CalpontSelectExecutionPlan::ColumnMap::const_iterator it;
std::string schemaName;
for (it = colMap.begin(); it != colMap.end(); ++it)
{
const auto sc = dynamic_cast<execplan::SimpleColumn*>((it->second).get());
if (sc)
{
schemaName = sc->schemaName();
// only the first time a schema is got will actually query
// system catalog. System catalog keeps a schema name std::map.
// if a schema exists, the call getSchemaInfo returns without
// doing anything.
if (!schemaName.empty())
csc->getSchemaInfo(schemaName);
}
}
execplan::CalpontSelectExecutionPlan::SelectList::const_iterator subIt;
for (subIt = csep.derivedTableList().begin(); subIt != csep.derivedTableList().end(); ++ subIt)
{
buildSysCache(*(dynamic_cast<execplan::CalpontSelectExecutionPlan*>(subIt->get())), csc);
}
}
public:
void operator()()
{
messageqcpp::ByteStream bs, inbs;
execplan::CalpontSelectExecutionPlan csep;
csep.sessionID(0);
joblist::SJLP jl;
bool incSessionThreadCnt = true;
std::mutex jlMutex;
std::condition_variable jlCleanupDone;
int destructing = 0;
bool selfJoin = false;
bool tryTuples = false;
bool usingTuples = false;
bool stmtCounted = false;
try
{
for (;;)
{
selfJoin = false;
tryTuples = false;
usingTuples = false;
if (jl)
{
// puts the real destruction in another thread to avoid
// making the whole session wait. It can take several seconds.
std::unique_lock<std::mutex> scoped(jlMutex);
destructing++;
std::thread bgdtor([jl, &jlMutex, &jlCleanupDone, &destructing] {
std::unique_lock<std::mutex> scoped(jlMutex);
const_cast<joblist::SJLP &>(jl).reset(); // this happens second; does real destruction
if (--destructing == 0)
jlCleanupDone.notify_one();
});
jl.reset(); // this runs first
bgdtor.detach();
}
bs = fIos.read();
if (bs.length() == 0)
{
if (gDebug > 1 || (gDebug && !csep.isInternal()))
std::cout << "### Got a close(1) for session id " << csep.sessionID() << std::endl;
// connection closed by client
fIos.close();
break;
}
else if (bs.length() < 4) //Not a CalpontSelectExecutionPlan
{
if (gDebug)
std::cout << "### Got a not-a-plan for session id " << csep.sessionID()
<< " with length " << bs.length() << std::endl;
fIos.close();
break;
}
else if (bs.length() == 4) //possible tuple flag
{
messageqcpp::ByteStream::quadbyte qb;
bs >> qb;
if (qb == 4) //UM wants new tuple i/f
{
if (gDebug)
std::cout << "### UM wants tuples" << std::endl;
tryTuples = true;
// now wait for the CSEP...
bs = fIos.read();
}
else if (qb == 5) //somebody wants stats
{
bs.restart();
qb = statementsRunningCount->cur();
bs << qb;
qb = statementsRunningCount->waiting();
bs << qb;
fIos.write(bs);
fIos.close();
break;
}
else
{
if (gDebug)
std::cout << "### Got a not-a-plan value " << qb << std::endl;
fIos.close();
break;
}
}
new_plan:
csep.unserialize(bs);
querytele::QueryTeleStats qts;
if ( !csep.isInternal() &&
(csep.queryType() == "SELECT" || csep.queryType() == "INSERT_SELECT") )
{
qts.query_uuid = csep.uuid();
qts.msg_type = querytele::QueryTeleStats::QT_START;
qts.start_time = querytele::QueryTeleClient::timeNowms();
qts.query = csep.data();
qts.session_id = csep.sessionID();
qts.query_type = csep.queryType();
qts.system_name = fOamCachePtr->getSystemName();
qts.module_name = fOamCachePtr->getModuleName();
qts.local_query = csep.localQuery();
qts.schema_name = csep.schemaName();
fTeleClient.postQueryTele(qts);
}
if (gDebug > 1 || (gDebug && !csep.isInternal()))
std::cout << "### For session id " << csep.sessionID() << ", got a CSEP" << std::endl;
setRMParms(csep.rmParms());
// @bug 1021. try to get schema cache for a come in query.
// skip system catalog queries.
if (!csep.isInternal())
{
boost::shared_ptr<execplan::CalpontSystemCatalog> csc =
execplan::CalpontSystemCatalog::makeCalpontSystemCatalog(csep.sessionID());
buildSysCache(csep, csc);
}
// As soon as we have a session id for this thread, update the
// thread count per session; only do this once per thread.
// Mask 0x80000000 is for associate user query and csc query
if (incSessionThreadCnt)
{
incThreadCntPerSession( csep.sessionID() | 0x80000000 );
incSessionThreadCnt = false;
}
bool needDbProfEndStatementMsg = false;
logging::Message::Args args;
std::string sqlText = csep.data();
logging::LoggingID li(16, csep.sessionID(), csep.txnID());
// Initialize stats for this query, including
// init sessionMemMap entry for this session to 0 memory %.
// We will need this later for traceOn() or if we receive a
// table request with qb=3 (see below). This is also recorded
// as query start time.
initStats( csep.sessionID(), sqlText );
fStats.fQueryType = csep.queryType();
// Log start and end statement if tracing is enabled. Keep in
// mind the trace flag won't be set for system catalog queries.
if (csep.traceOn())
{
args.reset();
args.add((int)csep.statementID());
args.add((int)csep.verID().currentScn);
args.add(sqlText);
msgLog.logMessage(logging::LOG_TYPE_DEBUG,
logDbProfStartStatement,
args,
li);
needDbProfEndStatementMsg = true;
}
//Don't log subsequent self joins after first.
if (selfJoin)
sqlText = "";
std::ostringstream oss;
oss << sqlText << "; |" << csep.schemaName() << "|";
logging::SQLLogger sqlLog(oss.str(), li);
statementsRunningCount->incr(stmtCounted);
if (tryTuples)
{
try // @bug2244: try/catch around fIos.write() calls responding to makeTupleList
{
std::string emsg("NOERROR");
messageqcpp::ByteStream emsgBs;
messageqcpp::ByteStream::quadbyte tflg = 0;
jl = joblist::JobListFactory::makeJobList(&csep, fRm, true, true);
// assign query stats
jl->queryStats(fStats);
messageqcpp::ByteStream tbs;
if ((jl->status()) == 0 && (jl->putEngineComm(fEc) == 0))
{
usingTuples = true;
//Tell the FE that we're sending tuples back, not TableBands
tbs << tflg;
fIos.write(tbs);
emsgBs.reset();
emsgBs << emsg;
fIos.write(emsgBs);
auto tjlp = dynamic_cast<joblist::TupleJobList*>(jl.get());
assert(tjlp);
tbs.restart();
tbs << tjlp->getOutputRowGroup();
fIos.write(tbs);
}
else
{
statementsRunningCount->decr(stmtCounted);
tflg = jl->status();
emsg = jl->errMsg();
tbs << tflg;
fIos.write(tbs);
emsgBs.reset();
emsgBs << emsg;
fIos.write(emsgBs);
std::cerr << "ExeMgr: could not build a tuple joblist: " << emsg << std::endl;
continue;
}
}
catch (std::exception& ex)
{
std::ostringstream errMsg;
errMsg << "ExeMgr: error writing makeJoblist "
"response; " << ex.what();
throw std::runtime_error( errMsg.str() );
}
catch (...)
{
std::ostringstream errMsg;
errMsg << "ExeMgr: unknown error writing makeJoblist "
"response; ";
throw std::runtime_error( errMsg.str() );
}
if (!usingTuples)
{
if (gDebug)
std::cout << "### UM wanted tuples but it didn't work out :-(" << std::endl;
}
else
{
if (gDebug)
std::cout << "### UM wanted tuples and we'll do our best;-)" << std::endl;
}
}
else
{
usingTuples = false;
jl = joblist::JobListFactory::makeJobList(&csep, fRm, false, true);
if (jl->status() == 0)
{
std::string emsg;
if (jl->putEngineComm(fEc) != 0)
throw std::runtime_error(jl->errMsg());
}
else
{
throw std::runtime_error("ExeMgr: could not build a JobList!");
}
}
jl->doQuery();
execplan::CalpontSystemCatalog::OID tableOID;
bool swallowRows = false;
joblist::DeliveredTableMap tm;
uint64_t totalBytesSent = 0;
uint64_t totalRowCount = 0;
// Project each table as the FE asks for it
for (;;)
{
bs = fIos.read();
if (bs.length() == 0)
{
if (gDebug > 1 || (gDebug && !csep.isInternal()))
std::cout << "### Got a close(2) for session id " << csep.sessionID() << std::endl;
break;
}
if (gDebug && bs.length() > 4)
std::cout << "### For session id " << csep.sessionID() << ", got too many bytes = " <<
bs.length() << std::endl;
//TODO: Holy crud! Can this be right?
//@bug 1444 Yes, if there is a self-join
if (bs.length() > 4)
{
selfJoin = true;
statementsRunningCount->decr(stmtCounted);
goto new_plan;
}
assert(bs.length() == 4);
messageqcpp::ByteStream::quadbyte qb;
try // @bug2244: try/catch around fIos.write() calls responding to qb command
{
bs >> qb;
if (gDebug > 1 || (gDebug && !csep.isInternal()))
std::cout << "### For session id " << csep.sessionID() << ", got a command = " << qb << std::endl;
if (qb == 0)
{
// No more tables, query is done
break;
}
else if (qb == 1)
{
// super-secret flag indicating that the UM is going to scarf down all the rows in the
// query.
swallowRows = true;
tm = jl->deliveredTables();
continue;
}
else if (qb == 2)
{
// UM just wants any table
assert(swallowRows);
auto iter = tm.begin();
if (iter == tm.end())
{
if (gDebug > 1 || (gDebug && !csep.isInternal()))
std::cout << "### For session id " << csep.sessionID() << ", returning end flag" << std::endl;
bs.restart();
bs << (messageqcpp::ByteStream::byte)1;
fIos.write(bs);
continue;
}
tableOID = iter->first;
}
else if (qb == 3) //special option-UM wants job stats std::string
{
std::string statsString;
//Log stats std::string to be sent back to front end
statsString = formatQueryStats(
jl,
"Query Stats",
false,
!(csep.traceFlags() & execplan::CalpontSelectExecutionPlan::TRACE_TUPLE_OFF),
(csep.traceFlags() & execplan::CalpontSelectExecutionPlan::TRACE_LOG),
totalRowCount
);
bs.restart();
bs << statsString;
if ((csep.traceFlags() & execplan::CalpontSelectExecutionPlan::TRACE_LOG) != 0)
{
bs << jl->extendedInfo();
bs << prettyPrintMiniInfo(jl->miniInfo());
}
else
{
std::string empty;
bs << empty;
bs << empty;
}
// send stats to connector for inserting to the querystats table
fStats.serialize(bs);
fIos.write(bs);
continue;
}
// for table mode handling
else if (qb == 4)
{
statementsRunningCount->decr(stmtCounted);
bs = fIos.read();
goto new_plan;
}
else // (qb > 3)
{
//Return table bands for the requested tableOID
tableOID = static_cast<execplan::CalpontSystemCatalog::OID>(qb);
}
}
catch (std::exception& ex)
{
std::ostringstream errMsg;
errMsg << "ExeMgr: error writing qb response "
"for qb cmd " << qb <<
"; " << ex.what();
throw std::runtime_error( errMsg.str() );
}
catch (...)
{
std::ostringstream errMsg;
errMsg << "ExeMgr: unknown error writing qb response "
"for qb cmd " << qb;
throw std::runtime_error( errMsg.str() );
}
if (swallowRows) tm.erase(tableOID);
FEMsgHandler msgHandler(jl, &fIos);
if (tableOID == 100)
msgHandler.start();
//...Loop serializing table bands projected for the tableOID
for (;;)
{
uint32_t rowCount;
rowCount = jl->projectTable(tableOID, bs);
msgHandler.stop();
if (jl->status())
{
const auto errInfo = logging::IDBErrorInfo::instance();
if (jl->errMsg().length() != 0)
bs << jl->errMsg();
else
bs << errInfo->errorMsg(jl->status());
}
try // @bug2244: try/catch around fIos.write() calls projecting rows
{
if (csep.traceFlags() & execplan::CalpontSelectExecutionPlan::TRACE_NO_ROWS3)
{
// Skip the write to the front end until the last empty band. Used to time queries
// through without any front end waiting.
if (tableOID < 3000 || rowCount == 0)
fIos.write(bs);
}
else
{
fIos.write(bs);
}
}
catch (std::exception& ex)
{
msgHandler.stop();
std::ostringstream errMsg;
errMsg << "ExeMgr: error projecting rows "
"for tableOID: " << tableOID <<
"; rowCnt: " << rowCount <<
"; prevTotRowCnt: " << totalRowCount <<
"; " << ex.what();
jl->abort();
while (rowCount)
rowCount = jl->projectTable(tableOID, bs);
if (tableOID == 100 && msgHandler.aborted())
{
/* TODO: modularize the cleanup code, as well as
* the rest of this fcn */
decThreadCntPerSession( csep.sessionID() | 0x80000000 );
statementsRunningCount->decr(stmtCounted);
fIos.close();
return;
}
//std::cout << "connection drop\n";
throw std::runtime_error( errMsg.str() );
}
catch (...)
{
std::ostringstream errMsg;
msgHandler.stop();
errMsg << "ExeMgr: unknown error projecting rows "
"for tableOID: " <<
tableOID << "; rowCnt: " << rowCount <<
"; prevTotRowCnt: " << totalRowCount;
jl->abort();
while (rowCount)
rowCount = jl->projectTable(tableOID, bs);
throw std::runtime_error( errMsg.str() );
}
totalRowCount += rowCount;
totalBytesSent += bs.length();
if (rowCount == 0)
{
msgHandler.stop();
// No more bands, table is done
bs.reset();
// @bug 2083 decr active statement count here for table mode.
if (!usingTuples)
statementsRunningCount->decr(stmtCounted);
break;
}
else
{
bs.restart();
}
} // End of loop to project and serialize table bands for a table
} // End of loop to process tables
// @bug 828
if (csep.traceOn())
jl->graph(csep.sessionID());
if (needDbProfEndStatementMsg)
{
std::string ss;
std::ostringstream prefix;
prefix << "ses:" << csep.sessionID() << " Query Totals";
//Log stats std::string to standard out
ss = formatQueryStats(
jl,
prefix.str(),
true,
!(csep.traceFlags() & execplan::CalpontSelectExecutionPlan::TRACE_TUPLE_OFF),
(csep.traceFlags() & execplan::CalpontSelectExecutionPlan::TRACE_LOG),
totalRowCount);
//@Bug 1306. Added timing info for real time tracking.
std::cout << ss << " at " << timeNow() << std::endl;
// log query stats to debug log file
args.reset();
args.add((int)csep.statementID());
args.add(fStats.fMaxMemPct);
args.add(fStats.fNumFiles);
args.add(fStats.fFileBytes); // log raw byte count instead of MB
args.add(fStats.fPhyIO);
args.add(fStats.fCacheIO);
args.add(fStats.fMsgRcvCnt);
args.add(fStats.fMsgBytesIn);
args.add(fStats.fMsgBytesOut);
args.add(fStats.fCPBlocksSkipped);
msgLog.logMessage(logging::LOG_TYPE_DEBUG,
logDbProfQueryStats,
args,
li);
//@bug 1327
deleteMaxMemPct( csep.sessionID() );
// Calling reset here, will cause joblist destructor to be
// called, which "joins" the threads. We need to do that
// here to make sure all syslogging from all the threads
// are complete; and that our logDbProfEndStatement will
// appear "last" in the syslog for this SQL statement.
// puts the real destruction in another thread to avoid
// making the whole session wait. It can take several seconds.
int stmtID = csep.statementID();
std::unique_lock<std::mutex> scoped(jlMutex);
// C7's compiler complains about the msgLog capture here
// msgLog is global scope, and passed by copy, so, unclear
// what the warning is about.
destructing++;
std::thread bgdtor([jl, &jlMutex, &jlCleanupDone, stmtID, &li, &destructing] {
std::unique_lock<std::mutex> scoped(jlMutex);
const_cast<joblist::SJLP &>(jl).reset(); // this happens second; does real destruction
logging::Message::Args args;
args.add(stmtID);
msgLog.logMessage(logging::LOG_TYPE_DEBUG,
logDbProfEndStatement,
args,
li);
if (--destructing == 0)
jlCleanupDone.notify_one();
});
jl.reset(); // this happens first
bgdtor.detach();
}
else
// delete sessionMemMap entry for this session's memory % use
deleteMaxMemPct( csep.sessionID() );
std::string endtime(timeNow());
if ((csep.traceFlags() & flagsWantOutput) && (csep.sessionID() < 0x80000000))
{
std::cout << "For session " << csep.sessionID() << ": " <<
totalBytesSent <<
" bytes sent back at " << endtime << std::endl;
// @bug 663 - Implemented caltraceon(16) to replace the
// $FIFO_SINK compiler definition in pColStep.
// This option consumes rows in the project steps.
if (csep.traceFlags() &
execplan::CalpontSelectExecutionPlan::TRACE_NO_ROWS4)
{
std::cout << std::endl;
std::cout << "**** No data returned to DM. Rows consumed "
"in ProjectSteps - caltrace(16) is on (FIFO_SINK)."
" ****" << std::endl;
std::cout << std::endl;
}
else if (csep.traceFlags() &
execplan::CalpontSelectExecutionPlan::TRACE_NO_ROWS3)
{
std::cout << std::endl;
std::cout << "**** No data returned to DM - caltrace(8) is "
"on (SWALLOW_ROWS_EXEMGR). ****" << std::endl;
std::cout << std::endl;
}
}
statementsRunningCount->decr(stmtCounted);
if ( !csep.isInternal() &&
(csep.queryType() == "SELECT" || csep.queryType() == "INSERT_SELECT") )
{
qts.msg_type = querytele::QueryTeleStats::QT_SUMMARY;
qts.max_mem_pct = fStats.fMaxMemPct;
qts.num_files = fStats.fNumFiles;
qts.phy_io = fStats.fPhyIO;
qts.cache_io = fStats.fCacheIO;
qts.msg_rcv_cnt = fStats.fMsgRcvCnt;
qts.cp_blocks_skipped = fStats.fCPBlocksSkipped;
qts.msg_bytes_in = fStats.fMsgBytesIn;
qts.msg_bytes_out = fStats.fMsgBytesOut;
qts.rows = totalRowCount;
qts.end_time = querytele::QueryTeleClient::timeNowms();
qts.session_id = csep.sessionID();
qts.query_type = csep.queryType();
qts.query = csep.data();
qts.system_name = fOamCachePtr->getSystemName();
qts.module_name = fOamCachePtr->getModuleName();
qts.local_query = csep.localQuery();
fTeleClient.postQueryTele(qts);
}
}
// Release CSC object (for sessionID) that was added by makeJobList()
// Mask 0x80000000 is for associate user query and csc query.
// (actual joblist destruction happens at the top of this loop)
decThreadCntPerSession( csep.sessionID() | 0x80000000 );
}
catch (std::exception& ex)
{
decThreadCntPerSession( csep.sessionID() | 0x80000000 );
statementsRunningCount->decr(stmtCounted);
std::cerr << "### ExeMgr ses:" << csep.sessionID() << " caught: " << ex.what() << std::endl;
logging::Message::Args args;
logging::LoggingID li(16, csep.sessionID(), csep.txnID());
args.add(ex.what());
msgLog.logMessage(logging::LOG_TYPE_CRITICAL, logExeMgrExcpt, args, li);
fIos.close();
}
catch (...)
{
decThreadCntPerSession( csep.sessionID() | 0x80000000 );
statementsRunningCount->decr(stmtCounted);
std::cerr << "### Exception caught!" << std::endl;
logging::Message::Args args;
logging::LoggingID li(16, csep.sessionID(), csep.txnID());
args.add("ExeMgr caught unknown exception");
msgLog.logMessage(logging::LOG_TYPE_CRITICAL, logExeMgrExcpt, args, li);
fIos.close();
}
// make sure we don't leave scope while joblists are being destroyed
std::unique_lock<std::mutex> scoped(jlMutex);
while (destructing > 0)
jlCleanupDone.wait(scoped);
}
};
class RssMonFcn : public utils::MonitorProcMem
{
public:
RssMonFcn(size_t maxPct, int pauseSeconds) :
MonitorProcMem(maxPct, 0, 21, pauseSeconds) {}
/*virtual*/
void operator()() const
{
for (;;)
{
size_t rssMb = rss();
size_t pct = rssMb * 100 / fMemTotal;
if (pct > fMaxPct)
{
if (fMaxPct >= 95)
{
std::cerr << "Too much memory allocated!" << std::endl;
logging::Message::Args args;
args.add((int)pct);
args.add((int)fMaxPct);
msgLog.logMessage(logging::LOG_TYPE_CRITICAL, logRssTooBig, args, logging::LoggingID(16));
exit(1);
}
if (statementsRunningCount->cur() == 0)
{
std::cerr << "Too much memory allocated!" << std::endl;
logging::Message::Args args;
args.add((int)pct);
args.add((int)fMaxPct);
msgLog.logMessage(logging::LOG_TYPE_WARNING, logRssTooBig, args, logging::LoggingID(16));
exit(1);
}
std::cerr << "Too much memory allocated, but stmts running" << std::endl;
}
// Update sessionMemMap entries lower than current mem % use
{
std::lock_guard<std::mutex> lk(sessionMemMapMutex);
for (SessionMemMap_t::iterator mapIter = sessionMemMap.begin();
mapIter != sessionMemMap.end();
++mapIter)
{
if (pct > mapIter->second)
{
mapIter->second = pct;
}
}
}
pause_();
}
}
};
#ifdef _MSC_VER
void exit_(int)
{
exit(0);
}
#endif
void added_a_pm(int)
{
logging::LoggingID logid(21, 0, 0);
logging::Message::Args args1;
logging::Message msg(1);
args1.add("exeMgr caught SIGHUP. Resetting connections");
msg.format( args1 );
std::cout << msg.msg().c_str() << std::endl;
logging::Logger logger(logid.fSubsysID);
logger.logMessage(logging::LOG_TYPE_DEBUG, msg, logid);
if (ec)
{
//set BUSY_INIT state while processing the add pm configuration change
oam::Oam oam;
try
{
oam.processInitComplete("ExeMgr", oam::BUSY_INIT);
}
catch (...)
{}
oam::OamCache* oamCache = oam::OamCache::makeOamCache();
oamCache->forceReload();
ec->Setup();
//set ACTIVE state
try
{
oam.processInitComplete("ExeMgr");
}
catch (...)
{}
}
}
void printTotalUmMemory(int sig)
{
int64_t num = rm->availableMemory();
std::cout << "Total UM memory available: " << num << std::endl;
}
void setupSignalHandlers()
{
#ifdef _MSC_VER
signal(SIGINT, exit_);
signal(SIGTERM, exit_);
#else
struct sigaction ign;
memset(&ign, 0, sizeof(ign));
ign.sa_handler = SIG_IGN;
sigaction(SIGPIPE, &ign, 0);
memset(&ign, 0, sizeof(ign));
ign.sa_handler = added_a_pm;
sigaction(SIGHUP, &ign, 0);
ign.sa_handler = printTotalUmMemory;
sigaction(SIGUSR1, &ign, 0);
memset(&ign, 0, sizeof(ign));
ign.sa_handler = fatalHandler;
sigaction(SIGSEGV, &ign, 0);
sigaction(SIGABRT, &ign, 0);
sigaction(SIGFPE, &ign, 0);
#endif
}
int8_t setupCwd(joblist::ResourceManager* rm)
{
std::string workdir = rm->getScWorkingDir();
int8_t rc = chdir(workdir.c_str());
if (rc < 0 || access(".", W_OK) != 0)
rc = chdir("/tmp");
return (rc < 0) ? -5 : rc;
}
void startRssMon(size_t maxPct, int pauseSeconds)
{
new boost::thread(RssMonFcn(maxPct, pauseSeconds));
}
int setupResources()
{
#ifdef _MSC_VER
//FIXME:
#else
struct rlimit rlim;
if (getrlimit(RLIMIT_NOFILE, &rlim) != 0)
{
return -1;
}
rlim.rlim_cur = rlim.rlim_max = 65536;
if (setrlimit(RLIMIT_NOFILE, &rlim) != 0)
{
return -2;
}
if (getrlimit(RLIMIT_NOFILE, &rlim) != 0)
{
return -3;
}
if (rlim.rlim_cur != 65536)
{
return -4;
}
#endif
return 0;
}
}
void cleanTempDir()
{
const auto config = config::Config::makeConfig();
std::string allowDJS = config->getConfig("HashJoin", "AllowDiskBasedJoin");
std::string tmpPrefix = config->getConfig("HashJoin", "TempFilePath");
if (allowDJS == "N" || allowDJS == "n")
return;
if (tmpPrefix.empty())
tmpPrefix = "/tmp/cs-diskjoin";
tmpPrefix += "/";
assert(tmpPrefix != "/");
/* This is quite scary as ExeMgr usually runs as root */
try
{
boost::filesystem::remove_all(tmpPrefix);
boost::filesystem::create_directories(tmpPrefix);
}
catch (const std::exception& ex)
{
std::cerr << ex.what() << std::endl;
logging::LoggingID logid(16, 0, 0);
logging::Message::Args args;
logging::Message message(8);
args.add("Execption whilst cleaning tmpdir: ");
args.add(ex.what());
message.format( args );
logging::Logger logger(logid.fSubsysID);
logger.logMessage(logging::LOG_TYPE_WARNING, message, logid);
}
catch (...)
{
std::cerr << "Caught unknown exception during tmpdir cleanup" << std::endl;
logging::LoggingID logid(16, 0, 0);
logging::Message::Args args;
logging::Message message(8);
args.add("Unknown execption whilst cleaning tmpdir");
message.format( args );
logging::Logger logger(logid.fSubsysID);
logger.logMessage(logging::LOG_TYPE_WARNING, message, logid);
}
}
int main(int argc, char* argv[])
{
// Set locale language
setlocale(LC_ALL, "");
setlocale(LC_NUMERIC, "C");
// Initialize the charset library
my_init();
// This is unset due to the way we start it
program_invocation_short_name = const_cast<char*>("ExeMgr");
gDebug = 0;
bool eFlg = false;
int c;
opterr = 0;
while ((c = getopt(argc, argv, "ed")) != EOF)
switch (c)
{
case 'd':
gDebug++;
break;
case 'e':
eFlg = true;
break;
case '?':
default:
break;
}
//set BUSY_INIT state
{
oam::Oam oam;
try
{
oam.processInitComplete("ExeMgr", oam::BUSY_INIT);
}
catch (...)
{
}
}
#ifdef _MSC_VER
//FIXME:
#else
// Make sure CSC thinks it's on a UM or else bucket reuse stuff below will stall
if (!eFlg)
setenv("CALPONT_CSC_IDENT", "um", 1);
#endif
setupSignalHandlers();
int err = 0;
if (!gDebug)
err = setupResources();
std::string errMsg;
switch (err)
{
case -1:
case -3:
errMsg = "Error getting file limits, please see non-root install documentation";
break;
case -2:
errMsg = "Error setting file limits, please see non-root install documentation";
break;
case -4:
errMsg = "Could not install file limits to required value, please see non-root install documentation";
break;
default:
errMsg = "Couldn't change working directory or unknown error";
break;
}
err = setupCwd(rm);
if (err < 0)
{
oam::Oam oam;
logging::Message::Args args;
logging::Message message;
args.add( errMsg );
message.format(args);
logging::LoggingID lid(16);
logging::MessageLog ml(lid);
ml.logCriticalMessage( message );
std::cerr << errMsg << std::endl;
try
{
oam.processInitFailure();
}
catch (...)
{
}
return 2;
}
cleanTempDir();
logging::MsgMap msgMap;
msgMap[logDefaultMsg] = logging::Message(logDefaultMsg);
msgMap[logDbProfStartStatement] = logging::Message(logDbProfStartStatement);
msgMap[logDbProfEndStatement] = logging::Message(logDbProfEndStatement);
msgMap[logStartSql] = logging::Message(logStartSql);
msgMap[logEndSql] = logging::Message(logEndSql);
msgMap[logRssTooBig] = logging::Message(logRssTooBig);
msgMap[logDbProfQueryStats] = logging::Message(logDbProfQueryStats);
msgMap[logExeMgrExcpt] = logging::Message(logExeMgrExcpt);
msgLog.msgMap(msgMap);
ec = joblist::DistributedEngineComm::instance(rm, true);
ec->Open();
bool tellUser = true;
messageqcpp::MessageQueueServer* mqs;
statementsRunningCount = new ActiveStatementCounter(rm->getEmExecQueueSize());
for (;;)
{
try
{
mqs = new messageqcpp::MessageQueueServer(ExeMgr, rm->getConfig(), messageqcpp::ByteStream::BlockSize, 64);
break;
}
catch (std::runtime_error& re)
{
std::string what = re.what();
if (what.find("Address already in use") != std::string::npos)
{
if (tellUser)
{
std::cerr << "Address already in use, retrying..." << std::endl;
tellUser = false;
}
sleep(5);
}
else
{
throw;
}
}
}
// class jobstepThreadPool is used by other processes. We can't call
// resourcemanaager (rm) functions during the static creation of threadpool
// because rm has a "isExeMgr" flag that is set upon creation (rm is a singleton).
// From the pools perspective, it has no idea if it is ExeMgr doing the
// creation, so it has no idea which way to set the flag. So we set the max here.
joblist::JobStep::jobstepThreadPool.setMaxThreads(rm->getJLThreadPoolSize());
joblist::JobStep::jobstepThreadPool.setName("ExeMgrJobList");
if (rm->getJlThreadPoolDebug() == "Y" || rm->getJlThreadPoolDebug() == "y")
{
joblist::JobStep::jobstepThreadPool.setDebug(true);
joblist::JobStep::jobstepThreadPool.invoke(threadpool::ThreadPoolMonitor(&joblist::JobStep::jobstepThreadPool));
}
int serverThreads = rm->getEmServerThreads();
int maxPct = rm->getEmMaxPct();
int pauseSeconds = rm->getEmSecondsBetweenMemChecks();
int priority = rm->getEmPriority();
FEMsgHandler::threadPool.setMaxThreads(serverThreads);
FEMsgHandler::threadPool.setName("FEMsgHandler");
if (maxPct > 0)
startRssMon(maxPct, pauseSeconds);
#ifndef _MSC_VER
setpriority(PRIO_PROCESS, 0, priority);
#endif
std::string teleServerHost(rm->getConfig()->getConfig("QueryTele", "Host"));
if (!teleServerHost.empty())
{
int teleServerPort = toInt(rm->getConfig()->getConfig("QueryTele", "Port"));
if (teleServerPort > 0)
{
gTeleServerParms.host = teleServerHost;
gTeleServerParms.port = teleServerPort;
}
}
std::cout << "Starting ExeMgr: st = " << serverThreads <<
", qs = " << rm->getEmExecQueueSize() << ", mx = " << maxPct << ", cf = " <<
rm->getConfig()->configFile() << std::endl;
//set ACTIVE state
{
oam::Oam oam;
try
{
oam.processInitComplete("ExeMgr");
}
catch (...)
{
}
}
if (getenv("SKIP_OAM_INIT"))
{
BRM::DBRM *dbrm = new BRM::DBRM();
dbrm->setSystemQueryReady(true);
delete dbrm;
}
threadpool::ThreadPool exeMgrThreadPool(serverThreads, 0);
exeMgrThreadPool.setName("ExeMgrServer");
if (rm->getExeMgrThreadPoolDebug() == "Y" || rm->getExeMgrThreadPoolDebug() == "y")
{
exeMgrThreadPool.setDebug(true);
exeMgrThreadPool.invoke(threadpool::ThreadPoolMonitor(&exeMgrThreadPool));
}
for (;;)
{
messageqcpp::IOSocket ios;
ios = mqs->accept();
exeMgrThreadPool.invoke(SessionThread(ios, ec, rm));
}
exeMgrThreadPool.wait();
return 0;
}
// vim:ts=4 sw=4: