/* Copyright (C) 2014 InfiniDB, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /*********************************************************************** * $Id: pcolstep.cpp 9655 2013-06-25 23:08:13Z xlou $ * * ***********************************************************************/ #include #include #include //#define NDEBUG #include #include #include using namespace std; #include "distributedenginecomm.h" #include "elementtype.h" #include "unique32generator.h" #include "messagequeue.h" using namespace messageqcpp; #include "configcpp.h" using namespace config; #include "messagelog.h" #include "messageobj.h" #include "loggingid.h" using namespace logging; #include "calpontsystemcatalog.h" #include "logicoperator.h" using namespace execplan; #include "brm.h" using namespace BRM; #include "idbcompress.h" #include "jlf_common.h" #include "primitivestep.h" // #define DEBUG 1 namespace joblist { #if 0 //const uint32_t defaultProjectBlockReqLimit = 32768; //const uint32_t defaultProjectBlockReqThreshold = 16384; struct pColStepPrimitive { pColStepPrimitive(pColStep* pColStep) : fPColStep(pColStep) {} pColStep* fPColStep; void operator()() { try { fPColStep->sendPrimitiveMessages(); } catch (exception& re) { cerr << "pColStep: send thread threw an exception: " << re.what() << "\t" << this << endl; } } }; struct pColStepAggregator { pColStepAggregator(pColStep* pColStep) : fPColStepCol(pColStep) {} pColStep* fPColStepCol; void operator()() { try { fPColStepCol->receivePrimitiveMessages(); } catch (exception& re) { cerr << fPColStepCol->toString() << ": recv thread threw an exception: " << re.what() << endl; } } }; #endif pColStep::pColStep( CalpontSystemCatalog::OID o, CalpontSystemCatalog::OID t, const CalpontSystemCatalog::ColType& ct, const JobInfo& jobInfo) : JobStep(jobInfo), fRm(jobInfo.rm), sysCat(jobInfo.csc), fOid(o), fTableOid(t), fColType(ct), fFilterCount(0), fBOP(BOP_NONE), ridList(0), msgsSent(0), msgsRecvd(0), finishedSending(false), recvWaiting(false), fIsDict(false), isEM(jobInfo.isExeMgr), ridCount(0), fFlushInterval(jobInfo.flushInterval), fSwallowRows(false), fProjectBlockReqLimit(fRm->getJlProjectBlockReqLimit()), fProjectBlockReqThreshold(fRm->getJlProjectBlockReqThreshold()), fStopSending(false), isFilterFeeder(false), fPhysicalIO(0), fCacheIO(0), fNumBlksSkipped(0), fMsgBytesIn(0), fMsgBytesOut(0) { if (fTableOid == 0) // cross engine support return; int err, i; uint32_t mask; if (fFlushInterval == 0 || !isEM) fOutputType = OT_BOTH; else fOutputType = OT_TOKEN; if (fOid < 1000) throw runtime_error("pColStep: invalid column"); compress::IDBCompressInterface cmpif; if (!cmpif.isCompressionAvail(fColType.compressionType)) { ostringstream oss; oss << "Unsupported compression type " << fColType.compressionType; oss << " for " << sysCat->colName(fOid); #ifdef SKIP_IDB_COMPRESSION oss << ". It looks you're running Community binaries on an Enterprise database."; #endif throw runtime_error(oss.str()); } realWidth = fColType.colWidth; if ( fColType.colDataType == CalpontSystemCatalog::VARCHAR ) { if (8 > fColType.colWidth && 4 <= fColType.colWidth ) fColType.colDataType = CalpontSystemCatalog::CHAR; fColType.colWidth++; } //If this is a dictionary column, fudge the numbers... if ((fColType.colDataType == CalpontSystemCatalog::VARBINARY) || (fColType.colDataType == CalpontSystemCatalog::BLOB) || (fColType.colDataType == CalpontSystemCatalog::TEXT)) { fColType.colWidth = 8; fIsDict = true; } else if (fColType.colWidth > 8 ) { fColType.colWidth = 8; fIsDict = true; //TODO: is this right? fColType.colDataType = CalpontSystemCatalog::VARCHAR; } //Round colWidth up if (fColType.colWidth == 3) fColType.colWidth = 4; else if (fColType.colWidth == 5 || fColType.colWidth == 6 || fColType.colWidth == 7) fColType.colWidth = 8; idbassert(fColType.colWidth > 0); ridsPerBlock = BLOCK_SIZE / fColType.colWidth; /* calculate some shortcuts for extent and block based arithmetic */ extentSize = (fRm->getExtentRows() * fColType.colWidth) / BLOCK_SIZE; for (i = 1, mask = 1, modMask = 0; i <= 32; i++) { mask <<= 1; modMask = (modMask << 1) | 1; if (extentSize & mask) { divShift = i; break; } } for (i++, mask <<= 1; i <= 32; i++, mask <<= 1) if (extentSize & mask) throw runtime_error("pColStep: Extent size must be a power of 2 in blocks"); /* calculate shortcuts for rid-based arithmetic */ for (i = 1, mask = 1, rpbMask = 0; i <= 32; i++) { mask <<= 1; rpbMask = (rpbMask << 1) | 1; if (ridsPerBlock & mask) { rpbShift = i; break; } } for (i++, mask <<= 1; i <= 32; i++, mask <<= 1) if (ridsPerBlock & mask) throw runtime_error("pColStep: Block size and column width must be a power of 2"); for (i = 0, mask = 1, blockSizeShift = 0; i < 32; i++) { if (mask == BLOCK_SIZE) { blockSizeShift = i; break; } mask <<= 1; } if (i == 32) throw runtime_error("pColStep: Block size must be a power of 2"); err = dbrm.getExtents(o, extents); if (err) { ostringstream os; os << "pColStep: BRM lookup error. Could not get extents for OID " << o; throw runtime_error(os.str()); } if (fOid > 3000) { lbidList.reset(new LBIDList(fOid, 0)); } sort(extents.begin(), extents.end(), ExtentSorter()); numExtents = extents.size(); // uniqueID = UniqueNumberGenerator::instance()->getUnique32(); // if (fDec) // fDec->addQueue(uniqueID); // initializeConfigParms ( ); } pColStep::pColStep(const pColScanStep& rhs) : JobStep(rhs), fRm(rhs.resourceManager()), fOid(rhs.oid()), fTableOid(rhs.tableOid()), fColType(rhs.colType()), fFilterCount(rhs.filterCount()), fBOP(rhs.BOP()), ridList(0), fFilterString(rhs.filterString()), msgsSent(0), msgsRecvd(0), finishedSending(false), recvWaiting(false), fIsDict(rhs.isDictCol()), ridCount(0), // Per Cindy, it's save to put fFlushInterval to be 0 fFlushInterval(0), fSwallowRows(false), fProjectBlockReqLimit(fRm->getJlProjectBlockReqLimit()), fProjectBlockReqThreshold(fRm->getJlProjectBlockReqThreshold()), fStopSending(false), fPhysicalIO(0), fCacheIO(0), fNumBlksSkipped(0), fMsgBytesIn(0), fMsgBytesOut(0), fFilters(rhs.getFilters()) { int err, i; uint32_t mask; if (fTableOid == 0) // cross engine support return; if (fOid < 1000) throw runtime_error("pColStep: invalid column"); ridsPerBlock = rhs.getRidsPerBlock(); /* calculate some shortcuts for extent and block based arithmetic */ extentSize = (fRm->getExtentRows() * fColType.colWidth) / BLOCK_SIZE; for (i = 1, mask = 1, modMask = 0; i <= 32; i++) { mask <<= 1; modMask = (modMask << 1) | 1; if (extentSize & mask) { divShift = i; break; } } for (i++, mask <<= 1; i <= 32; i++, mask <<= 1) if (extentSize & mask) throw runtime_error("pColStep: Extent size must be a power of 2 in blocks"); /* calculate shortcuts for rid-based arithmetic */ for (i = 1, mask = 1, rpbMask = 0; i <= 32; i++) { mask <<= 1; rpbMask = (rpbMask << 1) | 1; if (ridsPerBlock & mask) { rpbShift = i; break; } } for (i++, mask <<= 1; i <= 32; i++, mask <<= 1) if (ridsPerBlock & mask) throw runtime_error("pColStep: Block size and column width must be a power of 2"); for (i = 0, mask = 1, blockSizeShift = 0; i < 32; i++) { if (mask == BLOCK_SIZE) { blockSizeShift = i; break; } mask <<= 1; } if (i == 32) throw runtime_error("pColStep: Block size must be a power of 2"); err = dbrm.getExtents(fOid, extents); if (err) { ostringstream os; os << "pColStep: BRM lookup error. Could not get extents for OID " << fOid; throw runtime_error(os.str()); } lbidList = rhs.getlbidList(); sort(extents.begin(), extents.end(), ExtentSorter()); numExtents = extents.size(); fOnClauseFilter = rhs.onClauseFilter(); // uniqueID = UniqueNumberGenerator::instance()->getUnique32(); // if (fDec) // fDec->addQueue(uniqueID); // initializeConfigParms ( ); } pColStep::pColStep(const PassThruStep& rhs) : JobStep(rhs), fRm(rhs.resourceManager()), fOid(rhs.oid()), fTableOid(rhs.tableOid()), fColType(rhs.colType()), fFilterCount(0), fBOP(BOP_NONE), ridList(0), msgsSent(0), msgsRecvd(0), finishedSending(false), recvWaiting(false), fIsDict(rhs.isDictCol()), ridCount(0), // Per Cindy, it's save to put fFlushInterval to be 0 fFlushInterval(0), fSwallowRows(false), fProjectBlockReqLimit(fRm->getJlProjectBlockReqLimit()), fProjectBlockReqThreshold(fRm->getJlProjectBlockReqThreshold()), fStopSending(false), fPhysicalIO(0), fCacheIO(0), fNumBlksSkipped(0), fMsgBytesIn(0), fMsgBytesOut(0) { int err, i; uint32_t mask; if (fTableOid == 0) // cross engine support return; if (fOid < 1000) throw runtime_error("pColStep: invalid column"); ridsPerBlock = BLOCK_SIZE / fColType.colWidth; /* calculate some shortcuts for extent and block based arithmetic */ extentSize = (fRm->getExtentRows() * fColType.colWidth) / BLOCK_SIZE; for (i = 1, mask = 1, modMask = 0; i <= 32; i++) { mask <<= 1; modMask = (modMask << 1) | 1; if (extentSize & mask) { divShift = i; break; } } for (i++, mask <<= 1; i <= 32; i++, mask <<= 1) if (extentSize & mask) throw runtime_error("pColStep: Extent size must be a power of 2 in blocks"); /* calculate shortcuts for rid-based arithmetic */ for (i = 1, mask = 1, rpbMask = 0; i <= 32; i++) { mask <<= 1; rpbMask = (rpbMask << 1) | 1; if (ridsPerBlock & mask) { rpbShift = i; break; } } for (i++, mask <<= 1; i <= 32; i++, mask <<= 1) if (ridsPerBlock & mask) throw runtime_error("pColStep: Block size and column width must be a power of 2"); for (i = 0, mask = 1, blockSizeShift = 0; i < 32; i++) { if (mask == BLOCK_SIZE) { blockSizeShift = i; break; } mask <<= 1; } if (i == 32) throw runtime_error("pColStep: Block size must be a power of 2"); err = dbrm.getExtents(fOid, extents); if (err) { ostringstream os; os << "pColStep: BRM lookup error. Could not get extents for OID " << fOid; throw runtime_error(os.str()); } sort(extents.begin(), extents.end(), ExtentSorter()); numExtents = extents.size(); // uniqueID = UniqueNumberGenerator::instance()->getUnique32(); // if (fDec) // fDec->addQueue(uniqueID); // initializeConfigParms ( ); } pColStep::~pColStep() { // join? //delete lbidList; // if (fDec) // fDec->removeQueue(uniqueID); } //------------------------------------------------------------------------------ // Initialize configurable parameters //------------------------------------------------------------------------------ void pColStep::initializeConfigParms() { // const string section ( "JobList" ); // const string sendLimitName ( "ProjectBlockReqLimit" ); // const string sendThresholdName ( "ProjectBlockReqThreshold" ); // Config* cf = Config::makeConfig(); // // string strVal; // uint64_t numVal; //...Get the tuning parameters that throttle msgs sent to primproc //...fFilterRowReqLimit puts a cap on how many rids we will request from //... primproc, before pausing to let the consumer thread catch up. //... Without this limit, there is a chance that PrimProc could flood //... ExeMgr with thousands of messages that will consume massive //... amounts of memory for a 100 gigabyte database. //...fFilterRowReqThreshhold is the level at which the number of outstanding //... rids must fall below, before the producer can send more rids. // strVal = cf->getConfig(section, sendLimitName); // if (strVal.size() > 0) // { // errno = 0; // numVal = Config::uFromText(strVal); // if ( errno == 0 ) // fProjectBlockReqLimit = (uint32_t)numVal; // } // // strVal = cf->getConfig(section, sendThresholdName); // if (strVal.size() > 0) // { // errno = 0; // numVal = Config::uFromText(strVal); // if ( errno == 0 ) // fProjectBlockReqThreshold = (uint32_t)numVal; // } } void pColStep::startPrimitiveThread() { // pThread.reset(new boost::thread(pColStepPrimitive(this))); } void pColStep::startAggregationThread() { // cThread.reset(new boost::thread(pColStepAggregator(this))); } void pColStep::run() { // if (traceOn()) // { // syslogStartStep(16, // exemgr subsystem // std::string("pColStep")); // step name // } // // size_t sz = fInputJobStepAssociation.outSize(); // idbassert(sz > 0); // const AnyDataListSPtr& dl = fInputJobStepAssociation.outAt(0); // DataList_t* dlp = dl->dataList(); // DataList* strDlp = dl->stringDataList(); // if ( dlp ) // setRidList(dlp); // else // { // setStrRidList( strDlp ); // } // //Sort can be set through the jobstep or the input JSA if fFlushinterval is 0 // fToSort = (fFlushInterval) ? 0 : (!fToSort) ? fInputJobStepAssociation.toSort() : fToSort; // fToSort = 0; // //pthread_mutex_init(&mutex, NULL); // //pthread_cond_init(&condvar, NULL); // //pthread_cond_init(&flushed, NULL); // startPrimitiveThread(); // startAggregationThread(); } void pColStep::join() { // pThread->join(); // cThread->join(); // //pthread_mutex_destroy(&mutex); // //pthread_cond_destroy(&condvar); // //pthread_cond_destroy(&flushed); } void pColStep::addFilter(int8_t COP, float value) { fFilterString << (uint8_t) COP; fFilterString << (uint8_t) 0; fFilterString << *((uint32_t*) &value); fFilterCount++; } void pColStep::addFilter(int8_t COP, int64_t value, uint8_t roundFlag) { int8_t tmp8; int16_t tmp16; int32_t tmp32; fFilterString << (uint8_t) COP; fFilterString << roundFlag; // converts to a type of the appropriate width, then bitwise // copies into the filter ByteStream switch (fColType.colWidth) { case 1: tmp8 = value; fFilterString << *((uint8_t*) &tmp8); break; case 2: tmp16 = value; fFilterString << *((uint16_t*) &tmp16); break; case 4: tmp32 = value; fFilterString << *((uint32_t*) &tmp32); break; case 8: fFilterString << *((uint64_t*) &value); break; default: ostringstream o; o << "pColStep: CalpontSystemCatalog says OID " << fOid << " has a width of " << fColType.colWidth; throw runtime_error(o.str()); } fFilterCount++; } void pColStep::setRidList(DataList* dl) { ridList = dl; } void pColStep::setStrRidList(DataList* strDl) { strRidList = strDl; } void pColStep::setBOP(int8_t b) { fBOP = b; } void pColStep::setOutputType(int8_t OutputType) { fOutputType = OutputType; } void pColStep::setSwallowRows(const bool swallowRows) { fSwallowRows = swallowRows; } void pColStep::sendPrimitiveMessages() { // int it = -1; // int msgRidCount = 0; // int ridListIdx = 0; // bool more = false; // uint64_t absoluteRID = 0; // int64_t msgLBID = -1; // int64_t nextLBID = -1; // int64_t msgLargeBlock = -1; // int64_t nextLargeBlock = -1; // uint16_t blockRelativeRID; // uint32_t msgCount = 0; // uint32_t sentBlockCount = 0; // int msgsSkip=0; // bool scan=false; // bool scanThisBlock=false; // ElementType e; // UintRowGroup rw; // StringElementType strE; // StringRowGroup strRw; // // ByteStream msgRidList; // ByteStream primMsg(MAX_BUFFER_SIZE); //the MAX_BUFFER_SIZE as of 8/20 // // NewColRequestHeader hdr; // // AnyDataListSPtr dl; // FifoDataList *fifo = NULL; // StringFifoDataList* strFifo = NULL; // // const bool ignoreCP = ((fTraceFlags & CalpontSelectExecutionPlan::IGNORE_CP) != 0); // // //The presence of more than 1 input DL means we (probably) have a pDictionaryScan step feeding this step // // a list of tokens to get the rids for. Convert the input tokens to a filter string. We also have a rid // // list as the second input dl // if (fInputJobStepAssociation.outSize() > 1) // { // addFilters(); // if (fTableOid >= 3000) // cout << toString() << endl; // //If we got no input rids (as opposed to no input DL at all) then there were no matching rows from // // the previous step, so this step should not return any rows either. This would be the case, for // // instance, if P_NAME LIKE '%xxxx%' produced no signature matches. // if (fFilterCount == 0) // { // goto done; // } // } // // // determine which ranges/extents to eliminate from this step // //#ifdef DEBUG // if (fOid>=3000) // cout << "oid " << fOid << endl; //#endif // // scanFlags.resize(numExtents); // // for (uint32_t idx=0; idx CasualPartitionPredicate( // extents[idx].partition.cprange.lo_val, // extents[idx].partition.cprange.hi_val, // &fFilterString, // fFilterCount, // fColType, // fBOP) || ignoreCP; // scanFlags[idx]=flag; //#ifdef DEBUG // if (fOid >= 3000 && flushInterval == 0) // cout << (flag ? " will scan " : " will not scan ") // << "extent with range " << extents[idx].partition.cprange.lo_val // << "-" << extents[idx].partition.cprange.hi_val << endl; //#endif // // } // //// if (fOid>=3000) //// cout << " " << scanFlags[idx]; // } //// if (scanFlags.size()>0) //// cout << endl; // // // If there was more than 1 input DL, the first is a list of filters and the second is a list of rids, // // otherwise the first is the list of rids. // if (fInputJobStepAssociation.outSize() > 1) // ridListIdx = 1; // else // ridListIdx = 0; // // dl = fInputJobStepAssociation.outAt(ridListIdx); // ridList = dl->dataList(); // if ( ridList ) // { // fifo = dl->fifoDL(); // // if (fifo) // it = fifo->getIterator(); // else // it = ridList->getIterator(); // } // else // { // strRidList = dl->stringDataList(); // strFifo = dl->stringDL(); // // if (strFifo) // it = strFifo->getIterator(); // else // it = strRidList->getIterator(); // } // // if (ridList) // { // if (fifo) // { // more = fifo->next(it, &rw); // if (fOid>=3000 && dlTimes.FirstReadTime().tv_sec==0) { // dlTimes.setFirstReadTime(); // } // absoluteRID = rw.et[0].first; // } // else // { // more = ridList->next(it, &e); // if (fOid>=3000 && dlTimes.FirstReadTime().tv_sec==0) { // dlTimes.setFirstReadTime(); // } // absoluteRID = e.first; // rw.count = 1; // } // } // else // { // if (strFifo) // { // more = strFifo->next(it, &strRw); // if (fOid>=3000 && dlTimes.FirstReadTime().tv_sec==0) { // dlTimes.setFirstReadTime(); // } // absoluteRID = strRw.et[0].first; // } // else // { // more = strRidList->next(it, &strE); // if (fOid>=3000 && dlTimes.FirstReadTime().tv_sec==0) { // dlTimes.setFirstReadTime(); // } // absoluteRID = strE.first; // strRw.count = 1; // } // } // // if (more) // msgLBID = getLBID(absoluteRID, scan); // scanThisBlock = scan; // msgLargeBlock = absoluteRID >> blockSizeShift; // // while (more || msgRidCount > 0) { // uint64_t rwCount; // if ( ridList) // rwCount = rw.count; // else // rwCount = strRw.count; // // for (uint64_t i = 0; ((i < rwCount) || (!more && msgRidCount > 0)); ) // { // if ( ridList) // { // if (fifo) // absoluteRID = rw.et[i].first; // else // absoluteRID = e.first; // } // else // { // if (strFifo) // absoluteRID = strRw.et[i].first; // else // absoluteRID = strE.first; // } // // if (more) { // nextLBID = getLBID(absoluteRID, scan); // nextLargeBlock = absoluteRID >> blockSizeShift; // } // // //XXXPAT: need to prove N & S here // if (nextLBID == msgLBID && more) { //// blockRelativeRID = absoluteRID % ridsPerBlock; // blockRelativeRID = absoluteRID & rpbMask; // msgRidList << blockRelativeRID; // msgRidCount++; // ++i; // } // else { // //Bug 831: move building msg after the check of scanThisBlock // if (scanThisBlock==true) // { // hdr.ism.Interleave=0; // hdr.ism.Flags=planFlagsToPrimFlags(fTraceFlags); // hdr.ism.Command=COL_BY_SCAN; // hdr.ism.Size=sizeof(NewColRequestHeader) + fFilterString.length() + // msgRidList.length(); // hdr.ism.Type=2; // // hdr.hdr.SessionID = fSessionId; // //hdr.hdr.StatementID = 0; // hdr.hdr.TransactionID = fTxnId; // hdr.hdr.VerID = fVerId; // hdr.hdr.StepID = fStepId; // hdr.hdr.UniqueID = uniqueID; // // hdr.LBID = msgLBID; //// idbassert(hdr.LBID >= 0); // hdr.DataSize = fColType.colWidth; // hdr.DataType = fColType.colDataType; // hdr.CompType = fColType.compressionType; // hdr.OutputType = fOutputType; // hdr.BOP = fBOP; // hdr.NOPS = fFilterCount; // hdr.NVALS = msgRidCount; // hdr.sort = fToSort; // // primMsg.append((const uint8_t *) &hdr, sizeof(NewColRequestHeader)); // primMsg += fFilterString; // primMsg += msgRidList; // ridCount += msgRidCount; // ++sentBlockCount; // //#ifdef DEBUG // if (flushInterval == 0 && fOid >= 3000) // cout << "sending a prim msg for LBID " << msgLBID << endl; //#endif // ++msgCount; //// cout << "made a primitive\n"; // if (msgLargeBlock != nextLargeBlock || !more) { //// cout << "writing " << msgCount << " primitives\n"; // fMsgBytesOut += primMsg.lengthWithHdrOverhead(); // fDec->write(primMsg); // msgsSent += msgCount; // msgCount = 0; // primMsg.restart(); // msgLargeBlock = nextLargeBlock; // // // @bug 769 - Added "&& !fSwallowRows" condition below to fix problem with // // caltraceon(16) not working for tpch01 and some other queries. If a query // // ever held off requesting more blocks, it would lock and never finish. // //Bug 815 // if (( sentBlockCount >= fProjectBlockReqLimit) && !fSwallowRows && // (( msgsSent - msgsRecvd) > fProjectBlockReqThreshold)) // { // mutex.lock(); //pthread_mutex_lock(&mutex); // fStopSending = true; // // // @bug 836. Wake up the receiver if he's sleeping. // if (recvWaiting) // condvar.notify_one(); //pthread_cond_signal(&condvar); // flushed.wait(mutex); //pthread_cond_wait(&flushed, &mutex); // fStopSending = false; // mutex.unlock(); //pthread_mutex_unlock(&mutex); // sentBlockCount = 0; // } // } // } // else // { // msgsSkip++; // } // msgLBID = nextLBID; // msgRidList.restart(); // msgRidCount = 0; // // mutex.lock(); //pthread_mutex_lock(&mutex); // // if (scanThisBlock) { // if (recvWaiting) // condvar.notify_one(); //pthread_cond_signal(&condvar); // #ifdef DEBUG //// cout << "msgsSent++ = " << msgsSent << endl; // #endif // } // scanThisBlock = scan; // mutex.unlock(); //pthread_mutex_unlock(&mutex); // // // break the for loop // if (!more) // break; // } // } // for rw.count // // if (more) // { // if ( ridList ) // { // if (fifo) // { // rw.count = 0; // more = fifo->next(it, &rw); // } // else // { // rw.count = 1; // more = ridList->next(it, &e); // } // } // else // { // if (strFifo) // { // strRw.count = 0; // more = strFifo->next(it, &strRw); // } // else // { // strRw.count = 1; // more = strRidList->next(it, &strE); // } // } // } // } // // if (fOid>=3000) dlTimes.setLastReadTime(); // //done: // mutex.lock(); //pthread_mutex_lock(&mutex); // finishedSending = true; // if (recvWaiting) // condvar.notify_one(); //pthread_cond_signal(&condvar); // mutex.unlock(); //pthread_mutex_unlock(&mutex); // //#ifdef DEBUG // if (fOid >=3000) // cout << "pColStep msgSent " // << msgsSent << "/" << msgsSkip // << " rids " << ridCount // << " oid " << fOid << " " << msgLBID << endl; //#endif // //...Track the number of LBIDs we skip due to Casual Partioning. // fNumBlksSkipped += msgsSkip; } void pColStep::receivePrimitiveMessages() { // int64_t ridResults = 0; // AnyDataListSPtr dl = fOutputJobStepAssociation.outAt(0); // DataList_t* dlp = dl->dataList(); // uint64_t fbo; // FifoDataList *fifo = dl->fifoDL(); // UintRowGroup rw; // uint64_t ridBase; // boost::shared_ptr bs; // uint32_t i = 0, length; // // while (1) { // // sync with the send side // mutex.lock(); //pthread_mutex_lock(&mutex); // while (!finishedSending && msgsSent == msgsRecvd) { // recvWaiting = true; // #ifdef DEBUG // cout << "c sleeping" << endl; // #endif // // @bug 836. Wake up the sender if he's sleeping. // if (fStopSending) // flushed.notify_one(); //pthread_cond_signal(&flushed); // condvar.wait(mutex); //pthread_cond_wait(&condvar, &mutex); // #ifdef DEBUG // cout << "c waking" << endl; // #endif // recvWaiting = false; // } // if (msgsSent == msgsRecvd) { // mutex.unlock(); //pthread_mutex_unlock(&mutex); // break; // } // mutex.unlock(); //pthread_mutex_unlock(&mutex); // // // do the recv // fDec->read(uniqueID, bs); // fMsgBytesIn += bs->lengthWithHdrOverhead(); // // // no more messages, and buffered messages should be already processed by now. // if (bs->length() == 0) break; // // #ifdef DEBUG // cout << "msgsRecvd++ = " << msgsRecvd << ". RidResults = " << ridResults << endl; // cout << "Got a ColResultHeader!: " << bs.length() << " bytes" << endl; // #endif // // const ByteStream::byte* bsp = bs->buf(); // // // get the ISMPacketHeader out of the bytestream // //const ISMPacketHeader* ism = reinterpret_cast(bsp); // // // get the ColumnResultHeader out of the bytestream // const ColResultHeader* crh = reinterpret_cast // (&bsp[sizeof(ISMPacketHeader)]); // // bool firstRead = true; // length = bs->length(); // // i = 0; // uint32_t msgCount = 0; // while (i < length) { // ++msgCount; // // i += sizeof(ISMPacketHeader); // crh = reinterpret_cast(&bsp[i]); // // double check the sequence number is increased by one each time // i += sizeof(ColResultHeader); // // fCacheIO += crh->CacheIO; // fPhysicalIO += crh->PhysicalIO; // // // From this point on the rest of the bytestream is the data that comes back from the primitive server // // This needs to be fed to a datalist that is retrieved from the outputassociation object. // // fbo = getFBO(crh->LBID); // ridBase = fbo << rpbShift; // // #ifdef DEBUG //// cout << " NVALS = " << crh->NVALS << " fbo = " << fbo << " lbid = " << crh->LBID << endl; // #endif // // //Check output type // if ( fOutputType == OT_RID ) // { // ridResults += crh->NVALS; // } // // /* XXXPAT: This clause is executed when ExeMgr calls the new nextBand(BS) fcn. // // TODO: both classes have to agree // on which nextBand() variant will be called. pColStep // currently has to infer that from flushInterval and the // Table OID. It would be better to have a more explicit form // of agreement. // // The goal of the nextBand(BS) fcn is to avoid iterating over // every row except at unserialization. This clause copies // the raw results from the PrimProc response directly into // the memory used for the ElementType array. DeliveryStep // will also treat the ElementType array as raw memory and // serialize that. TableColumn now parses the packed data // instead of whole ElementTypes. // */ // else if (fOutputType == OT_TOKEN && fFlushInterval > 0 && !fIsDict) { // // if (fOid>=3000 && dlTimes.FirstInsertTime().tv_sec==0) // dlTimes.setFirstInsertTime(); // ridResults += crh->NVALS; // // /* memcpy the bytestream into the output set */ // uint32_t toCopy, bsPos = 0; // uint8_t *pos; // while (bsPos < crh->NVALS) { // toCopy = (crh->NVALS - bsPos > rw.ElementsPerGroup - rw.count ? // rw.ElementsPerGroup - rw.count : crh->NVALS - bsPos); // pos = ((uint8_t *) &rw.et[0]) + (rw.count * fColType.colWidth); // memcpy(pos, &bsp[i], toCopy * fColType.colWidth); // bsPos += toCopy; // i += toCopy * fColType.colWidth; // rw.count += toCopy; // if (rw.count == rw.ElementsPerGroup) { // if (!fSwallowRows) // fifo->insert(rw); // rw.count = 0; // } // } // } // else if ( fOutputType == OT_TOKEN) // { // uint64_t dv; // uint64_t rid; // // if (fOid>=3000 && dlTimes.FirstInsertTime().tv_sec==0) // dlTimes.setFirstInsertTime(); // ridResults += crh->NVALS; // for(int j = 0; j < crh->NVALS; ++j) // { // // XXXPAT: Only use this when the RID doesn't matter or when // // the response contains every row. // // rid = j + ridBase; // switch (fColType.colWidth) { // case 8: dv = *((const uint64_t *) &bsp[i]); i += 8; break; // case 4: dv = *((const uint32_t *) &bsp[i]); i += 4; break; // case 2: dv = *((const uint16_t *) &bsp[i]); i += 2; break; // case 1: dv = *((const uint8_t *) &bsp[i]); ++i; break; // default: // throw runtime_error("pColStep: invalid column width!"); // } // // // @bug 663 - Don't output any rows if fSwallowRows (caltraceon(16)) is on. // // This options swallows rows in the project steps. // if (!fSwallowRows) // { // if (fifo) // { // rw.et[rw.count].first = rid; // rw.et[rw.count++].second = dv; // if (rw.count == rw.ElementsPerGroup) // { // fifo->insert(rw); // rw.count = 0; // } // } // else // { // dlp->insert(ElementType(rid, dv)); // } // #ifdef DEBUG // //cout << " -- inserting <" << rid << ", " << dv << "> " << *prid << endl; // #endif // } // } // } // else if ( fOutputType == OT_BOTH ) // { // ridResults += crh->NVALS; // for(int j = 0; j < crh->NVALS; ++j) // { // uint64_t dv; // uint64_t rid; // // rid = *((const uint16_t *) &bsp[i]) + ridBase; // i += sizeof(uint16_t); // switch (fColType.colWidth) { // case 8: dv = *((const uint64_t *) &bsp[i]); i += 8; break; // case 4: dv = *((const uint32_t *) &bsp[i]); i += 4; break; // case 2: dv = *((const uint16_t *) &bsp[i]); i += 2; break; // case 1: dv = *((const uint8_t *) &bsp[i]); ++i; break; // default: // throw runtime_error("pColStep: invalid column width!"); // } // // // @bug 663 - Don't output any rows if fSwallowRows (caltraceon(16)) is on. // // This options swallows rows in the project steps. // if (!fSwallowRows) { // if (fOid>=3000 && dlTimes.FirstInsertTime().tv_sec==0) // dlTimes.setFirstInsertTime(); // if(fifo) // { //// rw.et[rw.count++] = ElementType(rid, dv); // rw.et[rw.count].first = rid; // rw.et[rw.count++].second = dv; // if (rw.count == rw.ElementsPerGroup) // { // fifo->insert(rw); // rw.count = 0; // } // } // else // { // dlp->insert(ElementType(rid, dv)); // } // #ifdef DEBUG // //cout << " -- inserting <" << rid << ", " << dv << "> " << *prid << endl; // #endif // } // } // } // } // unpacking the BS // // //Bug 815: Check whether we have enough to process // //++lockCount; // mutex.lock(); //pthread_mutex_lock(&mutex); // if ( fStopSending && ((msgsSent - msgsRecvd ) <= fProjectBlockReqThreshold) ) // { // flushed.notify_one(); //pthread_cond_signal(&flushed); // } // mutex.unlock(); //pthread_mutex_unlock(&mutex); // // firstRead = false; // msgsRecvd += msgCount; // } // read loop // // done reading // // if (fifo && rw.count > 0) // fifo->insert(rw); // // //...Casual partitioning could cause us to do no processing. In that // //...case these time stamps did not get set. So we set them here. // if (fOid>=3000 && dlTimes.FirstReadTime().tv_sec==0) { // dlTimes.setFirstReadTime(); // dlTimes.setLastReadTime(); // dlTimes.setFirstInsertTime(); // } // if (fOid>=3000) dlTimes.setEndOfInputTime(); // // //@bug 699: Reset StepMsgQueue // fDec->removeQueue(uniqueID); // // if (fifo) // fifo->endOfInput(); // else // dlp->endOfInput(); // // if (fTableOid >= 3000) // { // //...Construct timestamp using ctime_r() instead of ctime() not // //...necessarily due to re-entrancy, but because we want to strip // //...the newline ('\n') off the end of the formatted string. // time_t t = time(0); // char timeString[50]; // ctime_r(&t, timeString); // timeString[strlen(timeString)-1 ] = '\0'; // // FifoDataList* pFifo = 0; // uint64_t totalBlockedReadCount = 0; // uint64_t totalBlockedWriteCount = 0; // // //...Sum up the blocked FIFO reads for all input associations // size_t inDlCnt = fInputJobStepAssociation.outSize(); // for (size_t iDataList=0; iDataListfifoDL(); // if (pFifo) // { // totalBlockedReadCount += pFifo->blockedReadCount(); // } // } // // //...Sum up the blocked FIFO writes for all output associations // size_t outDlCnt = fOutputJobStepAssociation.outSize(); // for (size_t iDataList=0; iDataListfifoDL(); // if (pFifo) // { // totalBlockedWriteCount += pFifo->blockedWriteCount(); // } // } // // //...Roundoff msg byte counts to nearest KB for display // uint64_t msgBytesInKB = fMsgBytesIn >> 10; // uint64_t msgBytesOutKB = fMsgBytesOut >> 10; // if (fMsgBytesIn & 512) // msgBytesInKB++; // if (fMsgBytesOut & 512) // msgBytesOutKB++; // // // @bug 828 // if (fifo) // fifo->totalSize(ridResults); // // if (traceOn()) // { // //...Print job step completion information // ostringstream logStr; // logStr << "ses:" << fSessionId << // " st: " << fStepId << " finished at " << // timeString << "; PhyI/O-" << fPhysicalIO << "; CacheI/O-" << // fCacheIO << "; MsgsRvcd-" << msgsRecvd << // "; BlockedFifoIn/Out-" << totalBlockedReadCount << // "/" << totalBlockedWriteCount << // "; output size-" << ridResults << endl << // "\tPartitionBlocksEliminated-" << fNumBlksSkipped << // "; MsgBytesIn-" << msgBytesInKB << "KB" << // "; MsgBytesOut-" << msgBytesOutKB << "KB" << endl << // "\t1st read " << dlTimes.FirstReadTimeString() << // "; EOI " << dlTimes.EndOfInputTimeString() << "; runtime-" << // JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(),dlTimes.FirstReadTime()) << // "s" << endl; // // logEnd(logStr.str().c_str()); // // syslogReadBlockCounts(16, // exemgr sybsystem // fPhysicalIO, // # blocks read from disk // fCacheIO, // # blocks read from cache // fNumBlksSkipped); // # casual partition block hits // syslogProcessingTimes(16, // exemgr subsystem // dlTimes.FirstReadTime(), // first datalist read // dlTimes.LastReadTime(), // last datalist read // dlTimes.FirstInsertTime(), // first datalist write // dlTimes.EndOfInputTime()); // last (endOfInput) datalist write // syslogEndStep(16, // exemgr subsystem // totalBlockedReadCount, // blocked datalist input // totalBlockedWriteCount, // blocked datalist output // fMsgBytesIn, // incoming msg byte count // fMsgBytesOut); // outgoing msg byte count // } // } } const string pColStep::toString() const { ostringstream oss; oss << "pColStep ses:" << fSessionId << " txn:" << fTxnId << " ver:" << fVerId << " st:" << fStepId << " tb/col:" << fTableOid << "/" << fOid; if (alias().length()) oss << " alias:" << alias(); if (view().length()) oss << " view:" << view(); if (fOutputJobStepAssociation.outSize() > 0) oss << " " << omitOidInDL << fOutputJobStepAssociation.outAt(0) << showOidInDL; else oss << " (no output yet)"; oss << " nf:" << fFilterCount; oss << " in:"; for (unsigned i = 0; i < fInputJobStepAssociation.outSize(); i++) { oss << fInputJobStepAssociation.outAt(i) << ", "; } if (fSwallowRows) oss << " (sink)"; return oss.str(); } void pColStep::addFilters() { AnyDataListSPtr dl = fInputJobStepAssociation.outAt(0); DataList_t* bdl = dl->dataList(); FifoDataList* fifo = fInputJobStepAssociation.outAt(0)->fifoDL(); idbassert(bdl); int it = -1; bool more; ElementType e; int64_t token; if (fifo != NULL) { try { it = fifo->getIterator(); } catch (exception& ex) { cerr << "pColStep::addFilters: caught exception: " << ex.what() << " stepno: " << fStepId << endl; } catch (...) { cerr << "pColStep::addFilters: caught exception" << endl; } fBOP = BOP_OR; UintRowGroup rw; more = fifo->next(it, &rw); while (more) { for (uint64_t i = 0; i < rw.count; ++i) addFilter(COMPARE_EQ, (int64_t) rw.et[i].second); more = fifo->next(it, &rw); } } else { try { it = bdl->getIterator(); } catch (exception& ex) { cerr << "pColStep::addFilters: caught exception: " << ex.what() << " stepno: " << fStepId << endl; } catch (...) { cerr << "pColStep::addFilters: caught exception" << endl; } fBOP = BOP_OR; more = bdl->next(it, &e); while (more) { token = e.second; addFilter(COMPARE_EQ, token); more = bdl->next(it, &e); } } return; } /* This exists to avoid a DBRM lookup for every rid. */ inline uint64_t pColStep::getLBID(uint64_t rid, bool& scan) { uint32_t extentIndex, extentOffset; uint64_t fbo; fbo = rid >> rpbShift; extentIndex = fbo >> divShift; extentOffset = fbo & modMask; scan = (scanFlags[extentIndex] != 0); return extents[extentIndex].range.start + extentOffset; } inline uint64_t pColStep::getFBO(uint64_t lbid) { uint32_t i; uint64_t lastLBID; for (i = 0; i < numExtents; i++) { lastLBID = extents[i].range.start + (extents[i].range.size << 10) - 1; if (lbid >= (uint64_t) extents[i].range.start && lbid <= lastLBID) return (lbid - extents[i].range.start) + (i << divShift); } cerr << "pColStep: didn't find the FBO?\n"; throw logic_error("pColStep: didn't find the FBO?"); } void pColStep::appendFilter(const messageqcpp::ByteStream& filter, unsigned count) { fFilterString += filter; fFilterCount += count; } void pColStep::addFilter(const Filter* f) { if (NULL != f) fFilters.push_back(f); } void pColStep::appendFilter(const std::vector& fs) { fFilters.insert(fFilters.end(), fs.begin(), fs.end()); } } //namespace // vim:ts=4 sw=4: