mariadb-columnstore-engine/dbcon/joblist/largehashjoin.cpp

/* Copyright (C) 2014 InfiniDB, Inc.
   Copyright (C) 2016 MariaDB Corporation

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; version 2 of
   the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
   MA 02110-1301, USA. */

// $Id: largehashjoin.cpp 9655 2013-06-25 23:08:13Z xlou $
#include <string>
#include <sstream>
#include <cassert>
#include <stdexcept>
#include <ctime>
#include <sys/time.h>
#include <iomanip>
using namespace std;

#include <boost/thread/mutex.hpp>

#include "calpontsystemcatalog.h"
using namespace execplan;

#include "jobstep.h"
#include "largehashjoin.h"
#include "elementtype.h"
using namespace joblist;

#include "mcsconfig.h"

boost::mutex fileLock_g;

namespace
{
void logDiskIoInfo(uint64_t stepId, const AnyDataListSPtr& spdl)
{
  boost::mutex::scoped_lock lk(fileLock_g);
  string umDiskioLog = string(MCSLOGDIR) + "/trace/umdiskio.log";
  string umDiskioBak = string(MCSLOGDIR) + "/trace/umdiskio.bak";

  ofstream umDiskIoFile(umDiskioLog.c_str(), ios_base::app);

  CalpontSystemCatalog::OID oid;
  uint64_t maxBuckets = 0;
  list<DiskIoInfo>* infoList = NULL;
  string bkt("bkt");
  BucketDL<ElementType>* bdl = spdl->bucketDL();
  BucketDL<StringElementType>* sbdl = spdl->stringBucketDL();
  ZDL<ElementType>* zdl = spdl->zonedDL();
  ZDL<StringElementType>* szdl = spdl->stringZonedDL();

  if (bdl != NULL)
  {
    maxBuckets = bdl->bucketCount();
    oid = bdl->OID();
  }
  else if (zdl != NULL)
  {
    maxBuckets = zdl->bucketCount();
    oid = zdl->OID();
    bkt = "zdl";
  }
  else if (sbdl != NULL)
  {
    maxBuckets = sbdl->bucketCount();
    oid = sbdl->OID();
  }
  else if (szdl != NULL)
  {
    maxBuckets = szdl->bucketCount();
    oid = szdl->OID();
    bkt = "zdl";
  }
  else
  {
    // not logged for now.
    return;
  }

  for (uint64_t i = 0; i < maxBuckets; i++)
  {
    if (bdl)
      infoList = &(bdl->diskIoInfoList(i));
    else if (zdl)
      infoList = &(zdl->diskIoInfoList(i));
    else if (sbdl)
      infoList = &(sbdl->diskIoInfoList(i));
    else if (szdl)
      infoList = &(szdl->diskIoInfoList(i));

    for (list<DiskIoInfo>::iterator j = infoList->begin(); j != infoList->end(); j++)
    {
      boost::posix_time::time_duration td = j->fEnd - j->fStart;
      umDiskIoFile << setfill('0') << "st" << setw(2) << stepId << "oid" << oid << bkt << setw(3) << i
                   << (j->fWrite ? " writes " : " reads  ") << setw(7) << setfill(' ') << j->fBytes
                   << " bytes, at " << j->fStart << " duration " << td.total_microseconds() << " mcs @ "
                   << (j->fBytes / td.total_microseconds()) << "MB/s" << endl;
    }
  }

  streampos curPos = umDiskIoFile.tellp();
  umDiskIoFile.close();

  // move the current file to bak when size above .5 G, so total log is 1 G
  if (curPos > 0x20000000)
  {
    string cmd = "/bin/mv " + umDiskioLog + " " + umDiskioBak;
    (void)system(cmd.c_str());
  }
}

}  // namespace

namespace joblist
{
const uint64_t ZDL_VEC_SIZE = 4096;
//@bug 686. Make hashjoin doing jobs in seperate thread and return to main immediately.
// So the other job steps can start.
struct HJRunner
{
  HJRunner(LargeHashJoin* p) : joiner(p)
  {
  }
  LargeHashJoin* joiner;
  void operator()()
  {
    try
    {
      joiner->doHashJoin();
    }
    catch (std::exception& e)
    {
      ostringstream errMsg;
      errMsg << "HJRunner caught: " << e.what();
      joiner->errorLogging(errMsg.str());
      joiner->unblockDatalists(logging::largeHashJoinErr);
    }
    catch (...)
    {
      string msg("HJRunner caught something not an exception!");
      joiner->errorLogging(msg);
      joiner->unblockDatalists(logging::largeHashJoinErr);
    }
  }
};

struct StringHJRunner
{
  StringHJRunner(StringHashJoinStep* p) : joiner(p)
  {
  }
  StringHashJoinStep* joiner;
  void operator()()
  {
    try
    {
      joiner->doStringHashJoin();
    }
    catch (std::exception& e)
    {
      ostringstream errMsg;
      errMsg << "StringHJRunner caught: " << e.what();
      joiner->errorLogging(errMsg.str());
      joiner->unblockDatalists(logging::stringHashJoinStepErr);
    }
    catch (...)
    {
      string msg("StringHJRunner caught something not an exception!");
      joiner->errorLogging(msg);
      joiner->unblockDatalists(logging::stringHashJoinStepErr);
    }
  }
};

// Thread function used by HashJoin
//
template <typename e_t>
void* HashJoinByBucket_thr(void* arg)
{
  typename HashJoin<e_t>::thrParams_t* params = (typename HashJoin<e_t>::thrParams_t*)arg;
  HashJoin<e_t>* hjPtr = params->hjptr;
  const uint32_t thrIdx = params->thrIdx;
  long set1Size = 0;
  long set2Size = 0;
  bool sendAllHashSet = false;
  bool sendAllSearchSet = false;

  try
  {
    for (uint32_t idx = 0, bucketIdx = params->startBucket; idx < params->numBuckets; idx++, bucketIdx++)
    {
#ifdef DEBUG
      cout << "\tJoinByBucket() thr " << dec << thrIdx << " bkt " << bucketIdx << "/"
           << hjPtr->Set1()->bucketCount() << "/" << params->numBuckets << endl;
#endif

      JoinType joinType = hjPtr->getJoinType();

      set1Size = hjPtr->Set1()->size(bucketIdx);
      set2Size = hjPtr->Set2()->size(bucketIdx);

      if (set1Size <= 0 && set2Size <= 0)
      {
        continue;
      }
      else
      {
        if (set1Size > set2Size)
        {
          hjPtr->setSearchSet(hjPtr->Set1()->getBDL(), thrIdx);
          hjPtr->setHashSet(hjPtr->Set2()->getBDL(), thrIdx);
          hjPtr->setSearchResult(hjPtr->Result1(), thrIdx);
          hjPtr->setHashResult(hjPtr->Result2(), thrIdx);
          sendAllHashSet = (joinType == RIGHTOUTER);
          sendAllSearchSet = (joinType == LEFTOUTER);
        }
        else
        {
          hjPtr->setHashSet(hjPtr->Set1()->getBDL(), thrIdx);
          hjPtr->setSearchSet(hjPtr->Set2()->getBDL(), thrIdx);
          hjPtr->setHashResult(hjPtr->Result1(), thrIdx);
          hjPtr->setSearchResult(hjPtr->Result2(), thrIdx);
          sendAllHashSet = (joinType == LEFTOUTER);
          sendAllSearchSet = (joinType == RIGHTOUTER);
        }  // if set1Size > set2Size ...

      }  // if set1Size <=0 . . .

      params->timeset.setTimer(createHashStr);
      hjPtr->createHash(hjPtr->HashSet(thrIdx), hjPtr->HashTable(thrIdx), bucketIdx, sendAllHashSet,
                        hjPtr->HashResult(thrIdx), params->dlTimes, params->die);
      params->timeset.holdTimer(createHashStr);

#ifdef DEBUG
      long hashSetTotal = 0;
      long searchSetTotal = 0;

      for (uint32_t j = 0; j < hjPtr->HashSet(thrIdx)->bucketCount(); j++)
        hashSetTotal += hjPtr->HashSet(thrIdx)->bucketCount();  // are bucketDL

      for (uint32_t j = 0; j < hjPtr->HashSet(thrIdx)->bucketCount(); j++)
        searchSetTotal += hjPtr->SearchSet(thrIdx)->size(j);  // can be any datalist

      cout << "\t\tJoinByBucket() thr " << dec << thrIdx << " bkt " << bucketIdx << " hashSize "
           << hashSetTotal << " searchSize " << searchSetTotal << endl;
#endif

      bool more;
      e_t e;
      e_t e2;
      const uint64_t InvalidRID = static_cast<uint64_t>(-1);
      int iter = hjPtr->SearchSet(thrIdx)->getIterator(bucketIdx);

      ZonedDL* zdl1 = dynamic_cast<ZonedDL*>(hjPtr->SearchResult(thrIdx));
      ZonedDL* zdl2 = dynamic_cast<ZonedDL*>(hjPtr->HashResult(thrIdx));
      vector<e_t> vec1;
      vector<e_t> vec2;

      std::pair<typename HashJoin<e_t>::hashIter_t, typename HashJoin<e_t>::hashIter_t> hashItPair;
      typename HashJoin<e_t>::hashIter_t hashIt;
      typename HashJoin<e_t>::hash_t* ht = hjPtr->HashTable(thrIdx);
      params->timeset.setTimer(hashJoinStr);

      for (more = hjPtr->SearchSet(thrIdx)->next(bucketIdx, iter, &e); more && !(*params->die);
           more = hjPtr->SearchSet(thrIdx)->next(bucketIdx, iter, &e))
      {
        // If sendAllSearchSet=true, keep all of the search set.  If this is
        // a right outer, we are dealing with a join such as
        // col1 = col2 (+)
        // where col1 is the SearchSet and col2 is the HashSet.  We want to include
        // all of col1 in this case regardless of whether there is a matching col2.
        if (sendAllSearchSet)
        {
          if (zdl1)
          {
            vec1.push_back(e);

            if (vec1.size() >= ZDL_VEC_SIZE)
            {
              params->timeset.setTimer(insertResultsStr);
              hjPtr->SearchResult(thrIdx)->insert(vec1);
              vec1.clear();
              params->timeset.holdTimer(insertResultsStr);
            }
          }
          else
            hjPtr->SearchResult(thrIdx)->insert(e);
        }

        hashIt = ht->find(e.second);

        if (hashIt != ht->end())
        {
#ifdef DEBUG

          if (hjPtr->SearchResult(thrIdx)->OID() >= 3000)
            cout << "JoinByBucket() SearchResult add " << bucketIdx << " [" << e.first << "][" << e.second
                 << "]" << endl;

          uint32_t a = 0;
          e_t b = e_t();
#endif

          // If sendAllSearchSet=false, we already added the search result
          // before the if condition above.
          if (!sendAllSearchSet)
          {
            if (zdl1)
            {
              vec1.push_back(e);

              if (vec1.size() >= ZDL_VEC_SIZE)
              {
                params->timeset.setTimer(insertResultsStr);
                hjPtr->SearchResult(thrIdx)->insert(vec1);
                vec1.clear();
                params->timeset.holdTimer(insertResultsStr);
              }
            }
            else
              hjPtr->SearchResult(thrIdx)->insert(e);
          }

          // If sendAllHashSet=false, add the hash results to the output datalist.
          // If it is a left outer join then we already added all of the right side rows
          // in the bucket in the createHash call earlier in this function.
          if (!sendAllHashSet)
          {
            // If the matching pair has it's RID set to invalid, it's already been encountered,
            // so no reason to add it to the output datalist or keep searching for more matching values.
            if (hashIt->second != InvalidRID)
            {
              // If the matching pair has it's RID set to invalid, it's already been encountered,
              hashItPair = ht->equal_range(e.second);

              for (hashIt = hashItPair.first; hashIt != hashItPair.second; hashIt++)
              {
                e2.first = hashIt->second;
                e2.second = e.second;

                if (zdl2)
                {
                  vec2.push_back(e2);

                  if (vec2.size() >= ZDL_VEC_SIZE)
                  {
                    params->timeset.setTimer(insertResultsStr);
                    hjPtr->HashResult(thrIdx)->insert(vec2);
                    vec2.clear();
                    params->timeset.holdTimer(insertResultsStr);
                  }
                }
                else
                  hjPtr->HashResult(thrIdx)->insert(e2);

#ifdef DEBUG
                a++;
                b = v.second;
#endif

                // Set the RID to invalid rid now that it's been matched and added to the output datalist.
                // This will keep us from duplicating it if it is matched again.
                hashIt->second = InvalidRID;
              }

#ifdef DEBUG
              cout << "\t\tadded " << b << " " << a << " times" << endl << endl;
#endif
            }
          }

        }  //  if hashIt != hashIt->end()

      }  // for ( hjPtr...

      params->timeset.holdTimer(hashJoinStr);

      params->timeset.setTimer(insertLastResultsStr);

      if (vec1.size() != 0)
      {
        hjPtr->SearchResult(thrIdx)->insert(vec1);
        vec1.clear();
      }

      if (vec2.size() != 0)
      {
        hjPtr->HashResult(thrIdx)->insert(vec2);
        vec2.clear();
      }

      params->timeset.holdTimer(insertLastResultsStr);

      // typename HashJoin<e_t>::hash_t* ht = hjPtr->HashTable(thrIdx);
      ht->clear();

    }  // for (bucketIdx...
  }    // try
       // We don't have to call JSA.endOfInput() for this exception, because
       // the parent thread takes care of that in performThreadedJoin().
  catch (const logging::LargeDataListExcept& ex)
  {
    ostringstream errMsg;

    if (typeid(e_t) == typeid(StringElementType))
    {
      errMsg << "HashJoinByBucket_thr<String>: caught LDL error: " << ex.what();
      hjPtr->status(logging::stringHashJoinStepLargeDataListFileErr);
    }
    else
    {
      errMsg << "HashJoinByBucket_thr: caught LDL error: " << ex.what();
      hjPtr->status(logging::largeHashJoinLargeDataListFileErr);
    }

    cerr << errMsg.str() << endl;
    catchHandler(errMsg.str(), hjPtr->sessionId());
  }
  catch (const exception& ex)
  {
    ostringstream errMsg;

    if (typeid(e_t) == typeid(StringElementType))
    {
      errMsg << "HashJoinByBucket_thr<String>: caught: " << ex.what();
      hjPtr->status(logging::stringHashJoinStepErr);
    }
    else
    {
      errMsg << "HashJoinByBucket_thr: caught: " << ex.what();
      hjPtr->status(logging::largeHashJoinErr);
    }

    cerr << errMsg.str() << endl;
    catchHandler(errMsg.str(), hjPtr->sessionId());
  }
  catch (...)
  {
    ostringstream errMsg;

    if (typeid(e_t) == typeid(StringElementType))
    {
      errMsg << "HashJoinByBucket_thr<String>: caught unknown exception: ";
      hjPtr->status(logging::stringHashJoinStepErr);
    }
    else
    {
      errMsg << "HashJoinByBucket_thr: caught unknown exception";
      hjPtr->status(logging::largeHashJoinErr);
    }

    cerr << errMsg.str() << endl;
    catchHandler(errMsg.str(), hjPtr->sessionId());
  }

  return NULL;
}  // HashJoinByBucket_thr

LargeHashJoin::LargeHashJoin(JoinType joinType, uint32_t sessionId, uint32_t txnId, uint32_t statementId,
                             ResourceManager* rm)
 : fSessionId(sessionId)
 , fTxnId(txnId)
 , fStepId(0)
 , fStatementId(statementId)
 , fTableOID1(0)
 , fTableOID2(0)
 , fJoinType(joinType)
 , fRm(rm)
 , fAlias1()
 , fAlias2()
{
  // 	fConfig = config::Config::makeConfig();
  // 	fJoinType = joinType;
}

LargeHashJoin::~LargeHashJoin()
{
  if (traceOn())
  {
    for (uint64_t i = 0; i < fInputJobStepAssociation.outSize(); i++)
      logDiskIoInfo(fStepId, fInputJobStepAssociation.outAt(i));

    for (uint64_t i = 0; i < fOutputJobStepAssociation.outSize(); i++)
      logDiskIoInfo(fStepId, fOutputJobStepAssociation.outAt(i));
  }
}

void LargeHashJoin::join()
{
  runner->join();
}

void LargeHashJoin::run()
{
  if (traceOn())
  {
    syslogStartStep(16,                             // exemgr subsystem
                    std::string("LargeHashJoin"));  // step name
  }

  runner.reset(new boost::thread(HJRunner(this)));
}

void LargeHashJoin::unblockDatalists(uint16_t status)
{
  fOutputJobStepAssociation.status(status);
  fOutputJobStepAssociation.outAt(0)->dataList()->endOfInput();
  fOutputJobStepAssociation.outAt(1)->dataList()->endOfInput();
}

void LargeHashJoin::errorLogging(const string& msg) const
{
  ostringstream errMsg;
  errMsg << "Step " << stepId() << "; " << msg;
  cerr << errMsg.str() << endl;
  catchHandler(errMsg.str(), sessionId());
}

void LargeHashJoin::doHashJoin()
{
  string val;

  idbassert(fInputJobStepAssociation.outSize() >= 2);
  idbassert(fOutputJobStepAssociation.outSize() >= 2);
  BucketDataList* Ap = 0;
  BucketDataList* Bp = 0;
  BucketDataList* tAp = 0;
  BucketDataList* tBp = 0;
  DataList_t* inDL1 = 0;
  DataList_t* inDL2 = 0;
  inDL1 = fInputJobStepAssociation.outAt(0)->dataList();
  inDL2 = fInputJobStepAssociation.outAt(1)->dataList();
  idbassert(inDL1);
  idbassert(inDL2);

  HashJoin<ElementType>* hj = 0;
  double createWorkTime = 0;
  double hashWorkTime = 0;
  double insertWorkTime = 0;
  DataList_t* resultA = fOutputJobStepAssociation.outAt(0)->dataList();
  DataList_t* resultB = fOutputJobStepAssociation.outAt(1)->dataList();

  if (0 < fInputJobStepAssociation.status())
  {
    unblockDatalists(fInputJobStepAssociation.status());
  }
  else
  {
    string currentAction("preparing join");

    try
    {
      // If we're given BucketDL's, use them
      if (typeid(*inDL1) == typeid(BucketDataList))
      {
        if (typeid(*inDL2) != typeid(BucketDataList))
        {
          throw logic_error("LargeHashJoin::run: expected either 0 or 2 BucketDL's!");
        }

        Ap = dynamic_cast<BucketDataList*>(inDL1);
        Bp = dynamic_cast<BucketDataList*>(inDL2);
      }
      else
      {
        throw logic_error("HashJoin will take only BucketDLs as inputs");
        int maxBuckets = fRm.getHjMaxBuckets();
        joblist::ridtype_t maxElems = fRm.getHjMaxElems();

        BucketDataList* tAp = new BucketDataList(maxBuckets, 1, maxElems, fRm);
        BucketDataList* tBp = new BucketDataList(maxBuckets, 1, maxElems, fRm);
        tAp->setHashMode(1);
        tBp->setHashMode(1);

        ElementType element;
        int id;

        id = inDL1->getIterator();

        while (inDL1->next(id, &element))
        {
          tAp->insert(element);
        }

        tAp->endOfInput();

        id = inDL2->getIterator();

        while (inDL2->next(id, &element))
        {
          tBp->insert(element);
        }

        tBp->endOfInput();

        Ap = tAp;
        Bp = tBp;
      }

      unsigned numThreads = fRm.getHjNumThreads();

      BDLWrapper<ElementType> setA(Ap);
      BDLWrapper<ElementType> setB(Bp);

      hj = new HashJoin<ElementType>(setA, setB, resultA, resultB, fJoinType, &dlTimes,
                                     fOutputJobStepAssociation.statusPtr(), sessionId(), &die);

      if (fTableOID2 >= 3000)
      {
        ostringstream logStr2;
        logStr2 << "LargeHashJoin::run: ses:" << fSessionId << " st:" << fStepId
                << " input sizes: " << setA.size() << "/" << setB.size() << endl;
        cout << logStr2.str();
      }

      currentAction = "performing join";

      if (fTableOID2 >= 3000)
      {
        dlTimes.setFirstReadTime();
        dlTimes.setEndOfInputTime(dlTimes.FirstReadTime());
      }

      hj->performJoin(numThreads);

    }  // try
    catch (const logging::LargeDataListExcept& ex)
    {
      ostringstream errMsg;
      errMsg << __FILE__ << " doHashJoin: " << currentAction << ", caught LDL error: " << ex.what();
      errorLogging(errMsg.str());
      unblockDatalists(logging::largeHashJoinLargeDataListFileErr);
    }
    catch (const exception& ex)
    {
      ostringstream errMsg;
      errMsg << __FILE__ << " doHashJoin: " << currentAction << ", caught: " << ex.what();
      errorLogging(errMsg.str());
      unblockDatalists(logging::largeHashJoinErr);
    }
    catch (...)
    {
      ostringstream errMsg;
      errMsg << __FILE__ << " doHashJoin: " << currentAction << ", caught unknown exception";
      errorLogging(errMsg.str());
      unblockDatalists(logging::largeHashJoinErr);
    }

    if (hj)
    {
      //..hashWorkTime is the time to perform the hashjoin excluding the
      //  the output insertion time.  insertWorkTime is the sum or total
      //  of both insert times.  The end result is that createWorkTime +
      //  hashWorkTime + insertWorkTime roughly equates to the total work
      //  time.
      createWorkTime = hj->getTimeSet()->totalTime(createHashStr);
      hashWorkTime = hj->getTimeSet()->totalTime(hashJoinStr) - hj->getTimeSet()->totalTime(insertResultsStr);
      insertWorkTime =
          hj->getTimeSet()->totalTime(insertResultsStr) + hj->getTimeSet()->totalTime(insertLastResultsStr);
    }

  }  // (fInputJobStepAssociation.status() == 0)

  if (fTableOID2 >= 3000 && traceOn())
  {
    time_t finTime = time(0);
    char finTimeString[50];
    ctime_r(&finTime, finTimeString);
    finTimeString[strlen(finTimeString) - 1] = '\0';

    ostringstream logStr;
    logStr << "ses:" << fSessionId << " st: " << fStepId << " finished at " << finTimeString << "; 1st read "
           << dlTimes.FirstReadTimeString() << "; EOI " << dlTimes.EndOfInputTimeString() << endl
           << "\tLargeHashJoin::run: output sizes: " << resultA->totalSize() << "/" << resultB->totalSize()
           << " run time: " << JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(), dlTimes.FirstReadTime())
           << fixed << setprecision(2) << "s\n\tTotal work times: create hash: " << createWorkTime
           << "s, hash join: " << hashWorkTime << "s, insert results: " << insertWorkTime << "s\n"
           << "\tJob completion status " << fInputJobStepAssociation.status() << endl;
    logEnd(logStr.str().c_str());

    syslogProcessingTimes(16,                         // exemgr subsystem
                          dlTimes.FirstReadTime(),    // use join start time for first read time
                          dlTimes.EndOfInputTime(),   // use join end   time for last  read time
                          dlTimes.FirstReadTime(),    // use join start time for first write time
                          dlTimes.EndOfInputTime());  // use join end   time for last  write time
    syslogEndStep(16,                                 // exemgr subsystem
                  0,                                  // no blocked datalist input  to report
                  0);                                 // no blocked datalist output to report
  }

  delete hj;
  delete tAp;
  delete tBp;
}

const string LargeHashJoin::toString() const
{
  ostringstream oss;
  CalpontSystemCatalog::OID oid1 = 0;
  CalpontSystemCatalog::OID oid2 = 0;
  DataList_t* dl1;
  DataList_t* dl2;
  size_t idlsz;

  idlsz = fInputJobStepAssociation.outSize();
  idbassert(idlsz == 2);
  dl1 = fInputJobStepAssociation.outAt(0)->dataList();

  if (dl1)
    oid1 = dl1->OID();

  dl2 = fInputJobStepAssociation.outAt(1)->dataList();

  if (dl2)
    oid2 = dl2->OID();

  oss << "LargeHashJoin    ses:" << fSessionId << " st:" << fStepId;
  oss << omitOidInDL;
  oss << " in  tb/col1:" << fTableOID1 << "/" << oid1;
  oss << " " << fInputJobStepAssociation.outAt(0);
  oss << " in  tb/col2:" << fTableOID2 << "/" << oid2;
  oss << " " << fInputJobStepAssociation.outAt(1);

  idlsz = fOutputJobStepAssociation.outSize();
  idbassert(idlsz == 2);
  dl1 = fOutputJobStepAssociation.outAt(0)->dataList();

  if (dl1)
    oid1 = dl1->OID();

  dl2 = fOutputJobStepAssociation.outAt(1)->dataList();

  if (dl2)
    oid2 = dl2->OID();

  oss << endl << "                    ";
  oss << " out tb/col1:" << fTableOID1 << "/" << oid1;
  oss << " " << fOutputJobStepAssociation.outAt(0);
  oss << " out tb/col2:" << fTableOID2 << "/" << oid2;
  oss << " " << fOutputJobStepAssociation.outAt(1) << endl;

  return oss.str();
}

StringHashJoinStep::StringHashJoinStep(JoinType joinType, uint32_t sessionId, uint32_t txnId,
                                       uint32_t statementId, ResourceManager* rm)
 : LargeHashJoin(joinType, sessionId, txnId, statementId, rm)
{
}

StringHashJoinStep::~StringHashJoinStep()
{
}

void StringHashJoinStep::run()
{
  if (traceOn())
  {
    syslogStartStep(16,                                  // exemgr subsystem
                    std::string("StringHashJoinStep"));  // step name
  }

  runner.reset(new boost::thread(StringHJRunner(this)));
}

void StringHashJoinStep::doStringHashJoin()
{
  string val;

  idbassert(fInputJobStepAssociation.outSize() >= 2);
  idbassert(fOutputJobStepAssociation.outSize() >= 2);
  DataList<StringElementType>* inDL1 = fInputJobStepAssociation.outAt(0)->stringDataList();
  DataList<StringElementType>* inDL2 = fInputJobStepAssociation.outAt(1)->stringDataList();
  idbassert(inDL1);
  idbassert(inDL2);

  BucketDL<StringElementType>* Ap = 0;
  BucketDL<StringElementType>* Bp = 0;
  BucketDL<StringElementType>* tAp = 0;
  BucketDL<StringElementType>* tBp = 0;

  HashJoin<StringElementType>* hj = 0;
  double createWorkTime = 0;
  double hashWorkTime = 0;
  double insertWorkTime = 0;
  DataList_t* resultA = fOutputJobStepAssociation.outAt(0)->dataList();
  DataList_t* resultB = fOutputJobStepAssociation.outAt(1)->dataList();
  struct timeval start_time;
  gettimeofday(&start_time, 0);
  struct timeval end_time = start_time;
  ZonedDL* bdl1 = 0;
  ZonedDL* bdl2 = 0;

  // result from hashjoinstep is expected to be BandedDataList
  // but the HashJoin<StringElementType> returns StringDataList
  // also, the null is reported as "_CpNuLl_" by pDictionStep
  // create two StringDataList for the intermediate result BDL
  // @bug 721. use zdl.
  StringZonedDL* dlA = new StringZonedDL(1, fRm);
  dlA->setMultipleProducers(true);
  StringZonedDL* dlB = new StringZonedDL(1, fRm);
  dlB->setMultipleProducers(true);

  if (0 < fInputJobStepAssociation.status())
  {
    unblockDatalists(fInputJobStepAssociation.status());
  }
  else
  {
    string currentAction("preparing join");

    try
    {
      // If we're given BucketDL's, use them
      if (typeid(*inDL1) == typeid(BucketDL<StringElementType>))
      {
        if (typeid(*inDL2) != typeid(BucketDL<StringElementType>))
        {
          throw logic_error("StringHashJoinStep::run: expected either 0 or 2 BucketDL's!");
        }

        Ap = dynamic_cast<BucketDL<StringElementType>*>(inDL1);
        Bp = dynamic_cast<BucketDL<StringElementType>*>(inDL2);
      }
      else
      {
        int maxBuckets = fRm.getHjMaxBuckets();
        joblist::ridtype_t maxElems = fRm.getHjMaxElems();

        // 		int maxBuckets=4;
        // 		joblist::ridtype_t maxElems=1024*8;
        // 		val = fConfig->getConfig("HashJoin", "MaxBuckets");  // same as HashJoin
        //    		if (val.size() > 0)
        // 			maxBuckets = static_cast<int>(config::Config::fromText(val));
        // 		if (maxBuckets <=0)
        // 			maxBuckets=4;
        // 		val = fConfig->getConfig("HashJoin", "MaxElems");    // same as HashJoin
        // 		if (val.size() >0)
        // 			maxElems = config::Config::uFromText(val);
        // 		if (maxElems<=0)
        // 			maxElems=1024*8;

        tAp = new BucketDL<StringElementType>(maxBuckets, 1, maxElems, fRm);
        tBp = new BucketDL<StringElementType>(maxBuckets, 1, maxElems, fRm);
        tAp->setHashMode(1);
        tBp->setHashMode(1);

        StringElementType element;
        int id = inDL1->getIterator();

        while (inDL1->next(id, &element))
        {
          tAp->insert(element);
        }

        tAp->endOfInput();

        id = inDL2->getIterator();

        while (inDL2->next(id, &element))
        {
          tBp->insert(element);
        }

        tBp->endOfInput();

        Ap = tAp;
        Bp = tBp;
      }

      unsigned numThreads = fRm.getHjNumThreads();
      // 	unsigned numThreads = 0;
      // 	val = fConfig->getConfig("HashJoin", "NumThreads");
      // 	if (val.size() > 0)
      // 		numThreads = static_cast<unsigned>(config::Config::uFromText(val));
      // 	if (numThreads <= 0)
      // 		numThreads = 4;

      BDLWrapper<StringElementType> setA(Ap);
      BDLWrapper<StringElementType> setB(Bp);

      HashJoin<StringElementType>* hj =
          new HashJoin<StringElementType>(setA, setB, dlA, dlB, fJoinType, &dlTimes,
                                          fOutputJobStepAssociation.statusPtr(), sessionId(), &die);

      if ((dlA == NULL) || (dlB == NULL) || (hj == NULL))
      {
        ostringstream oss;
        oss << "StringHashJoinStep::run() null pointer from new -- ";
        oss << "StringDataList A(0x" << hex << (ptrdiff_t)dlA << "), B(0x" << (ptrdiff_t)dlB
            << "), HashJoin hj(0x" << (ptrdiff_t)hj << ")";
        throw(runtime_error(oss.str().c_str()));
      }

      // leave this in
      if (fTableOID2 >= 3000)
      {
        ostringstream logStr2;
        logStr2 << "StringHashJoinStep::run: ses:" << fSessionId << " st:" << fStepId
                << " input sizes: " << setA.size() << "/" << setB.size() << endl;
        cout << logStr2.str();
      }

      currentAction = "performing join";

      if (fTableOID2 >= 3000)
      {
        dlTimes.setFirstReadTime();
        dlTimes.setEndOfInputTime(dlTimes.FirstReadTime());
      }

      hj->performJoin(numThreads);

      currentAction = "after join";

      // convert from StringElementType to ElementType by grabbing the rid
      // take _CpNuLl_ out of the result
      StringElementType se;
      ElementType e;
      int id = dlA->getIterator();

      bdl1 = dynamic_cast<ZonedDL*>(resultA);
      bdl2 = dynamic_cast<ZonedDL*>(resultB);
      vector<ElementType> v;
      v.reserve(ZDL_VEC_SIZE);

      if (bdl1)
      {
        while (dlA->next(id, &se))
        {
          if (se.second != CPNULLSTRMARK)
          {
            e.first = se.first;
            v.push_back(e);

            if (v.size() >= ZDL_VEC_SIZE)
            {
              resultA->insert(v);
              v.clear();
            }
          }
        }

        if (v.size() > 0)
          resultA->insert(v);

        resultA->endOfInput();
      }

      else
      {
        while (dlA->next(id, &se))
        {
          if (se.second != CPNULLSTRMARK)
          {
            e.first = se.first;
            resultA->insert(e);
          }
        }

        resultA->endOfInput();
      }

      id = dlB->getIterator();

      if (bdl2)
      {
        v.clear();

        while (dlB->next(id, &se))
        {
          if (se.second != CPNULLSTRMARK)
          {
            e.first = se.first;
            v.push_back(e);

            if (v.size() >= ZDL_VEC_SIZE)
            {
              resultB->insert(v);
              v.clear();
            }
          }
        }

        if (v.size() > 0)
          resultB->insert(v);

        resultB->endOfInput();
      }
      else
      {
        while (dlB->next(id, &se))
        {
          if (se.second != CPNULLSTRMARK)
          {
            e.first = se.first;
            resultB->insert(e);
          }
        }

        resultB->endOfInput();
      }
    }  // try
    catch (const logging::LargeDataListExcept& ex)
    {
      ostringstream errMsg;
      errMsg << __FILE__ << " doStringHashJoin: " << currentAction << ", caught LDL error: " << ex.what();
      errorLogging(errMsg.str());
      unblockDatalists(logging::stringHashJoinStepLargeDataListFileErr);
      dlA->endOfInput();
      dlB->endOfInput();
    }
    catch (const exception& ex)
    {
      ostringstream errMsg;
      errMsg << __FILE__ << " doStringHashJoin: " << currentAction << ", caught: " << ex.what();
      errorLogging(errMsg.str());
      unblockDatalists(logging::stringHashJoinStepErr);
      dlA->endOfInput();
      dlB->endOfInput();
    }
    catch (...)
    {
      ostringstream errMsg;
      errMsg << __FILE__ << " doStringHashJoin: " << currentAction << ", caught unknown exception";
      errorLogging(errMsg.str());
      unblockDatalists(logging::stringHashJoinStepErr);
      dlA->endOfInput();
      dlB->endOfInput();
    }

    gettimeofday(&end_time, 0);

    if (fTableOID2 >= 3000)
      dlTimes.setEndOfInputTime();

    if (hj)
    {
      //..hashWorkTime is the time to perform the hashjoin excluding the
      //  the output insertion time.  insertWorkTime is the sum or total
      //  of both insert times.  The end result is that createWorkTime +
      //  hashWorkTime + insertWorkTime roughly equates to the total work
      //  time.
      createWorkTime = hj->getTimeSet()->totalTime(createHashStr);
      hashWorkTime = hj->getTimeSet()->totalTime(hashJoinStr) - hj->getTimeSet()->totalTime(insertResultsStr);
      insertWorkTime =
          hj->getTimeSet()->totalTime(insertResultsStr) + hj->getTimeSet()->totalTime(insertLastResultsStr);
    }

  }  // (fInputJobStepAssociation.status() == 0)

  if (fTableOID2 >= 3000 && traceOn())
  {
    time_t finTime = time(0);
    char finTimeString[50];
    ctime_r(&finTime, finTimeString);
    finTimeString[strlen(finTimeString) - 1] = '\0';

    ostringstream logStr;
    logStr << "ses:" << fSessionId << " st: " << fStepId << " finished at " << finTimeString << "; 1st read "
           << dlTimes.FirstReadTimeString() << "; EOI " << dlTimes.EndOfInputTimeString() << endl
           << "\tStringHashJoinStep::run: output sizes: " << dlA->totalSize() << "/" << dlB->totalSize()
           << " [";

    if (bdl1 && bdl2)
      logStr << bdl1->totalSize() << "/" << bdl2->totalSize();

    logStr << "] run time: " << JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(), dlTimes.FirstReadTime())
           << fixed << setprecision(2) << "s\n\tTotal work times: create hash: " << createWorkTime
           << "s, hash join: " << hashWorkTime << "s, insert results: " << insertWorkTime << "s\n"
           << "\tJob completion status " << fInputJobStepAssociation.status() << endl;
    logEnd(logStr.str().c_str());

    syslogProcessingTimes(16,          // exemgr subsystem
                          start_time,  // use join start time for first read time
                          end_time,    // use join end   time for last  read time
                          start_time,  // use join start time for first write time
                          end_time);   // use join end   time for last  write time
    syslogEndStep(16,                  // exemgr subsystem
                  0,                   // no blocked datalist input  to report
                  0);                  // no blocked datalist output to report
  }

  delete hj;
  delete tAp;
  delete tBp;
  delete dlA;
  delete dlB;
}

const string StringHashJoinStep::toString() const
{
  ostringstream oss;
  CalpontSystemCatalog::OID oid1 = 0;
  CalpontSystemCatalog::OID oid2 = 0;

  size_t idlsz = fInputJobStepAssociation.outSize();
  idbassert(idlsz == 2);
  DataList<StringElementType>* dl1 = fInputJobStepAssociation.outAt(0)->stringDataList();

  if (dl1)
    oid1 = dl1->OID();

  DataList<StringElementType>* dl2 = fInputJobStepAssociation.outAt(1)->stringDataList();

  if (dl2)
    oid2 = dl2->OID();

  oss << "StringHashJoinStep    ses:" << fSessionId << " st:" << fStepId;
  oss << omitOidInDL;
  oss << " in  tb/col1:" << fTableOID1 << "/" << oid1;
  oss << " " << fInputJobStepAssociation.outAt(0);
  oss << " in  tb/col2:" << fTableOID2 << "/" << oid2;
  oss << " " << fInputJobStepAssociation.outAt(1);

  idlsz = fOutputJobStepAssociation.outSize();
  idbassert(idlsz == 2);
  DataList_t* dl3 = fOutputJobStepAssociation.outAt(0)->dataList();

  if (dl3)
    oid1 = dl3->OID();

  DataList_t* dl4 = fOutputJobStepAssociation.outAt(1)->dataList();

  if (dl4)
    oid2 = dl4->OID();

  oss << endl << "                          ";
  oss << " out tb/col1:" << fTableOID1 << "/" << oid1;
  oss << " " << fOutputJobStepAssociation.outAt(0);
  oss << " out tb/col2:" << fTableOID2 << "/" << oid2;
  oss << " " << fOutputJobStepAssociation.outAt(1);

  return oss.str();
}

}  // namespace joblist