1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-26 05:02:32 +03:00
Files
mariadb-columnstore-engine/writeengine/dictionary/we_dctnry.cpp
Andrew Hutchings 21ca6ca42f MCOL-1021 Fix dictionary de-duplication cache
The signatures were getting destroyed whilst processing before being
used in the dictionary de-duplication cache making the cache full of
empty strings. This fix resets the signature after insertion for the
cache.

This fix also lets the signature cache be read when there is 1000
entries in it.
2017-11-09 17:28:21 +00:00

1454 lines
48 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/*******************************************************************************
* $Id: we_dctnry.cpp 4737 2013-08-14 20:45:46Z bwilkinson $
*
*******************************************************************************/
/** @we_dctnry.cpp
* When a signature is given, the value will be stored in dictionary and
* a token will be issued. Given a token, the signature in the dictionary
* can be deleted.
* The whole file contains only one class Dctnry
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <sstream>
#include <inttypes.h>
#include <iostream>
using namespace std;
#include "bytestream.h"
#include "brmtypes.h"
#include "extentmap.h" // for DICT_COL_WIDTH
#include "we_stats.h"
#include "we_log.h"
#include "we_dctnry.h"
using namespace messageqcpp;
using namespace WriteEngine;
using namespace BRM;
#include "IDBDataFile.h"
#include "IDBPolicy.h"
#include "cacheutils.h"
using namespace idbdatafile;
namespace
{
// These used to be member variables, hence the "m_" prefix. But they are
// all constants, so I removed them as member variables. May change the
// variable name later (to remove the m_ prefix) as time allows.
const uint16_t m_endHeader = DCTNRY_END_HEADER; // end of header flag (0xffff)
const uint16_t m_offSetZero= BYTE_PER_BLOCK; // value for 0 offset (8192)
const int m_lastOffSet= BYTE_PER_BLOCK; // end of last offset
const int m_totalHdrBytes = // # bytes in header
HDR_UNIT_SIZE + NEXT_PTR_BYTES + HDR_UNIT_SIZE + HDR_UNIT_SIZE;
const int m_bigSpace = // free space in an empty block
BYTE_PER_BLOCK - (m_totalHdrBytes + HDR_UNIT_SIZE);
const int START_HDR1 = // start loc of 2nd offset (HDR1)
HDR_UNIT_SIZE + NEXT_PTR_BYTES + HDR_UNIT_SIZE;
const int PSEUDO_COL_WIDTH = DICT_COL_WIDTH; // used to convert row count to block count
const int MAX_BLOB_SIZE = 2100000000; // for safety, we use an 18bit block count of 8KB blocks
}
namespace WriteEngine
{
// We will make this a constant for now. If we ever decide to make
// INITIAL_EXTENT_ROWS_TO_DISK configurable, we will need to move this
// statement, and use Config class to get INITIAL_EXTENT_ROWS_TO_DISK.
int NUM_BLOCKS_PER_INITIAL_EXTENT =
((INITIAL_EXTENT_ROWS_TO_DISK/BYTE_PER_BLOCK) * PSEUDO_COL_WIDTH);
/*******************************************************************************
* Description:
* Dctnry constructor
******************************************************************************/
Dctnry::Dctnry() :
m_nextPtr(NOT_USED_PTR),
m_partition(0),
m_segment(0),
m_dbRoot(1),
m_numBlocks(0),
m_lastFbo(0),
m_hwm(0),
m_newStartOffset(0),
m_freeSpace(0),
m_curOp(0),
m_colWidth(0),
m_importDataMode(IMPORT_DATA_TEXT)
{
memset( m_dctnryHeader, 0, sizeof(m_dctnryHeader));
memset( m_curBlock.data, 0, sizeof(m_curBlock.data));
m_curBlock.lbid = INVALID_LBID;
//add all initial header sizes for an empty block
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes ;
memcpy(m_dctnryHeader2, &m_freeSpace, HDR_UNIT_SIZE);
memcpy(m_dctnryHeader2+ HDR_UNIT_SIZE, &m_nextPtr,NEXT_PTR_BYTES);
memcpy(m_dctnryHeader2+ HDR_UNIT_SIZE+NEXT_PTR_BYTES,
&m_offSetZero, HDR_UNIT_SIZE);
memcpy(m_dctnryHeader2+ HDR_UNIT_SIZE + NEXT_PTR_BYTES + HDR_UNIT_SIZE,
&m_endHeader, HDR_UNIT_SIZE);
m_curFbo = INVALID_NUM;
m_curLbid = INVALID_LBID;
memset(m_sigArray, 0 , MAX_STRING_CACHE_SIZE*sizeof(Signature));
m_arraySize =0;
clear();//files
}
/*******************************************************************************
* Description:
* Dctnry destructor
******************************************************************************/
Dctnry::~Dctnry()
{
//clear string cache here!
freeStringCache( );
}
/*******************************************************************************
* Description:
* Free memory consumed by dictionary string cache
******************************************************************************/
void Dctnry::freeStringCache( )
{
for (int i=0; i<m_arraySize; i++)
{
delete [] m_sigArray[i].signature;
m_sigArray[i].signature = 0;
}
memset(m_sigArray, 0 , MAX_STRING_CACHE_SIZE*sizeof(Signature));
m_arraySize = 0;
}
/*******************************************************************************
* Description:
* Create a dictionary file and initialize the header
*
* PARAMETERS:
* none
*
* RETURN:
* success - successfully write the header to block
* failure - it did not write the header to block
******************************************************************************/
int Dctnry::init()
{
//cout <<"Init called! m_dctnryOID =" << m_dctnryOID << endl;
m_lastFbo =0;
m_hwm = 0;
m_newStartOffset =0;
m_freeSpace = 0;
m_curOp=0;
memset( m_curBlock.data, 0, sizeof(m_curBlock.data));
m_curBlock.lbid = INVALID_LBID;
memset(m_sigArray, 0 , MAX_STRING_CACHE_SIZE*sizeof(Signature));
m_arraySize =0;
return NO_ERROR;
}
/*******************************************************************************
* Description:
* Create a dictionary file and initialize the header, or can be used to
* just add an extent to an already open dictionary store file.
*
* PARAMETERS:
* input
* dctnryOID - dictionary OID
* colWidth - dictionary string width (not the token width)
* dbRoot - DBRoot where file is located
* partition - partition number associated with the file
* segment - segment number associated with the file
* startLbid - (out) starting LBID of the newly allocated extent
* flag - "true" indicates we are adding the first block and the
* file needs to be created with an abbreviated extent.
* "false" indicates we just want to add an extent to
* an existing file, and the file has already been opened.
*
* RETURN:
* success - successfully created file and/or extent
* failure - failed to create file and/or extent
******************************************************************************/
int Dctnry::createDctnry( const OID& dctnryOID, int colWidth,
const uint16_t dbRoot, const uint32_t partition, const uint16_t segment,
LBID_t& startLbid, bool flag)
{
int allocSize = 0;
char fileName[FILE_NAME_SIZE];
int rc;
std::map<FID,FID> oids;
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_ALLOC_DCT_EXTENT);
#endif
if (flag)
{
m_dctnryOID = dctnryOID;
m_partition = partition;
m_segment = segment;
m_dbRoot = dbRoot;
RETURN_ON_ERROR( ( rc = oid2FileName( m_dctnryOID, fileName, true,
m_dbRoot, m_partition, m_segment ) ) );
m_segFileName = fileName;
// if obsolete file exists, "w+b" will truncate and write over
m_dFile = createDctnryFile(fileName, colWidth, "w+b", DEFAULT_BUFSIZ);
}
else
{
RETURN_ON_ERROR( setFileOffset(m_dFile, 0, SEEK_END) );
}
rc = BRMWrapper::getInstance()->allocateDictStoreExtent(
(const OID)m_dctnryOID, m_dbRoot, m_partition, m_segment,
startLbid, allocSize);
if (rc != NO_ERROR)
{
if (flag)
{
closeDctnryFile(false, oids);
}
return rc;
}
// We allocate a full extent from BRM, but only write an abbreviated 256K
// rows to disk for 1st extent in each store file, to conserve disk usage.
int totalSize = allocSize;
if (flag)
{
totalSize = NUM_BLOCKS_PER_INITIAL_EXTENT;
}
if ( !isDiskSpaceAvail(Config::getDBRootByNum(m_dbRoot), totalSize) )
{
if (flag)
{
closeDctnryFile(false, oids);
}
return ERR_FILE_DISK_SPACE;
}
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_ALLOC_DCT_EXTENT);
#endif
if( m_dFile != NULL ) {
rc = FileOp::initDctnryExtent( m_dFile,
m_dbRoot,
totalSize,
m_dctnryHeader2,
m_totalHdrBytes,
false );
if (rc != NO_ERROR)
{
if (flag)
{
closeDctnryFile(false, oids);
}
return rc;
}
}
else
return ERR_FILE_CREATE;
if (flag)
{
closeDctnryFile(true, oids);
m_numBlocks = totalSize;
m_hwm = 0;
rc = BRMWrapper::getInstance()->setLocalHWM(
m_dctnryOID, m_partition, m_segment, m_hwm);
}
else
{
m_numBlocks = m_numBlocks + totalSize;
}
return rc;
}
/*******************************************************************************
* Description:
* This function should be called to expand an abbreviated dictionary extent
* into a full extent on disk.
*
* PARAMETERS:
* none
*
* RETURN:
* success - successfully expanded extent
* failure - failed to expand extent
******************************************************************************/
int Dctnry::expandDctnryExtent()
{
RETURN_ON_NULL( m_dFile, ERR_FILE_SEEK );
off64_t oldOffset = m_dFile->tell();
RETURN_ON_ERROR( setFileOffset(m_dFile, 0, SEEK_END) );
// Based on extent size, see how many blocks to add to fill the extent
int blksToAdd = ( ((int)BRMWrapper::getInstance()->getExtentRows() -
INITIAL_EXTENT_ROWS_TO_DISK)/BYTE_PER_BLOCK ) * PSEUDO_COL_WIDTH;
if ( !isDiskSpaceAvail(Config::getDBRootByNum(m_dbRoot), blksToAdd) )
{
return ERR_FILE_DISK_SPACE;
}
int rc = FileOp::initDctnryExtent( m_dFile,
m_dbRoot,
blksToAdd,
m_dctnryHeader2,
m_totalHdrBytes,
true );
if (rc != NO_ERROR)
return rc;
// Restore offset back to where we were before expanding the extent
RETURN_ON_ERROR( setFileOffset(m_dFile, oldOffset, SEEK_SET) );
// Update block count to reflect disk space added by expanding the extent.
m_numBlocks = m_numBlocks + blksToAdd;
return rc;
}
/*******************************************************************************
* DESCRIPTION:
* Close dictionary files
*
* PARAMETERS:
* none
*
* RETURN:
* none
******************************************************************************/
int Dctnry::closeDctnry(bool realClose)
{
if ( !m_dFile )
return NO_ERROR;
int rc;
CommBlock cb;
cb.file.oid = m_dctnryOID;
cb.file.pFile = m_dFile;
std::map<FID,FID> oids;
if (m_curBlock.state==BLK_WRITE)
{
rc = writeDBFile(cb, &m_curBlock, m_curBlock.lbid);
if (rc != NO_ERROR)
{
closeDctnryFile(false, oids);
return rc;
}
memset( m_curBlock.data, 0, sizeof(m_curBlock.data));
// m_curBlock.state== BLK_INIT;
}
//@Bug 5572. always close file for uncompressed file.
if (FileOp::compressionType() == 0)
realClose = true;
if (realClose) {
//@Bug 5689. Need pass oid to write to the right file.
oids[m_dctnryOID] = m_dctnryOID;
// dmc-error handling (should detect/report error in closing file)
closeDctnryFile(true, oids);
}
m_hwm = (HWM)m_lastFbo;
idbassert(m_dctnryOID>=0);
if (idbdatafile::IDBPolicy::useHdfs() && realClose)
{
BRM::FileInfo aFile;
std::vector<BRM::OID_t> oidsToFlush;
oidsToFlush.push_back(m_dctnryOID);
aFile.oid = m_dctnryOID;
aFile.partitionNum = m_partition;
aFile.segmentNum = m_segment;
aFile.dbRoot = m_dbRoot;
aFile.compType = FileOp::compressionType();
std::vector<BRM::FileInfo> aFileInfo;
aFileInfo.push_back(aFile);
cacheutils::purgePrimProcFdCache(aFileInfo, Config::getLocalModuleID());
cacheutils::flushOIDsFromCache(oidsToFlush);
}
rc = BRMWrapper::getInstance()->setLocalHWM(
m_dctnryOID, m_partition, m_segment, m_hwm);
if (rc != NO_ERROR)
return rc;
//cout <<"Init called! m_dctnryOID =" << m_dctnryOID << endl;
if (realClose)
freeStringCache( );
return NO_ERROR;
}
/*******************************************************************************
* DESCRIPTION:
* Close dictionary file without flushing block buffer or updating
* BRM with HWM.
*
* PARAMETERS:
* none
*
* RETURN:
* none
******************************************************************************/
int Dctnry::closeDctnryOnly( )
{
if ( !m_dFile )
return NO_ERROR;
// dmc-error handling (should detect/report error in closing file)
std::map<FID,FID> oids;
closeDctnryFile(false, oids);
freeStringCache( );
return NO_ERROR;
}
/*******************************************************************************
* DESCRIPTION:
* drop/delete dictionary file
*
* PARAMETERS:
* dctnryOID -- file number to drop
*
* RETURN:
* none
******************************************************************************/
int Dctnry::dropDctnry( const OID& dctnryOID)
{
m_dctnryOID = dctnryOID;
if (m_dFile)
{
RETURN_ON_ERROR( closeDctnry() );
}
return deleteFile( dctnryOID);
}
/*******************************************************************************
* DESCRIPTION:
* open dictionary file
*
* PARAMETERS:
* dctnryOID-- for open dictionary file
* dbRoot -- DBRoot for dictionary store segment file
* partition-- partition for dictionary store segment file
* segment -- segment for dictionary store segment file
* useTmpSuffix - for Bulk HDFS usage: use or not use *.tmp file suffix
*
* RETURN:
* successful- NO_ERROR
* Fail - Error Code
******************************************************************************/
// @bug 5572 - HDFS usage: add *.tmp file backup flag
int Dctnry::openDctnry(const OID& dctnryOID,
const uint16_t dbRoot,
const uint32_t partition,
const uint16_t segment,
const bool useTmpSuffix)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_OPEN_DCT_FILE);
#endif
int rc = NO_ERROR;
m_dctnryOID = dctnryOID;
m_dbRoot = dbRoot;
m_partition = partition;
m_segment = segment;
m_dFile = openDctnryFile(useTmpSuffix);
if( m_dFile == NULL )
{
ostringstream oss;
oss << "oid:partition:segment " <<
dctnryOID <<":"<<partition<<":"<<segment;
logging::Message::Args args;
logging::Message message(1);
args.add("Error opening dictionary file ");
args.add(oss.str());
args.add("");
args.add("");
message.format(args);
logging::LoggingID lid(21);
logging::MessageLog ml(lid);
ml.logErrorMessage( message );
return ERR_FILE_OPEN;
}
m_numBlocks = numOfBlocksInFile();
std::map<FID,FID> oids;
//Initialize other misc member variables
init();
int extState;
rc=BRMWrapper::getInstance()->getLocalHWM(dctnryOID,
m_partition, m_segment, m_hwm, extState);
if (rc!=NO_ERROR)
{
closeDctnryFile(false, oids);
return rc;
}
m_lastFbo = (int)m_hwm;
memset( m_curBlock.data, 0, sizeof(m_curBlock.data));
m_curFbo = m_lastFbo;
rc = BRMWrapper::getInstance()->getBrmInfo( m_dctnryOID,
m_partition, m_segment,
m_curFbo, m_curLbid);
if (rc!=NO_ERROR)
{
closeDctnryFile(false, oids);
return rc;
}
CommBlock cb;
cb.file.oid = m_dctnryOID;
cb.file.pFile = m_dFile;
#ifdef PROFILE
// We omit the call to readDBFile from OPEN_DCT_FILE stats, because com-
// pressed files have separate stats that readDBFile() will capture thru
// ChunkManager::fetchChunkFromFile().
Stats::stopParseEvent(WE_STATS_OPEN_DCT_FILE);
#endif
rc=readDBFile(cb, m_curBlock.data, m_curLbid);
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_OPEN_DCT_FILE);
#endif
if (rc!=NO_ERROR)
{
closeDctnryFile(false, oids);
return rc;
}
//@Bug 5567 Don't seek for compressed file.
if (m_compressionType == 0)
{
// Position file to the start of the current block;
// Determine file byte offset based on the current block offset (m_curFbo)
long long byteOffset = ((long long)m_curFbo) * (long)BYTE_PER_BLOCK;
rc = setFileOffset(m_dFile, byteOffset);
if (rc!=NO_ERROR)
{
closeDctnryFile(false, oids);
return rc;
}
}
m_curBlock.lbid = m_curLbid;
m_curBlock.state= BLK_READ;
int opCnt = 0;
// Get new free space (m_freeSpace) from header too! Here!!!!!!!!!!!!!!!
getBlockOpCount( m_curBlock, opCnt);
m_curOp = opCnt;
// "If" this store file contains no more than 1 block, then we preload
// the string cache used to recognize duplicates during row insertion.
if (m_hwm == 0)
{
preLoadStringCache( m_curBlock );
}
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_OPEN_DCT_FILE);
#endif
return rc;
}
/*******************************************************************************
* Description:
* Determine if the specified signature is present in the string cache.
*
* PARAMETERS:
* input
* sig - signature to search for
*
* RETURN:
* true - if signature if found
* false - if signature is not found
******************************************************************************/
bool Dctnry::getTokenFromArray(Signature& sig)
{
for (int i=0; i<(int)m_arraySize ; i++ )
{
if (sig.size == m_sigArray[i].size)
{
if (!memcmp(sig.signature, m_sigArray[i].signature, sig.size))
{
sig.token = m_sigArray[i].token;
return true;
}//endif sig compare
}//endif size compare
}
return false;
}
/*******************************************************************************
* Description:
* Used by bulk import to insert a signature into m_curBlock, and update
* the m_curBlock header accordingly.
*
* PARAMETERS:
* input
* sig - signature to be inserted
* output
* token - token that was assigned to the inserted signature
*
* RETURN:
* success - successfully write the signature to the block
* failure - failed to extend/create an extent for the block
******************************************************************************/
int Dctnry::insertDctnry2(Signature& sig)
{
int rc = 0;
int write_size;
bool lbid_in_token = false;
size_t origSigSize = sig.size;
unsigned char* origSig = sig.signature;
sig.token.bc = 0;
while (sig.size > 0)
{
if (sig.size > (m_freeSpace - m_totalHdrBytes))
{
write_size = (m_freeSpace - m_totalHdrBytes);
}
else
{
write_size = sig.size;
}
insertDctnryHdr(m_curBlock.data, write_size);
insertSgnture(m_curBlock.data, write_size, (unsigned char*)sig.signature);
sig.size -= write_size;
sig.signature += write_size;
m_curFbo = m_lastFbo;
if (!lbid_in_token)
{
sig.token.fbo = m_curLbid;
sig.token.op = m_curOp;
lbid_in_token = true;
}
if (sig.size > 0)
{
CommBlock cb;
cb.file.oid = m_dctnryOID;
cb.file.pFile = m_dFile;
sig.token.bc++;
RETURN_ON_ERROR( writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo) );
memset( m_curBlock.data, 0, sizeof(m_curBlock.data));
memcpy( m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
m_curBlock.state = BLK_WRITE;
m_curOp =0;
m_lastFbo++;
m_curFbo = m_lastFbo;
//...Expand current extent if it is an abbreviated initial extent
if ((m_curFbo == m_numBlocks) &&
(m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
{
RETURN_ON_ERROR( expandDctnryExtent() );
}
//...Allocate a new extent if we have reached the last block in the
// current extent.
if (m_curFbo == m_numBlocks)
{//last block
//for roll back the extent to use
//Save those empty extents in case of failure to rollback
std::vector<ExtentInfo> dictExtentInfo;
ExtentInfo info;
info.oid = m_dctnryOID;
info.partitionNum = m_partition;
info.segmentNum = m_segment;
info.dbRoot = m_dbRoot;
info.hwm = m_hwm;
info.newFile = false;
dictExtentInfo.push_back (info);
LBID_t startLbid;
// Add an extent.
rc = createDctnry(m_dctnryOID,
0, // dummy column width
m_dbRoot,
m_partition,
m_segment,
startLbid,
false) ;
if ( rc != NO_ERROR )
{
//roll back the extent
BRMWrapper::getInstance()->deleteEmptyDictStoreExtents(
dictExtentInfo);
return rc;
}
}
RETURN_ON_ERROR( BRMWrapper::getInstance()->getBrmInfo(m_dctnryOID,
m_partition, m_segment,
m_curFbo, m_curLbid) );
m_curBlock.lbid = m_curLbid;
}
}
sig.size = origSigSize;
sig.signature = origSig;
return NO_ERROR;
}
/*******************************************************************************
* Description:
* Used by bulk import to insert collection of strings into this store file.
* Function assumes that the file is already positioned to the current block.
*
* PARAMETERS:
* input
* buf - character buffer containing input strings
* pos - meta data describing data in "buf"
* totalRow - number of rows in "buf"
* col - column of strings to be parsed from "buf"
* output
* tokenBuf - tokens assigned to inserted strings
*
* RETURN:
* success - successfully write the header to block
* failure - it did not write the header to block
******************************************************************************/
int Dctnry::insertDctnry(const char* buf,
ColPosPair ** pos,
const int totalRow, const int col,
char* tokenBuf,
long long& truncCount)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT);
#endif
int startPos = 0;
int totalUseSize = 0;
int outOffset = 0;
const char* pIn;
char* pOut = tokenBuf;
Signature curSig;
bool found = false;
bool next = false;
CommBlock cb;
cb.file.oid = m_dctnryOID;
cb.file.pFile = m_dFile;
WriteEngine::Token nullToken;
//...Loop through all the rows for the specified column
while(startPos < totalRow)
{
found = false;
memset(&curSig, 0, sizeof(curSig));
curSig.size = pos[startPos][col].offset;
// Strip trailing null bytes '\0' (by adjusting curSig.size) if import-
// ing in binary mode. If entire string is binary zeros, then we treat
// as a NULL value.
if (m_importDataMode != IMPORT_DATA_TEXT)
{
if ((curSig.size > 0) &&
(curSig.size != COLPOSPAIR_NULL_TOKEN_OFFSET))
{
char* fld = (char*)buf + pos[startPos][col].start;
int kk = curSig.size-1;
for (; kk>=0; kk--)
{
if (fld[kk] != '\0')
break;
}
curSig.size = kk + 1;
}
}
// Read thread should validate against max size so that the entire row
// can be rejected up front. Once we get here in the parsing thread,
// it is too late to reject the row. However, as a precaution, we
// still check against max size & set to null token if needed.
if ((curSig.size == 0) ||
(curSig.size == COLPOSPAIR_NULL_TOKEN_OFFSET) ||
(curSig.size > MAX_BLOB_SIZE))
{
if (m_defVal.length() > 0) // use default string if available
{
pIn = m_defVal.c_str();
curSig.signature = (unsigned char*)pIn;
curSig.size = m_defVal.length();
}
else
{
memcpy( pOut + outOffset, &nullToken, 8 );
outOffset += 8;
startPos++;
continue;
}
}
else
{
pIn = (char*)buf + pos[startPos][col].start;
curSig.signature =(unsigned char*)pIn;
}
// @Bug 2565: Truncate any strings longer than schema's column width
if (curSig.size > m_colWidth)
{
curSig.size = m_colWidth;
++truncCount;
}
//...Search for the string in our string cache
//if it fits into one block (< 8KB)
if (curSig.size <= MAX_SIGNATURE_SIZE)
{
//Stats::startParseEvent("getTokenFromArray");
found = getTokenFromArray(curSig);
if(found)
{
memcpy( pOut + outOffset, &curSig.token, 8 );
outOffset += 8;
startPos++;
//Stats::stopParseEvent("getTokenFromArray");
continue;
}
//Stats::stopParseEvent("getTokenFromArray");
}
totalUseSize = m_totalHdrBytes + curSig.size;
//...String not found in cache, so proceed.
// If room is available in current block then insert into block.
// @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
if( ((totalUseSize <= m_freeSpace) ||
((curSig.size > 8176) && (m_freeSpace > m_totalHdrBytes))) &&
(m_curOp < (MAX_OP_COUNT-1)) ) {
RETURN_ON_ERROR(insertDctnry2(curSig)); //m_freeSpace updated!
m_curBlock.state = BLK_WRITE;
memcpy( pOut + outOffset, &curSig.token, 8 );
outOffset += 8;
startPos++;
found = true;
//...If we have reached limit for the number of strings allowed in
// a block, then we write the current block so that we can start
// another block.
if (m_curOp>= MAX_OP_COUNT -1)
{
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
RETURN_ON_ERROR(writeDBFileNoVBCache(cb,&m_curBlock,m_curFbo));
m_curBlock.state = BLK_READ;
next = true;
}
//...Add string to cache, if we have not exceeded cache limit
// Don't cache big blobs
if ((m_arraySize < MAX_STRING_CACHE_SIZE) &&
(curSig.size <= MAX_SIGNATURE_SIZE))
{
addToStringCache( curSig );
}
}
else //...No room for this string in current block, so we write
// out the current block, so we can start another block
{
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
RETURN_ON_ERROR( writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo) );
m_curBlock.state = BLK_READ;
next = true;
found = false;
}//if m_freeSpace
//..."next" flag is used to indicate that we need to advance to the
// next block in the store file.
if (next)
{
memset( m_curBlock.data, 0, sizeof(m_curBlock.data));
memcpy( m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
m_curBlock.state = BLK_WRITE;
m_curOp =0;
next = false;
m_lastFbo++;
m_curFbo = m_lastFbo;
//...Expand current extent if it is an abbreviated initial extent
if ((m_curFbo == m_numBlocks) &&
(m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
{
RETURN_ON_ERROR( expandDctnryExtent() );
}
//...Allocate a new extent if we have reached the last block in the
// current extent.
if (m_curFbo == m_numBlocks)
{//last block
LBID_t startLbid;
// Add an extent.
RETURN_ON_ERROR( createDctnry(m_dctnryOID,
m_colWidth,
m_dbRoot,
m_partition,
m_segment,
startLbid,
false) );
if (m_logger)
{
std::ostringstream oss;
oss << "Add dictionary extent OID-" << m_dctnryOID <<
"; DBRoot-" << m_dbRoot <<
"; part-" << m_partition <<
"; seg-" << m_segment <<
"; hwm-" << m_curFbo <<
"; LBID-" << startLbid <<
"; file-" << m_segFileName;
m_logger->logMsg( oss.str(), MSGLVL_INFO2 );
}
m_curLbid = startLbid;
// now seek back to the curFbo, after adding an extent
// @bug5769 For uncompressed only;
// ChunkManager manages the file offset for the compression case
if (m_compressionType == 0)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
#endif
long long byteOffset = m_curFbo;
byteOffset *= BYTE_PER_BLOCK;
RETURN_ON_ERROR( setFileOffset(m_dFile, byteOffset) );
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
#endif
}
}
else
{
// LBIDs are numbered collectively and consecutively within an
// extent, so within an extent we can derive the LBID by simply
// incrementing it rather than having to go back to BRM to look
// up the LBID for each FBO.
m_curLbid++;
}
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT);
#endif
m_curBlock.lbid = m_curLbid;
//..."found" flag indicates whether the string was already found
// "or" added to the end of the previous block. If false, then
// we need to add the string to the new block.
if (!found)
{
RETURN_ON_ERROR(insertDctnry2(curSig)); //m_freeSpace updated!
m_curBlock.state = BLK_WRITE;
memcpy( pOut + outOffset, &curSig.token, 8 );
outOffset += 8;
startPos++;
//...Add string to cache, if we have not exceeded cache limit
if ((m_arraySize < MAX_STRING_CACHE_SIZE) &&
(curSig.size <= MAX_SIGNATURE_SIZE))
{
addToStringCache( curSig );
}
}
}//if next
}//end while
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
//Done
// If any data leftover and not written by subsequent call to
// insertDctnry(), then it will be written by closeDctnry().
return NO_ERROR;
}
/*******************************************************************************
* DESCRIPTION:
* Used by DML to insert a single string into this store file.
* (1) Insert a signature value into the block
* (2) The header information inserted at front
* (3) The signature inserted from back
* (4) Total minimum header size-- free space 2bytes, next pointer 8 bytes
* zero offset 2 bytes, end of header 2 bytes, total 14 bytes
* plus 2 bytes for new values' starting offset value storage
* total 14 bytes
* (5) Values size <=8176 =(8192-16) will not be split into two blocks
* (6) For smaller value <=8176, it has to fit into one block or
* unsuccessfully to insert
* (7) For large value > 8176,
* smaller space first then take up a whole block
* or a whole block first then some left over space in another
* block
* (8) limit to 8000 byte for this release size
*
* PARAMETERS:
* input dFile
* -- File handle
* Input sgnature_size
* -- how many bytes the signature occupies
* Input sgnature_value
* -- the value of the signature
* output token
* -- token structure carrying the assigned fbo and op
*
* RETURN:
* success - successfully insert the signature
* failure - it did not insert the signature
******************************************************************************/
int Dctnry::insertDctnry(const int& sgnature_size,
const unsigned char* sgnature_value,
Token& token)
{
int rc = 0;
int i;
unsigned char* value = NULL;
int size;
int write_size;
bool lbid_in_token = false;
// Round down for safety. In theory we can take 262143 * 8176 bytes
if (sgnature_size > MAX_BLOB_SIZE)
{
return ERR_DICT_SIZE_GT_2G;
}
if (sgnature_size == 0)
{
WriteEngine::Token nullToken;
memcpy( &token, &nullToken, 8 );
return NO_ERROR;
}
CommBlock cb;
cb.file.oid = m_dctnryOID;
cb.file.pFile = m_dFile;
size = sgnature_size;
value = (unsigned char*)sgnature_value;
token.bc = 0;
for (i = m_lastFbo; i < m_numBlocks; i++)
{
// @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
if( ((m_freeSpace>= (size + m_totalHdrBytes)) ||
((size > 8176) && (m_freeSpace > m_totalHdrBytes))) &&
(m_curOp < (MAX_OP_COUNT-1)) )
{ // found the perfect block; signature size fit in this block
if (size > (m_freeSpace - m_totalHdrBytes))
{
write_size = (m_freeSpace - m_totalHdrBytes);
}
else
{
write_size = size;
}
insertDctnryHdr(m_curBlock.data, write_size);
insertSgnture(m_curBlock.data, write_size, value);
size -= write_size;
value += write_size;
m_curBlock.state = BLK_WRITE;
// We only want the start LBID for a multi-block dict in the token
if (!lbid_in_token)
{
token.fbo = m_curLbid;
token.op = m_curOp;
lbid_in_token = true;
}
if (size > 0)
token.bc++;
m_lastFbo = i;
m_curFbo = m_lastFbo;
if ((m_curOp < (MAX_OP_COUNT-1)) && (size <= 0))
return NO_ERROR;
}//end Found
//@bug 3832. check error code
RETURN_ON_ERROR( writeDBFile(cb, &m_curBlock, m_curLbid) );
memset( m_curBlock.data, 0, sizeof(m_curBlock.data));
memcpy( m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
m_curBlock.state = BLK_WRITE;
m_curOp =0;
m_lastFbo++;
m_curFbo = m_lastFbo;
//...Expand current extent if it is an abbreviated initial extent
if ((m_curFbo == m_numBlocks) &&
(m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
{
RETURN_ON_ERROR( expandDctnryExtent() );
}
//...Allocate a new extent if we have reached the last block in the
// current extent.
if (m_curFbo == m_numBlocks)
{//last block
//for roll back the extent to use
//Save those empty extents in case of failure to rollback
std::vector<ExtentInfo> dictExtentInfo;
ExtentInfo info;
info.oid = m_dctnryOID;
info.partitionNum = m_partition;
info.segmentNum = m_segment;
info.dbRoot = m_dbRoot;
info.hwm = m_hwm;
info.newFile = false;
dictExtentInfo.push_back (info);
LBID_t startLbid;
// Add an extent.
rc = createDctnry(m_dctnryOID,
0, // dummy column width
m_dbRoot,
m_partition,
m_segment,
startLbid,
false) ;
if ( rc != NO_ERROR )
{
//roll back the extent
BRMWrapper::getInstance()->deleteEmptyDictStoreExtents(
dictExtentInfo);
return rc;
}
}
RETURN_ON_ERROR( BRMWrapper::getInstance()->getBrmInfo(m_dctnryOID,
m_partition, m_segment,
m_curFbo, m_curLbid) );
m_curBlock.lbid = m_curLbid;
}//end for loop for all of the blocks
return ERR_DICT_NO_SPACE_INSERT;
}
/*******************************************************************************
* Description
* Update the block header (and data members like m_freeSpace,
* m_newStartOffset, etc), to reflect the insertion of string of size "size"
*
* PARAMETERS:
* input
* blockBuf
* --the block buffer
* input
* size
* --Size of the signature value
*
* RETURN:
* none
******************************************************************************/
void Dctnry::insertDctnryHdr( unsigned char* blockBuf,
const int& size)
{
int endHdrLoc = START_HDR1 + (m_curOp+1)*HDR_UNIT_SIZE;
int nextOffsetLoc = START_HDR1 + m_curOp*HDR_UNIT_SIZE;
int lastOffsetLoc = START_HDR1 + (m_curOp-1)*HDR_UNIT_SIZE ;
m_freeSpace -= (size + HDR_UNIT_SIZE);
memcpy(&blockBuf[endHdrLoc],&m_endHeader,HDR_UNIT_SIZE);
uint16_t lastOffset =*(uint16_t*)&blockBuf[lastOffsetLoc];
uint16_t nextOffset = lastOffset- size;
memcpy(&blockBuf[0], &m_freeSpace, HDR_UNIT_SIZE);
memcpy(&blockBuf[nextOffsetLoc], &nextOffset, HDR_UNIT_SIZE);
m_newStartOffset = nextOffset;
m_curOp++;
}
/*******************************************************************************
* DESCRIPTION:
* Insert the specified string into the block buffer.
*
* PARAMETERS:
* Input blockBuf
* --block buffer
* Input size
* -- size of the signature value
* Input value
* -- value of the signature
*
* RETURN:
* none
******************************************************************************/
void Dctnry::insertSgnture(unsigned char* blockBuf,
const int& size, unsigned char*value)
{
//m_newStartLoc is calculated from the header insertion code
memcpy(&blockBuf[m_newStartOffset], value, size);
}
/*******************************************************************************
* Description:
* get the op count for a block
* input
* DataBlock& fileBlock -- the file block
* output
* op_count - total op count
******************************************************************************/
void Dctnry::getBlockOpCount( const DataBlock &fileBlock, int & op_count)
{
ByteStream bs;
ByteStream::byte inbuf[BYTE_PER_BLOCK];
memcpy(inbuf, fileBlock.data , BYTE_PER_BLOCK);
bs.load(inbuf, BYTE_PER_BLOCK);
ByteStream::doublebyte offset;
ByteStream::doublebyte dbyte;
bs >> m_freeSpace;
bs >> dbyte;
bs >> dbyte;
bs >> dbyte;
bs >> dbyte;
bs >> dbyte;
idbassert(dbyte == BYTE_PER_BLOCK);
bs >> offset;
while (offset < 0xffff)
{
op_count++;
bs >> offset;
}
}
/*******************************************************************************
* Description:
* Loads the string cache from the specified DataBlock, which should be
* the first block in the applicable dictionary store file.
* input
* DataBlock& fileBlock -- the file block
******************************************************************************/
void Dctnry::preLoadStringCache( const DataBlock& fileBlock )
{
int hdrOffsetBeg = HDR_UNIT_SIZE + NEXT_PTR_BYTES + HDR_UNIT_SIZE;
int hdrOffsetEnd = HDR_UNIT_SIZE + NEXT_PTR_BYTES;
uint16_t offBeg = 0;
uint16_t offEnd = 0;
memcpy( &offBeg, &fileBlock.data[hdrOffsetBeg], HDR_UNIT_SIZE );
memcpy( &offEnd, &fileBlock.data[hdrOffsetEnd], HDR_UNIT_SIZE );
int op = 1; // ordinal position of the string within the block
Signature aSig;
memset( &aSig, 0, sizeof(Signature));
while ((offBeg != DCTNRY_END_HEADER) &&
(op <= MAX_STRING_CACHE_SIZE))
{
unsigned int len = offEnd - offBeg;
aSig.size = len;
aSig.signature = new unsigned char[len];
memcpy(aSig.signature, &fileBlock.data[offBeg], len);
aSig.token.op = op;
aSig.token.fbo = m_curLbid;
m_sigArray[op-1] = aSig;
offEnd = offBeg;
hdrOffsetBeg += HDR_UNIT_SIZE;
memcpy( &offBeg, &fileBlock.data[hdrOffsetBeg], HDR_UNIT_SIZE );
op++;
}
m_arraySize = op - 1;
//std::cout << "Preloading strings..." << std::endl;
//char strSig[1000];
//uint64_t tokenVal;
//for (int i=0; i<m_arraySize; i++)
//{
// memcpy(strSig, m_sigArray[i].signature, m_sigArray[i].size );
// memcpy(&tokenVal, &m_sigArray[i].token, sizeof(uint64_t));
// strSig[m_sigArray[i].size] = '\0';
// std::cout << "op-" << m_sigArray[i].token.op <<
// "; fbo-" << m_sigArray[i].token.fbo <<
// "; sig-" << strSig <<
// "; token-" << tokenVal << std::endl;
//}
}
/*******************************************************************************
* Description:
* Add the specified signature (string) to the string cache.
* input
* newSig -- Signature string to be added to the string cache.
******************************************************************************/
void Dctnry::addToStringCache( const Signature& newSig )
{
Signature asig;
memset(&asig, 0, sizeof(Signature));
asig.signature = new unsigned char[newSig.size];
memcpy(asig.signature, newSig.signature, newSig.size );
asig.size = newSig.size;
asig.token = newSig.token;
m_sigArray[m_arraySize]=asig;
m_arraySize++;
}
/*******************************************************************************
* Description:
* get the location of the end of header
* input
* dFile - file handle
* lbid - block of interest
* output
* endOp - ordinal position of the end of header for "lbid"
*
* return value
* Success -- found and deleted
* Fail -- ERR_DICT_INVALID_DELETE
******************************************************************************/
int Dctnry::getEndOp(IDBDataFile* dFile, int lbid, int &endOp)
{
DataBlock fileBlock;
Offset newOffset;
int rc;
CommBlock cb;
cb.file.oid = m_dctnryOID;
cb.file.pFile = dFile;
memset( fileBlock.data, 0, sizeof(fileBlock.data));
m_dFile = dFile;
rc=readSubBlockEntry( cb, &fileBlock, lbid, 0, 0,
HDR_UNIT_SIZE +
NEXT_PTR_BYTES +
HDR_UNIT_SIZE +
HDR_UNIT_SIZE,
&m_dctnryHeader);
memcpy(&m_freeSpace, &fileBlock.data[0],HDR_UNIT_SIZE);
memcpy(&m_nextPtr, &fileBlock.data[HDR_UNIT_SIZE],NEXT_PTR_BYTES);
newOffset.hdrLoc = HDR_UNIT_SIZE + NEXT_PTR_BYTES + HDR_UNIT_SIZE ;
memcpy(&newOffset.offset,&fileBlock.data[newOffset.hdrLoc],HDR_UNIT_SIZE);
endOp = 1; //should be zero counting the end of header then
while ( newOffset.offset !=DCTNRY_END_HEADER)
{
newOffset.hdrLoc += HDR_UNIT_SIZE;
memcpy(&newOffset.offset,&fileBlock.data[newOffset.hdrLoc],
HDR_UNIT_SIZE);
endOp++;
}
return rc;
}
/*******************************************************************************
* Add a signature value to the dictionary store.
* Function first checks to see if the signature is already
* in our string cache, and returns the corresponding token
* if it is found in the cache.
******************************************************************************/
int Dctnry::updateDctnry(unsigned char* sigValue, int& sigSize,
Token& token)
{
int rc = NO_ERROR;
Signature sig;
sig.signature = sigValue;
sig.size = sigSize;
// Look for string in cache
// As long as the string <= 8000 bytes
if (sigSize <= MAX_SIGNATURE_SIZE)
{
bool found = false;
found = getTokenFromArray(sig);
if (found)
{
token = sig.token;
return NO_ERROR;
}
}
//Insert into Dictionary
rc = insertDctnry(sigSize, sigValue, token);
//Add the new signature and token into cache
//As long as the string is <= 8000 bytes
if ((m_arraySize < MAX_STRING_CACHE_SIZE) &&
(sigSize <= MAX_SIGNATURE_SIZE))
{
Signature sig;
sig.size = sigSize;
sig.signature = new unsigned char[sigSize];
memcpy (sig.signature, sigValue, sigSize);
sig.token = token;
m_sigArray[m_arraySize]=sig;
m_arraySize++;
}
return rc;
}
/*******************************************************************************
* open dictionary file
******************************************************************************/
IDBDataFile* Dctnry::createDctnryFile(
const char *name, int, const char *mode, int ioBuffSize)
{
return openFile(name, mode, ioBuffSize, false);
}
/*******************************************************************************
* open dictionary file
******************************************************************************/
// @bug 5572 - HDFS usage: add *.tmp file backup flag
IDBDataFile* Dctnry::openDctnryFile(bool useTmpSuffix)
{
return openFile(
m_dctnryOID, m_dbRoot, m_partition, m_segment, m_segFileName,
"r+b", DEFAULT_COLSIZ, useTmpSuffix);
}
/*******************************************************************************
* close dictionary file
******************************************************************************/
void Dctnry::closeDctnryFile(bool doFlush, std::map<FID,FID> & oids)
{
closeFile(m_dFile);
m_dFile = NULL;
}
int Dctnry::numOfBlocksInFile()
{
long long fileSizeBytes = 0;
getFileSize(m_dFile,fileSizeBytes); //dmc-error handling (ignoring rc)
return fileSizeBytes/BYTE_PER_BLOCK;
}
void Dctnry::copyDctnryHeader(void* buf)
{
memcpy(buf, m_dctnryHeader2, m_totalHdrBytes);
}
} //end of namespace