1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-04-21 19:45:56 +03:00
mariadb-columnstore-engine/writeengine/bulk/we_columninfocompressed.cpp
2022-01-21 16:43:49 +00:00

501 lines
18 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/******************************************************************************
* $Id: we_columninfocompressed.cpp 4726 2013-08-07 03:38:36Z bwilkinson $
*
*******************************************************************************/
#include "we_columninfocompressed.h"
#include "we_define.h"
#include "we_log.h"
#include "we_type.h"
#include "we_rbmetawriter.h"
#include "we_stats.h"
#include "we_tableinfo.h"
#include "idbcompress.h"
using namespace compress;
#include "IDBFileSystem.h"
#include <iostream>
namespace WriteEngine
{
//------------------------------------------------------------------------------
// ColumnInfoCompressed constructor
//------------------------------------------------------------------------------
ColumnInfoCompressed::ColumnInfoCompressed(Log* logger, int idIn, const JobColumn& columnIn,
DBRootExtentTracker* pDBRootExtTrk, TableInfo* pTableInfo)
: // RBMetaWriter* rbMetaWriter) :
ColumnInfo(logger, idIn, columnIn, pDBRootExtTrk, pTableInfo)
, fRBMetaWriter(pTableInfo->rbMetaWriter())
{
}
//------------------------------------------------------------------------------
// ColumnInfoCompressed destructor
//------------------------------------------------------------------------------
ColumnInfoCompressed::~ColumnInfoCompressed()
{
}
//------------------------------------------------------------------------------
// Close the current compressed Column file after first compressing/flushing
// any remaining data, and re-writing the headers as well.
//------------------------------------------------------------------------------
int ColumnInfoCompressed::closeColumnFile(bool bCompletingExtent, bool bAbort)
{
int rc = NO_ERROR;
if (curCol.dataFile.pFile)
{
if (!bAbort)
{
// If we are opening and closing a file in order to add an extent as
// part of preliminary block skipping, then we won't have a Column-
// BufferManger object yet. One will be created when the file is
// reopened to begin importing.
if (fColBufferMgr)
{
rc = fColBufferMgr->finishFile(bCompletingExtent);
if (rc != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << "Error closing compressed file; OID-" << curCol.dataFile.fid << "; DBRoot-"
<< curCol.dataFile.fDbRoot << "; part-" << curCol.dataFile.fPartition << "; seg-"
<< curCol.dataFile.fSegment << "; " << ec.errorString(rc);
fLog->logMsg(oss.str(), rc, MSGLVL_ERROR);
bAbort = true;
}
}
}
ColumnInfo::closeColumnFile(bCompletingExtent, bAbort);
}
return rc;
}
//------------------------------------------------------------------------------
// Prepare the initial compressed column segment file for import.
//------------------------------------------------------------------------------
int ColumnInfoCompressed::setupInitialColumnFile(HWM oldHwm, HWM hwm)
{
char hdr[compress::CompressInterface::HDR_BUF_LEN * 2];
RETURN_ON_ERROR(colOp->readHeaders(curCol.dataFile.pFile, hdr));
// Initialize the output buffer manager for the column.
WriteEngine::ColumnBufferManager* mgr;
if (column.colType == COL_TYPE_DICT)
{
mgr = new ColumnBufferManagerDctnry(this, 8, fLog, column.compressionType);
RETURN_ON_ERROR(mgr->setDbFile(curCol.dataFile.pFile, hwm, hdr));
}
else
{
mgr = new ColumnBufferManager(this, column.width, fLog, column.compressionType);
RETURN_ON_ERROR(mgr->setDbFile(curCol.dataFile.pFile, hwm, hdr));
}
fColBufferMgr = mgr;
int abbrevFlag = (compress::CompressInterface::getBlockCount(hdr) ==
uint64_t(INITIAL_EXTENT_ROWS_TO_DISK * column.width / BYTE_PER_BLOCK));
setFileSize(hwm, abbrevFlag);
// See if dealing with abbreviated extent that will need expanding.
// This only applies to the first extent of the first segment file.
setAbbrevExtentCheck();
// If we are dealing with initial extent, see if block skipping has
// exceeded disk allocation, in which case we expand to a full extent.
if (isAbbrevExtent())
{
unsigned int numBlksForFirstExtent = (INITIAL_EXTENT_ROWS_TO_DISK * column.width) / BYTE_PER_BLOCK;
if (((oldHwm + 1) <= numBlksForFirstExtent) && ((hwm + 1) > numBlksForFirstExtent))
{
RETURN_ON_ERROR(expandAbbrevExtent(false));
}
}
// Store the current allocated file size in availFileSize.
// Keep in mind, these are raw uncompressed offsets.
// NOTE: We don't call setFileOffset() to set the file position in the
// column segment file at this point; we wait till we load the compressed
// buffer later on in ColumnBufferCompressed::initToBeCompressedBuffer()
long long byteOffset = (long long)hwm * (long long)BYTE_PER_BLOCK;
fSizeWritten = byteOffset;
fSizeWrittenStart = fSizeWritten;
availFileSize = fileSize - fSizeWritten;
if (fLog->isDebug(DEBUG_1))
{
std::ostringstream oss;
oss << "Init raw data offsets in compressed column file OID-" << curCol.dataFile.fid << "; DBRoot-"
<< curCol.dataFile.fDbRoot << "; part-" << curCol.dataFile.fPartition << "; seg-"
<< curCol.dataFile.fSegment << "; abbrev-" << abbrevFlag << "; begByte-" << fSizeWritten
<< "; endByte-" << fileSize << "; freeBytes-" << availFileSize;
fLog->logMsg(oss.str(), MSGLVL_INFO2);
}
return NO_ERROR;
}
//------------------------------------------------------------------------------
// Reinitializes ColBuf buffer, and resets
// file offset data member attributes where new extent will start.
//------------------------------------------------------------------------------
int ColumnInfoCompressed::resetFileOffsetsNewExtent(const char* hdr)
{
setFileSize(curCol.dataFile.hwm, false);
long long byteOffset = (long long)curCol.dataFile.hwm * (long long)BYTE_PER_BLOCK;
fSizeWritten = byteOffset;
fSizeWrittenStart = fSizeWritten;
availFileSize = fileSize - fSizeWritten;
// If we are adding an extent as part of preliminary block skipping, then
// we won't have a ColumnBufferManager object yet, but that's okay, because
// we are only adding the empty extent at this point.
if (fColBufferMgr)
{
RETURN_ON_ERROR(fColBufferMgr->setDbFile(curCol.dataFile.pFile, curCol.dataFile.hwm, hdr));
// Reinitialize ColBuf for the next extent
long long startFileOffset;
RETURN_ON_ERROR(fColBufferMgr->resetToBeCompressedColBuf(startFileOffset));
// Set the file offset to point to the chunk we are adding or updating
RETURN_ON_ERROR(colOp->setFileOffset(curCol.dataFile.pFile, startFileOffset));
}
return NO_ERROR;
}
//------------------------------------------------------------------------------
// Save HWM chunk for compressed dictionary store files, so that the HWM chunk
// can be restored by bulk rollback if an error should occur.
//------------------------------------------------------------------------------
// @bug 5572 - HDFS usage: add flag used to control *.tmp file usage
int ColumnInfoCompressed::saveDctnryStoreHWMChunk(bool& needBackup)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_COMPRESS_DCT_BACKUP_CHUNK);
#endif
needBackup = false;
int rc = NO_ERROR;
try
{
needBackup = fRBMetaWriter->backupDctnryHWMChunk(column.dctnry.dctnryOid, curCol.dataFile.fDbRoot,
curCol.dataFile.fPartition, curCol.dataFile.fSegment);
}
catch (WeException& ex)
{
fLog->logMsg(ex.what(), ex.errorCode(), MSGLVL_ERROR);
rc = ex.errorCode();
}
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_COMPRESS_DCT_BACKUP_CHUNK);
#endif
return rc;
}
//------------------------------------------------------------------------------
// Truncate specified dictionary store file for this column.
// Only applies to compressed columns.
//
// This function may logically belong in a dictionary related class, but I did
// not particularly want to put a bulk import specific function in Dctnry-
// Compress1 (a wrapper class shared with DML/DDL) or Dctnry, so I put it here.
// May change my mind later.
//
// dmc-Not the most efficient implementation. We are reopening
// the file to perform the truncation, instead of truncating the file before
// we close it. This is done because we need to first flush the compressed
// chunks before we can determine the truncation file size. But the Chunk-
// Manager flushChunks() function immediately closes the file and clears itself
// after if flushes the data. So by the time we get back to the application
// code it's too late to truncate the file. At some point, we could look at
// adding or changing the ChunkManager API to support a flush w/o a close.
// That would be more optimum than having to reopen the file for truncation.
//------------------------------------------------------------------------------
int ColumnInfoCompressed::truncateDctnryStore(OID dctnryOid, uint16_t root, uint32_t pNum,
uint16_t sNum) const
{
int rc = NO_ERROR;
// @bug5769 Don't initialize extents or truncate db files on HDFS
if (idbdatafile::IDBPolicy::useHdfs())
{
std::ostringstream oss1;
oss1 << "Finished writing dictionary file"
": OID-"
<< dctnryOid << "; DBRoot-" << root << "; part-" << pNum << "; seg-" << sNum;
// Have to rework this logging if we want to keep it.
// Filesize is not correct when adding data to an "existing" file,
// since in the case of HDFS, we are writing to a *.cdf.tmp file.
// char dctnryFileName[FILE_NAME_SIZE];
// if (colOp->getFileName(dctnryOid,dctnryFileName,
// root, pNum, sNum) == NO_ERROR)
//{
// off64_t dctnryFileSize = idbdatafile::IDBFileSystem::getFs(
// IDBDataFile::HDFS).size(dctnryFileName);
// if (dctnryFileSize != -1)
// {
// oss1 << "; size-" << dctnryFileSize;
// }
//}
fLog->logMsg(oss1.str(), MSGLVL_INFO2);
}
else
{
// See if the relevant dictionary store file can/should be truncated
// (to the nearest extent)
std::string segFile;
IDBDataFile* dFile = fTruncateDctnryFileOp.openFile(dctnryOid, root, pNum, sNum, segFile);
if (dFile == 0)
{
rc = ERR_FILE_OPEN;
std::ostringstream oss;
oss << "Error opening compressed dictionary store segment "
"file for truncation"
<< ": OID-" << dctnryOid << "; DbRoot-" << root << "; partition-" << pNum << "; segment-" << sNum;
fLog->logMsg(oss.str(), rc, MSGLVL_ERROR);
return rc;
}
char controlHdr[CompressInterface::HDR_BUF_LEN];
rc = fTruncateDctnryFileOp.readFile(dFile, (unsigned char*)controlHdr, CompressInterface::HDR_BUF_LEN);
if (rc != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << "Error reading compressed dictionary store control hdr "
"for truncation"
<< ": OID-" << dctnryOid << "; DbRoot-" << root << "; partition-" << pNum << "; segment-" << sNum
<< "; " << ec.errorString(rc);
fLog->logMsg(oss.str(), rc, MSGLVL_ERROR);
fTruncateDctnryFileOp.closeFile(dFile);
return rc;
}
int rc1 = compress::CompressInterface::verifyHdr(controlHdr);
if (rc1 != 0)
{
rc = ERR_COMP_VERIFY_HDRS;
WErrorCodes ec;
std::ostringstream oss;
oss << "Error verifying compressed dictionary store ptr hdr "
"for truncation"
<< ": OID-" << dctnryOid << "; DbRoot-" << root << "; partition-" << pNum << "; segment-" << sNum
<< "; (" << rc1 << ")";
fLog->logMsg(oss.str(), rc, MSGLVL_ERROR);
fTruncateDctnryFileOp.closeFile(dFile);
return rc;
}
// No need to perform file truncation if the dictionary file just contains
// a single abbreviated extent. Truncating up to the nearest extent would
// actually grow the file (something we don't want to do), because we have
// not yet reserved a full extent (on disk) for this dictionary store file.
const int PSEUDO_COL_WIDTH = 8;
uint64_t numBlocks = compress::CompressInterface::getBlockCount(controlHdr);
if (numBlocks == uint64_t(INITIAL_EXTENT_ROWS_TO_DISK * PSEUDO_COL_WIDTH / BYTE_PER_BLOCK))
{
std::ostringstream oss1;
oss1 << "Skip truncating abbreviated dictionary file"
": OID-"
<< dctnryOid << "; DBRoot-" << root << "; part-" << pNum << "; seg-" << sNum << "; blocks-"
<< numBlocks;
fLog->logMsg(oss1.str(), MSGLVL_INFO2);
fTruncateDctnryFileOp.closeFile(dFile);
return NO_ERROR;
}
uint64_t hdrSize = compress::CompressInterface::getHdrSize(controlHdr);
uint64_t ptrHdrSize = hdrSize - CompressInterface::HDR_BUF_LEN;
char* pointerHdr = new char[ptrHdrSize];
rc = fTruncateDctnryFileOp.readFile(dFile, (unsigned char*)pointerHdr, ptrHdrSize);
if (rc != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << "Error reading compressed dictionary store pointer hdr "
"for truncation"
<< ": OID-" << dctnryOid << "; DbRoot-" << root << "; partition-" << pNum << "; segment-" << sNum
<< "; " << ec.errorString(rc);
fLog->logMsg(oss.str(), rc, MSGLVL_ERROR);
fTruncateDctnryFileOp.closeFile(dFile);
delete[] pointerHdr;
return rc;
}
CompChunkPtrList chunkPtrs;
rc1 = compress::CompressInterface::getPtrList(pointerHdr, ptrHdrSize, chunkPtrs);
delete[] pointerHdr;
if (rc1 != 0)
{
rc = ERR_COMP_PARSE_HDRS;
WErrorCodes ec;
std::ostringstream oss;
oss << "Error parsing compressed dictionary store ptr hdr "
"for truncation"
<< ": OID-" << dctnryOid << "; DbRoot-" << root << "; partition-" << pNum << "; segment-" << sNum
<< "; (" << rc1 << ")";
fLog->logMsg(oss.str(), rc, MSGLVL_ERROR);
fTruncateDctnryFileOp.closeFile(dFile);
return rc;
}
// Truncate the relevant dictionary store file to the nearest extent
if (chunkPtrs.size() > 0)
{
long long dataByteLength =
chunkPtrs[chunkPtrs.size() - 1].first + chunkPtrs[chunkPtrs.size() - 1].second - hdrSize;
long long extentBytes = fRowsPerExtent * PSEUDO_COL_WIDTH;
long long rem = dataByteLength % extentBytes;
if (rem > 0)
{
dataByteLength = dataByteLength - rem + extentBytes;
}
long long truncateFileSize = dataByteLength + hdrSize;
std::ostringstream oss1;
oss1 << "Truncating dictionary file"
": OID-"
<< dctnryOid << "; DBRoot-" << root << "; part-" << pNum << "; seg-" << sNum << "; size-"
<< truncateFileSize;
fLog->logMsg(oss1.str(), MSGLVL_INFO2);
if (truncateFileSize > 0)
rc = fTruncateDctnryFileOp.truncateFile(dFile, truncateFileSize);
else
rc = ERR_COMP_TRUNCATE_ZERO; //@bug3913-Catch truncate to 0 bytes
if (rc != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << "Error truncating compressed dictionary store file"
": OID-"
<< dctnryOid << "; DbRoot-" << root << "; partition-" << pNum << "; segment-" << sNum << "; "
<< ec.errorString(rc);
fLog->logMsg(oss.str(), rc, MSGLVL_ERROR);
fTruncateDctnryFileOp.closeFile(dFile);
return rc;
}
}
fTruncateDctnryFileOp.closeFile(dFile);
}
return NO_ERROR;
}
//------------------------------------------------------------------------------
// Fill out existing partial extent to extent boundary, so that we can resume
// inserting rows on an extent boundary basis. This use case should only take
// place when a DBRoot with a partial extent has been moved from one PM to
// another.
//------------------------------------------------------------------------------
int ColumnInfoCompressed::extendColumnOldExtent(uint16_t dbRootNext, uint32_t partitionNext,
uint16_t segmentNext, HWM hwmNextIn)
{
const unsigned int BLKS_PER_EXTENT = (fRowsPerExtent * column.width) / BYTE_PER_BLOCK;
// Round up HWM to the end of the current extent
unsigned int nBlks = hwmNextIn + 1;
unsigned int nRem = nBlks % BLKS_PER_EXTENT;
HWM hwmNext = 0;
if (nRem > 0)
hwmNext = nBlks - nRem + BLKS_PER_EXTENT - 1;
else
hwmNext = nBlks - 1;
std::ostringstream oss;
oss << "Padding compressed partial extent to extent boundary in OID-" << curCol.dataFile.fid << "; DBRoot-"
<< dbRootNext << "; part-" << partitionNext << "; seg-" << segmentNext << "; hwm-" << hwmNext;
fLog->logMsg(oss.str(), MSGLVL_INFO2);
curCol.dataFile.pFile = 0;
curCol.dataFile.fDbRoot = dbRootNext;
curCol.dataFile.fPartition = partitionNext;
curCol.dataFile.fSegment = segmentNext;
curCol.dataFile.hwm = hwmNext;
curCol.dataFile.fSegFileName.clear();
std::string segFileName;
std::string errTask;
int rc = colOp->fillCompColumnExtentEmptyChunks(curCol.dataFile.fid, curCol.colWidth, column.emptyVal,
curCol.dataFile.fDbRoot, curCol.dataFile.fPartition,
curCol.dataFile.fSegment, curCol.colDataType,
curCol.dataFile.hwm, segFileName, errTask);
if (rc != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << "extendColumnOldExtent: error padding extent (" << errTask << "); "
<< "column OID-" << curCol.dataFile.fid << "; DBRoot-" << curCol.dataFile.fDbRoot << "; part-"
<< curCol.dataFile.fPartition << "; seg-" << curCol.dataFile.fSegment << "; newHwm-"
<< curCol.dataFile.hwm << "; " << ec.errorString(rc);
fLog->logMsg(oss.str(), rc, MSGLVL_CRITICAL);
fpTableInfo->fBRMReporter.addToErrMsgEntry(oss.str());
return rc;
}
addToSegFileList(curCol.dataFile, hwmNext);
return NO_ERROR;
}
} // namespace WriteEngine