1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-04-20 09:07:44 +03:00
Denis Khalikov 2e2fc7d4a3 [MCOL-5106] Support 48 extents per file for rebuildEM.
This patch increases the number of extents per segment file
for rebuildEM tool.
2022-08-04 13:58:17 +03:00

675 lines
22 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/******************************************************************************************
* $Id: idbcompress.cpp 3907 2013-06-18 13:32:46Z dcathey $
*
******************************************************************************************/
#include <cstring>
#include <iostream>
#include <stdexcept>
#include <unordered_map>
using namespace std;
#include "blocksize.h"
#include "logger.h"
#include "snappy.h"
#include "hasher.h"
#include "mcsconfig.h"
#ifdef HAVE_LZ4
#include "lz4.h"
#else
// Taken from lz4.h.
#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */
#define LZ4_COMPRESSBOUND(isize) \
((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize) / 255) + 16)
#endif
#define IDBCOMP_DLLEXPORT
#include "idbcompress.h"
#undef IDBCOMP_DLLEXPORT
namespace
{
const uint64_t MAGIC_NUMBER = 0xfdc119a384d0778eULL;
const uint64_t VERSION_NUM3 = 3;
// version 1.1 of the chunk data has a short header
// QuickLZ compressed data never has the high bit set on the first byte
const int SIG_OFFSET = 0;
const int CHECKSUM_OFFSET = 1;
const int LEN_OFFSET = 5;
const unsigned HEADER_SIZE = 9;
// The max number of lbids to be stored in segment file.
const uint32_t LBID_MAX_SIZE = 48;
struct CompressedDBFileHeader
{
uint64_t fMagicNumber;
uint64_t fVersionNum;
uint64_t fCompressionType;
uint64_t fHeaderSize;
uint64_t fBlockCount;
uint64_t fColumnWidth;
execplan::CalpontSystemCatalog::ColDataType fColDataType;
uint64_t fLBIDCount;
uint64_t fLBIDS[LBID_MAX_SIZE];
};
// Make the header to be 4K, regardless number of fields being defined/used in header.
union CompressedDBFileHeaderBlock
{
CompressedDBFileHeader fHeader;
char fDummy[compress::CompressInterface::HDR_BUF_LEN];
};
void initCompressedDBFileHeader(void* hdrBuf, uint32_t columnWidth,
execplan::CalpontSystemCatalog::ColDataType colDataType, int compressionType,
int hdrSize)
{
CompressedDBFileHeaderBlock* hdr = reinterpret_cast<CompressedDBFileHeaderBlock*>(hdrBuf);
hdr->fHeader.fMagicNumber = MAGIC_NUMBER;
hdr->fHeader.fVersionNum = VERSION_NUM3;
hdr->fHeader.fCompressionType = compressionType;
hdr->fHeader.fBlockCount = 0;
hdr->fHeader.fHeaderSize = hdrSize;
hdr->fHeader.fColumnWidth = columnWidth;
hdr->fHeader.fColDataType = colDataType;
hdr->fHeader.fLBIDCount = 0;
std::memset(hdr->fHeader.fLBIDS, 0, sizeof(hdr->fHeader.fLBIDS));
}
} // namespace
namespace compress
{
#ifndef SKIP_IDB_COMPRESSION
CompressInterface::CompressInterface(unsigned int numUserPaddingBytes)
: fNumUserPaddingBytes(numUserPaddingBytes)
{
}
/* V1 is really only available for decompression, we kill any DDL using V1 by hand.
* Maybe should have a new api, isDecompressionAvail() ? Any request to compress
* using V1 will silently be changed to V2.
*/
/*static*/
bool CompressInterface::isCompressionAvail(int compressionType)
{
return ((compressionType == 0) || (compressionType == 1) || (compressionType == 2) ||
(compressionType == 3));
}
size_t CompressInterface::getMaxCompressedSizeGeneric(size_t inLen)
{
return std::max(snappy::MaxCompressedLength(inLen), LZ4_COMPRESSBOUND(inLen)) + HEADER_SIZE;
}
//------------------------------------------------------------------------------
// Compress a block of data
//------------------------------------------------------------------------------
int CompressInterface::compressBlock(const char* in, const size_t inLen, unsigned char* out,
size_t& outLen) const
{
size_t snaplen = 0;
utils::Hasher128 hasher;
// loose input checking.
if (outLen < maxCompressedSize(inLen))
{
cerr << "got outLen = " << outLen << " for inLen = " << inLen << ", needed " << (maxCompressedSize(inLen))
<< endl;
return ERR_BADOUTSIZE;
}
auto rc = compress(in, inLen, reinterpret_cast<char*>(&out[HEADER_SIZE]), &outLen);
if (rc != ERR_OK)
{
return rc;
}
snaplen = outLen;
uint8_t* signature = (uint8_t*)&out[SIG_OFFSET];
uint32_t* checksum = (uint32_t*)&out[CHECKSUM_OFFSET];
uint32_t* len = (uint32_t*)&out[LEN_OFFSET];
*signature = getChunkMagicNumber();
*checksum = hasher((char*)&out[HEADER_SIZE], snaplen);
*len = snaplen;
// cerr << "cb: " << inLen << '/' << outLen << '/' << (snappy::MaxCompressedLength(inLen) + HEADER_SIZE) <<
// " : " << (snaplen + HEADER_SIZE) << endl;
outLen = snaplen + HEADER_SIZE;
return ERR_OK;
}
//------------------------------------------------------------------------------
// Decompress a block of data
//------------------------------------------------------------------------------
int CompressInterface::uncompressBlock(const char* in, const size_t inLen, unsigned char* out,
size_t& outLen) const
{
uint32_t realChecksum;
uint32_t storedChecksum;
uint32_t storedLen;
uint8_t storedMagic;
utils::Hasher128 hasher;
auto tmpOutLen = outLen;
outLen = 0;
if (inLen < 1)
return ERR_BADINPUT;
storedMagic = *((uint8_t*)&in[SIG_OFFSET]);
if (storedMagic == getChunkMagicNumber())
{
if (inLen < HEADER_SIZE)
return ERR_BADINPUT;
storedChecksum = *((uint32_t*)&in[CHECKSUM_OFFSET]);
storedLen = *((uint32_t*)(&in[LEN_OFFSET]));
if (inLen < storedLen + HEADER_SIZE)
return ERR_BADINPUT;
realChecksum = hasher(&in[HEADER_SIZE], storedLen);
if (storedChecksum != realChecksum)
return ERR_CHECKSUM;
auto rc = uncompress(&in[HEADER_SIZE], storedLen, reinterpret_cast<char*>(out), &tmpOutLen);
if (rc != ERR_OK)
{
cerr << "uncompressBlock failed!" << endl;
return ERR_DECOMPRESS;
}
outLen = tmpOutLen;
}
else
{
// v1 compression or bad header
return ERR_BADINPUT;
}
// cerr << "ub: " << inLen << " : " << outLen << endl;
return ERR_OK;
}
//------------------------------------------------------------------------------
// Verify the passed in buffer contains a valid compression file header.
//------------------------------------------------------------------------------
int CompressInterface::verifyHdr(const void* hdrBuf)
{
const CompressedDBFileHeader* hdr = reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf);
if (hdr->fMagicNumber != MAGIC_NUMBER)
return -1;
if (!isCompressionAvail(hdr->fCompressionType))
return -2;
return 0;
}
//------------------------------------------------------------------------------
// Extract compression pointer information out of the pointer buffer that is
// passed in. ptrBuf points to the pointer section of the compression hdr.
//------------------------------------------------------------------------------
int CompressInterface::getPtrList(const char* ptrBuf, const int ptrBufSize, CompChunkPtrList& chunkPtrs)
{
int rc = 0;
chunkPtrs.clear();
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
for (unsigned int i = 0; (i < NUM_PTRS) && (rc == 0); i++)
{
if (ptrs[i + 1] == 0) // 0 offset means end of data
break;
if (ptrs[i + 1] > ptrs[i])
chunkPtrs.push_back(make_pair(ptrs[i], (ptrs[i + 1] - ptrs[i])));
else
rc = -1;
}
return rc;
}
//------------------------------------------------------------------------------
// Extract compression pointer information out of the file compression hdr.
// Function assume that the file is a column file that has just two 4096-hdrs,
// one for the file header, and one for the list of pointers.
// Wrapper of above method for backward compatibility.
//------------------------------------------------------------------------------
int CompressInterface::getPtrList(const char* hdrBuf, CompChunkPtrList& chunkPtrs)
{
return getPtrList(hdrBuf + HDR_BUF_LEN, HDR_BUF_LEN, chunkPtrs);
}
//------------------------------------------------------------------------------
// Count the number of chunk pointers in the pointer header(s)
//------------------------------------------------------------------------------
unsigned int CompressInterface::getPtrCount(const char* ptrBuf, const int ptrBufSize)
{
unsigned int chunkCount = 0;
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
for (unsigned int i = 0; i < NUM_PTRS; i++)
{
if (ptrs[i + 1] == 0) // 0 offset means end of data
break;
chunkCount++;
}
return chunkCount;
}
//------------------------------------------------------------------------------
// Count the number of chunk pointers in the specified 8192 byte compression
// file header, which carries a single 4096 byte compression chunk header.
// This should not be used for compressed dictionary files which could have
// more compression chunk headers.
//------------------------------------------------------------------------------
unsigned int CompressInterface::getPtrCount(const char* hdrBuf)
{
return getPtrCount(hdrBuf + HDR_BUF_LEN, HDR_BUF_LEN);
}
//------------------------------------------------------------------------------
// Store list of compression pointers into the specified header.
//------------------------------------------------------------------------------
void CompressInterface::storePtrs(const std::vector<uint64_t>& ptrs, void* ptrBuf, int ptrSectionSize)
{
memset((ptrBuf), 0, ptrSectionSize); // reset the pointer section to 0
uint64_t* hdrPtrs = reinterpret_cast<uint64_t*>(ptrBuf);
for (unsigned i = 0; i < ptrs.size(); i++)
{
hdrPtrs[i] = ptrs[i];
}
}
//------------------------------------------------------------------------------
// Wrapper of above method for backward compatibility
//------------------------------------------------------------------------------
void CompressInterface::storePtrs(const std::vector<uint64_t>& ptrs, void* ptrBuf)
{
storePtrs(ptrs, reinterpret_cast<char*>(ptrBuf) + HDR_BUF_LEN, HDR_BUF_LEN);
}
//------------------------------------------------------------------------------
// Initialize the header blocks to be written at the start of a dictionary file.
//------------------------------------------------------------------------------
void CompressInterface::initHdr(void* hdrBuf, void* ptrBuf, uint32_t colWidth,
execplan::CalpontSystemCatalog::ColDataType columnType, int compressionType,
int hdrSize)
{
memset(hdrBuf, 0, HDR_BUF_LEN);
memset(ptrBuf, 0, hdrSize - HDR_BUF_LEN);
initCompressedDBFileHeader(hdrBuf, colWidth, columnType, compressionType, hdrSize);
}
//------------------------------------------------------------------------------
// Initialize the header blocks to be written at the start of a column file.
//------------------------------------------------------------------------------
void CompressInterface::initHdr(void* hdrBuf, uint32_t columnWidth,
execplan::CalpontSystemCatalog::ColDataType columnType, int compressionType)
{
memset(hdrBuf, 0, HDR_BUF_LEN * 2);
initCompressedDBFileHeader(hdrBuf, columnWidth, columnType, compressionType, HDR_BUF_LEN * 2);
}
//------------------------------------------------------------------------------
// Get the header's version number
//------------------------------------------------------------------------------
uint64_t CompressInterface::getVersionNumber(const void* hdrBuf)
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fVersionNum);
}
//------------------------------------------------------------------------------
// Set the file's block count
//------------------------------------------------------------------------------
void CompressInterface::setBlockCount(void* hdrBuf, uint64_t count)
{
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fBlockCount = count;
}
//------------------------------------------------------------------------------
// Get the file's block count
//------------------------------------------------------------------------------
uint64_t CompressInterface::getBlockCount(const void* hdrBuf)
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fBlockCount);
}
//------------------------------------------------------------------------------
// Get the file's compression type
//------------------------------------------------------------------------------
uint64_t CompressInterface::getCompressionType(const void* hdrBuf)
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fCompressionType);
}
//------------------------------------------------------------------------------
// Set the overall header size
//------------------------------------------------------------------------------
void CompressInterface::setHdrSize(void* hdrBuf, uint64_t size)
{
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fHeaderSize = size;
}
//------------------------------------------------------------------------------
// Get the overall header size
//------------------------------------------------------------------------------
uint64_t CompressInterface::getHdrSize(const void* hdrBuf)
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fHeaderSize);
}
//------------------------------------------------------------------------------
// Get column type
//-----------------------------------------------------------------------------
execplan::CalpontSystemCatalog::ColDataType CompressInterface::getColDataType(const void* hdrBuf)
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fColDataType);
}
//------------------------------------------------------------------------------
// Get column width
//------------------------------------------------------------------------------
uint64_t CompressInterface::getColumnWidth(const void* hdrBuf)
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fColumnWidth);
}
//------------------------------------------------------------------------------
// Get LBID by index
//------------------------------------------------------------------------------
uint64_t CompressInterface::getLBIDByIndex(const void* hdrBuf, uint64_t index)
{
if (index < LBID_MAX_SIZE)
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fLBIDS[index]);
return 0;
}
//------------------------------------------------------------------------------
// Set LBID by index
//------------------------------------------------------------------------------
void CompressInterface::setLBIDByIndex(void* hdrBuf, uint64_t lbid, uint64_t index)
{
if (lbid && index < LBID_MAX_SIZE)
{
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fLBIDS[index] = lbid;
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fLBIDCount =
std::max(index + 1, reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fLBIDCount);
}
}
//------------------------------------------------------------------------------
// Get LBID count
//------------------------------------------------------------------------------
uint64_t CompressInterface::getLBIDCount(void* hdrBuf)
{
return reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fLBIDCount;
}
//------------------------------------------------------------------------------
// Calculates the chunk and block offset within the chunk for the specified
// block number.
//------------------------------------------------------------------------------
void CompressInterface::locateBlock(unsigned int block, unsigned int& chunkIndex,
unsigned int& blockOffsetWithinChunk) const
{
const uint64_t BUFLEN = UNCOMPRESSED_INBUF_LEN;
uint64_t byteOffset = (uint64_t)block * BLOCK_SIZE;
uint64_t chunk = byteOffset / BUFLEN;
uint64_t blockInChunk = (byteOffset % BUFLEN) / BLOCK_SIZE;
chunkIndex = chunk;
blockOffsetWithinChunk = blockInChunk;
}
//------------------------------------------------------------------------------
// Round up the size of the buffer to the applicable compressed size increment,
// also expand to allow for user requested padding. Lastly, initialize padding
// bytes to 0.
//------------------------------------------------------------------------------
int CompressInterface::padCompressedChunks(unsigned char* buf, size_t& len, unsigned int maxLen) const
{
int nPaddingBytes = 0;
int nRem = len % COMPRESSED_CHUNK_INCREMENT_SIZE;
if (nRem != 0)
{
nPaddingBytes = COMPRESSED_CHUNK_INCREMENT_SIZE - nRem;
}
nPaddingBytes = nPaddingBytes + fNumUserPaddingBytes;
if (nPaddingBytes > 0)
{
if ((len + nPaddingBytes) > maxLen)
return -1;
memset(buf + len, 0, nPaddingBytes);
len = len + nPaddingBytes;
}
return 0;
}
// Snappy
CompressInterfaceSnappy::CompressInterfaceSnappy(uint32_t numUserPaddingBytes)
: CompressInterface(numUserPaddingBytes)
{
}
int32_t CompressInterfaceSnappy::compress(const char* in, size_t inLen, char* out, size_t* outLen) const
{
snappy::RawCompress(in, inLen, out, outLen);
#ifdef DEBUG_COMPRESSION
std::cout << "Snappy::compress: inLen " << inLen << ", outLen " << *outLen << std::endl;
#endif
return ERR_OK;
}
int32_t CompressInterfaceSnappy::uncompress(const char* in, size_t inLen, char* out, size_t* outLen) const
{
size_t realOutLen = 0;
auto rc = snappy::GetUncompressedLength(in, inLen, &realOutLen);
if (!rc || realOutLen > *outLen)
{
cerr << "snappy::GetUncompressedLength failed. InLen: " << inLen << ", outLen: " << *outLen
<< ", realOutLen: " << realOutLen << endl;
return ERR_DECOMPRESS;
}
rc = snappy::RawUncompress(in, inLen, out);
if (!rc)
{
cerr << "snappy::RawUnompress failed. InLen: " << inLen << ", outLen: " << *outLen << endl;
return ERR_DECOMPRESS;
}
#ifdef DEBUG_COMPRESSION
std::cout << "Snappy::uncompress: inLen " << inLen << ", outLen " << *outLen << std::endl;
#endif
*outLen = realOutLen;
return ERR_OK;
}
size_t CompressInterfaceSnappy::maxCompressedSize(size_t uncompSize) const
{
return (snappy::MaxCompressedLength(uncompSize) + HEADER_SIZE);
}
bool CompressInterfaceSnappy::getUncompressedSize(char* in, size_t inLen, size_t* outLen) const
{
return snappy::GetUncompressedLength(in, inLen, outLen);
}
uint8_t CompressInterfaceSnappy::getChunkMagicNumber() const
{
return CHUNK_MAGIC_SNAPPY;
}
// LZ4
CompressInterfaceLZ4::CompressInterfaceLZ4(uint32_t numUserPaddingBytes)
: CompressInterface(numUserPaddingBytes)
{
}
int32_t CompressInterfaceLZ4::compress(const char* in, size_t inLen, char* out, size_t* outLen) const
{
#ifdef HAVE_LZ4
auto compressedLen = LZ4_compress_default(in, out, inLen, *outLen);
if (!compressedLen)
{
cerr << "LZ_compress_default failed. InLen: " << inLen << ", compressedLen: " << compressedLen << endl;
return ERR_COMPRESS;
}
#ifdef DEBUG_COMPRESSION
std::cout << "LZ4::compress: inLen " << inLen << ", comressedLen " << compressedLen << std::endl;
#endif
*outLen = compressedLen;
return ERR_OK;
#else
return ERR_COMPRESS;
#endif
}
int32_t CompressInterfaceLZ4::uncompress(const char* in, size_t inLen, char* out, size_t* outLen) const
{
#ifdef HAVE_LZ4
auto decompressedLen = LZ4_decompress_safe(in, out, inLen, *outLen);
if (decompressedLen < 0)
{
cerr << "LZ_decompress_safe failed with error code " << decompressedLen << endl;
cerr << "InLen: " << inLen << ", outLen: " << *outLen << endl;
return ERR_DECOMPRESS;
}
*outLen = decompressedLen;
#ifdef DEBUG_COMPRESSION
std::cout << "LZ4::uncompress: inLen " << inLen << ", outLen " << *outLen << std::endl;
#endif
return ERR_OK;
#else
return ERR_DECOMPRESS;
#endif
}
size_t CompressInterfaceLZ4::maxCompressedSize(size_t uncompSize) const
{
return (LZ4_COMPRESSBOUND(uncompSize) + HEADER_SIZE);
}
bool CompressInterfaceLZ4::getUncompressedSize(char* in, size_t inLen, size_t* outLen) const
{
// LZ4 does not have such function.
idbassert(false);
return false;
}
uint8_t CompressInterfaceLZ4::getChunkMagicNumber() const
{
return CHUNK_MAGIC_LZ4;
}
CompressInterface* getCompressInterfaceByType(uint32_t compressionType, uint32_t numUserPaddingBytes)
{
switch (compressionType)
{
case 1:
case 2: return new CompressInterfaceSnappy(numUserPaddingBytes);
case 3: return new CompressInterfaceLZ4(numUserPaddingBytes);
}
return nullptr;
}
CompressInterface* getCompressInterfaceByName(const std::string& compressionName,
uint32_t numUserPaddingBytes)
{
if (compressionName == "SNAPPY")
return new CompressInterfaceSnappy(numUserPaddingBytes);
else if (compressionName == "LZ4")
return new CompressInterfaceLZ4(numUserPaddingBytes);
return nullptr;
}
void initializeCompressorPool(
std::unordered_map<uint32_t, std::shared_ptr<CompressInterface>>& compressorPool,
uint32_t numUserPaddingBytes)
{
compressorPool = {
make_pair(2, std::shared_ptr<CompressInterface>(new CompressInterfaceSnappy(numUserPaddingBytes))),
make_pair(3, std::shared_ptr<CompressInterface>(new CompressInterfaceLZ4(numUserPaddingBytes)))};
}
std::shared_ptr<CompressInterface> getCompressorByType(
std::unordered_map<uint32_t, std::shared_ptr<CompressInterface>>& compressorPool,
uint32_t compressionType)
{
switch (compressionType)
{
case 1:
case 2:
if (!compressorPool.count(2))
{
return nullptr;
}
return compressorPool[2];
case 3:
if (!compressorPool.count(3))
{
return nullptr;
}
return compressorPool[3];
}
return nullptr;
}
#endif
} // namespace compress