mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-04-18 21:44:02 +03:00
529 lines
17 KiB
C++
529 lines
17 KiB
C++
/* Copyright (C) 2014 InfiniDB, Inc.
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; version 2 of
|
|
the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
MA 02110-1301, USA. */
|
|
|
|
/******************************************************************************************
|
|
* $Id: idbcompress.cpp 3907 2013-06-18 13:32:46Z dcathey $
|
|
*
|
|
******************************************************************************************/
|
|
#include <cstring>
|
|
#include <iostream>
|
|
#include <stdexcept>
|
|
using namespace std;
|
|
|
|
#include "blocksize.h"
|
|
#include "logger.h"
|
|
#include "snappy.h"
|
|
#include "hasher.h"
|
|
#include "version1.h"
|
|
|
|
#define IDBCOMP_DLLEXPORT
|
|
#include "idbcompress.h"
|
|
#undef IDBCOMP_DLLEXPORT
|
|
|
|
namespace
|
|
{
|
|
const uint64_t MAGIC_NUMBER = 0xfdc119a384d0778eULL;
|
|
const uint64_t VERSION_NUM1 = 1;
|
|
const uint64_t VERSION_NUM2 = 2;
|
|
const int COMPRESSED_CHUNK_INCREMENT_SIZE = 8192;
|
|
const int PTR_SECTION_OFFSET = compress::IDBCompressInterface::HDR_BUF_LEN;
|
|
|
|
// version 1.1 of the chunk data has a short header
|
|
// QuickLZ compressed data never has the high bit set on the first byte
|
|
const uint8_t CHUNK_MAGIC1 = 0xff;
|
|
const int SIG_OFFSET = 0;
|
|
const int CHECKSUM_OFFSET = 1;
|
|
const int LEN_OFFSET = 5;
|
|
const unsigned HEADER_SIZE = 9;
|
|
|
|
/* version 1.2 of the chunk data changes the hash function used to calculate
|
|
* checksums. We can no longer use the algorithm used in ver 1.1. Everything
|
|
* else is the same
|
|
*/
|
|
const uint8_t CHUNK_MAGIC2 = 0xfe;
|
|
|
|
/* version 2.0 of the chunk data uses a new compression algo. For us, because of
|
|
* the finite number of block sizes we compress, the first byte of the compressed
|
|
* data will always be 0x80, so it can't be confused with V1.0 data (that has no
|
|
* header).
|
|
*/
|
|
const uint8_t CHUNK_MAGIC3 = 0xfd;
|
|
|
|
struct CompressedDBFileHeader
|
|
{
|
|
uint64_t fMagicNumber;
|
|
uint64_t fVersionNum;
|
|
uint64_t fCompressionType;
|
|
uint64_t fHeaderSize;
|
|
uint64_t fBlockCount;
|
|
};
|
|
|
|
// Make the header to be 4K, regardless number of fields being defined/used in header.
|
|
union CompressedDBFileHeaderBlock
|
|
{
|
|
CompressedDBFileHeader fHeader;
|
|
char fDummy[compress::IDBCompressInterface::HDR_BUF_LEN];
|
|
};
|
|
|
|
void initCompressedDBFileHeader(void* hdrBuf, int compressionType, int hdrSize)
|
|
{
|
|
CompressedDBFileHeaderBlock* hdr = reinterpret_cast<CompressedDBFileHeaderBlock*>(hdrBuf);
|
|
hdr->fHeader.fMagicNumber = MAGIC_NUMBER;
|
|
hdr->fHeader.fVersionNum = VERSION_NUM2;
|
|
hdr->fHeader.fCompressionType = compressionType;
|
|
hdr->fHeader.fBlockCount = 0;
|
|
hdr->fHeader.fHeaderSize = hdrSize;
|
|
}
|
|
|
|
void log(const string& s)
|
|
{
|
|
logging::MessageLog logger((logging::LoggingID()));
|
|
logging::Message message;
|
|
logging::Message::Args args;
|
|
|
|
args.add(s);
|
|
message.format(args);
|
|
logger.logErrorMessage(message);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
namespace compress
|
|
{
|
|
#ifndef SKIP_IDB_COMPRESSION
|
|
|
|
IDBCompressInterface::IDBCompressInterface(unsigned int numUserPaddingBytes) :
|
|
fNumUserPaddingBytes(numUserPaddingBytes)
|
|
{ }
|
|
|
|
IDBCompressInterface::~IDBCompressInterface()
|
|
{ }
|
|
|
|
/* V1 is really only available for decompression, we kill any DDL using V1 by hand.
|
|
* Maybe should have a new api, isDecompressionAvail() ? Any request to compress
|
|
* using V1 will silently be changed to V2.
|
|
*/
|
|
bool IDBCompressInterface::isCompressionAvail(int compressionType) const
|
|
{
|
|
if ( (compressionType == 0) ||
|
|
(compressionType == 1) ||
|
|
(compressionType == 2) )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Compress a block of data
|
|
//------------------------------------------------------------------------------
|
|
int IDBCompressInterface::compressBlock(const char* in,
|
|
const size_t inLen,
|
|
unsigned char* out,
|
|
unsigned int& outLen) const
|
|
{
|
|
size_t snaplen = 0;
|
|
utils::Hasher128 hasher;
|
|
|
|
// loose input checking.
|
|
if (outLen < snappy::MaxCompressedLength(inLen) + HEADER_SIZE)
|
|
{
|
|
cerr << "got outLen = " << outLen << " for inLen = " << inLen << ", needed " <<
|
|
(snappy::MaxCompressedLength(inLen) + HEADER_SIZE) << endl;
|
|
return ERR_BADOUTSIZE;
|
|
}
|
|
|
|
//apparently this never fails?
|
|
snappy::RawCompress(in, inLen, reinterpret_cast<char*>(&out[HEADER_SIZE]), &snaplen);
|
|
|
|
uint8_t* signature = (uint8_t*) &out[SIG_OFFSET];
|
|
uint32_t* checksum = (uint32_t*) &out[CHECKSUM_OFFSET];
|
|
uint32_t* len = (uint32_t*) &out[LEN_OFFSET];
|
|
*signature = CHUNK_MAGIC3;
|
|
*checksum = hasher((char*) &out[HEADER_SIZE], snaplen);
|
|
*len = snaplen;
|
|
|
|
//cerr << "cb: " << inLen << '/' << outLen << '/' << (snappy::MaxCompressedLength(inLen) + HEADER_SIZE) <<
|
|
// " : " << (snaplen + HEADER_SIZE) << endl;
|
|
|
|
outLen = snaplen + HEADER_SIZE;
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Decompress a block of data
|
|
//------------------------------------------------------------------------------
|
|
int IDBCompressInterface::uncompressBlock(const char* in, const size_t inLen, unsigned char* out,
|
|
unsigned int& outLen) const
|
|
{
|
|
bool comprc = false;
|
|
size_t ol = 0;
|
|
|
|
uint32_t realChecksum;
|
|
uint32_t storedChecksum;
|
|
uint32_t storedLen;
|
|
uint8_t storedMagic;
|
|
utils::Hasher128 hasher;
|
|
|
|
outLen = 0;
|
|
|
|
if (inLen < 1)
|
|
{
|
|
return ERR_BADINPUT;
|
|
}
|
|
|
|
storedMagic = *((uint8_t*) &in[SIG_OFFSET]);
|
|
|
|
if (storedMagic == CHUNK_MAGIC3)
|
|
{
|
|
if (inLen < HEADER_SIZE)
|
|
{
|
|
return ERR_BADINPUT;
|
|
}
|
|
|
|
storedChecksum = *((uint32_t*) &in[CHECKSUM_OFFSET]);
|
|
storedLen = *((uint32_t*) (&in[LEN_OFFSET]));
|
|
|
|
if (inLen < storedLen + HEADER_SIZE)
|
|
{
|
|
return ERR_BADINPUT;
|
|
}
|
|
|
|
realChecksum = hasher(&in[HEADER_SIZE], storedLen);
|
|
|
|
if (storedChecksum != realChecksum)
|
|
{
|
|
return ERR_CHECKSUM;
|
|
}
|
|
|
|
comprc = snappy::GetUncompressedLength(&in[HEADER_SIZE], storedLen, &ol) &&
|
|
snappy::RawUncompress(&in[HEADER_SIZE], storedLen, reinterpret_cast<char*>(out));
|
|
}
|
|
else if (storedMagic == CHUNK_MAGIC1 || storedMagic == CHUNK_MAGIC2)
|
|
{
|
|
if (inLen < HEADER_SIZE)
|
|
{
|
|
return ERR_BADINPUT;
|
|
}
|
|
|
|
storedChecksum = *((uint32_t*) &in[CHECKSUM_OFFSET]);
|
|
storedLen = *((uint32_t*) (&in[LEN_OFFSET]));
|
|
|
|
if (inLen < storedLen + HEADER_SIZE)
|
|
{
|
|
return ERR_BADINPUT;
|
|
}
|
|
|
|
/* We can no longer verify the checksum on ver 1.1 */
|
|
if (storedMagic == CHUNK_MAGIC2)
|
|
{
|
|
realChecksum = hasher(&in[HEADER_SIZE], storedLen);
|
|
|
|
if (storedChecksum != realChecksum)
|
|
{
|
|
return ERR_CHECKSUM;
|
|
}
|
|
}
|
|
|
|
try
|
|
{
|
|
comprc = v1::decompress(&in[HEADER_SIZE], storedLen, out, &ol);
|
|
}
|
|
catch (runtime_error& rex)
|
|
{
|
|
//cerr << "decomp caught exception: " << rex.what() << endl;
|
|
ostringstream os;
|
|
os << "decomp caught exception: " << rex.what();
|
|
log(os.str());
|
|
comprc = false;
|
|
}
|
|
catch (exception& ex)
|
|
{
|
|
ostringstream os;
|
|
os << "decomp caught exception: " << ex.what();
|
|
log(os.str());
|
|
comprc = false;
|
|
}
|
|
catch (...)
|
|
{
|
|
comprc = false;
|
|
}
|
|
}
|
|
else if ((storedMagic & 0x80) != 0)
|
|
{
|
|
return ERR_BADINPUT;
|
|
}
|
|
else
|
|
{
|
|
comprc = v1::decompress(in, inLen, out, &ol);
|
|
}
|
|
|
|
if (!comprc)
|
|
{
|
|
cerr << "decomp failed!" << endl;
|
|
return ERR_DECOMPRESS;
|
|
}
|
|
|
|
outLen = ol;
|
|
//cerr << "ub: " << inLen << " : " << outLen << endl;
|
|
|
|
return ERR_OK;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Verify the passed in buffer contains a valid compression file header.
|
|
//------------------------------------------------------------------------------
|
|
int IDBCompressInterface::verifyHdr(const void* hdrBuf) const
|
|
{
|
|
const CompressedDBFileHeader* hdr = reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf);
|
|
|
|
if (hdr->fMagicNumber != MAGIC_NUMBER)
|
|
return -1;
|
|
|
|
if (!isCompressionAvail(hdr->fCompressionType))
|
|
return -2;
|
|
|
|
return 0;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Extract compression pointer information out of the pointer buffer that is
|
|
// passed in. ptrBuf points to the pointer section of the compression hdr.
|
|
//------------------------------------------------------------------------------
|
|
int IDBCompressInterface::getPtrList(const char* ptrBuf,
|
|
const int ptrBufSize,
|
|
CompChunkPtrList& chunkPtrs ) const
|
|
{
|
|
int rc = 0;
|
|
chunkPtrs.clear();
|
|
|
|
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
|
|
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
|
|
|
|
for (unsigned int i = 0; (i < NUM_PTRS) && (rc == 0); i++)
|
|
{
|
|
if (ptrs[i + 1] == 0) // 0 offset means end of data
|
|
break;
|
|
|
|
if (ptrs[i + 1] > ptrs[i])
|
|
chunkPtrs.push_back(make_pair( ptrs[i], (ptrs[i + 1] - ptrs[i])));
|
|
else
|
|
rc = -1;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Extract compression pointer information out of the file compression hdr.
|
|
// Function assume that the file is a column file that has just two 4096-hdrs,
|
|
// one for the file header, and one for the list of pointers.
|
|
// Wrapper of above method for backward compatibility.
|
|
//------------------------------------------------------------------------------
|
|
int IDBCompressInterface::getPtrList(const char* hdrBuf, CompChunkPtrList& chunkPtrs ) const
|
|
{
|
|
return getPtrList(hdrBuf + HDR_BUF_LEN, HDR_BUF_LEN, chunkPtrs);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Count the number of chunk pointers in the pointer header(s)
|
|
//------------------------------------------------------------------------------
|
|
unsigned int IDBCompressInterface::getPtrCount(const char* ptrBuf,
|
|
const int ptrBufSize) const
|
|
{
|
|
unsigned int chunkCount = 0;
|
|
|
|
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
|
|
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
|
|
|
|
for (unsigned int i = 0; i < NUM_PTRS; i++)
|
|
{
|
|
if (ptrs[i + 1] == 0) // 0 offset means end of data
|
|
break;
|
|
|
|
chunkCount++;
|
|
}
|
|
|
|
return chunkCount;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Count the number of chunk pointers in the specified 8192 byte compression
|
|
// file header, which carries a single 4096 byte compression chunk header.
|
|
// This should not be used for compressed dictionary files which could have
|
|
// more compression chunk headers.
|
|
//------------------------------------------------------------------------------
|
|
unsigned int IDBCompressInterface::getPtrCount(const char* hdrBuf) const
|
|
{
|
|
return getPtrCount(hdrBuf + HDR_BUF_LEN, HDR_BUF_LEN);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Store list of compression pointers into the specified header.
|
|
//------------------------------------------------------------------------------
|
|
void IDBCompressInterface::storePtrs(const std::vector<uint64_t>& ptrs,
|
|
void* ptrBuf,
|
|
int ptrSectionSize) const
|
|
{
|
|
memset((ptrBuf), 0, ptrSectionSize); // reset the pointer section to 0
|
|
uint64_t* hdrPtrs = reinterpret_cast<uint64_t*>(ptrBuf);
|
|
|
|
for (unsigned i = 0; i < ptrs.size(); i++)
|
|
{
|
|
hdrPtrs[i] = ptrs[i];
|
|
}
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Wrapper of above method for backward compatibility
|
|
//------------------------------------------------------------------------------
|
|
void IDBCompressInterface::storePtrs(const std::vector<uint64_t>& ptrs, void* ptrBuf) const
|
|
{
|
|
storePtrs(ptrs, reinterpret_cast<char*>(ptrBuf) + HDR_BUF_LEN, HDR_BUF_LEN);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Initialize the header blocks to be written at the start of a column file.
|
|
//------------------------------------------------------------------------------
|
|
void IDBCompressInterface::initHdr(void* hdrBuf, int compressionType) const
|
|
{
|
|
memset(hdrBuf, 0, HDR_BUF_LEN * 2);
|
|
initCompressedDBFileHeader(hdrBuf, compressionType, HDR_BUF_LEN * 2);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Initialize the header blocks to be written at the start of a dictionary file.
|
|
//------------------------------------------------------------------------------
|
|
void IDBCompressInterface::initHdr(void* hdrBuf, void* ptrBuf, int compressionType, int hdrSize) const
|
|
{
|
|
memset(hdrBuf, 0, HDR_BUF_LEN);
|
|
memset(ptrBuf, 0, hdrSize - HDR_BUF_LEN);
|
|
initCompressedDBFileHeader(hdrBuf, compressionType, hdrSize);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Set the file's block count
|
|
//------------------------------------------------------------------------------
|
|
void IDBCompressInterface::setBlockCount(void* hdrBuf, uint64_t count) const
|
|
{
|
|
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fBlockCount = count;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Get the file's block count
|
|
//------------------------------------------------------------------------------
|
|
uint64_t IDBCompressInterface::getBlockCount(const void* hdrBuf) const
|
|
{
|
|
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fBlockCount);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Set the overall header size
|
|
//------------------------------------------------------------------------------
|
|
void IDBCompressInterface::setHdrSize(void* hdrBuf, uint64_t size) const
|
|
{
|
|
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fHeaderSize = size;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Get the overall header size
|
|
//------------------------------------------------------------------------------
|
|
uint64_t IDBCompressInterface::getHdrSize(const void* hdrBuf) const
|
|
{
|
|
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fHeaderSize);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Calculates the chunk and block offset within the chunk for the specified
|
|
// block number.
|
|
//------------------------------------------------------------------------------
|
|
void IDBCompressInterface::locateBlock(unsigned int block,
|
|
unsigned int& chunkIndex,
|
|
unsigned int& blockOffsetWithinChunk) const
|
|
{
|
|
const uint64_t BUFLEN = UNCOMPRESSED_INBUF_LEN;
|
|
|
|
uint64_t byteOffset = (uint64_t)block * BLOCK_SIZE;
|
|
uint64_t chunk = byteOffset / BUFLEN;
|
|
uint64_t blockInChunk = (byteOffset % BUFLEN) / BLOCK_SIZE;
|
|
|
|
chunkIndex = chunk;
|
|
blockOffsetWithinChunk = blockInChunk;
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Round up the size of the buffer to the applicable compressed size increment,
|
|
// also expand to allow for user requested padding. Lastly, initialize padding
|
|
// bytes to 0.
|
|
//------------------------------------------------------------------------------
|
|
int IDBCompressInterface::padCompressedChunks(unsigned char* buf,
|
|
unsigned int& len,
|
|
unsigned int maxLen) const
|
|
{
|
|
int nPaddingBytes = 0;
|
|
int nRem = len % COMPRESSED_CHUNK_INCREMENT_SIZE;
|
|
|
|
if (nRem != 0)
|
|
{
|
|
nPaddingBytes = COMPRESSED_CHUNK_INCREMENT_SIZE - nRem;
|
|
}
|
|
|
|
nPaddingBytes = nPaddingBytes + fNumUserPaddingBytes;
|
|
|
|
if (nPaddingBytes > 0)
|
|
{
|
|
if ((len + nPaddingBytes) > maxLen)
|
|
return -1;
|
|
|
|
memset(buf + len, 0, nPaddingBytes);
|
|
len = len + nPaddingBytes;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* static */
|
|
uint64_t IDBCompressInterface::maxCompressedSize(uint64_t uncompSize)
|
|
{
|
|
return (snappy::MaxCompressedLength(uncompSize) + HEADER_SIZE);
|
|
}
|
|
|
|
int IDBCompressInterface::compress(const char* in, size_t inLen, char* out,
|
|
size_t* outLen) const
|
|
{
|
|
snappy::RawCompress(in, inLen, out, outLen);
|
|
return 0;
|
|
}
|
|
|
|
int IDBCompressInterface::uncompress(const char* in, size_t inLen, char* out) const
|
|
{
|
|
return !(snappy::RawUncompress(in, inLen, out));
|
|
}
|
|
|
|
/* static */
|
|
bool IDBCompressInterface::getUncompressedSize(char* in, size_t inLen, size_t* outLen)
|
|
{
|
|
return snappy::GetUncompressedLength(in, inLen, outLen);
|
|
}
|
|
|
|
#endif
|
|
|
|
} // namespace compress
|
|
// vim:ts=4 sw=4:
|
|
|