1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-10-24 10:12:58 +03:00
Files
mariadb-columnstore-engine/utils/compress/idbcompress.cpp
2016-01-06 14:08:59 -06:00

499 lines
16 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/******************************************************************************************
* $Id: idbcompress.cpp 3907 2013-06-18 13:32:46Z dcathey $
*
******************************************************************************************/
#include <cstring>
#include <iostream>
#include <stdexcept>
using namespace std;
#include "blocksize.h"
#include "logger.h"
#include "snappy.h"
#include "hasher.h"
#include "version1.h"
#define IDBCOMP_DLLEXPORT
#include "idbcompress.h"
#undef IDBCOMP_DLLEXPORT
namespace
{
const uint64_t MAGIC_NUMBER = 0xfdc119a384d0778eULL;
const uint64_t VERSION_NUM1 = 1;
const uint64_t VERSION_NUM2 = 2;
const int COMPRESSED_CHUNK_INCREMENT_SIZE = 8192;
const int PTR_SECTION_OFFSET = compress::IDBCompressInterface::HDR_BUF_LEN;
// version 1.1 of the chunk data has a short header
// QuickLZ compressed data never has the high bit set on the first byte
const uint8_t CHUNK_MAGIC1 = 0xff;
const int SIG_OFFSET = 0;
const int CHECKSUM_OFFSET = 1;
const int LEN_OFFSET = 5;
const unsigned HEADER_SIZE = 9;
/* version 1.2 of the chunk data changes the hash function used to calculate
* checksums. We can no longer use the algorithm used in ver 1.1. Everything
* else is the same
*/
const uint8_t CHUNK_MAGIC2 = 0xfe;
/* version 2.0 of the chunk data uses a new compression algo. For us, because of
* the finite number of block sizes we compress, the first byte of the compressed
* data will always be 0x80, so it can't be confused with V1.0 data (that has no
* header).
*/
const uint8_t CHUNK_MAGIC3 = 0xfd;
struct CompressedDBFileHeader
{
uint64_t fMagicNumber;
uint64_t fVersionNum;
uint64_t fCompressionType;
uint64_t fHeaderSize;
uint64_t fBlockCount;
};
// Make the header to be 4K, regardless number of fields being defined/used in header.
union CompressedDBFileHeaderBlock
{
CompressedDBFileHeader fHeader;
char fDummy[compress::IDBCompressInterface::HDR_BUF_LEN];
};
void initCompressedDBFileHeader(void* hdrBuf, int compressionType, int hdrSize)
{
CompressedDBFileHeaderBlock* hdr = reinterpret_cast<CompressedDBFileHeaderBlock*>(hdrBuf);
hdr->fHeader.fMagicNumber = MAGIC_NUMBER;
hdr->fHeader.fVersionNum = VERSION_NUM2;
hdr->fHeader.fCompressionType = compressionType;
hdr->fHeader.fBlockCount = 0;
hdr->fHeader.fHeaderSize = hdrSize;
}
void log(const string &s)
{
logging::MessageLog logger((logging::LoggingID()));
logging::Message message;
logging::Message::Args args;
args.add(s);
message.format(args);
logger.logErrorMessage(message);
}
} // namespace
namespace compress
{
#ifndef SKIP_IDB_COMPRESSION
IDBCompressInterface::IDBCompressInterface(unsigned int numUserPaddingBytes) :
fNumUserPaddingBytes(numUserPaddingBytes)
{ }
IDBCompressInterface::~IDBCompressInterface()
{ }
/* V1 is really only available for decompression, we kill any DDL using V1 by hand.
* Maybe should have a new api, isDecompressionAvail() ? Any request to compress
* using V1 will silently be changed to V2.
*/
bool IDBCompressInterface::isCompressionAvail(int compressionType) const
{
if ( (compressionType == 0) ||
(compressionType == 1) ||
(compressionType == 2) )
return true;
return false;
}
//------------------------------------------------------------------------------
// Compress a block of data
//------------------------------------------------------------------------------
int IDBCompressInterface::compressBlock(const char* in,
const size_t inLen,
unsigned char* out,
unsigned int& outLen) const
{
size_t snaplen = 0;
utils::Hasher128 hasher;
// loose input checking.
if (outLen < snappy::MaxCompressedLength(inLen) + HEADER_SIZE)
{
cerr << "got outLen = " << outLen << " for inLen = " << inLen << ", needed " <<
(snappy::MaxCompressedLength(inLen) + HEADER_SIZE) << endl;
return ERR_BADOUTSIZE;
}
//apparently this never fails?
snappy::RawCompress(in, inLen, reinterpret_cast<char*>(&out[HEADER_SIZE]), &snaplen);
uint8_t *signature = (uint8_t *) &out[SIG_OFFSET];
uint32_t *checksum = (uint32_t *) &out[CHECKSUM_OFFSET];
uint32_t *len = (uint32_t *) &out[LEN_OFFSET];
*signature = CHUNK_MAGIC3;
*checksum = hasher((char *) &out[HEADER_SIZE], snaplen);
*len = snaplen;
//cerr << "cb: " << inLen << '/' << outLen << '/' << (snappy::MaxCompressedLength(inLen) + HEADER_SIZE) <<
// " : " << (snaplen + HEADER_SIZE) << endl;
outLen = snaplen + HEADER_SIZE;
return ERR_OK;
}
//------------------------------------------------------------------------------
// Decompress a block of data
//------------------------------------------------------------------------------
int IDBCompressInterface::uncompressBlock(const char* in, const size_t inLen, unsigned char* out,
unsigned int& outLen) const
{
bool comprc = false;
size_t ol = 0;
uint32_t realChecksum;
uint32_t storedChecksum;
uint32_t storedLen;
uint8_t storedMagic;
utils::Hasher128 hasher;
outLen = 0;
if (inLen < 1) {
return ERR_BADINPUT;
}
storedMagic = *((uint8_t *) &in[SIG_OFFSET]);
if (storedMagic == CHUNK_MAGIC3)
{
if (inLen < HEADER_SIZE) {
return ERR_BADINPUT;
}
storedChecksum = *((uint32_t *) &in[CHECKSUM_OFFSET]);
storedLen = *((uint32_t *) (&in[LEN_OFFSET]));
if (inLen < storedLen + HEADER_SIZE) {
return ERR_BADINPUT;
}
realChecksum = hasher(&in[HEADER_SIZE], storedLen);
if (storedChecksum != realChecksum) {
return ERR_CHECKSUM;
}
comprc = snappy::GetUncompressedLength(&in[HEADER_SIZE], storedLen, &ol) &&
snappy::RawUncompress(&in[HEADER_SIZE], storedLen, reinterpret_cast<char*>(out));
}
else if (storedMagic == CHUNK_MAGIC1 || storedMagic == CHUNK_MAGIC2)
{
if (inLen < HEADER_SIZE) {
return ERR_BADINPUT;
}
storedChecksum = *((uint32_t *) &in[CHECKSUM_OFFSET]);
storedLen = *((uint32_t *) (&in[LEN_OFFSET]));
if (inLen < storedLen + HEADER_SIZE) {
return ERR_BADINPUT;
}
/* We can no longer verify the checksum on ver 1.1 */
if (storedMagic == CHUNK_MAGIC2) {
realChecksum = hasher(&in[HEADER_SIZE], storedLen);
if (storedChecksum != realChecksum) {
return ERR_CHECKSUM;
}
}
try {
comprc = v1::decompress(&in[HEADER_SIZE], storedLen, out, &ol);
} catch (runtime_error& rex) {
//cerr << "decomp caught exception: " << rex.what() << endl;
ostringstream os;
os << "decomp caught exception: " << rex.what();
log(os.str());
comprc = false;
} catch (exception& ex) {
ostringstream os;
os << "decomp caught exception: " << ex.what();
log(os.str());
comprc = false;
} catch (...) {
comprc = false;
}
}
else if ((storedMagic & 0x80) != 0)
{
return ERR_BADINPUT;
}
else
{
comprc = v1::decompress(in, inLen, out, &ol);
}
if (!comprc)
{
cerr << "decomp failed!" << endl;
return ERR_DECOMPRESS;
}
outLen = ol;
//cerr << "ub: " << inLen << " : " << outLen << endl;
return ERR_OK;
}
//------------------------------------------------------------------------------
// Verify the passed in buffer contains a valid compression file header.
//------------------------------------------------------------------------------
int IDBCompressInterface::verifyHdr(const void* hdrBuf) const
{
const CompressedDBFileHeader* hdr = reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf);
if (hdr->fMagicNumber != MAGIC_NUMBER)
return -1;
if (!isCompressionAvail(hdr->fCompressionType))
return -2;
return 0;
}
//------------------------------------------------------------------------------
// Extract compression pointer information out of the pointer buffer that is
// passed in. ptrBuf points to the pointer section of the compression hdr.
//------------------------------------------------------------------------------
int IDBCompressInterface::getPtrList(const char* ptrBuf,
const int ptrBufSize,
CompChunkPtrList& chunkPtrs ) const
{
int rc = 0;
chunkPtrs.clear();
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
for (unsigned int i = 0; (i < NUM_PTRS) && (rc == 0); i++)
{
if (ptrs[i+1] == 0) // 0 offset means end of data
break;
if (ptrs[i+1] > ptrs[i])
chunkPtrs.push_back(make_pair( ptrs[i], (ptrs[i+1]-ptrs[i])));
else
rc = -1;
}
return rc;
}
//------------------------------------------------------------------------------
// Extract compression pointer information out of the file compression hdr.
// Function assume that the file is a column file that has just two 4096-hdrs,
// one for the file header, and one for the list of pointers.
// Wrapper of above method for backward compatibility.
//------------------------------------------------------------------------------
int IDBCompressInterface::getPtrList(const char* hdrBuf, CompChunkPtrList& chunkPtrs ) const
{
return getPtrList(hdrBuf+HDR_BUF_LEN, HDR_BUF_LEN, chunkPtrs);
}
//------------------------------------------------------------------------------
// Count the number of chunk pointers in the pointer header(s)
//------------------------------------------------------------------------------
unsigned int IDBCompressInterface::getPtrCount(const char* ptrBuf,
const int ptrBufSize) const
{
unsigned int chunkCount = 0;
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
for (unsigned int i = 0; i < NUM_PTRS; i++)
{
if (ptrs[i+1] == 0) // 0 offset means end of data
break;
chunkCount++;
}
return chunkCount;
}
//------------------------------------------------------------------------------
// Count the number of chunk pointers in the specified 8192 byte compression
// file header, which carries a single 4096 byte compression chunk header.
// This should not be used for compressed dictionary files which could have
// more compression chunk headers.
//------------------------------------------------------------------------------
unsigned int IDBCompressInterface::getPtrCount(const char* hdrBuf) const
{
return getPtrCount(hdrBuf+HDR_BUF_LEN, HDR_BUF_LEN);
}
//------------------------------------------------------------------------------
// Store list of compression pointers into the specified header.
//------------------------------------------------------------------------------
void IDBCompressInterface::storePtrs(const std::vector<uint64_t>& ptrs,
void* ptrBuf,
int ptrSectionSize) const
{
memset((ptrBuf), 0, ptrSectionSize); // reset the pointer section to 0
uint64_t* hdrPtrs = reinterpret_cast<uint64_t*>(ptrBuf);
for (unsigned i=0; i<ptrs.size(); i++)
{
hdrPtrs[i] = ptrs[i];
}
}
//------------------------------------------------------------------------------
// Wrapper of above method for backward compatibility
//------------------------------------------------------------------------------
void IDBCompressInterface::storePtrs(const std::vector<uint64_t>& ptrs, void* ptrBuf) const
{
storePtrs(ptrs, reinterpret_cast<char*>(ptrBuf) + HDR_BUF_LEN, HDR_BUF_LEN);
}
//------------------------------------------------------------------------------
// Initialize the header blocks to be written at the start of a column file.
//------------------------------------------------------------------------------
void IDBCompressInterface::initHdr(void* hdrBuf, int compressionType) const
{
memset(hdrBuf, 0, HDR_BUF_LEN*2);
initCompressedDBFileHeader(hdrBuf, compressionType, HDR_BUF_LEN*2);
}
//------------------------------------------------------------------------------
// Initialize the header blocks to be written at the start of a dictionary file.
//------------------------------------------------------------------------------
void IDBCompressInterface::initHdr(void* hdrBuf,void* ptrBuf,int compressionType,int hdrSize) const
{
memset(hdrBuf, 0, HDR_BUF_LEN);
memset(ptrBuf, 0, hdrSize - HDR_BUF_LEN);
initCompressedDBFileHeader(hdrBuf, compressionType, hdrSize);
}
//------------------------------------------------------------------------------
// Set the file's block count
//------------------------------------------------------------------------------
void IDBCompressInterface::setBlockCount(void* hdrBuf, uint64_t count) const
{
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fBlockCount = count;
}
//------------------------------------------------------------------------------
// Get the file's block count
//------------------------------------------------------------------------------
uint64_t IDBCompressInterface::getBlockCount(const void* hdrBuf) const
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fBlockCount);
}
//------------------------------------------------------------------------------
// Set the overall header size
//------------------------------------------------------------------------------
void IDBCompressInterface::setHdrSize(void* hdrBuf, uint64_t size) const
{
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fHeaderSize = size;
}
//------------------------------------------------------------------------------
// Get the overall header size
//------------------------------------------------------------------------------
uint64_t IDBCompressInterface::getHdrSize(const void* hdrBuf) const
{
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fHeaderSize);
}
//------------------------------------------------------------------------------
// Calculates the chunk and block offset within the chunk for the specified
// block number.
//------------------------------------------------------------------------------
void IDBCompressInterface::locateBlock(unsigned int block,
unsigned int& chunkIndex,
unsigned int& blockOffsetWithinChunk) const
{
const uint64_t BUFLEN = UNCOMPRESSED_INBUF_LEN;
uint64_t byteOffset = (uint64_t)block * BLOCK_SIZE;
uint64_t chunk = byteOffset / BUFLEN;
uint64_t blockInChunk = (byteOffset % BUFLEN) / BLOCK_SIZE;
chunkIndex = chunk;
blockOffsetWithinChunk = blockInChunk;
}
//------------------------------------------------------------------------------
// Round up the size of the buffer to the applicable compressed size increment,
// also expand to allow for user requested padding. Lastly, initialize padding
// bytes to 0.
//------------------------------------------------------------------------------
int IDBCompressInterface::padCompressedChunks(unsigned char* buf,
unsigned int& len,
unsigned int maxLen) const
{
int nPaddingBytes = 0;
int nRem = len % COMPRESSED_CHUNK_INCREMENT_SIZE;
if (nRem != 0)
{
nPaddingBytes = COMPRESSED_CHUNK_INCREMENT_SIZE - nRem;
}
nPaddingBytes = nPaddingBytes + fNumUserPaddingBytes;
if (nPaddingBytes > 0)
{
if ((len + nPaddingBytes) > maxLen)
return -1;
memset(buf+len, 0, nPaddingBytes);
len = len + nPaddingBytes;
}
return 0;
}
/* static */
uint64_t IDBCompressInterface::maxCompressedSize(uint64_t uncompSize)
{
return (snappy::MaxCompressedLength(uncompSize) + HEADER_SIZE);
}
int IDBCompressInterface::compress(const char *in, size_t inLen, char *out,
size_t *outLen) const
{
snappy::RawCompress(in, inLen, out, outLen);
return 0;
}
int IDBCompressInterface::uncompress(const char *in, size_t inLen, char *out) const
{
return !(snappy::RawUncompress(in, inLen, out));
}
/* static */
bool IDBCompressInterface::getUncompressedSize(char *in, size_t inLen, size_t *outLen)
{
return snappy::GetUncompressedLength(in, inLen, outLen);
}
#endif
} // namespace compress
// vim:ts=4 sw=4: