You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
the begginning
This commit is contained in:
498
utils/compress/idbcompress.cpp
Normal file
498
utils/compress/idbcompress.cpp
Normal file
@ -0,0 +1,498 @@
|
||||
/* Copyright (C) 2014 InfiniDB, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; version 2 of
|
||||
the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||
MA 02110-1301, USA. */
|
||||
|
||||
/******************************************************************************************
|
||||
* $Id: idbcompress.cpp 3907 2013-06-18 13:32:46Z dcathey $
|
||||
*
|
||||
******************************************************************************************/
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
using namespace std;
|
||||
|
||||
#include "blocksize.h"
|
||||
#include "logger.h"
|
||||
#include "snappy.h"
|
||||
#include "hasher.h"
|
||||
#include "version1.h"
|
||||
|
||||
#define IDBCOMP_DLLEXPORT
|
||||
#include "idbcompress.h"
|
||||
#undef IDBCOMP_DLLEXPORT
|
||||
|
||||
namespace
|
||||
{
|
||||
const uint64_t MAGIC_NUMBER = 0xfdc119a384d0778eULL;
|
||||
const uint64_t VERSION_NUM1 = 1;
|
||||
const uint64_t VERSION_NUM2 = 2;
|
||||
const int COMPRESSED_CHUNK_INCREMENT_SIZE = 8192;
|
||||
const int PTR_SECTION_OFFSET = compress::IDBCompressInterface::HDR_BUF_LEN;
|
||||
|
||||
// version 1.1 of the chunk data has a short header
|
||||
// QuickLZ compressed data never has the high bit set on the first byte
|
||||
const uint8_t CHUNK_MAGIC1 = 0xff;
|
||||
const int SIG_OFFSET = 0;
|
||||
const int CHECKSUM_OFFSET = 1;
|
||||
const int LEN_OFFSET = 5;
|
||||
const unsigned HEADER_SIZE = 9;
|
||||
|
||||
/* version 1.2 of the chunk data changes the hash function used to calculate
|
||||
* checksums. We can no longer use the algorithm used in ver 1.1. Everything
|
||||
* else is the same
|
||||
*/
|
||||
const uint8_t CHUNK_MAGIC2 = 0xfe;
|
||||
|
||||
/* version 2.0 of the chunk data uses a new compression algo. For us, because of
|
||||
* the finite number of block sizes we compress, the first byte of the compressed
|
||||
* data will always be 0x80, so it can't be confused with V1.0 data (that has no
|
||||
* header).
|
||||
*/
|
||||
const uint8_t CHUNK_MAGIC3 = 0xfd;
|
||||
|
||||
struct CompressedDBFileHeader
|
||||
{
|
||||
uint64_t fMagicNumber;
|
||||
uint64_t fVersionNum;
|
||||
uint64_t fCompressionType;
|
||||
uint64_t fHeaderSize;
|
||||
uint64_t fBlockCount;
|
||||
};
|
||||
|
||||
// Make the header to be 4K, regardless number of fields being defined/used in header.
|
||||
union CompressedDBFileHeaderBlock
|
||||
{
|
||||
CompressedDBFileHeader fHeader;
|
||||
char fDummy[compress::IDBCompressInterface::HDR_BUF_LEN];
|
||||
};
|
||||
|
||||
void initCompressedDBFileHeader(void* hdrBuf, int compressionType, int hdrSize)
|
||||
{
|
||||
CompressedDBFileHeaderBlock* hdr = reinterpret_cast<CompressedDBFileHeaderBlock*>(hdrBuf);
|
||||
hdr->fHeader.fMagicNumber = MAGIC_NUMBER;
|
||||
hdr->fHeader.fVersionNum = VERSION_NUM2;
|
||||
hdr->fHeader.fCompressionType = compressionType;
|
||||
hdr->fHeader.fBlockCount = 0;
|
||||
hdr->fHeader.fHeaderSize = hdrSize;
|
||||
}
|
||||
|
||||
void log(const string &s)
|
||||
{
|
||||
logging::MessageLog logger((logging::LoggingID()));
|
||||
logging::Message message;
|
||||
logging::Message::Args args;
|
||||
|
||||
args.add(s);
|
||||
message.format(args);
|
||||
logger.logErrorMessage(message);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
namespace compress
|
||||
{
|
||||
#ifndef SKIP_IDB_COMPRESSION
|
||||
|
||||
IDBCompressInterface::IDBCompressInterface(unsigned int numUserPaddingBytes) :
|
||||
fNumUserPaddingBytes(numUserPaddingBytes)
|
||||
{ }
|
||||
|
||||
IDBCompressInterface::~IDBCompressInterface()
|
||||
{ }
|
||||
|
||||
/* V1 is really only available for decompression, we kill any DDL using V1 by hand.
|
||||
* Maybe should have a new api, isDecompressionAvail() ? Any request to compress
|
||||
* using V1 will silently be changed to V2.
|
||||
*/
|
||||
bool IDBCompressInterface::isCompressionAvail(int compressionType) const
|
||||
{
|
||||
if ( (compressionType == 0) ||
|
||||
(compressionType == 1) ||
|
||||
(compressionType == 2) )
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compress a block of data
|
||||
//------------------------------------------------------------------------------
|
||||
int IDBCompressInterface::compressBlock(const char* in,
|
||||
const size_t inLen,
|
||||
unsigned char* out,
|
||||
unsigned int& outLen) const
|
||||
{
|
||||
size_t snaplen = 0;
|
||||
utils::Hasher128 hasher;
|
||||
|
||||
// loose input checking.
|
||||
if (outLen < snappy::MaxCompressedLength(inLen) + HEADER_SIZE)
|
||||
{
|
||||
cerr << "got outLen = " << outLen << " for inLen = " << inLen << ", needed " <<
|
||||
(snappy::MaxCompressedLength(inLen) + HEADER_SIZE) << endl;
|
||||
return ERR_BADOUTSIZE;
|
||||
}
|
||||
|
||||
//apparently this never fails?
|
||||
snappy::RawCompress(in, inLen, reinterpret_cast<char*>(&out[HEADER_SIZE]), &snaplen);
|
||||
|
||||
uint8_t *signature = (uint8_t *) &out[SIG_OFFSET];
|
||||
uint32_t *checksum = (uint32_t *) &out[CHECKSUM_OFFSET];
|
||||
uint32_t *len = (uint32_t *) &out[LEN_OFFSET];
|
||||
*signature = CHUNK_MAGIC3;
|
||||
*checksum = hasher((char *) &out[HEADER_SIZE], snaplen);
|
||||
*len = snaplen;
|
||||
|
||||
//cerr << "cb: " << inLen << '/' << outLen << '/' << (snappy::MaxCompressedLength(inLen) + HEADER_SIZE) <<
|
||||
// " : " << (snaplen + HEADER_SIZE) << endl;
|
||||
|
||||
outLen = snaplen + HEADER_SIZE;
|
||||
|
||||
return ERR_OK;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Decompress a block of data
|
||||
//------------------------------------------------------------------------------
|
||||
int IDBCompressInterface::uncompressBlock(const char* in, const size_t inLen, unsigned char* out,
|
||||
unsigned int& outLen) const
|
||||
{
|
||||
bool comprc = false;
|
||||
size_t ol = 0;
|
||||
|
||||
uint32_t realChecksum;
|
||||
uint32_t storedChecksum;
|
||||
uint32_t storedLen;
|
||||
uint8_t storedMagic;
|
||||
utils::Hasher128 hasher;
|
||||
|
||||
outLen = 0;
|
||||
if (inLen < 1) {
|
||||
return ERR_BADINPUT;
|
||||
}
|
||||
storedMagic = *((uint8_t *) &in[SIG_OFFSET]);
|
||||
|
||||
if (storedMagic == CHUNK_MAGIC3)
|
||||
{
|
||||
if (inLen < HEADER_SIZE) {
|
||||
return ERR_BADINPUT;
|
||||
}
|
||||
storedChecksum = *((uint32_t *) &in[CHECKSUM_OFFSET]);
|
||||
storedLen = *((uint32_t *) (&in[LEN_OFFSET]));
|
||||
if (inLen < storedLen + HEADER_SIZE) {
|
||||
return ERR_BADINPUT;
|
||||
}
|
||||
|
||||
realChecksum = hasher(&in[HEADER_SIZE], storedLen);
|
||||
if (storedChecksum != realChecksum) {
|
||||
return ERR_CHECKSUM;
|
||||
}
|
||||
|
||||
comprc = snappy::GetUncompressedLength(&in[HEADER_SIZE], storedLen, &ol) &&
|
||||
snappy::RawUncompress(&in[HEADER_SIZE], storedLen, reinterpret_cast<char*>(out));
|
||||
}
|
||||
else if (storedMagic == CHUNK_MAGIC1 || storedMagic == CHUNK_MAGIC2)
|
||||
{
|
||||
if (inLen < HEADER_SIZE) {
|
||||
return ERR_BADINPUT;
|
||||
}
|
||||
storedChecksum = *((uint32_t *) &in[CHECKSUM_OFFSET]);
|
||||
storedLen = *((uint32_t *) (&in[LEN_OFFSET]));
|
||||
if (inLen < storedLen + HEADER_SIZE) {
|
||||
return ERR_BADINPUT;
|
||||
}
|
||||
/* We can no longer verify the checksum on ver 1.1 */
|
||||
if (storedMagic == CHUNK_MAGIC2) {
|
||||
realChecksum = hasher(&in[HEADER_SIZE], storedLen);
|
||||
if (storedChecksum != realChecksum) {
|
||||
return ERR_CHECKSUM;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
comprc = v1::decompress(&in[HEADER_SIZE], storedLen, out, &ol);
|
||||
} catch (runtime_error& rex) {
|
||||
//cerr << "decomp caught exception: " << rex.what() << endl;
|
||||
ostringstream os;
|
||||
os << "decomp caught exception: " << rex.what();
|
||||
log(os.str());
|
||||
comprc = false;
|
||||
} catch (exception& ex) {
|
||||
ostringstream os;
|
||||
os << "decomp caught exception: " << ex.what();
|
||||
log(os.str());
|
||||
comprc = false;
|
||||
} catch (...) {
|
||||
comprc = false;
|
||||
}
|
||||
}
|
||||
else if ((storedMagic & 0x80) != 0)
|
||||
{
|
||||
return ERR_BADINPUT;
|
||||
}
|
||||
else
|
||||
{
|
||||
comprc = v1::decompress(in, inLen, out, &ol);
|
||||
}
|
||||
|
||||
if (!comprc)
|
||||
{
|
||||
cerr << "decomp failed!" << endl;
|
||||
return ERR_DECOMPRESS;
|
||||
}
|
||||
|
||||
outLen = ol;
|
||||
//cerr << "ub: " << inLen << " : " << outLen << endl;
|
||||
|
||||
return ERR_OK;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Verify the passed in buffer contains a valid compression file header.
|
||||
//------------------------------------------------------------------------------
|
||||
int IDBCompressInterface::verifyHdr(const void* hdrBuf) const
|
||||
{
|
||||
const CompressedDBFileHeader* hdr = reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf);
|
||||
if (hdr->fMagicNumber != MAGIC_NUMBER)
|
||||
return -1;
|
||||
if (!isCompressionAvail(hdr->fCompressionType))
|
||||
return -2;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Extract compression pointer information out of the pointer buffer that is
|
||||
// passed in. ptrBuf points to the pointer section of the compression hdr.
|
||||
//------------------------------------------------------------------------------
|
||||
int IDBCompressInterface::getPtrList(const char* ptrBuf,
|
||||
const int ptrBufSize,
|
||||
CompChunkPtrList& chunkPtrs ) const
|
||||
{
|
||||
int rc = 0;
|
||||
chunkPtrs.clear();
|
||||
|
||||
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
|
||||
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
|
||||
for (unsigned int i = 0; (i < NUM_PTRS) && (rc == 0); i++)
|
||||
{
|
||||
if (ptrs[i+1] == 0) // 0 offset means end of data
|
||||
break;
|
||||
|
||||
if (ptrs[i+1] > ptrs[i])
|
||||
chunkPtrs.push_back(make_pair( ptrs[i], (ptrs[i+1]-ptrs[i])));
|
||||
else
|
||||
rc = -1;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Extract compression pointer information out of the file compression hdr.
|
||||
// Function assume that the file is a column file that has just two 4096-hdrs,
|
||||
// one for the file header, and one for the list of pointers.
|
||||
// Wrapper of above method for backward compatibility.
|
||||
//------------------------------------------------------------------------------
|
||||
int IDBCompressInterface::getPtrList(const char* hdrBuf, CompChunkPtrList& chunkPtrs ) const
|
||||
{
|
||||
return getPtrList(hdrBuf+HDR_BUF_LEN, HDR_BUF_LEN, chunkPtrs);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Count the number of chunk pointers in the pointer header(s)
|
||||
//------------------------------------------------------------------------------
|
||||
unsigned int IDBCompressInterface::getPtrCount(const char* ptrBuf,
|
||||
const int ptrBufSize) const
|
||||
{
|
||||
unsigned int chunkCount = 0;
|
||||
|
||||
const uint64_t* ptrs = reinterpret_cast<const uint64_t*>(ptrBuf);
|
||||
const unsigned int NUM_PTRS = ptrBufSize / sizeof(uint64_t);
|
||||
for (unsigned int i = 0; i < NUM_PTRS; i++)
|
||||
{
|
||||
if (ptrs[i+1] == 0) // 0 offset means end of data
|
||||
break;
|
||||
|
||||
chunkCount++;
|
||||
}
|
||||
|
||||
return chunkCount;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Count the number of chunk pointers in the specified 8192 byte compression
|
||||
// file header, which carries a single 4096 byte compression chunk header.
|
||||
// This should not be used for compressed dictionary files which could have
|
||||
// more compression chunk headers.
|
||||
//------------------------------------------------------------------------------
|
||||
unsigned int IDBCompressInterface::getPtrCount(const char* hdrBuf) const
|
||||
{
|
||||
return getPtrCount(hdrBuf+HDR_BUF_LEN, HDR_BUF_LEN);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Store list of compression pointers into the specified header.
|
||||
//------------------------------------------------------------------------------
|
||||
void IDBCompressInterface::storePtrs(const std::vector<uint64_t>& ptrs,
|
||||
void* ptrBuf,
|
||||
int ptrSectionSize) const
|
||||
{
|
||||
memset((ptrBuf), 0, ptrSectionSize); // reset the pointer section to 0
|
||||
uint64_t* hdrPtrs = reinterpret_cast<uint64_t*>(ptrBuf);
|
||||
|
||||
for (unsigned i=0; i<ptrs.size(); i++)
|
||||
{
|
||||
hdrPtrs[i] = ptrs[i];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Wrapper of above method for backward compatibility
|
||||
//------------------------------------------------------------------------------
|
||||
void IDBCompressInterface::storePtrs(const std::vector<uint64_t>& ptrs, void* ptrBuf) const
|
||||
{
|
||||
storePtrs(ptrs, reinterpret_cast<char*>(ptrBuf) + HDR_BUF_LEN, HDR_BUF_LEN);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Initialize the header blocks to be written at the start of a column file.
|
||||
//------------------------------------------------------------------------------
|
||||
void IDBCompressInterface::initHdr(void* hdrBuf, int compressionType) const
|
||||
{
|
||||
memset(hdrBuf, 0, HDR_BUF_LEN*2);
|
||||
initCompressedDBFileHeader(hdrBuf, compressionType, HDR_BUF_LEN*2);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Initialize the header blocks to be written at the start of a dictionary file.
|
||||
//------------------------------------------------------------------------------
|
||||
void IDBCompressInterface::initHdr(void* hdrBuf,void* ptrBuf,int compressionType,int hdrSize) const
|
||||
{
|
||||
memset(hdrBuf, 0, HDR_BUF_LEN);
|
||||
memset(ptrBuf, 0, hdrSize - HDR_BUF_LEN);
|
||||
initCompressedDBFileHeader(hdrBuf, compressionType, hdrSize);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set the file's block count
|
||||
//------------------------------------------------------------------------------
|
||||
void IDBCompressInterface::setBlockCount(void* hdrBuf, uint64_t count) const
|
||||
{
|
||||
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fBlockCount = count;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Get the file's block count
|
||||
//------------------------------------------------------------------------------
|
||||
uint64_t IDBCompressInterface::getBlockCount(const void* hdrBuf) const
|
||||
{
|
||||
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fBlockCount);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set the overall header size
|
||||
//------------------------------------------------------------------------------
|
||||
void IDBCompressInterface::setHdrSize(void* hdrBuf, uint64_t size) const
|
||||
{
|
||||
reinterpret_cast<CompressedDBFileHeader*>(hdrBuf)->fHeaderSize = size;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Get the overall header size
|
||||
//------------------------------------------------------------------------------
|
||||
uint64_t IDBCompressInterface::getHdrSize(const void* hdrBuf) const
|
||||
{
|
||||
return (reinterpret_cast<const CompressedDBFileHeader*>(hdrBuf)->fHeaderSize);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Calculates the chunk and block offset within the chunk for the specified
|
||||
// block number.
|
||||
//------------------------------------------------------------------------------
|
||||
void IDBCompressInterface::locateBlock(unsigned int block,
|
||||
unsigned int& chunkIndex,
|
||||
unsigned int& blockOffsetWithinChunk) const
|
||||
{
|
||||
const uint64_t BUFLEN = UNCOMPRESSED_INBUF_LEN;
|
||||
|
||||
uint64_t byteOffset = (uint64_t)block * BLOCK_SIZE;
|
||||
uint64_t chunk = byteOffset / BUFLEN;
|
||||
uint64_t blockInChunk = (byteOffset % BUFLEN) / BLOCK_SIZE;
|
||||
|
||||
chunkIndex = chunk;
|
||||
blockOffsetWithinChunk = blockInChunk;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Round up the size of the buffer to the applicable compressed size increment,
|
||||
// also expand to allow for user requested padding. Lastly, initialize padding
|
||||
// bytes to 0.
|
||||
//------------------------------------------------------------------------------
|
||||
int IDBCompressInterface::padCompressedChunks(unsigned char* buf,
|
||||
unsigned int& len,
|
||||
unsigned int maxLen) const
|
||||
{
|
||||
int nPaddingBytes = 0;
|
||||
int nRem = len % COMPRESSED_CHUNK_INCREMENT_SIZE;
|
||||
if (nRem != 0)
|
||||
{
|
||||
nPaddingBytes = COMPRESSED_CHUNK_INCREMENT_SIZE - nRem;
|
||||
}
|
||||
|
||||
nPaddingBytes = nPaddingBytes + fNumUserPaddingBytes;
|
||||
|
||||
if (nPaddingBytes > 0)
|
||||
{
|
||||
if ((len + nPaddingBytes) > maxLen)
|
||||
return -1;
|
||||
|
||||
memset(buf+len, 0, nPaddingBytes);
|
||||
len = len + nPaddingBytes;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* static */
|
||||
uint64_t IDBCompressInterface::maxCompressedSize(uint64_t uncompSize)
|
||||
{
|
||||
return (snappy::MaxCompressedLength(uncompSize) + HEADER_SIZE);
|
||||
}
|
||||
|
||||
int IDBCompressInterface::compress(const char *in, size_t inLen, char *out,
|
||||
size_t *outLen) const
|
||||
{
|
||||
snappy::RawCompress(in, inLen, out, outLen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int IDBCompressInterface::uncompress(const char *in, size_t inLen, char *out) const
|
||||
{
|
||||
return !(snappy::RawUncompress(in, inLen, out));
|
||||
}
|
||||
|
||||
/* static */
|
||||
bool IDBCompressInterface::getUncompressedSize(char *in, size_t inLen, size_t *outLen)
|
||||
{
|
||||
return snappy::GetUncompressedLength(in, inLen, outLen);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace compress
|
||||
// vim:ts=4 sw=4:
|
||||
|
Reference in New Issue
Block a user