You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
360 lines
11 KiB
C++
360 lines
11 KiB
C++
/* Copyright (C) 2014 InfiniDB, Inc.
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; version 2 of
|
|
the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
MA 02110-1301, USA. */
|
|
|
|
// $Id: we_dctnry.h 4726 2013-08-07 03:38:36Z bwilkinson $
|
|
|
|
/** @we_dctnry.h
|
|
* Defines the Dctnry class
|
|
* When a signature is given, the value will be stored in dictionary and
|
|
* a token will be issued. Given a token, the signature in the dictionary
|
|
* can be deleted
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <cstdlib>
|
|
#include <cstddef>
|
|
#include <iostream>
|
|
#include <string>
|
|
|
|
#include "we_dbfileop.h"
|
|
#include "we_type.h"
|
|
#include "we_brm.h"
|
|
#include "bytestream.h"
|
|
#include "nullstring.h"
|
|
|
|
#define EXPORT
|
|
|
|
namespace arrow
|
|
{
|
|
class Array;
|
|
}
|
|
|
|
/** Namespace WriteEngine */
|
|
namespace WriteEngine
|
|
{
|
|
//---------------------------------------------------------------------------
|
|
// Structure used to store signatures in string cache
|
|
//---------------------------------------------------------------------------
|
|
typedef struct Signature
|
|
{
|
|
int size;
|
|
unsigned char* signature;
|
|
Token token;
|
|
} Signature;
|
|
|
|
struct sig_compare
|
|
{
|
|
bool operator()(const Signature& a, const Signature& b) const
|
|
{
|
|
if (a.size == b.size)
|
|
{
|
|
return memcmp(a.signature, b.signature, a.size) < 0;
|
|
}
|
|
else if (a.size < b.size)
|
|
{
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @brief Class to interface with dictionary store files.
|
|
*/
|
|
class Dctnry : public DbFileOp
|
|
{
|
|
//--------------------------------------------------------------------------
|
|
// Public members
|
|
//--------------------------------------------------------------------------
|
|
public:
|
|
/**
|
|
* @brief Dctnry Constructor
|
|
*/
|
|
EXPORT Dctnry();
|
|
|
|
/**
|
|
* @brief Dctnry Destructor
|
|
*/
|
|
EXPORT virtual ~Dctnry();
|
|
|
|
/**
|
|
* @brief Close the dictionary file handle.
|
|
*/
|
|
EXPORT int closeDctnry(bool realClose = true);
|
|
|
|
/**
|
|
* @brief Close the dictionary file handle without flushing the current blk
|
|
* buffer or updating HWM to BRM.
|
|
*/
|
|
EXPORT int closeDctnryOnly();
|
|
|
|
/**
|
|
* @brief Create a dictionary extent
|
|
*
|
|
* If 'flag' is true, a new file is created with an abbreviated extent.
|
|
* If 'flag' is false, then function adds a full exent to an already open
|
|
* file, basically assuming that the file already has 1 or more extents.
|
|
*
|
|
* @param dctnryOID - dictionary file OID
|
|
* @param colWidth - dictionary string width (not the token width)
|
|
* @param dbRoot - DBRoot for store file
|
|
* @param partition - partition number for store file
|
|
* @param segment - column segment number for store file
|
|
* @param flag - indicates whether extent is added to new file (true)
|
|
* @param startLbid - starting LBID for the newly allocated extent
|
|
*/
|
|
EXPORT int createDctnry(const OID& dctnryOID, int colWidth, const uint16_t dbRoot, const uint32_t partition,
|
|
const uint16_t segment, BRM::LBID_t& startLbid, bool flag = true);
|
|
|
|
/**
|
|
* @brief Drop dictionary store
|
|
*
|
|
* @param dctnryOID- OID of dictionary store file to be deleted
|
|
*/
|
|
EXPORT int dropDctnry(const OID& dctnryOID);
|
|
|
|
/**
|
|
* @brief Accessors
|
|
*/
|
|
const std::string& getFileName() const
|
|
{
|
|
return m_segFileName;
|
|
}
|
|
HWM getHWM() const
|
|
{
|
|
return m_hwm;
|
|
}
|
|
EXPORT bool getTokenFromArray(Signature& sig);
|
|
EXPORT uint64_t getCurLbid()
|
|
{
|
|
return m_curLbid;
|
|
}
|
|
const unsigned char* getDctnryHeader2() const
|
|
{
|
|
return m_dctnryHeader2;
|
|
}
|
|
|
|
/**
|
|
* @brief Insert a signature value to a file block and return token/pointer.
|
|
* (for DDL/DML use)
|
|
*
|
|
* @param sgnature_size - size of signature to be inserted
|
|
* @param sgnature_value - signature to be inserted
|
|
* @param token - (output) token associated with inserted signature
|
|
*/
|
|
EXPORT int insertDctnry(const int& sgnature_size, const unsigned char* sgnature_value, Token& token);
|
|
|
|
/**
|
|
* @brief Insert signature value to a file block and return token/pointer
|
|
* (for Bulk use)
|
|
*
|
|
* @param columnData - arrow array containing strings to be parsed
|
|
* @param startRowIdx - start position for current batch parquet data
|
|
* @param totalRow - total number of rows in buf
|
|
* @param col - the column to be parsed from buf
|
|
* @param tokenBuf - (output) list of tokens for the parsed strings
|
|
*/
|
|
EXPORT int insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx, const int totalRow,
|
|
const int col, char* tokenBuf, long long& truncCount,
|
|
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
|
|
|
|
/**
|
|
* @brief Insert a signature value to a file block and return token/pointer
|
|
* (for Bulk use)
|
|
*
|
|
* @param buf - bulk buffer containing strings to be parsed
|
|
* @param pos - list of offsets into buf
|
|
* @param totalRow - total number of rows in buf
|
|
* @param col - the column to be parsed from buf
|
|
* @param tokenBuf - (output) list of tokens for the parsed strings
|
|
*/
|
|
EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col,
|
|
char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs,
|
|
const WriteEngine::ColType& weType);
|
|
|
|
/**
|
|
* @brief Update dictionary store with tokenized strings (for DDL/DML use)
|
|
*
|
|
* @param sigValue - signature value
|
|
* @param sigSize - signature size
|
|
* @param token - (output) token that was added
|
|
*/
|
|
EXPORT int updateDctnry(unsigned char* sigValue, int& sigSize, Token& token);
|
|
|
|
/**
|
|
* @brief open dictionary store
|
|
*
|
|
* @param dctnryOID - dictionary file OID
|
|
* @param dbRoot - DBRoot for store file
|
|
* @param partition - partition number for store file
|
|
* @param segment - column segment number for store file
|
|
* @param useTmpSuffix - for Bulk HDFS usage: use or not use *.tmp file suffix
|
|
*/
|
|
EXPORT int openDctnry(const OID& dctnryOID, const uint16_t dbRoot, const uint32_t partition,
|
|
const uint16_t segment, const bool useTmpSuffix);
|
|
|
|
/**
|
|
* @brief copy the dictionary header to buffer
|
|
*/
|
|
void copyDctnryHeader(void* buf);
|
|
|
|
/**
|
|
* @brief Set logger that can be used for logging (primarily by bulk load)
|
|
*/
|
|
void setLogger(Log* logger)
|
|
{
|
|
m_logger = logger;
|
|
}
|
|
|
|
/**
|
|
* @brief Set dictionary column width for this column
|
|
*/
|
|
void setColWidth(int colWidth)
|
|
{
|
|
m_colWidth = colWidth;
|
|
}
|
|
|
|
/**
|
|
* @brief Set dictionary default for this column
|
|
*/
|
|
void setDefault(const utils::NullString& defVal)
|
|
{
|
|
m_defVal = defVal;
|
|
}
|
|
|
|
void setImportDataMode(ImportDataMode importMode)
|
|
{
|
|
m_importDataMode = importMode;
|
|
}
|
|
|
|
virtual int checkFixLastDictChunk()
|
|
{
|
|
return NO_ERROR;
|
|
}
|
|
/**
|
|
* @brief Use this only in Unit Tests and not in prod
|
|
*/
|
|
virtual IDBDataFile* createDctnryFileUnit(const char* name, int width, const char* mode, int ioBuffSize)
|
|
{
|
|
return createDctnryFile(name, width, mode, ioBuffSize);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
// Protected members
|
|
//------------------------------------------------------------------------------
|
|
protected:
|
|
//
|
|
// Add the specified signature (string) to the string cache
|
|
//
|
|
void addToStringCache(const Signature& newSig);
|
|
|
|
//
|
|
// Clear the dictionary store.
|
|
//
|
|
void clear()
|
|
{
|
|
m_dFile = NULL;
|
|
m_dctnryOID = (OID)INVALID_NUM;
|
|
}
|
|
|
|
// Expand an abbreviated extent on disk.
|
|
int expandDctnryExtent();
|
|
|
|
// Free memory consumed by strings in the string cache
|
|
void freeStringCache();
|
|
|
|
//
|
|
// Functions to read data:
|
|
// getBlockOpCount - get the ordinal position (OP) count from the header
|
|
// getEndOp - read OP of the end of header for specified fbo
|
|
//
|
|
void getBlockOpCount(const DataBlock& fileBlock, int& op_count);
|
|
int getEndOp(IDBDataFile* dFile, int fbo, int& op);
|
|
|
|
//
|
|
// Initialization
|
|
//
|
|
int init();
|
|
|
|
//
|
|
// Support functions for inserting values into dictionary.
|
|
// insertDctnryHdr inserts the new value info into the header.
|
|
// insertSgnture inserts the new value into the block.
|
|
//
|
|
int insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
|
|
int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
|
|
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
|
|
int insertDctnry2(Signature& sig);
|
|
void insertDctnryHdr(unsigned char* blockBuf, const int& size);
|
|
void insertSgnture(unsigned char* blockBuf, const int& size, unsigned char* value);
|
|
|
|
//
|
|
// Preloads the strings from the specified DataBlock. Currently
|
|
// used to preload the first block, of a store file having only 1 block.
|
|
//
|
|
void preLoadStringCache(const DataBlock& fileBlock);
|
|
|
|
// methods to be overriden by compression classes
|
|
// (width argument in createDctnryFile() is string width, not token width)
|
|
virtual IDBDataFile* createDctnryFile(const char* name, int width, const char* mode, int ioBuffSize,
|
|
BRM::LBID_t lbid = -1);
|
|
virtual IDBDataFile* openDctnryFile(bool useTmpSuffix);
|
|
virtual void closeDctnryFile(bool doFlush, std::map<FID, FID>& oids);
|
|
virtual int numOfBlocksInFile();
|
|
|
|
std::set<Signature, sig_compare> m_sigArray;
|
|
int m_arraySize; // num strings in m_sigArray
|
|
|
|
// m_dctnryHeader used for hdr when readSubBlockEntry is used to read a blk
|
|
// m_dctnryHeader2 contains filled in template used to initialize new blocks
|
|
unsigned char m_dctnryHeader[DCTNRY_HEADER_SIZE]; // first 14 bytes of hdr
|
|
unsigned char m_dctnryHeader2[DCTNRY_HEADER_SIZE]; // first 14 bytes of hdr
|
|
|
|
uint64_t m_nextPtr; // next pointer
|
|
|
|
// relate to different Dictionary file
|
|
FID m_dctnryOID; // OID for the dctnry file
|
|
IDBDataFile* m_dFile; // dictionary file
|
|
uint32_t m_partition; // partition associated with OID
|
|
uint16_t m_segment; // segment associated with OID
|
|
uint16_t m_dbRoot; // DBRoot associated with OID
|
|
std::string m_segFileName; // current column segment file
|
|
int m_numBlocks; // num "raw" uncompressed blocks in file
|
|
int m_lastFbo;
|
|
HWM m_hwm;
|
|
// Need to be initialized for different Dictionary file
|
|
int m_newStartOffset; // start offset
|
|
uint16_t m_freeSpace; // free space (bytes) within current block
|
|
int m_curOp; // current ordinal pointer within m_curFbo
|
|
int m_curFbo; // current "raw" (uncompressed) FBO
|
|
BRM::LBID_t m_curLbid; // LBID associated with m_curFbo
|
|
DataBlock m_curBlock; // current "raw" (uncompressed) data block
|
|
Log* m_logger; // logger, mainly for bulk load
|
|
int m_colWidth; // width of this dictionary column
|
|
utils::NullString m_defVal; // optional default string value
|
|
ImportDataMode m_importDataMode; // Import data in text or binary mode
|
|
|
|
}; // end of class
|
|
|
|
} // namespace WriteEngine
|
|
|
|
#undef EXPORT
|