1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00
Files
mariadb-columnstore-engine/writeengine/dictionary/we_dctnry.h
2023-11-30 01:47:13 +04:00

360 lines
11 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
// $Id: we_dctnry.h 4726 2013-08-07 03:38:36Z bwilkinson $
/** @we_dctnry.h
* Defines the Dctnry class
* When a signature is given, the value will be stored in dictionary and
* a token will be issued. Given a token, the signature in the dictionary
* can be deleted
*/
#pragma once
#include <cstdlib>
#include <cstddef>
#include <iostream>
#include <string>
#include "we_dbfileop.h"
#include "we_type.h"
#include "we_brm.h"
#include "bytestream.h"
#include "nullstring.h"
#define EXPORT
namespace arrow
{
class Array;
}
/** Namespace WriteEngine */
namespace WriteEngine
{
//---------------------------------------------------------------------------
// Structure used to store signatures in string cache
//---------------------------------------------------------------------------
typedef struct Signature
{
int size;
unsigned char* signature;
Token token;
} Signature;
struct sig_compare
{
bool operator()(const Signature& a, const Signature& b) const
{
if (a.size == b.size)
{
return memcmp(a.signature, b.signature, a.size) < 0;
}
else if (a.size < b.size)
{
return true;
}
else
{
return false;
}
}
};
/**
* @brief Class to interface with dictionary store files.
*/
class Dctnry : public DbFileOp
{
//--------------------------------------------------------------------------
// Public members
//--------------------------------------------------------------------------
public:
/**
* @brief Dctnry Constructor
*/
EXPORT Dctnry();
/**
* @brief Dctnry Destructor
*/
EXPORT virtual ~Dctnry();
/**
* @brief Close the dictionary file handle.
*/
EXPORT int closeDctnry(bool realClose = true);
/**
* @brief Close the dictionary file handle without flushing the current blk
* buffer or updating HWM to BRM.
*/
EXPORT int closeDctnryOnly();
/**
* @brief Create a dictionary extent
*
* If 'flag' is true, a new file is created with an abbreviated extent.
* If 'flag' is false, then function adds a full exent to an already open
* file, basically assuming that the file already has 1 or more extents.
*
* @param dctnryOID - dictionary file OID
* @param colWidth - dictionary string width (not the token width)
* @param dbRoot - DBRoot for store file
* @param partition - partition number for store file
* @param segment - column segment number for store file
* @param flag - indicates whether extent is added to new file (true)
* @param startLbid - starting LBID for the newly allocated extent
*/
EXPORT int createDctnry(const OID& dctnryOID, int colWidth, const uint16_t dbRoot, const uint32_t partition,
const uint16_t segment, BRM::LBID_t& startLbid, bool flag = true);
/**
* @brief Drop dictionary store
*
* @param dctnryOID- OID of dictionary store file to be deleted
*/
EXPORT int dropDctnry(const OID& dctnryOID);
/**
* @brief Accessors
*/
const std::string& getFileName() const
{
return m_segFileName;
}
HWM getHWM() const
{
return m_hwm;
}
EXPORT bool getTokenFromArray(Signature& sig);
EXPORT uint64_t getCurLbid()
{
return m_curLbid;
}
const unsigned char* getDctnryHeader2() const
{
return m_dctnryHeader2;
}
/**
* @brief Insert a signature value to a file block and return token/pointer.
* (for DDL/DML use)
*
* @param sgnature_size - size of signature to be inserted
* @param sgnature_value - signature to be inserted
* @param token - (output) token associated with inserted signature
*/
EXPORT int insertDctnry(const int& sgnature_size, const unsigned char* sgnature_value, Token& token);
/**
* @brief Insert signature value to a file block and return token/pointer
* (for Bulk use)
*
* @param columnData - arrow array containing strings to be parsed
* @param startRowIdx - start position for current batch parquet data
* @param totalRow - total number of rows in buf
* @param col - the column to be parsed from buf
* @param tokenBuf - (output) list of tokens for the parsed strings
*/
EXPORT int insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx, const int totalRow,
const int col, char* tokenBuf, long long& truncCount,
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
/**
* @brief Insert a signature value to a file block and return token/pointer
* (for Bulk use)
*
* @param buf - bulk buffer containing strings to be parsed
* @param pos - list of offsets into buf
* @param totalRow - total number of rows in buf
* @param col - the column to be parsed from buf
* @param tokenBuf - (output) list of tokens for the parsed strings
*/
EXPORT int insertDctnry(const char* buf, ColPosPair** pos, const int totalRow, const int col,
char* tokenBuf, long long& truncCount, const CHARSET_INFO* cs,
const WriteEngine::ColType& weType);
/**
* @brief Update dictionary store with tokenized strings (for DDL/DML use)
*
* @param sigValue - signature value
* @param sigSize - signature size
* @param token - (output) token that was added
*/
EXPORT int updateDctnry(unsigned char* sigValue, int& sigSize, Token& token);
/**
* @brief open dictionary store
*
* @param dctnryOID - dictionary file OID
* @param dbRoot - DBRoot for store file
* @param partition - partition number for store file
* @param segment - column segment number for store file
* @param useTmpSuffix - for Bulk HDFS usage: use or not use *.tmp file suffix
*/
EXPORT int openDctnry(const OID& dctnryOID, const uint16_t dbRoot, const uint32_t partition,
const uint16_t segment, const bool useTmpSuffix);
/**
* @brief copy the dictionary header to buffer
*/
void copyDctnryHeader(void* buf);
/**
* @brief Set logger that can be used for logging (primarily by bulk load)
*/
void setLogger(Log* logger)
{
m_logger = logger;
}
/**
* @brief Set dictionary column width for this column
*/
void setColWidth(int colWidth)
{
m_colWidth = colWidth;
}
/**
* @brief Set dictionary default for this column
*/
void setDefault(const utils::NullString& defVal)
{
m_defVal = defVal;
}
void setImportDataMode(ImportDataMode importMode)
{
m_importDataMode = importMode;
}
virtual int checkFixLastDictChunk()
{
return NO_ERROR;
}
/**
* @brief Use this only in Unit Tests and not in prod
*/
virtual IDBDataFile* createDctnryFileUnit(const char* name, int width, const char* mode, int ioBuffSize)
{
return createDctnryFile(name, width, mode, ioBuffSize);
}
//------------------------------------------------------------------------------
// Protected members
//------------------------------------------------------------------------------
protected:
//
// Add the specified signature (string) to the string cache
//
void addToStringCache(const Signature& newSig);
//
// Clear the dictionary store.
//
void clear()
{
m_dFile = NULL;
m_dctnryOID = (OID)INVALID_NUM;
}
// Expand an abbreviated extent on disk.
int expandDctnryExtent();
// Free memory consumed by strings in the string cache
void freeStringCache();
//
// Functions to read data:
// getBlockOpCount - get the ordinal position (OP) count from the header
// getEndOp - read OP of the end of header for specified fbo
//
void getBlockOpCount(const DataBlock& fileBlock, int& op_count);
int getEndOp(IDBDataFile* dFile, int fbo, int& op);
//
// Initialization
//
int init();
//
// Support functions for inserting values into dictionary.
// insertDctnryHdr inserts the new value info into the header.
// insertSgnture inserts the new value into the block.
//
int insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
int insertDctnry2(Signature& sig);
void insertDctnryHdr(unsigned char* blockBuf, const int& size);
void insertSgnture(unsigned char* blockBuf, const int& size, unsigned char* value);
//
// Preloads the strings from the specified DataBlock. Currently
// used to preload the first block, of a store file having only 1 block.
//
void preLoadStringCache(const DataBlock& fileBlock);
// methods to be overriden by compression classes
// (width argument in createDctnryFile() is string width, not token width)
virtual IDBDataFile* createDctnryFile(const char* name, int width, const char* mode, int ioBuffSize,
BRM::LBID_t lbid = -1);
virtual IDBDataFile* openDctnryFile(bool useTmpSuffix);
virtual void closeDctnryFile(bool doFlush, std::map<FID, FID>& oids);
virtual int numOfBlocksInFile();
std::set<Signature, sig_compare> m_sigArray;
int m_arraySize; // num strings in m_sigArray
// m_dctnryHeader used for hdr when readSubBlockEntry is used to read a blk
// m_dctnryHeader2 contains filled in template used to initialize new blocks
unsigned char m_dctnryHeader[DCTNRY_HEADER_SIZE]; // first 14 bytes of hdr
unsigned char m_dctnryHeader2[DCTNRY_HEADER_SIZE]; // first 14 bytes of hdr
uint64_t m_nextPtr; // next pointer
// relate to different Dictionary file
FID m_dctnryOID; // OID for the dctnry file
IDBDataFile* m_dFile; // dictionary file
uint32_t m_partition; // partition associated with OID
uint16_t m_segment; // segment associated with OID
uint16_t m_dbRoot; // DBRoot associated with OID
std::string m_segFileName; // current column segment file
int m_numBlocks; // num "raw" uncompressed blocks in file
int m_lastFbo;
HWM m_hwm;
// Need to be initialized for different Dictionary file
int m_newStartOffset; // start offset
uint16_t m_freeSpace; // free space (bytes) within current block
int m_curOp; // current ordinal pointer within m_curFbo
int m_curFbo; // current "raw" (uncompressed) FBO
BRM::LBID_t m_curLbid; // LBID associated with m_curFbo
DataBlock m_curBlock; // current "raw" (uncompressed) data block
Log* m_logger; // logger, mainly for bulk load
int m_colWidth; // width of this dictionary column
utils::NullString m_defVal; // optional default string value
ImportDataMode m_importDataMode; // Import data in text or binary mode
}; // end of class
} // namespace WriteEngine
#undef EXPORT