You've already forked mariadb-columnstore-engine
							
							
				mirror of
				https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
				synced 2025-10-24 10:12:58 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			349 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			349 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /* Copyright (C) 2014 InfiniDB, Inc.
 | |
| 
 | |
|    This program is free software; you can redistribute it and/or
 | |
|    modify it under the terms of the GNU General Public License
 | |
|    as published by the Free Software Foundation; version 2 of
 | |
|    the License.
 | |
| 
 | |
|    This program is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU General Public License for more details.
 | |
| 
 | |
|    You should have received a copy of the GNU General Public License
 | |
|    along with this program; if not, write to the Free Software
 | |
|    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 | |
|    MA 02110-1301, USA. */
 | |
| 
 | |
| /*********************************************************************
 | |
|  *   $Id: we_bulkloadbuffer.h 4489 2013-01-30 18:47:53Z dcathey $
 | |
|  *
 | |
|  ********************************************************************/
 | |
| #ifndef _WE_BULKLOADBUFFER_H
 | |
| #define _WE_BULKLOADBUFFER_H
 | |
| 
 | |
| #include "we_type.h"
 | |
| #include "limits"
 | |
| #include "string"
 | |
| #include "vector"
 | |
| #include "boost/thread/mutex.hpp"
 | |
| #include "boost/ptr_container/ptr_vector.hpp"
 | |
| #include "we_columninfo.h"
 | |
| #include "calpontsystemcatalog.h"
 | |
| 
 | |
| namespace WriteEngine 
 | |
| {
 | |
| class Log;
 | |
| 
 | |
| // Used to collect stats about a BulkLoadBuffer buffer that is being parsed
 | |
| class BLBufferStats
 | |
| {
 | |
|   public:
 | |
|     int64_t minBufferVal;
 | |
|     int64_t maxBufferVal;
 | |
|     int64_t satCount;
 | |
|     BLBufferStats(ColDataType colDataType) : satCount(0)
 | |
|     {
 | |
|         if (isUnsigned(colDataType))
 | |
|         {
 | |
|             minBufferVal = static_cast<int64_t>(MAX_UBIGINT);
 | |
|             maxBufferVal = static_cast<int64_t>(MIN_UBIGINT);
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             minBufferVal = MAX_BIGINT;
 | |
|             maxBufferVal = MIN_BIGINT;
 | |
|         }
 | |
|     }
 | |
| };
 | |
| 
 | |
| class BulkLoadBuffer
 | |
| {
 | |
| private:    
 | |
| 
 | |
|     //--------------------------------------------------------------------------
 | |
|     // Private Data Members
 | |
|     //--------------------------------------------------------------------------
 | |
| 
 | |
|     char *fData;                        // Buffer with data read from tbl file
 | |
|     char *fDataParser;                  // for temporary use by parser 
 | |
| 
 | |
|     char *fOverflowBuf;                 // Overflow data held for next buffer
 | |
|     unsigned fOverflowSize;             // Current size of fOverflowBuf
 | |
| 
 | |
|     // Information about the locker and status for each column in this buffer.
 | |
|     // Note that TableInfo::fSyncUpdatesTI mutex is used to synchronize
 | |
|     // access to fColumnLocks and fParseComplete from both read and parse
 | |
|     // threads.  Table-scope lock (fSyncUpdatesTI) is used instead of
 | |
|     // buffer-scope lock (fSyncUpdatesBLB), to keep state for all buffers
 | |
|     // static while we are scanning fColumnLocks for the buffers in a table
 | |
|     std::vector<LockInfo> fColumnLocks;
 | |
| 
 | |
|     // Note that TableInfo::fSyncUpdatesTI mutex (not fStatusBLB) is used
 | |
|     // to synchronize getting/setting fStatusBLB between threads.
 | |
|     Status fStatusBLB;                  // Status of buffer
 | |
| 
 | |
|     // TableInfo::fSyncUpdatesTI mutex should be locked when accessing
 | |
|     // this data member (see fColumnLocks discussion).
 | |
|     unsigned fParseComplete;            // Num of columns that are parseComplete
 | |
| 
 | |
|     unsigned fTotalRows;                // Max rows this buffer can now hold;
 | |
|                                         //   size of fTokens array
 | |
|     std::vector< std::pair<RID,std::string> > fRowStatus;//Status of bad rows
 | |
|     std::vector<std::string> fErrRows; // Rejected rows to write to .bad file
 | |
| 
 | |
|     uint32_t fTotalReadRows;                // Total valid rows read into buffer;
 | |
|                                         //   this count excludes rejected rows
 | |
|     uint32_t fTotalReadRowsParser;          // for temporary use by parser
 | |
| 
 | |
|     uint32_t fTotalReadRowsForLog;          // Total rows read into this buffer
 | |
|                                         //   including invalid rows
 | |
| 
 | |
|     RID fStartRow;                      // Starting row id for rows in buffer,
 | |
|                                         //   relative to start of job.
 | |
|                                         //   Rejected rows are excluded.
 | |
|     RID fStartRowParser;                // for temporary use by parser
 | |
| 
 | |
|     RID fStartRowForLogging;            // Starting row id for rows in buffer,
 | |
|                                         //   relative to start of current input
 | |
|                                         //   file.  All rows are counted.
 | |
|     RID fStartRowForLoggingParser;      // for temporary use by parser
 | |
| 
 | |
|     uint32_t fAutoIncGenCount;              // How many auto-increment values are
 | |
|                                         //   to be generated for current buffer
 | |
|     uint32_t fAutoIncGenCountParser;        // for temporary use by parser
 | |
| 
 | |
|     uint64_t fAutoIncNextValue;         // Next auto-increment value assign to
 | |
|                                         //   a row in this buffer
 | |
|     unsigned fNumberOfColumns;          // Number of ColumnInfo objs in table
 | |
| 
 | |
|     ColPosPair **fTokens;               // Vector of start and offsets for the
 | |
|                                         //   column values read from tbl files
 | |
|     ColPosPair **fTokensParser;         // for temporary use by parser
 | |
| 
 | |
|     char fColDelim;                     // Character to delimit columns in a row
 | |
|     unsigned fBufferSize;               // Size of input read buffer (fData)
 | |
|     unsigned fReadSize;                 // Number of bytes in read buffer(fData)
 | |
|     boost::mutex fSyncUpdatesBLB;       // Mutex to synchronize updates
 | |
|     Log* fLog;                          // Logger object
 | |
|     bool fNullStringMode;               // Indicates if "NULL" string is to be
 | |
|                                         //   treated as a NULL value or not
 | |
|     char fEnclosedByChar;               // Optional char to enclose col values
 | |
|     char fEscapeChar;                   // Used to escape enclosed character
 | |
|     int fBufferId;                      // Id for this read buffer
 | |
|     std::string fTableName;             // Table assigned to this read buffer
 | |
|     JobFieldRefList fFieldList;         // Complete list of cols and flds
 | |
|     unsigned int fNumFieldsInFile;      // Number of fields in input file
 | |
|                                         //   (including fields to be ignored)
 | |
|     unsigned int fNumColsInFile;        // Number of flds in input file targeted
 | |
|                                         //   for db cols (omits default cols)
 | |
|     bool fbTruncationAsError;           // Treat string truncation as error
 | |
|     ImportDataMode fImportDataMode;     // Import data in text or binary mode
 | |
|     unsigned int fFixedBinaryRecLen;    // Fixed rec len used in binary mode
 | |
| 
 | |
|     //--------------------------------------------------------------------------
 | |
|     // Private Functions
 | |
|     //--------------------------------------------------------------------------
 | |
| 
 | |
|     /** @brief Copy constructor
 | |
|      */
 | |
|     BulkLoadBuffer(const BulkLoadBuffer &buffer);
 | |
| 
 | |
|     /** @brief Assignment operator
 | |
|      */
 | |
|     BulkLoadBuffer & operator =(const BulkLoadBuffer & buffer);
 | |
| 
 | |
|     /** @brief Convert the buffer data depending upon the data type
 | |
|      */
 | |
|     void convert(char *field, int fieldLength,
 | |
|                  bool nullFlag, unsigned char *output,
 | |
|                  const JobColumn & column,
 | |
|                  BLBufferStats& bufStats);
 | |
| 
 | |
|     /** @brief Copy the overflow data
 | |
|      */
 | |
|     void copyOverflow(const BulkLoadBuffer & buffer);
 | |
| 
 | |
|     /** @brief Parse a Read buffer for a nonDictionary column
 | |
|      */
 | |
|     int parseCol(ColumnInfo &columnInfo);
 | |
| 
 | |
|     /** @brief Parse a Read buffer for a nonDictionary column
 | |
|      */
 | |
|     void parseColLogMinMax(std::ostringstream& oss,
 | |
|                            ColDataType         colDataType,
 | |
|                            int64_t             minBufferVal,
 | |
|                            int64_t             maxBufferVal) const;
 | |
| 
 | |
|     /** @brief Parse a Read buffer for a Dictionary column
 | |
|      */
 | |
|     int parseDict(ColumnInfo &columnInfo);
 | |
| 
 | |
|     /** @brief Parse a Dictionary Read buffer into a ColumnBufferSection.
 | |
|      * 
 | |
|      * Parses the Read buffer into a section up to the point at which
 | |
|      * the buffer crosses an extent boundary.
 | |
|      *
 | |
|      * @param columnInfo    Column being parsed
 | |
|      * @oaram tokenPos      Position of rows to be parsed, in fTokens.
 | |
|      * @param startRow      Row id of first row in buffer to be parsed.
 | |
|      *        Row id is relative to all the rows in this import.
 | |
|      * @param totalReadRows Number of buffer rows ready to be parsed
 | |
|      * @param nRowsParsed   Number of buffer rows that were parsed
 | |
|      */
 | |
|     int parseDictSection(ColumnInfo &columnInfo, int tokenPos,
 | |
|                          RID startRow, uint32_t totalReadRows,
 | |
|                          uint32_t& nRowsParsed);
 | |
| 
 | |
|     /** @brief Expand the size of the fTokens array
 | |
|      */
 | |
|     void resizeTokenArray();
 | |
| 
 | |
|     /** @brief tokenize the buffer contents and fill up the token array.
 | |
|      */
 | |
|     void tokenize(const boost::ptr_vector<ColumnInfo>& columnsInfo,
 | |
|                   unsigned int allowedErrCntThisCall);
 | |
| 
 | |
|     /** @brief Binary tokenization of the buffer, and fill up the token array.
 | |
|      */
 | |
|     int tokenizeBinary(const boost::ptr_vector<ColumnInfo>& columnsInfo,
 | |
|                   unsigned int allowedErrCntThisCall,
 | |
|                   bool bEndOfData);
 | |
| 
 | |
|     /** @brief Determine if specified value is NULL or not.
 | |
|      */
 | |
|     bool isBinaryFieldNull(void* val, WriteEngine::ColType ct,
 | |
|                   execplan::CalpontSystemCatalog::ColDataType dt);
 | |
| 
 | |
| public:
 | |
| 
 | |
|     /** @brief Constructor
 | |
|      * @param noOfCol Number of columns
 | |
|      * @param bufferSize Buffer size
 | |
|      * @param logger The Log object used for logging
 | |
|      * @param bufferId Id assigned to this buffer
 | |
|      * @param tableName Name of table associated with this read buffer
 | |
|      * @param jobFieldRefList Complete list of cols/flds listed in Job XML file
 | |
|      */
 | |
|     BulkLoadBuffer(unsigned noOfCols,
 | |
|                    unsigned bufferSize, Log* logger,
 | |
|                    int bufferId, const std::string& tableName,
 | |
|                    const JobFieldRefList& jobFieldRefList);
 | |
| 
 | |
|     /** @brief Destructor
 | |
|     */
 | |
|     ~BulkLoadBuffer();
 | |
| 
 | |
|     /** @brief Resets the values of the members (excluding column locks)
 | |
|      */
 | |
|     void reset();
 | |
| 
 | |
|     /** @brief Resets the column locks.
 | |
|      * TableInfo::fSyncUpdatesTI mutex should be locked when calling this 
 | |
|      * function (see fColumnLocks discussion).
 | |
|      */
 | |
|     void resetColumnLocks();
 | |
| 
 | |
|     /** @brief Get the buffer status
 | |
|      */
 | |
|     Status getStatusBLB() const {return fStatusBLB;}
 | |
| 
 | |
|     /** @brief Set the buffer status
 | |
|      */
 | |
|     void setStatusBLB(const Status & status){fStatusBLB = status;}
 | |
| 
 | |
|     /** @brief Try to lock a column for the buffer
 | |
|      * TableInfo::fSyncUpdatesTI mutex should be locked when calling this 
 | |
|      * function (see fColumnLocks discussion).
 | |
|      */
 | |
|     bool tryAndLockColumn(const int & columnId, const int & id);
 | |
| 
 | |
|     /** @brief Read the table data into the buffer
 | |
|      */
 | |
|     int fillFromFile(const BulkLoadBuffer& overFlowBufIn,
 | |
|         FILE * handle, RID & totalRows, RID & correctTotalRows,
 | |
|         const boost::ptr_vector<ColumnInfo>& columnsInfo,
 | |
|         unsigned int allowedErrCntThisCall); 
 | |
| 
 | |
|     /** @brief Get the overflow size
 | |
|      */
 | |
|     int getOverFlowSize()  const {return fOverflowSize;}
 | |
| 
 | |
|     /** @brief Parse the buffer data
 | |
|      */
 | |
|     int parse(ColumnInfo &columnInfo);
 | |
| 
 | |
|     /** @brief Set the delimiter used to delimit the columns within a row
 | |
|      */
 | |
|     void setColDelimiter(const char & delim){fColDelim = delim;}
 | |
| 
 | |
|     /** @brief Set mode to treat "NULL" string as NULL value or not.
 | |
|      */
 | |
|     void setNullStringMode( bool bMode ) { fNullStringMode = bMode; }
 | |
| 
 | |
|     /** @brief Set character optionally used to enclose input column values.
 | |
|      */
 | |
|     void setEnclosedByChar( char enChar ) { fEnclosedByChar = enChar; }
 | |
| 
 | |
|     /** @brief Set escape char to use in conjunction with enclosed by char.
 | |
|      */
 | |
|     void setEscapeChar    ( char esChar ) { fEscapeChar  = esChar; }
 | |
| 
 | |
|     /** @brief Get the column status
 | |
|      *  TableInfo::fSyncUpdatesTI mutex should be locked when calling this 
 | |
|      *  function (see fColumnLocks discussion).
 | |
|      */
 | |
|     Status getColumnStatus(const int & columnId) const
 | |
|     { return fColumnLocks[columnId].status; }
 | |
| 
 | |
|     /** @brief Set the column status
 | |
|      *  TableInfo::fSyncUpdatesTI mutex should be locked when calling this 
 | |
|      *  function (see fColumnLocks discussion).
 | |
|      *  @returns TRUE if all columns in the buffer are complete.
 | |
|      */
 | |
|     bool setColumnStatus(const int &columnId, const Status & status);
 | |
| 
 | |
|     /** @brief Get the error row status's
 | |
|      */
 | |
|     const std::vector< std::pair<RID,std::string> >& getErrorRows() const
 | |
|     {return fRowStatus;}
 | |
| 
 | |
|     /** @brief Get the error rows
 | |
|      */
 | |
|     const std::vector<std::string>& getExactErrorRows() const
 | |
|     {return fErrRows;}
 | |
|        
 | |
|     void clearErrRows() {fRowStatus.clear();fErrRows.clear();}
 | |
| 
 | |
|     /** @brief Get the column locker.
 | |
|      *  TableInfo::fSyncUpdatesTI mutex should be locked when calling this 
 | |
|      *  function (see fColumnLocks discussion).
 | |
|      */
 | |
|     const int getColumnLocker(const int & columnId) const
 | |
|     { return fColumnLocks[columnId].locker; }
 | |
| 
 | |
|     /** @brief set truncation as error for this import.
 | |
|      */
 | |
|     void setTruncationAsError(bool bTruncationAsError) 
 | |
|     { fbTruncationAsError = bTruncationAsError; }
 | |
| 
 | |
|     /** @brief retrieve the tuncation as error setting for this
 | |
|      *  import. When set, this causes char and varchar strings
 | |
|      *  that are longer than the column definition to be treated
 | |
|      *  as errors instead of warnings.
 | |
|      */
 | |
|     bool getTruncationAsError() const
 | |
|     { return fbTruncationAsError; }
 | |
| 
 | |
|     /** @brief Set text vs binary import mode along with corresponding fixed
 | |
|      *         record length that is used if the binary mode is set to TRUE.
 | |
|      */
 | |
|     void setImportDataMode( ImportDataMode importMode,
 | |
|         unsigned int fixedBinaryRecLen )
 | |
|     { fImportDataMode    = importMode;
 | |
|       fFixedBinaryRecLen = fixedBinaryRecLen; }
 | |
| };
 | |
| 
 | |
| }
 | |
| #endif
 |