You've already forked mariadb-columnstore-engine
							
							
				mirror of
				https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
				synced 2025-10-31 18:30:33 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			412 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			412 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /* Copyright (C) 2014 InfiniDB, Inc.
 | |
| 
 | |
|    This program is free software; you can redistribute it and/or
 | |
|    modify it under the terms of the GNU General Public License
 | |
|    as published by the Free Software Foundation; version 2 of
 | |
|    the License.
 | |
| 
 | |
|    This program is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU General Public License for more details.
 | |
| 
 | |
|    You should have received a copy of the GNU General Public License
 | |
|    along with this program; if not, write to the Free Software
 | |
|    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 | |
|    MA 02110-1301, USA. */
 | |
| 
 | |
| /*********************************************************************
 | |
|  *   $Id: we_bulkloadbuffer.h 4489 2013-01-30 18:47:53Z dcathey $
 | |
|  *
 | |
|  ********************************************************************/
 | |
| #pragma once
 | |
| 
 | |
| #include "we_type.h"
 | |
| #include "limits"
 | |
| #include "string"
 | |
| #include "vector"
 | |
| #include "boost/thread/mutex.hpp"
 | |
| #pragma GCC diagnostic push
 | |
| #pragma GCC diagnostic ignored "-Wunused-parameter"
 | |
| #include "boost/ptr_container/ptr_vector.hpp"
 | |
| #pragma GCC diagnostic pop
 | |
| #include "we_columninfo.h"
 | |
| #include "calpontsystemcatalog.h"
 | |
| #include "dataconvert.h"
 | |
| 
 | |
| namespace WriteEngine
 | |
| {
 | |
| class Log;
 | |
| 
 | |
| // Used to collect stats about a BulkLoadBuffer buffer that is being parsed
 | |
| class BLBufferStats
 | |
| {
 | |
|  public:
 | |
|   int64_t minBufferVal;
 | |
|   int64_t maxBufferVal;
 | |
|   int64_t satCount;
 | |
|   union
 | |
|   {
 | |
|     int128_t bigMinBufferVal;
 | |
|     int64_t minBufferVal_;
 | |
|   };
 | |
|   union
 | |
|   {
 | |
|     int128_t bigMaxBufferVal;
 | |
|     int64_t maxBufferVal_;
 | |
|   };
 | |
|   BLBufferStats(ColDataType colDataType) : satCount(0)
 | |
|   {
 | |
|     if (isUnsigned(colDataType))
 | |
|     {
 | |
|       minBufferVal = static_cast<int64_t>(MAX_UBIGINT);
 | |
|       maxBufferVal = static_cast<int64_t>(MIN_UBIGINT);
 | |
|       bigMinBufferVal = -1;
 | |
|       bigMaxBufferVal = 0;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|       minBufferVal = MAX_BIGINT;
 | |
|       maxBufferVal = MIN_BIGINT;
 | |
|       utils::int128Max(bigMinBufferVal);
 | |
|       utils::int128Min(bigMaxBufferVal);
 | |
|     }
 | |
|   }
 | |
| };
 | |
| 
 | |
| class BulkLoadBuffer
 | |
| {
 | |
|  private:
 | |
|   //--------------------------------------------------------------------------
 | |
|   // Private Data Members
 | |
|   //--------------------------------------------------------------------------
 | |
| 
 | |
|   char* fData;        // Buffer with data read from tbl file
 | |
|   char* fDataParser;  // for temporary use by parser
 | |
| 
 | |
|   char* fOverflowBuf;      // Overflow data held for next buffer
 | |
|   unsigned fOverflowSize;  // Current size of fOverflowBuf
 | |
| 
 | |
|   // Information about the locker and status for each column in this buffer.
 | |
|   // Note that TableInfo::fSyncUpdatesTI mutex is used to synchronize
 | |
|   // access to fColumnLocks and fParseComplete from both read and parse
 | |
|   // threads.  Table-scope lock (fSyncUpdatesTI) is used instead of
 | |
|   // buffer-scope lock (fSyncUpdatesBLB), to keep state for all buffers
 | |
|   // static while we are scanning fColumnLocks for the buffers in a table
 | |
|   std::vector<LockInfo> fColumnLocks;
 | |
| 
 | |
|   // Note that TableInfo::fSyncUpdatesTI mutex (not fStatusBLB) is used
 | |
|   // to synchronize getting/setting fStatusBLB between threads.
 | |
|   Status fStatusBLB;  // Status of buffer
 | |
| 
 | |
|   // TableInfo::fSyncUpdatesTI mutex should be locked when accessing
 | |
|   // this data member (see fColumnLocks discussion).
 | |
|   unsigned fParseComplete;  // Num of columns that are parseComplete
 | |
| 
 | |
|   unsigned fTotalRows;  // Max rows this buffer can now hold;
 | |
|   //   size of fTokens array
 | |
|   std::vector<std::pair<RID, std::string> > fRowStatus;  // Status of bad rows
 | |
|   std::vector<std::string> fErrRows;                     // Rejected rows to write to .bad file
 | |
| 
 | |
|   uint32_t fTotalReadRows;  // Total valid rows read into buffer;
 | |
|   //   this count excludes rejected rows
 | |
|   uint32_t fTotalReadRowsParser;  // for temporary use by parser
 | |
| 
 | |
|   uint32_t fTotalReadRowsForLog;  // Total rows read into this buffer
 | |
|   //   including invalid rows
 | |
| 
 | |
|   RID fStartRow;  // Starting row id for rows in buffer,
 | |
|   //   relative to start of job.
 | |
|   //   Rejected rows are excluded.
 | |
|   RID fStartRowParser;  // for temporary use by parser
 | |
| 
 | |
|   RID fStartRowForLogging;  // Starting row id for rows in buffer,
 | |
|   //   relative to start of current input
 | |
|   //   file.  All rows are counted.
 | |
|   RID fStartRowForLoggingParser;  // for temporary use by parser
 | |
| 
 | |
|   uint32_t fAutoIncGenCount;  // How many auto-increment values are
 | |
|   //   to be generated for current buffer
 | |
|   uint32_t fAutoIncGenCountParser;  // for temporary use by parser
 | |
| 
 | |
|   uint64_t fAutoIncNextValue;  // Next auto-increment value assign to
 | |
|   //   a row in this buffer
 | |
|   unsigned fNumberOfColumns;  // Number of ColumnInfo objs in table
 | |
| 
 | |
|   ColPosPair** fTokens;  // Vector of start and offsets for the
 | |
|   //   column values read from tbl files
 | |
|   ColPosPair** fTokensParser;  // for temporary use by parser
 | |
| 
 | |
|   char fColDelim;                // Character to delimit columns in a row
 | |
|   unsigned fBufferSize;          // Size of input read buffer (fData)
 | |
|   unsigned fReadSize;            // Number of bytes in read buffer(fData)
 | |
|   boost::mutex fSyncUpdatesBLB;  // Mutex to synchronize updates
 | |
|   Log* fLog;                     // Logger object
 | |
|   bool fNullStringMode;          // Indicates if "NULL" string is to be
 | |
|   //   treated as a NULL value or not
 | |
|   char fEnclosedByChar;           // Optional char to enclose col values
 | |
|   char fEscapeChar;               // Used to escape enclosed character
 | |
|   int fBufferId;                  // Id for this read buffer
 | |
|   std::string fTableName;         // Table assigned to this read buffer
 | |
|   JobFieldRefList fFieldList;     // Complete list of cols and flds
 | |
|   unsigned int fNumFieldsInFile;  // Number of fields in input file
 | |
|   //   (including fields to be ignored)
 | |
|   unsigned int fNumColsInFile;  // Number of flds in input file targeted
 | |
|   //   for db cols (omits default cols)
 | |
|   bool fbTruncationAsError;         // Treat string truncation as error
 | |
|   ImportDataMode fImportDataMode;   // Import data in text or binary mode
 | |
|   long fTimeZone;                   // Timezone offset (in seconds) relative to UTC,
 | |
|                                     // to use for TIMESTAMP data type. For example,
 | |
|                                     // for EST which is UTC-5:00, offset will be -18000s.
 | |
|   unsigned int fFixedBinaryRecLen;  // Fixed rec len used in binary mode
 | |
| 
 | |
|   //--------------------------------------------------------------------------
 | |
|   // Private Functions
 | |
|   //--------------------------------------------------------------------------
 | |
| 
 | |
|   /** @brief Copy constructor
 | |
|    */
 | |
|   BulkLoadBuffer(const BulkLoadBuffer& buffer);
 | |
| 
 | |
|   /** @brief Assignment operator
 | |
|    */
 | |
|   BulkLoadBuffer& operator=(const BulkLoadBuffer& buffer);
 | |
| 
 | |
|   /** @brief Convert the buffer data depending upon the data type
 | |
|    */
 | |
|   void convert(char* field, int fieldLength, bool nullFlag, unsigned char* output, const JobColumn& column,
 | |
|                BLBufferStats& bufStats);
 | |
| 
 | |
|   /** @brief Copy the overflow data
 | |
|    */
 | |
|   void copyOverflow(const BulkLoadBuffer& buffer);
 | |
| 
 | |
|   /** @brief Parse a Read buffer for a nonDictionary column
 | |
|    */
 | |
|   int parseCol(ColumnInfo& columnInfo);
 | |
| 
 | |
|   /** @brief Parse a Read buffer for a nonDictionary column
 | |
|    */
 | |
|   void parseColLogMinMax(std::ostringstream& oss, ColDataType colDataType, int64_t minBufferVal,
 | |
|                          int64_t maxBufferVal) const;
 | |
| 
 | |
|   /** @brief Parse a Read buffer for a Dictionary column
 | |
|    */
 | |
|   int parseDict(ColumnInfo& columnInfo);
 | |
| 
 | |
|   /** @brief Parse a Dictionary Read buffer into a ColumnBufferSection.
 | |
|    *
 | |
|    * Parses the Read buffer into a section up to the point at which
 | |
|    * the buffer crosses an extent boundary.
 | |
|    *
 | |
|    * @param columnInfo    Column being parsed
 | |
|    * @oaram tokenPos      Position of rows to be parsed, in fTokens.
 | |
|    * @param startRow      Row id of first row in buffer to be parsed.
 | |
|    *        Row id is relative to all the rows in this import.
 | |
|    * @param totalReadRows Number of buffer rows ready to be parsed
 | |
|    * @param nRowsParsed   Number of buffer rows that were parsed
 | |
|    */
 | |
|   int parseDictSection(ColumnInfo& columnInfo, int tokenPos, RID startRow, uint32_t totalReadRows,
 | |
|                        uint32_t& nRowsParsed);
 | |
| 
 | |
|   /** @brief Expand the size of the fTokens array
 | |
|    */
 | |
|   void resizeTokenArray();
 | |
| 
 | |
|   /** @brief tokenize the buffer contents and fill up the token array.
 | |
|    */
 | |
|   void tokenize(const boost::ptr_vector<ColumnInfo>& columnsInfo, int allowedErrCntThisCall,
 | |
|                 size_t& skipRows);
 | |
| 
 | |
|   /** @brief Binary tokenization of the buffer, and fill up the token array.
 | |
|    */
 | |
|   int tokenizeBinary(const boost::ptr_vector<ColumnInfo>& columnsInfo, int allowedErrCntThisCall,
 | |
|                      bool bEndOfData);
 | |
| 
 | |
|   /** @brief Determine if specified value is NULL or not.
 | |
|    */
 | |
|   bool isBinaryFieldNull(void* val, WriteEngine::ColType ct, execplan::CalpontSystemCatalog::ColDataType dt);
 | |
| 
 | |
|  public:
 | |
|   /** @brief Constructor
 | |
|    * @param noOfCol Number of columns
 | |
|    * @param bufferSize Buffer size
 | |
|    * @param logger The Log object used for logging
 | |
|    * @param bufferId Id assigned to this buffer
 | |
|    * @param tableName Name of table associated with this read buffer
 | |
|    * @param jobFieldRefList Complete list of cols/flds listed in Job XML file
 | |
|    */
 | |
|   BulkLoadBuffer(unsigned noOfCols, unsigned bufferSize, Log* logger, int bufferId,
 | |
|                  const std::string& tableName, const JobFieldRefList& jobFieldRefList);
 | |
| 
 | |
|   /** @brief Destructor
 | |
|    */
 | |
|   ~BulkLoadBuffer();
 | |
| 
 | |
|   /** @brief Resets the values of the members (excluding column locks)
 | |
|    */
 | |
|   void reset();
 | |
| 
 | |
|   /** @brief Resets the column locks.
 | |
|    * TableInfo::fSyncUpdatesTI mutex should be locked when calling this
 | |
|    * function (see fColumnLocks discussion).
 | |
|    */
 | |
|   void resetColumnLocks();
 | |
| 
 | |
|   /** @brief Get the buffer status
 | |
|    */
 | |
|   Status getStatusBLB() const
 | |
|   {
 | |
|     return fStatusBLB;
 | |
|   }
 | |
| 
 | |
|   /** @brief Set the buffer status
 | |
|    */
 | |
|   void setStatusBLB(const Status& status)
 | |
|   {
 | |
|     fStatusBLB = status;
 | |
|   }
 | |
| 
 | |
|   /** @brief Try to lock a column for the buffer
 | |
|    * TableInfo::fSyncUpdatesTI mutex should be locked when calling this
 | |
|    * function (see fColumnLocks discussion).
 | |
|    */
 | |
|   bool tryAndLockColumn(const int& columnId, const int& id);
 | |
| 
 | |
|   int fillFromMemory(const BulkLoadBuffer& overFlowBufIn, const char* input, size_t length,
 | |
|                      size_t* parse_length, size_t& skipRows, RID& totalReadRows, RID& correctTotalRows,
 | |
|                      const boost::ptr_vector<ColumnInfo>& columnsInfo, int allowedErrCntThisCall);
 | |
| 
 | |
|   /** @brief Read the table data into the buffer
 | |
|    */
 | |
|   int fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* handle, size_t& skipRows, RID& totalRows,
 | |
|                    RID& correctTotalRows, const boost::ptr_vector<ColumnInfo>& columnsInfo,
 | |
|                    int allowedErrCntThisCall);
 | |
| 
 | |
|   /** @brief Get the overflow size
 | |
|    */
 | |
|   int getOverFlowSize() const
 | |
|   {
 | |
|     return fOverflowSize;
 | |
|   }
 | |
| 
 | |
|   /** @brief Parse the buffer data
 | |
|    */
 | |
|   int parse(ColumnInfo& columnInfo);
 | |
| 
 | |
|   /** @brief Set the delimiter used to delimit the columns within a row
 | |
|    */
 | |
|   void setColDelimiter(const char& delim)
 | |
|   {
 | |
|     fColDelim = delim;
 | |
|   }
 | |
| 
 | |
|   /** @brief Set mode to treat "NULL" string as NULL value or not.
 | |
|    */
 | |
|   void setNullStringMode(bool bMode)
 | |
|   {
 | |
|     fNullStringMode = bMode;
 | |
|   }
 | |
| 
 | |
|   /** @brief Set character optionally used to enclose input column values.
 | |
|    */
 | |
|   void setEnclosedByChar(char enChar)
 | |
|   {
 | |
|     fEnclosedByChar = enChar;
 | |
|   }
 | |
| 
 | |
|   /** @brief Set escape char to use in conjunction with enclosed by char.
 | |
|    */
 | |
|   void setEscapeChar(char esChar)
 | |
|   {
 | |
|     fEscapeChar = esChar;
 | |
|   }
 | |
| 
 | |
|   /** @brief Get the column status
 | |
|    *  TableInfo::fSyncUpdatesTI mutex should be locked when calling this
 | |
|    *  function (see fColumnLocks discussion).
 | |
|    */
 | |
|   Status getColumnStatus(const int& columnId) const
 | |
|   {
 | |
|     return fColumnLocks[columnId].status;
 | |
|   }
 | |
| 
 | |
|   /** @brief Set the column status
 | |
|    *  TableInfo::fSyncUpdatesTI mutex should be locked when calling this
 | |
|    *  function (see fColumnLocks discussion).
 | |
|    *  @returns TRUE if all columns in the buffer are complete.
 | |
|    */
 | |
|   bool setColumnStatus(const int& columnId, const Status& status);
 | |
| 
 | |
|   /** @brief Get the error row status's
 | |
|    */
 | |
|   const std::vector<std::pair<RID, std::string> >& getErrorRows() const
 | |
|   {
 | |
|     return fRowStatus;
 | |
|   }
 | |
| 
 | |
|   /** @brief Get the error rows
 | |
|    */
 | |
|   const std::vector<std::string>& getExactErrorRows() const
 | |
|   {
 | |
|     return fErrRows;
 | |
|   }
 | |
| 
 | |
|   void clearErrRows()
 | |
|   {
 | |
|     fRowStatus.clear();
 | |
|     fErrRows.clear();
 | |
|   }
 | |
| 
 | |
|   /** @brief Get the column locker.
 | |
|    *  TableInfo::fSyncUpdatesTI mutex should be locked when calling this
 | |
|    *  function (see fColumnLocks discussion).
 | |
|    */
 | |
|   int getColumnLocker(const int& columnId) const
 | |
|   {
 | |
|     return fColumnLocks[columnId].locker;
 | |
|   }
 | |
| 
 | |
|   /** @brief set truncation as error for this import.
 | |
|    */
 | |
|   void setTruncationAsError(bool bTruncationAsError)
 | |
|   {
 | |
|     fbTruncationAsError = bTruncationAsError;
 | |
|   }
 | |
| 
 | |
|   /** @brief retrieve the tuncation as error setting for this
 | |
|    *  import. When set, this causes char and varchar strings
 | |
|    *  that are longer than the column definition to be treated
 | |
|    *  as errors instead of warnings.
 | |
|    */
 | |
|   bool getTruncationAsError() const
 | |
|   {
 | |
|     return fbTruncationAsError;
 | |
|   }
 | |
| 
 | |
|   /** @brief Set text vs binary import mode along with corresponding fixed
 | |
|    *         record length that is used if the binary mode is set to TRUE.
 | |
|    */
 | |
|   void setImportDataMode(ImportDataMode importMode, unsigned int fixedBinaryRecLen)
 | |
|   {
 | |
|     fImportDataMode = importMode;
 | |
|     fFixedBinaryRecLen = fixedBinaryRecLen;
 | |
|   }
 | |
| 
 | |
|   /** @brief set timezone.
 | |
|    */
 | |
|   void setTimeZone(long timeZone)
 | |
|   {
 | |
|     fTimeZone = timeZone;
 | |
|   }
 | |
| };
 | |
| 
 | |
| inline bool isTrueWord(const char* field, int fieldLength)
 | |
| {
 | |
|   // return false;
 | |
|   return fieldLength == 4 && (field[0] == 'T' || field[0] == 't') && (field[1] == 'R' || field[1] == 'r') &&
 | |
|          (field[2] == 'U' || field[2] == 'u') && (field[3] == 'E' || field[3] == 'e');
 | |
| }
 | |
| 
 | |
| }  // namespace WriteEngine
 |