1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-5505 add parquet support for cpimport and add mcs_parquet_ddl and mcs_parquet_gen tools

This commit is contained in:
HanpyBin
2023-08-20 16:01:58 +08:00
committed by Leonid Fedorov
parent 94a680ea60
commit fe597ec78c
25 changed files with 4677 additions and 251 deletions

View File

@ -30,7 +30,7 @@
#include "we_columninfo.h"
#include "calpontsystemcatalog.h"
#include "dataconvert.h"
#include <arrow/api.h>
namespace WriteEngine
{
class Log;
@ -84,6 +84,9 @@ class BulkLoadBuffer
char* fOverflowBuf; // Overflow data held for next buffer
unsigned fOverflowSize; // Current size of fOverflowBuf
std::shared_ptr<arrow::RecordBatch> fParquetBatch; // Batch of parquet file to be parsed
std::shared_ptr<arrow::RecordBatch> fParquetBatchParser; // for temporary use by parser
std::shared_ptr<::arrow::RecordBatchReader> fParquetReader; // Reader for read batches of parquet data
// Information about the locker and status for each column in this buffer.
// Note that TableInfo::fSyncUpdatesTI mutex is used to synchronize
// access to fColumnLocks and fParseComplete from both read and parse
@ -174,6 +177,19 @@ class BulkLoadBuffer
void convert(char* field, int fieldLength, bool nullFlag, unsigned char* output, const JobColumn& column,
BLBufferStats& bufStats);
/** @brief Parse a batch of parquet data in read buffer for a nonDictionary column
*/
int parseColParquet(ColumnInfo& columnInfo);
/** @brief Convert batch parquet data depending upon the data type
*/
void convertParquet(std::shared_ptr<arrow::Array> columnData, unsigned char* buf, const JobColumn& column,
BLBufferStats& bufStats, RID& lastInputRowInExtent, ColumnInfo& columnInfo,
bool& updateCPInfoPendingFlag, ColumnBufferSection* section);
inline void updateCPMinMax(ColumnInfo& columnInfo, RID& lastInputRowInExtent, BLBufferStats& bufStats,
bool& updateCPInfoPendingFlag, ColumnBufferSection* section, uint32_t curRow);
/** @brief Copy the overflow data
*/
void copyOverflow(const BulkLoadBuffer& buffer);
@ -263,6 +279,11 @@ class BulkLoadBuffer
fStatusBLB = status;
}
void setParquetReader(std::shared_ptr<::arrow::RecordBatchReader> reader)
{
fParquetReader = reader;
}
/** @brief Try to lock a column for the buffer
* TableInfo::fSyncUpdatesTI mutex should be locked when calling this
* function (see fColumnLocks discussion).
@ -273,6 +294,10 @@ class BulkLoadBuffer
size_t* parse_length, RID& totalReadRows, RID& correctTotalRows,
const boost::ptr_vector<ColumnInfo>& columnsInfo, unsigned int allowedErrCntThisCall);
/** @brief Read the batch data into the buffer
*/
int fillFromFileParquet(RID& totalReadRows, RID& correctTotalRows);
/** @brief Read the table data into the buffer
*/
int fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* handle, RID& totalRows, RID& correctTotalRows,