1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00

MCOL-5505 add parquet support for cpimport and add mcs_parquet_ddl and mcs_parquet_gen tools

This commit is contained in:
HanpyBin
2023-08-20 16:01:58 +08:00
committed by Leonid Fedorov
parent 94a680ea60
commit fe597ec78c
25 changed files with 4677 additions and 251 deletions

View File

@ -55,6 +55,9 @@ using namespace querytele;
#include "oamcache.h"
#include "cacheutils.h"
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/exception.h>
namespace
{
const std::string BAD_FILE_SUFFIX = ".bad"; // Reject data file suffix
@ -153,6 +156,8 @@ TableInfo::TableInfo(Log* logger, const BRM::TxnID txnID, const string& processN
, fRejectErrCnt(0)
, fExtentStrAlloc(tableOID, logger)
, fOamCachePtr(oam::OamCache::makeOamCache())
, fParquetReader(NULL)
, fReader(nullptr)
{
fBuffers.clear();
fColumns.clear();
@ -266,24 +271,44 @@ int TableInfo::readTableData()
{
RID validTotalRows = 0;
RID totalRowsPerInputFile = 0;
int64_t totalRowsParquet = 0; // totalRowsParquet to be used in later function
// needs int64_t type
int filesTBProcessed = fLoadFileList.size();
int fileCounter = 0;
unsigned long long qtSentAt = 0;
if (fHandle == NULL)
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
fFileName = fLoadFileList[fileCounter];
int rc = openTableFile();
if (rc != NO_ERROR)
if (fHandle == NULL)
{
// Mark the table status as error and exit.
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
fStatusTI = WriteEngine::ERR;
return rc;
fFileName = fLoadFileList[fileCounter];
int rc = openTableFile();
if (rc != NO_ERROR)
{
// Mark the table status as error and exit.
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
fStatusTI = WriteEngine::ERR;
return rc;
}
fileCounter++;
}
}
else
{
if (fParquetReader == NULL)
{
fFileName = fLoadFileList[fileCounter];
int rc = openTableFileParquet(totalRowsParquet);
if (rc != NO_ERROR)
{
// Mark the table status as error and exit.
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
fStatusTI = WriteEngine::ERR;
return rc;
}
fileCounter++;
}
fileCounter++;
}
timeval readStart;
@ -419,16 +444,23 @@ int TableInfo::readTableData()
// validTotalRows is ongoing total of valid rows read for all files
// pertaining to this DB table.
int readRc;
if (fReadFromS3)
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
fColumns, allowedErrCntThisCall);
if (fReadFromS3)
{
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
fColumns, allowedErrCntThisCall);
}
else
{
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
validTotalRows, fColumns, allowedErrCntThisCall);
}
}
else
{
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
validTotalRows, fColumns, allowedErrCntThisCall);
readRc = fBuffers[readBufNo].fillFromFileParquet(totalRowsPerInputFile, validTotalRows);
}
if (readRc != NO_ERROR)
@ -530,7 +562,7 @@ int TableInfo::readTableData()
fCurrentReadBuffer = (fCurrentReadBuffer + 1) % fReadBufCount;
// bufferCount++;
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)))
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)) || (totalRowsPerInputFile == (RID)totalRowsParquet))
{
timeval readFinished;
gettimeofday(&readFinished, NULL);
@ -567,7 +599,15 @@ int TableInfo::readTableData()
if (fileCounter < filesTBProcessed)
{
fFileName = fLoadFileList[fileCounter];
int rc = openTableFile();
int rc;
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
rc = openTableFile();
}
else
{
rc = openTableFileParquet(totalRowsParquet);
}
if (rc != NO_ERROR)
{
@ -1252,6 +1292,45 @@ void TableInfo::addColumn(ColumnInfo* info)
fExtentStrAlloc.addColumn(info->column.mapOid, info->column.width, info->column.dataType);
}
int TableInfo::openTableFileParquet(int64_t &totalRowsParquet)
{
if (fParquetReader != NULL)
return NO_ERROR;
std::shared_ptr<arrow::io::ReadableFile> infile;
try
{
PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(fFileName, arrow::default_memory_pool()));
PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &fReader));
fReader->set_batch_size(1000);
PARQUET_THROW_NOT_OK(fReader->ScanContents({0}, 1000, &totalRowsParquet));
PARQUET_THROW_NOT_OK(fReader->GetRecordBatchReader(&fParquetReader));
}
catch (std::exception& ex)
{
ostringstream oss;
oss << "Error opening import file " << fFileName << ".";
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
return ERR_FILE_OPEN;
}
catch (...)
{
ostringstream oss;
oss << "Error opening import file " << fFileName << ".";
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
return ERR_FILE_OPEN;
}
// initialize fBuffers batch source
for (int i = 0; i < fReadBufCount; ++i)
{
fBuffers[i].setParquetReader(fParquetReader);
}
return NO_ERROR;
}
//------------------------------------------------------------------------------
// Open the file corresponding to fFileName so that we can import it's contents.
// A buffer is also allocated and passed to setvbuf().
@ -1331,24 +1410,32 @@ int TableInfo::openTableFile()
//------------------------------------------------------------------------------
void TableInfo::closeTableFile()
{
if (fHandle)
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
// If reading from stdin, we don't delete the buffer out from under
// the file handle, because stdin is still open. This will cause a
// memory leak, but when using stdin, we can only read in 1 table.
// So it's not like we will be leaking multiple buffers for several
// tables over the life of the job.
if (!fReadFromStdin)
if (fHandle)
{
fclose(fHandle);
delete[] fFileBuffer;
// If reading from stdin, we don't delete the buffer out from under
// the file handle, because stdin is still open. This will cause a
// memory leak, but when using stdin, we can only read in 1 table.
// So it's not like we will be leaking multiple buffers for several
// tables over the life of the job.
if (!fReadFromStdin)
{
fclose(fHandle);
delete[] fFileBuffer;
}
fHandle = 0;
}
else if (ms3)
{
ms3_free((uint8_t*)fFileBuffer);
}
fHandle = 0;
}
else if (ms3)
else
{
ms3_free((uint8_t*)fFileBuffer);
fReader.reset();
fParquetReader.reset();
}
}