You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
MCOL-5505 add parquet support for cpimport and add mcs_parquet_ddl and mcs_parquet_gen tools
This commit is contained in:
@ -55,6 +55,9 @@ using namespace querytele;
|
||||
#include "oamcache.h"
|
||||
#include "cacheutils.h"
|
||||
|
||||
#include <arrow/io/api.h>
|
||||
#include <parquet/arrow/reader.h>
|
||||
#include <parquet/exception.h>
|
||||
namespace
|
||||
{
|
||||
const std::string BAD_FILE_SUFFIX = ".bad"; // Reject data file suffix
|
||||
@ -153,6 +156,8 @@ TableInfo::TableInfo(Log* logger, const BRM::TxnID txnID, const string& processN
|
||||
, fRejectErrCnt(0)
|
||||
, fExtentStrAlloc(tableOID, logger)
|
||||
, fOamCachePtr(oam::OamCache::makeOamCache())
|
||||
, fParquetReader(NULL)
|
||||
, fReader(nullptr)
|
||||
{
|
||||
fBuffers.clear();
|
||||
fColumns.clear();
|
||||
@ -266,24 +271,44 @@ int TableInfo::readTableData()
|
||||
{
|
||||
RID validTotalRows = 0;
|
||||
RID totalRowsPerInputFile = 0;
|
||||
int64_t totalRowsParquet = 0; // totalRowsParquet to be used in later function
|
||||
// needs int64_t type
|
||||
int filesTBProcessed = fLoadFileList.size();
|
||||
int fileCounter = 0;
|
||||
unsigned long long qtSentAt = 0;
|
||||
|
||||
if (fHandle == NULL)
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFile();
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
if (fHandle == NULL)
|
||||
{
|
||||
// Mark the table status as error and exit.
|
||||
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
|
||||
fStatusTI = WriteEngine::ERR;
|
||||
return rc;
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFile();
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
// Mark the table status as error and exit.
|
||||
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
|
||||
fStatusTI = WriteEngine::ERR;
|
||||
return rc;
|
||||
}
|
||||
fileCounter++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fParquetReader == NULL)
|
||||
{
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFileParquet(totalRowsParquet);
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
// Mark the table status as error and exit.
|
||||
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
|
||||
fStatusTI = WriteEngine::ERR;
|
||||
return rc;
|
||||
}
|
||||
fileCounter++;
|
||||
}
|
||||
|
||||
fileCounter++;
|
||||
}
|
||||
|
||||
timeval readStart;
|
||||
@ -419,16 +444,23 @@ int TableInfo::readTableData()
|
||||
// validTotalRows is ongoing total of valid rows read for all files
|
||||
// pertaining to this DB table.
|
||||
int readRc;
|
||||
if (fReadFromS3)
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
|
||||
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
|
||||
fColumns, allowedErrCntThisCall);
|
||||
if (fReadFromS3)
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
|
||||
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
|
||||
fColumns, allowedErrCntThisCall);
|
||||
}
|
||||
else
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
|
||||
validTotalRows, fColumns, allowedErrCntThisCall);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
|
||||
validTotalRows, fColumns, allowedErrCntThisCall);
|
||||
readRc = fBuffers[readBufNo].fillFromFileParquet(totalRowsPerInputFile, validTotalRows);
|
||||
}
|
||||
|
||||
if (readRc != NO_ERROR)
|
||||
@ -530,7 +562,7 @@ int TableInfo::readTableData()
|
||||
fCurrentReadBuffer = (fCurrentReadBuffer + 1) % fReadBufCount;
|
||||
|
||||
// bufferCount++;
|
||||
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)))
|
||||
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)) || (totalRowsPerInputFile == (RID)totalRowsParquet))
|
||||
{
|
||||
timeval readFinished;
|
||||
gettimeofday(&readFinished, NULL);
|
||||
@ -567,7 +599,15 @@ int TableInfo::readTableData()
|
||||
if (fileCounter < filesTBProcessed)
|
||||
{
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFile();
|
||||
int rc;
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
rc = openTableFile();
|
||||
}
|
||||
else
|
||||
{
|
||||
rc = openTableFileParquet(totalRowsParquet);
|
||||
}
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
@ -1252,6 +1292,45 @@ void TableInfo::addColumn(ColumnInfo* info)
|
||||
fExtentStrAlloc.addColumn(info->column.mapOid, info->column.width, info->column.dataType);
|
||||
}
|
||||
|
||||
|
||||
int TableInfo::openTableFileParquet(int64_t &totalRowsParquet)
|
||||
{
|
||||
if (fParquetReader != NULL)
|
||||
return NO_ERROR;
|
||||
std::shared_ptr<arrow::io::ReadableFile> infile;
|
||||
try
|
||||
{
|
||||
PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(fFileName, arrow::default_memory_pool()));
|
||||
PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &fReader));
|
||||
fReader->set_batch_size(1000);
|
||||
PARQUET_THROW_NOT_OK(fReader->ScanContents({0}, 1000, &totalRowsParquet));
|
||||
PARQUET_THROW_NOT_OK(fReader->GetRecordBatchReader(&fParquetReader));
|
||||
}
|
||||
catch (std::exception& ex)
|
||||
{
|
||||
ostringstream oss;
|
||||
oss << "Error opening import file " << fFileName << ".";
|
||||
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
|
||||
|
||||
return ERR_FILE_OPEN;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
ostringstream oss;
|
||||
oss << "Error opening import file " << fFileName << ".";
|
||||
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
|
||||
|
||||
return ERR_FILE_OPEN;
|
||||
}
|
||||
// initialize fBuffers batch source
|
||||
for (int i = 0; i < fReadBufCount; ++i)
|
||||
{
|
||||
fBuffers[i].setParquetReader(fParquetReader);
|
||||
}
|
||||
return NO_ERROR;
|
||||
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Open the file corresponding to fFileName so that we can import it's contents.
|
||||
// A buffer is also allocated and passed to setvbuf().
|
||||
@ -1331,24 +1410,32 @@ int TableInfo::openTableFile()
|
||||
//------------------------------------------------------------------------------
|
||||
void TableInfo::closeTableFile()
|
||||
{
|
||||
if (fHandle)
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
// If reading from stdin, we don't delete the buffer out from under
|
||||
// the file handle, because stdin is still open. This will cause a
|
||||
// memory leak, but when using stdin, we can only read in 1 table.
|
||||
// So it's not like we will be leaking multiple buffers for several
|
||||
// tables over the life of the job.
|
||||
if (!fReadFromStdin)
|
||||
if (fHandle)
|
||||
{
|
||||
fclose(fHandle);
|
||||
delete[] fFileBuffer;
|
||||
// If reading from stdin, we don't delete the buffer out from under
|
||||
// the file handle, because stdin is still open. This will cause a
|
||||
// memory leak, but when using stdin, we can only read in 1 table.
|
||||
// So it's not like we will be leaking multiple buffers for several
|
||||
// tables over the life of the job.
|
||||
if (!fReadFromStdin)
|
||||
{
|
||||
fclose(fHandle);
|
||||
delete[] fFileBuffer;
|
||||
}
|
||||
|
||||
fHandle = 0;
|
||||
}
|
||||
else if (ms3)
|
||||
{
|
||||
ms3_free((uint8_t*)fFileBuffer);
|
||||
}
|
||||
|
||||
fHandle = 0;
|
||||
}
|
||||
else if (ms3)
|
||||
else
|
||||
{
|
||||
ms3_free((uint8_t*)fFileBuffer);
|
||||
fReader.reset();
|
||||
fParquetReader.reset();
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user