1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

feature(cpimport): MCOL-5164 ignore all errors (-e all)

This commit is contained in:
Aleksei Antipovskii
2025-06-20 17:20:40 +02:00
committed by Leonid Fedorov
parent 7dca1da8f2
commit 1ce46b5e0b
15 changed files with 209 additions and 56 deletions

View File

@ -530,14 +530,14 @@ int BulkLoad::preProcess(Job& job, int tableNo, std::shared_ptr<TableInfo>& tabl
if (pwd)
tableInfo->setUIDGID(pwd->pw_uid, pwd->pw_gid);
if (fMaxErrors != -1)
if (fMaxErrors != MAX_ERRORS_DEFAULT)
tableInfo->setMaxErrorRows(fMaxErrors);
else
tableInfo->setMaxErrorRows(job.jobTableList[tableNo].maxErrNum);
// @bug 3929: cpimport.bin error messaging using up too much memory.
// Validate that max allowed error count is within valid range
long long maxErrNum = tableInfo->getMaxErrorRows();
int maxErrNum = tableInfo->getMaxErrorRows();
if (maxErrNum > MAX_ALLOW_ERROR_COUNT)
{

View File

@ -129,7 +129,7 @@ class BulkLoad : public FileOp
void setEscapeChar(char esChar);
void setSkipRows(size_t skipRows);
void setKeepRbMetaFiles(bool keepMeta);
void setMaxErrorCount(unsigned int maxErrors);
void setMaxErrorCount(int maxErrors);
void setNoOfParseThreads(int parseThreads);
void setNoOfReadThreads(int readThreads);
void setNullStringMode(bool bMode);
@ -184,13 +184,13 @@ class BulkLoad : public FileOp
Log fLog; // logger
int fNumOfParser; // total number of parser
int fNumOfParser{0}; // total number of parser
char fColDelim{0}; // delimits col values within a row
int fNoOfBuffers{-1}; // Number of read buffers
int fBufferSize{-1}; // Read buffer size
int fFileVbufSize{-1}; // Internal file system buffer size
long long fMaxErrors{-1}; // Max allowable errors per job
long long fMaxErrors{MAX_ERRORS_DEFAULT}; // Max allowable errors per job
std::string fAlternateImportDir; // Alternate bulk import directory
std::string fErrorDir; // Opt. where error records record
std::string fProcessName; // Application process name
@ -429,10 +429,7 @@ inline void BulkLoad::setKeepRbMetaFiles(bool keepMeta)
fKeepRbMetaFiles = keepMeta;
}
// Mutator takes an unsigned int, but we store in a long long, because...
// TableInfo which eventually needs this attribute, takes an unsigned int,
// but we want to be able to init to -1, to indicate when it has not been set.
inline void BulkLoad::setMaxErrorCount(unsigned int maxErrors)
inline void BulkLoad::setMaxErrorCount(int maxErrors)
{
fMaxErrors = maxErrors;
}

View File

@ -2049,7 +2049,7 @@ int BulkLoadBuffer::parseDictSection(ColumnInfo& columnInfo, int tokenPos, RID s
int BulkLoadBuffer::fillFromMemory(const BulkLoadBuffer& overFlowBufIn, const char* input, size_t length,
size_t* parse_length, size_t& skipRows, RID& totalReadRows,
RID& correctTotalRows, const boost::ptr_vector<ColumnInfo>& columnsInfo,
unsigned int allowedErrCntThisCall)
int allowedErrCntThisCall)
{
boost::mutex::scoped_lock lock(fSyncUpdatesBLB);
reset();
@ -2153,7 +2153,7 @@ int BulkLoadBuffer::fillFromMemory(const BulkLoadBuffer& overFlowBufIn, const ch
int BulkLoadBuffer::fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* handle, size_t& skipRows,
RID& totalReadRows, RID& correctTotalRows,
const boost::ptr_vector<ColumnInfo>& columnsInfo,
unsigned int allowedErrCntThisCall)
int allowedErrCntThisCall)
{
boost::mutex::scoped_lock lock(fSyncUpdatesBLB);
reset();
@ -2277,7 +2277,7 @@ int BulkLoadBuffer::fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* hand
// depending on whether the user has enabled the "enclosed by" feature.
//------------------------------------------------------------------------------
void BulkLoadBuffer::tokenize(const boost::ptr_vector<ColumnInfo>& columnsInfo,
unsigned int allowedErrCntThisCall, size_t& skipRows)
int allowedErrCntThisCall, size_t& skipRows)
{
unsigned offset = 0; // length of field
unsigned curCol = 0; // dest db column counter within a row
@ -2789,7 +2789,7 @@ void BulkLoadBuffer::tokenize(const boost::ptr_vector<ColumnInfo>& columnsInfo,
// Quit if we exceed max allowable errors for this call.
// We set lastRowHead = p, so that the code that follows this
// loop won't try to save any data in fOverflowBuf.
if (errorCount > allowedErrCntThisCall)
if (allowedErrCntThisCall != MAX_ERRORS_ALL && errorCount > static_cast<unsigned>(allowedErrCntThisCall))
{
lastRowHead = p + 1;
p++;
@ -2928,7 +2928,7 @@ void BulkLoadBuffer::resizeTokenArray()
// then tokenize() will stop reading data and exit.
//------------------------------------------------------------------------------
int BulkLoadBuffer::tokenizeBinary(const boost::ptr_vector<ColumnInfo>& columnsInfo,
unsigned int allowedErrCntThisCall, bool bEndOfData)
int allowedErrCntThisCall, bool bEndOfData)
{
unsigned curCol = 0; // dest db column counter within a row
unsigned curRowNum = 0; // "total" number of rows read during this call
@ -3082,7 +3082,7 @@ int BulkLoadBuffer::tokenizeBinary(const boost::ptr_vector<ColumnInfo>& columnsI
errorCount++;
// Quit if we exceed max allowable errors for this call
if (errorCount > allowedErrCntThisCall)
if (allowedErrCntThisCall != MAX_ERRORS_ALL && errorCount > static_cast<unsigned>(allowedErrCntThisCall))
break;
}

View File

@ -215,12 +215,12 @@ class BulkLoadBuffer
/** @brief tokenize the buffer contents and fill up the token array.
*/
void tokenize(const boost::ptr_vector<ColumnInfo>& columnsInfo, unsigned int allowedErrCntThisCall,
void tokenize(const boost::ptr_vector<ColumnInfo>& columnsInfo, int allowedErrCntThisCall,
size_t& skipRows);
/** @brief Binary tokenization of the buffer, and fill up the token array.
*/
int tokenizeBinary(const boost::ptr_vector<ColumnInfo>& columnsInfo, unsigned int allowedErrCntThisCall,
int tokenizeBinary(const boost::ptr_vector<ColumnInfo>& columnsInfo, int allowedErrCntThisCall,
bool bEndOfData);
/** @brief Determine if specified value is NULL or not.
@ -275,13 +275,13 @@ class BulkLoadBuffer
int fillFromMemory(const BulkLoadBuffer& overFlowBufIn, const char* input, size_t length,
size_t* parse_length, size_t& skipRows, RID& totalReadRows, RID& correctTotalRows,
const boost::ptr_vector<ColumnInfo>& columnsInfo, unsigned int allowedErrCntThisCall);
const boost::ptr_vector<ColumnInfo>& columnsInfo, int allowedErrCntThisCall);
/** @brief Read the table data into the buffer
*/
int fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* handle, size_t& skipRows, RID& totalRows,
RID& correctTotalRows, const boost::ptr_vector<ColumnInfo>& columnsInfo,
unsigned int allowedErrCntThisCall);
int allowedErrCntThisCall);
/** @brief Get the overflow size
*/

View File

@ -70,8 +70,7 @@ WECmdArgs::WECmdArgs(int argc, char** argv)
DECLARE_INT_ARG("read-buffer-size,c", fReadBufSize, 1, INT_MAX,
"Application read buffer size (in bytes)")
DECLARE_INT_ARG("debug,d", fDebugLvl, 1, 3, "Print different level(1-3) debug message")
DECLARE_INT_ARG("max-errors,e", fMaxErrors, 0, INT_MAX,
"Maximum number of allowable error per table per PM")
("max-errors,e", po::value<string>(), "Maximum number (or 'all') of allowable error per table per PM")
("file-path,f", po::value<string>(&fPmFilePath),
"Data file directory path. Default is current working directory.\n"
"\tIn Mode 1, represents the local input file path.\n"
@ -304,6 +303,24 @@ void WECmdArgs::parseCmdLineArgs(int argc, char** argv)
fAllowMissingColumn = true;
}
}
if (vm.contains("max-errors"))
{
auto optarg= vm["max-errors"].as<string>();
if (optarg == "all")
{
fMaxErrors = MAX_ERRORS_ALL;
}
else
{
errno = 0;
long lValue = strtol(optarg.c_str(), nullptr, 10);
if (errno != 0 || lValue < 0 || lValue > INT_MAX)
{
startupError("Option --max-errors/-e is invalid or out of range");
}
fMaxErrors = lValue;
}
}
if (fArgMode != -1)
fMode = fArgMode; // BUG 4210
@ -337,10 +354,7 @@ void WECmdArgs::fillParams(BulkLoad& curJob, std::string& sJobIdStr, std::string
curJob.setReadBufferCount(fIOReadBufSize);
curJob.setReadBufferSize(fReadBufSize);
if (fMaxErrors >= 0)
{
curJob.setMaxErrorCount(fMaxErrors);
}
curJob.setMaxErrorCount(fMaxErrors);
if (!fPmFilePath.empty())
{
importPath = fPmFilePath;

View File

@ -91,7 +91,7 @@ private:
int fNoOfReadThrds{1}; // No. of read buffers
int fDebugLvl{0}; // Debug level
int fMaxErrors{-1}; // Max allowable errors
int fMaxErrors{MAX_ERRORS_DEFAULT}; // Max allowable errors
int fReadBufSize{-1}; // Read buffer size
int fIOReadBufSize{-1}; // I/O read buffer size
int fSetBufSize{0}; // Buff size w/setvbuf

View File

@ -412,7 +412,11 @@ int TableInfo::readTableData()
// We keep a running total of read errors; fMaxErrorRows specifies
// the error limit. Here's where we see how many more errors we
// still have below the limit, and we pass this to fillFromFile().
unsigned allowedErrCntThisCall = ((fMaxErrorRows > fTotalErrRows) ? (fMaxErrorRows - fTotalErrRows) : 0);
int allowedErrCntThisCall;
if (fMaxErrorRows == MAX_ERRORS_ALL)
allowedErrCntThisCall = MAX_ERRORS_ALL;
else
allowedErrCntThisCall = static_cast<unsigned>(fMaxErrorRows) > fTotalErrRows ? fMaxErrorRows - fTotalErrRows : 0;
// Fill in the specified buffer.
// fTotalReadRowsPerInputFile is ongoing total number of rows read,
@ -485,7 +489,7 @@ int TableInfo::readTableData()
writeErrorList(&fBuffers[readBufNo].getErrorRows(), &fBuffers[readBufNo].getExactErrorRows(), false);
fBuffers[readBufNo].clearErrRows();
if (fTotalErrRows > fMaxErrorRows)
if (fMaxErrorRows != MAX_ERRORS_ALL && fTotalErrRows > static_cast<unsigned>(fMaxErrorRows))
{
// flush the reject data file and output the rejected rows
// flush err file and output the rejected row id and the reason.

View File

@ -85,7 +85,7 @@ class TableInfo : public WeUIDGID
// for this table. Is volatile to
// insure parser & reader threads
// see the latest value.
unsigned fMaxErrorRows; // Maximum error rows
int fMaxErrorRows; // Maximum error rows
int fLastBufferId; // Id of the last buffer
char* fFileBuffer; // File buffer passed to setvbuf()
int fCurrentParseBuffer; // Id of leading current buffer being
@ -298,7 +298,7 @@ class TableInfo : public WeUIDGID
/** @brief Get the number of maximum allowed error rows
*/
unsigned getMaxErrorRows() const;
int getMaxErrorRows() const;
/** @brief retrieve the tuncation as error setting for this
* import. When set, this causes char and varchar strings
@ -309,7 +309,7 @@ class TableInfo : public WeUIDGID
/** @brief set the maximum number of error rows allowed
*/
void setMaxErrorRows(const unsigned int maxErrorRows);
void setMaxErrorRows(int maxErrorRows);
/** @brief Set mode to treat "NULL" string as NULL value or not.
*/
@ -513,7 +513,7 @@ inline Status TableInfo::getStatusTI() const
return fStatusTI;
}
inline unsigned TableInfo::getMaxErrorRows() const
inline int TableInfo::getMaxErrorRows() const
{
return fMaxErrorRows;
}
@ -630,7 +630,7 @@ inline void TableInfo::setLoadFilesInput(bool bReadFromStdin, bool bReadFromS3,
fS3Region = s3region;
}
inline void TableInfo::setMaxErrorRows(const unsigned int maxErrorRows)
inline void TableInfo::setMaxErrorRows(int maxErrorRows)
{
fMaxErrorRows = maxErrorRows;
}