1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-10 01:22:48 +03:00
Files
mariadb-columnstore-engine/writeengine/bulk/cpimport.cpp
Gagan Goel 973e5024d8 MCOL-4957 Fix performance slowdown for processing TIMESTAMP columns.
Part 1:
 As part of MCOL-3776 to address synchronization issue while accessing
 the fTimeZone member of the Func class, mutex locks were added to the
 accessor and mutator methods. However, this slows down processing
 of TIMESTAMP columns in PrimProc significantly as all threads across
 all concurrently running queries would serialize on the mutex. This
 is because PrimProc only has a single global object for the functor
 class (class derived from Func in utils/funcexp/functor.h) for a given
 function name. To fix this problem:

   (1) We remove the fTimeZone as a member of the Func derived classes
   (hence removing the mutexes) and instead use the fOperationType
   member of the FunctionColumn class to propagate the timezone values
   down to the individual functor processing functions such as
   FunctionColumn::getStrVal(), FunctionColumn::getIntVal(), etc.

   (2) To achieve (1), a timezone member is added to the
   execplan::CalpontSystemCatalog::ColType class.

Part 2:
 Several functors in the Funcexp code call dataconvert::gmtSecToMySQLTime()
 and dataconvert::mySQLTimeToGmtSec() functions for conversion between seconds
 since unix epoch and broken-down representation. These functions in turn call
 the C library function localtime_r() which currently has a known bug of holding
 a global lock via a call to __tz_convert. This significantly reduces performance
 in multi-threaded applications where multiple threads concurrently call
 localtime_r(). More details on the bug:
   https://sourceware.org/bugzilla/show_bug.cgi?id=16145

 This bug in localtime_r() caused processing of the Functors in PrimProc to
 slowdown significantly since a query execution causes Functors code to be
 processed in a multi-threaded manner.

 As a fix, we remove the calls to localtime_r() from gmtSecToMySQLTime()
 and mySQLTimeToGmtSec() by performing the timezone-to-offset conversion
 (done in dataconvert::timeZoneToOffset()) during the execution plan
 creation in the plugin. Note that localtime_r() is only called when the
 time_zone system variable is set to "SYSTEM".

 This fix also required changing the timezone type from a std::string to
 a long across the system.
2022-02-14 14:12:27 -05:00

1368 lines
42 KiB
C++

/* Copyright (C) 2014 InfiniDB, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/*******************************************************************************
* $Id: cpimport.cpp 4726 2013-08-07 03:38:36Z bwilkinson $
*
*******************************************************************************/
#include <iostream>
#include <sstream>
#include <fstream>
#include <clocale>
#include <sys/types.h>
#include <unistd.h>
#include <csignal>
#include <cstring>
#include <string>
#include <cerrno>
#include <cstdlib>
#include <sys/time.h>
#ifndef _MSC_VER
#include <sys/resource.h>
#else
#include <cstdio>
#endif
#include <boost/filesystem/path.hpp>
#include "idberrorinfo.h"
#include "we_simplesyslog.h"
#include "we_bulkload.h"
#include "we_bulkstatus.h"
#include "we_config.h"
#include "we_xmljob.h"
#include "we_xmlgenproc.h"
#include "we_tempxmlgendata.h"
#include "liboamcpp.h"
#include "IDBPolicy.h"
#include "MonitorProcMem.h"
#include "dataconvert.h"
#include "mcsconfig.h"
using namespace std;
using namespace WriteEngine;
using namespace execplan;
namespace
{
char* pgmName = 0;
const std::string IMPORT_PATH_CWD(".");
bool bDebug = false;
//@bug 4643: cpimport job ended during setup w/o any err msg.
// Added a try/catch with logging to main() in case
// the process was dying with an uncaught exception.
enum TASK
{
TASK_CMD_LINE_PARSING = 1,
TASK_INIT_CONFIG_CACHE = 2,
TASK_BRM_STATE_READY = 3,
TASK_BRM_STATE_READ_WRITE = 4,
TASK_SHUTDOWN_PENDING = 5,
TASK_SUSPEND_PENDING = 6,
TASK_ESTABLISH_JOBFILE = 7,
TASK_LOAD_JOBFILE = 8,
TASK_PROCESS_DATA = 9
};
const char* taskLabels[] = {"",
"parsing command line options",
"initializing config cache",
"checking BRM Ready state",
"checking BRM Read/Write state",
"checking for pending shutdown",
"checking for pending suspend",
"establishing job file",
"loading job file",
"processing data"};
} // namespace
//------------------------------------------------------------------------------
// Print command line usage
//------------------------------------------------------------------------------
void printUsage()
{
cout << endl
<< "Simple usage using positional parameters "
"(no XML job file):"
<< endl
<< " cpimport.bin dbName tblName [loadFile] [-j jobID] " << endl
<< " [-h] [-r readers] [-w parsers] [-s c] [-f path] [-b readBufs] " << endl
<< " [-c readBufSize] [-e maxErrs] [-B libBufSize] [-n NullOption] " << endl
<< " [-E encloseChar] [-C escapeChar] [-I binaryOpt] [-S] "
"[-d debugLevel] [-i] "
<< endl
<< " [-D] [-N] [-L rejectDir] [-T timeZone]" << endl
<< " [-U username]" << endl
<< endl;
cout << endl
<< "Traditional usage without positional parameters "
"(XML job file required):"
<< endl
<< " cpimport.bin -j jobID " << endl
<< " [-h] [-r readers] [-w parsers] [-s c] [-f path] [-b readBufs] " << endl
<< " [-c readBufSize] [-e maxErrs] [-B libBufSize] [-n NullOption] " << endl
<< " [-E encloseChar] [-C escapeChar] [-I binaryOpt] [-S] "
"[-d debugLevel] [-i] "
<< endl
<< " [-p path] [-l loadFile]" << endl
<< " [-D] [-N] [-L rejectDir] [-T timeZone]" << endl
<< " [-U username]" << endl
<< endl;
cout << " Positional parameters:" << endl
<< " dbName Name of database to load" << endl
<< " tblName Name of table to load" << endl
<< " loadFile Optional input file name in current directory, "
<< "unless a fully" << endl
<< " qualified name is given. If not given, "
<< "input read from stdin." << endl
<< endl;
cout << " Options:" << endl
<< " -b Number of read buffers" << endl
<< " -c Application read buffer size (in bytes)" << endl
<< " -d Print different level (1-3) debug message " << endl
<< " -e Maximum number of allowable errors per table" << endl
<< " -f Data file directory path; " << endl
<< " In simple usage:" << endl
<< " Default is current working directory." << endl
<< " -f option only applies if loadFile is specified." << endl
<< " In traditional usage: " << endl
<< " Default is <BulkRoot>/data/import." << endl
<< " 'STDIN' (all caps) redirects input from stdin." << endl
<< " -h Print this message" << endl
<< " -i Print extended info to console, else this info only goes "
"to log file."
<< endl
<< " -j Job id. In simple usage, default is the table OID." << endl
<< " -l Name of input file to be loaded, relative to -f path," << endl
<< " unless a fully qualified input file name is given." << endl
<< " -n NullOption (0-treat the string NULL as data (default);" << endl
<< " 1-treat the string NULL as a NULL value)" << endl
<< " -p Path for XML job description file" << endl
<< " -r Number of readers" << endl
<< " -s 'c' is the delimiter between column values" << endl
<< " -w Number of parsers" << endl
<< " -B I/O library read buffer size (in bytes)" << endl
<< " -E Enclosed by character if field values are enclosed" << endl
<< " -C Escape character used in conjunction with 'enclosed by' "
<< "character," << endl
<< " or as part of NULL escape sequence ('\\N'); default is '\\'" << endl
<< " -I Binary import; binaryOpt 1-import NULL values" << endl
<< " 2-saturate NULL values" << endl
<< " -S Treat string truncations as errors" << endl
<< " -D Disable timeout when waiting for table lock" << endl
<< " -N Disable console output" << endl
<< " -L send *.err and *.bad (reject) files here" << endl
<< " -T Timezone used for TIMESTAMP datatype" << endl
<< " Possible values: \"SYSTEM\" (default)" << endl
<< " : Offset in the form +/-HH:MM" << endl
<< endl
<< " -y S3 Authentication Key (for S3 imports)" << endl
<< " -K S3 Authentication Secret (for S3 imports)" << endl
<< " -t S3 Bucket (for S3 imports)" << endl
<< " -H S3 Hostname (for S3 imports, Amazon's S3 default)" << endl
<< " -g S3 Regions (for S3 imports)" << endl
<< " -U username of new data files owner. Default is mysql" << endl;
cout << " Example1:" << endl
<< " cpimport.bin -j 1234" << endl
<< " Example2: Some column values are enclosed within double quotes." << endl
<< " cpimport.bin -j 3000 -E '\"'" << endl
<< " Example3: Import a nation table without a Job XML file" << endl
<< " cpimport.bin -j 301 tpch nation nation.tbl" << endl;
exit(EXIT_SUCCESS);
}
//------------------------------------------------------------------------------
// Signal handler to catch SIGTERM signal to terminate the process
//------------------------------------------------------------------------------
void handleSigTerm(int i)
{
std::cout << "Received SIGTERM to terminate the process..." << std::endl;
BulkStatus::setJobStatus(EXIT_FAILURE);
}
//------------------------------------------------------------------------------
// Signal handler to catch Control-C signal to terminate the process
//------------------------------------------------------------------------------
void handleControlC(int i)
{
if (!BulkLoad::disableConsoleOutput())
std::cout << "Received Control-C to terminate the process..." << std::endl;
BulkStatus::setJobStatus(EXIT_FAILURE);
}
#ifdef _MSC_VER
BOOL WINAPI HandlerCtrlCRoutine(_In_ DWORD dwCtrlType)
{
// Log to syslog
logging::Message::Args errMsgArgs;
errMsgArgs.add("Received Break to terminate the process");
SimpleSysLog::instance()->logMsg(errMsgArgs, logging::LOG_TYPE_DEBUG, logging::M0087);
handleControlC(dwCtrlType);
return true;
}
#endif
//------------------------------------------------------------------------------
// If error occurs during startup, this function is called to log the specified
// message and terminate the process.
//------------------------------------------------------------------------------
void startupError(const std::string& errMsg, bool showHint)
{
// Log to console
if (!BulkLoad::disableConsoleOutput())
cerr << errMsg << endl;
if (showHint)
{
std::ostringstream oss;
oss << "Try '" << pgmName << " -h' for more information.";
if (!BulkLoad::disableConsoleOutput())
cerr << oss.str() << endl;
}
// Log to syslog
logging::Message::Args errMsgArgs;
errMsgArgs.add(errMsg);
SimpleSysLog::instance()->logMsg(errMsgArgs, logging::LOG_TYPE_ERROR, logging::M0087);
std::string jobIdStr("0");
logging::Message::Args endMsgArgs;
endMsgArgs.add(jobIdStr);
endMsgArgs.add("FAILED");
SimpleSysLog::instance()->logMsg(endMsgArgs, logging::LOG_TYPE_INFO, logging::M0082);
exit(EXIT_FAILURE);
}
//------------------------------------------------------------------------------
// Initialize signal handling
//------------------------------------------------------------------------------
void setupSignalHandlers()
{
#ifdef _MSC_VER
BOOL brtn = SetConsoleCtrlHandler(HandlerCtrlCRoutine, true);
#else
struct sigaction ign;
// Ignore SIGPIPE signal
memset(&ign, 0, sizeof(ign));
ign.sa_handler = SIG_IGN;
sigaction(SIGPIPE, &ign, 0);
// Ignore SIGHUP signals
memset(&ign, 0, sizeof(ign));
ign.sa_handler = SIG_IGN;
sigaction(SIGHUP, &ign, 0);
// @bug 4344 enable Control-C by disabling this section of code
// Ignore SIGINT (Control-C) signal
// memset(&ign, 0, sizeof(ign));
// ign.sa_handler = SIG_IGN;
// sigaction(SIGINT, &ign, 0);
// @bug 4344 enable Control-C by adding this section of code
// catch Control-C signal to terminate the program
struct sigaction act;
memset(&act, 0, sizeof(act));
act.sa_handler = handleControlC;
sigaction(SIGINT, &act, 0);
// catch SIGTERM signal to terminate the program
memset(&act, 0, sizeof(act));
act.sa_handler = handleSigTerm;
sigaction(SIGTERM, &act, 0);
#endif
}
//------------------------------------------------------------------------------
// Parse the command line arguments
//------------------------------------------------------------------------------
void parseCmdLineArgs(int argc, char** argv, BulkLoad& curJob, std::string& sJobIdStr,
std::string& sXMLJobDir, std::string& sModuleIDandPID, bool& bLogInfo2ToConsole,
std::string& xmlGenSchema, std::string& xmlGenTable, bool& bValidateColumnList)
{
std::string importPath;
std::string rptFileName;
int option;
bool bImportFileArg = false;
BulkModeType bulkMode = BULK_MODE_LOCAL;
std::string jobUUID;
while ((option = getopt(argc, argv, "b:c:d:e:f:hij:kl:m:n:p:r:s:u:w:B:C:DE:I:P:R:ST:X:NL:y:K:t:H:g:U:")) !=
EOF)
{
switch (option)
{
case 'b': // -b: no. of read buffers
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 1) || (lValue > INT_MAX))
{
startupError(std::string("Option -b is invalid or out of range."), true);
}
int noOfReadBuffers = lValue;
curJob.setReadBufferCount(noOfReadBuffers);
break;
}
case 'c': // -c: read buffer size
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 1) || (lValue > INT_MAX))
{
startupError(std::string("Option -c is invalid or out of range."), true);
}
int readBufferSize = lValue;
curJob.setReadBufferSize(readBufferSize);
break;
}
case 'd': // -d: debug level
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 1) || (lValue > INT_MAX))
{
startupError(std::string("Option -d is invalid or out of range."), true);
}
int debugLevel = lValue;
if (debugLevel > 0 && debugLevel <= 3)
{
bDebug = true;
curJob.setAllDebug((DebugLevel)debugLevel);
if (!BulkLoad::disableConsoleOutput())
cout << "\nDebug level is set to " << debugLevel << endl;
}
break;
}
case 'e': // -e: max allowed errors
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 0) || (lValue > INT_MAX))
{
startupError(std::string("Option -e is invalid or out of range."), true);
}
int maxErrors = lValue;
curJob.setMaxErrorCount(maxErrors);
break;
}
case 'f': // -f: import path
{
importPath = optarg;
std::string setAltErrMsg;
if (curJob.setAlternateImportDir(importPath, setAltErrMsg) != NO_ERROR)
startupError(setAltErrMsg, false);
break;
}
case 'h': // -h: help
{
printUsage();
break;
}
case 'i': // -i: log info to console
{
bLogInfo2ToConsole = true;
break;
}
case 'j': // -j: jobID
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 0) || (lValue > INT_MAX))
{
startupError(std::string("Option -j is invalid or out of range."), true);
}
sJobIdStr = optarg;
break;
}
case 'k': // -k: hidden option to keep (not delete)
{
// bulk rollback meta-data files
curJob.setKeepRbMetaFiles(true);
break;
}
case 'l': // -l: import load file(s)
{
bImportFileArg = true;
curJob.addToCmdLineImportFileList(std::string(optarg));
break;
}
case 'm': // -m: bulk load mode
{
bulkMode = (BulkModeType)atoi(optarg);
if ((bulkMode != BULK_MODE_REMOTE_SINGLE_SRC) && (bulkMode != BULK_MODE_REMOTE_MULTIPLE_SRC) &&
(bulkMode != BULK_MODE_LOCAL))
{
startupError(std::string("Invalid bulk mode; can be 1,2, or 3"), true);
}
break;
}
case 'n': // -n: treat "NULL" as null
{
int nullStringMode = atoi(optarg);
if ((nullStringMode != 0) && (nullStringMode != 1))
{
startupError(std::string("Invalid NULL option; value can be 0 or 1"), true);
}
if (nullStringMode)
curJob.setNullStringMode(true);
else
curJob.setNullStringMode(false);
break;
}
case 'p': // -p: Job XML path
{
sXMLJobDir = optarg;
break;
}
case 'r': // -r: num read threads
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 1) || (lValue > INT_MAX))
{
startupError(std::string("Option -r is invalid or out of range."), true);
}
int numOfReaders = lValue;
#if !defined(__LP64__) && !defined(_MSC_VER)
if (numOfReaders > 1)
{
cerr << "Note: resetting number of read threads to maximum" << endl;
numOfReaders = 1;
}
#endif
curJob.setNoOfReadThreads(numOfReaders);
if (!BulkLoad::disableConsoleOutput())
cout << "number of read threads : " << numOfReaders << endl;
break;
}
case 's': // -s: column delimiter
{
char delim;
if (!strcmp(optarg, "\\t"))
{
delim = '\t';
if (!BulkLoad::disableConsoleOutput())
cout << "Column delimiter : "
<< "\\t" << endl;
}
else
{
delim = optarg[0];
if (delim == '\t') // special case to print a <TAB>
{
if (!BulkLoad::disableConsoleOutput())
cout << "Column delimiter : '\\t'" << endl;
}
else
{
if (!BulkLoad::disableConsoleOutput())
cout << "Column delimiter : " << delim << endl;
}
}
curJob.setColDelimiter(delim);
break;
}
case 'u': // -u: import job UUID
{
jobUUID = optarg;
curJob.setJobUUID(jobUUID);
break;
}
case 'w': // -w: num parse threads
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 1) || (lValue > INT_MAX))
{
startupError(std::string("Option -w is invalid or out of range."), true);
}
int numOfParser = lValue;
#if !defined(__LP64__) && !defined(_MSC_VER)
if (numOfParser > 3)
{
cerr << "Note: resetting number of parse threads to maximum" << endl;
numOfParser = 3;
}
#endif
curJob.setNoOfParseThreads(numOfParser);
if (!BulkLoad::disableConsoleOutput())
cout << "number of parse threads : " << numOfParser << endl;
break;
}
case 'B': // -B: setvbuf read size
{
errno = 0;
long lValue = strtol(optarg, 0, 10);
if ((errno != 0) || (lValue < 1) || (lValue > INT_MAX))
{
startupError(std::string("Option -B is invalid or out of range."), true);
}
int vbufReadSize = lValue;
curJob.setVbufReadSize(vbufReadSize);
break;
}
case 'C': // -C: enclosed escape char
{
curJob.setEscapeChar(optarg[0]);
if (!BulkLoad::disableConsoleOutput())
cout << "Escape Character : " << optarg[0] << endl;
break;
}
case 'E': // -E: enclosed by char
{
curJob.setEnclosedByChar(optarg[0]);
if (!BulkLoad::disableConsoleOutput())
cout << "Enclosed by Character : " << optarg[0] << endl;
break;
}
case 'I': // -I: Binary import mode
{
ImportDataMode importMode = (ImportDataMode)atoi(optarg);
if ((importMode != IMPORT_DATA_BIN_ACCEPT_NULL) && (importMode != IMPORT_DATA_BIN_SAT_NULL))
{
startupError(std::string("Invalid binary import option; value can be 1"
"(accept NULL values) or 2(saturate NULL values)"),
true);
}
curJob.setImportDataMode(importMode);
break;
}
case 'L': // -L: Error log directory
{
curJob.setErrorDir(optarg);
break;
}
case 'P': // -P: Calling moduleid
{
// and PID
sModuleIDandPID = optarg;
break;
}
case 'R': // -R: distributed mode
{
// report file
rptFileName = optarg;
break;
}
case 'S': // -S: Char & VarChar data
{
// greater than col def
curJob.setTruncationAsError(true); // are reported as err
break;
}
case 'T':
{
std::string timeZone = optarg;
long offset;
if (dataconvert::timeZoneToOffset(timeZone.c_str(), timeZone.size(), &offset))
{
startupError(std::string("Value for option -T is invalid"), true);
}
curJob.setTimeZone(offset);
break;
}
case 'X': // Hidden extra options
{
if (!strcmp(optarg, "AllowMissingColumn"))
bValidateColumnList = false;
break;
}
case 'D': // disable table lock waiting timeout
{
curJob.disableTimeOut(true);
break;
}
case 'N': // silent the output to console
{
BulkLoad::disableConsoleOutput(true);
break;
}
case 'y':
{
curJob.setS3Key(optarg);
break;
}
case 'K':
{
curJob.setS3Secret(optarg);
break;
}
case 't':
{
curJob.setS3Bucket(optarg);
break;
}
case 'H':
{
curJob.setS3Host(optarg);
break;
}
case 'g':
{
curJob.setS3Region(optarg);
break;
}
case 'U':
{
curJob.setUsername(optarg);
break;
}
default:
{
ostringstream oss;
oss << "Unrecognized command line option (" << option << ")";
startupError(oss.str(), true);
}
}
}
curJob.setDefaultJobUUID();
// Inconsistent to specify -f STDIN with -l importFile
if ((bImportFileArg) && (importPath == "STDIN"))
{
startupError(std::string("-f STDIN is invalid with -l importFile."), true);
}
// If distributed mode, make sure report filename is specified and that we
// can create the file using the specified path.
if ((bulkMode == BULK_MODE_REMOTE_SINGLE_SRC) || (bulkMode == BULK_MODE_REMOTE_MULTIPLE_SRC))
{
if (rptFileName.empty())
{
startupError(std::string("Bulk modes 1 and 2 require -R rptFileName."), true);
}
else
{
std::ofstream rptFile(rptFileName.c_str());
if (rptFile.fail())
{
std::ostringstream oss;
oss << "Unable to open report file " << rptFileName;
startupError(oss.str(), false);
}
rptFile.close();
}
curJob.setBulkLoadMode(bulkMode, rptFileName);
}
// Get positional arguments, User can provide:
// 1. no positional parameters
// 2. Two positional parameters (schema and table names)
// 3. Three positional parameters (schema, table, and import file name)
if (optind < argc) // see if db schema name is given
{
xmlGenSchema = argv[optind]; // 1st pos parm
optind++;
if (optind < argc) // see if table name is given
{
// Validate invalid options in conjunction with 2-3 positional
// parameter mode, which means we are using temp Job XML file.
if (bImportFileArg)
{
startupError(std::string("-l importFile is invalid with positional parameters"), true);
}
if (!sXMLJobDir.empty())
{
startupError(std::string("-p path is invalid with positional parameters."), true);
}
if (importPath == "STDIN")
{
startupError(std::string("-f STDIN is invalid with positional parameters."), true);
}
xmlGenTable = argv[optind]; // 2nd pos parm
optind++;
if (optind < argc) // see if input file name is given
{
// 3rd pos parm
curJob.addToCmdLineImportFileList(std::string(argv[optind]));
// Default to CWD if loadfile name given w/o -f path
if (importPath.empty())
{
std::string setAltErrMsg;
if (curJob.setAlternateImportDir(std::string("."), setAltErrMsg) != NO_ERROR)
startupError(setAltErrMsg, false);
}
}
else
{
// Invalid to specify -f if no load file name given
if (!importPath.empty())
{
startupError(std::string("-f requires 3rd positional parameter (load file name)."), true);
}
// Default to STDIN if no import file name given
std::string setAltErrMsg;
if (curJob.setAlternateImportDir(std::string("STDIN"), setAltErrMsg) != NO_ERROR)
startupError(setAltErrMsg, false);
}
}
else
{
startupError(std::string("No table name specified with schema."), true);
}
}
else
{
// JobID is a required parameter with no positional parm mode,
// because we need the jobid to identify the input job xml file.
if (sJobIdStr.empty())
{
startupError(std::string("No JobID specified."), true);
}
}
}
//------------------------------------------------------------------------------
// Print the path of the input load file(s), and the name of the job xml file.
//------------------------------------------------------------------------------
void printInputSource(const std::string& alternateImportDir, const std::string& jobDescFile,
const std::string& S3Bucket)
{
if (!S3Bucket.empty())
{
cout << "Input file will be read from S3 Bucket : " << S3Bucket << ", file/object : " << jobDescFile
<< endl;
}
else if (alternateImportDir.size() > 0)
{
if (alternateImportDir == IMPORT_PATH_CWD)
{
char cwdBuf[4096];
char* bufPtr = &cwdBuf[0];
bufPtr = ::getcwd(cwdBuf, sizeof(cwdBuf));
if (!(BulkLoad::disableConsoleOutput()))
cout << "Input file(s) will be read from : " << bufPtr << endl;
}
else
{
if (!(BulkLoad::disableConsoleOutput()))
cout << "Input file(s) will be read from : " << alternateImportDir << endl;
}
}
else
{
if (!(BulkLoad::disableConsoleOutput()))
cout << "Input file(s) will be read from Bulkload root directory : " << Config::getBulkRoot() << endl;
}
if (!(BulkLoad::disableConsoleOutput()))
cout << "Job description file : " << jobDescFile << endl;
}
//------------------------------------------------------------------------------
// Get TableOID string for the specified db and table name.
//------------------------------------------------------------------------------
void getTableOID(const std::string& xmlGenSchema, const std::string& xmlGenTable, std::string& tableOIDStr)
{
OID tableOID = 0;
execplan::CalpontSystemCatalog::TableName tbl(xmlGenSchema, xmlGenTable);
try
{
boost::shared_ptr<CalpontSystemCatalog> cat =
CalpontSystemCatalog::makeCalpontSystemCatalog(BULK_SYSCAT_SESSION_ID);
cat->identity(CalpontSystemCatalog::EC);
tableOID = cat->tableRID(tbl).objnum;
}
catch (std::exception& ex)
{
std::ostringstream oss;
oss << "Unable to set default JobID; "
<< "Error getting OID for table " << tbl.schema << '.' << tbl.table << ": " << ex.what();
startupError(oss.str(), false);
}
catch (...)
{
std::ostringstream oss;
oss << "Unable to set default JobID; "
<< "Unknown error getting OID for table " << tbl.schema << '.' << tbl.table;
startupError(oss.str(), false);
}
std::ostringstream oss;
oss << tableOID;
tableOIDStr = oss.str();
}
//------------------------------------------------------------------------------
// Construct temporary Job XML file if user provided schema, job, and
// optional load filename.
// tempJobDir - directory used to store temporary job xml file
// sJobIdStr - job id (-j) specified by user
// xmlGenSchema - db schema name specified by user (1st positional parm)
// xmlGenTable - db table name specified by user (2nd positional parm)
// alternateImportDir - alternate directory for input data files
// sFileName(out)-filename path for temporary job xml file that is created
//------------------------------------------------------------------------------
void constructTempXmlFile(const std::string& tempJobDir, const std::string& sJobIdStr,
const std::string& xmlGenSchema, const std::string& xmlGenTable,
const std::string& alternateImportDir, const std::string& S3Bucket,
boost::filesystem::path& sFileName)
{
// Construct the job description file name
std::string xmlErrMsg;
int rc = 0;
std::string tableOIDStr;
getTableOID(xmlGenSchema, xmlGenTable, tableOIDStr);
rc = XMLJob::genJobXMLFileName(std::string(), tempJobDir, sJobIdStr,
true, // using temp job xml file
xmlGenSchema, xmlGenTable, sFileName, xmlErrMsg, tableOIDStr);
if (rc != NO_ERROR)
{
std::ostringstream oss;
oss << "cpimport.bin error creating temporary Job XML file name: " << xmlErrMsg;
startupError(oss.str(), false);
}
printInputSource(alternateImportDir, sFileName.string(), S3Bucket);
TempXMLGenData genData(sJobIdStr, xmlGenSchema, xmlGenTable);
XMLGenProc genProc(&genData,
false, // don't log to Jobxml_nnn.log
false); // generate XML file (not a syscat report)
try
{
genProc.startXMLFile();
execplan::CalpontSystemCatalog::TableName tbl(xmlGenSchema, xmlGenTable);
genProc.makeTableData(tbl);
if (!genProc.makeColumnData(tbl))
{
std::ostringstream oss;
oss << "No columns for " << xmlGenSchema << '.' << xmlGenTable;
startupError(oss.str(), false);
}
}
catch (runtime_error& ex)
{
std::ostringstream oss;
oss << "cpimport.bin runtime exception constructing temporary "
"Job XML file: "
<< ex.what();
startupError(oss.str(), false);
}
catch (exception& ex)
{
std::ostringstream oss;
oss << "cpimport.bin exception constructing temporary "
"Job XML file: "
<< ex.what();
startupError(oss.str(), false);
}
catch (...)
{
startupError(std::string("cpimport.bin "
"unknown exception constructing temporary Job XML file"),
false);
}
genProc.writeXMLFile(sFileName.string());
}
//------------------------------------------------------------------------------
// Verify we are running from a PM node.
//------------------------------------------------------------------------------
void verifyNode()
{
std::string localModuleType = Config::getLocalModuleType();
// Validate running on a PM
if (localModuleType != "pm")
{
startupError(std::string("Exiting, "
"cpimport.bin can only be run on a PM node"),
true);
}
}
//------------------------------------------------------------------------------
// Log initiate message
//------------------------------------------------------------------------------
void logInitiateMsg(const char* initText)
{
logging::Message::Args initMsgArgs;
initMsgArgs.add(initText);
SimpleSysLog::instance()->logMsg(initMsgArgs, logging::LOG_TYPE_INFO, logging::M0086);
}
//------------------------------------------------------------------------------
// Main entry point into the cpimport.bin program
//------------------------------------------------------------------------------
int main(int argc, char** argv)
{
#ifdef _MSC_VER
_setmaxstdio(2048);
#endif
setupSignalHandlers();
// Set locale language
const char* pLoc = setlocale(LC_ALL, "");
if (pLoc)
{
// Log one line
cout << "Locale = " << pLoc;
}
else
{
cout << "Failed to set locale ";
}
setlocale(LC_NUMERIC, "C");
// Initialize singleton instance of syslogging
if (argc > 0)
pgmName = argv[0];
logging::IDBErrorInfo::instance();
SimpleSysLog::instance()->setLoggingID(logging::LoggingID(SUBSYSTEM_ID_WE_BULK));
// Log job initiation unless user is asking for help
std::ostringstream ossArgList;
bool bHelpFlag = false;
for (int m = 1; m < argc; m++)
{
if (strcmp(argv[m], "-h") == 0)
{
bHelpFlag = true;
break;
}
if (!strcmp(argv[m], "\t")) // special case to print a <TAB>
ossArgList << "'\\t'" << ' ';
else
ossArgList << argv[m] << ' ';
}
if (!bHelpFlag)
{
logInitiateMsg(ossArgList.str().c_str());
}
BulkLoad curJob;
string sJobIdStr;
string sXMLJobDir;
string sModuleIDandPID;
bool bLogInfo2ToConsole = false;
bool bValidateColumnList = true;
bool bRollback = false;
bool bForce = false;
int rc = NO_ERROR;
std::string exceptionMsg;
TASK task; // track tasks being performed
// set this upfront
curJob.setErrorDir(string(MCSLOGDIR) + "/cpimport/");
try
{
//--------------------------------------------------------------------------
// Parse the command line arguments
//--------------------------------------------------------------------------
task = TASK_CMD_LINE_PARSING;
string xmlGenSchema;
string xmlGenTable;
parseCmdLineArgs(argc, argv, curJob, sJobIdStr, sXMLJobDir, sModuleIDandPID, bLogInfo2ToConsole,
xmlGenSchema, xmlGenTable, bValidateColumnList);
//--------------------------------------------------------------------------
// Save basename portion of program path from argv[0]
//--------------------------------------------------------------------------
string base;
string::size_type startBase = string(argv[0]).rfind('/');
if (startBase == string::npos)
base.assign(argv[0]);
else
base.assign(argv[0] + startBase + 1);
curJob.setProcessName(base);
if (bDebug)
logInitiateMsg("Command line arguments parsed");
//--------------------------------------------------------------------------
// Init singleton classes (other than syslogging that we already setup)
//--------------------------------------------------------------------------
task = TASK_INIT_CONFIG_CACHE;
// Initialize cache used to store configuration parms from Columnstore.xml
Config::initConfigCache();
// Setup signal handlers "again" because HDFS plugin seems to be
// changing our settings to ignore ctrl-C and sigterm
setupSignalHandlers();
// initialize singleton BRM Wrapper. Also init ExtentRows (in dbrm) from
// main thread, since ExtentMap::getExtentRows is not thread safe.
BRMWrapper::getInstance()->getInstance()->getExtentRows();
//--------------------------------------------------------------------------
// Validate running on valid node
//--------------------------------------------------------------------------
verifyNode();
//--------------------------------------------------------------------------
// Set scheduling priority for this cpimport.bin process
//--------------------------------------------------------------------------
#ifdef _MSC_VER
// FIXME
#else
setpriority(PRIO_PROCESS, 0, Config::getBulkProcessPriority());
#endif
if (bDebug)
logInitiateMsg("Config cache initialized");
//--------------------------------------------------------------------------
// Make sure DMLProc startup has completed before running a cpimport.bin job
//--------------------------------------------------------------------------
task = TASK_BRM_STATE_READY;
if (!BRMWrapper::getInstance()->isSystemReady())
{
startupError(std::string("System is not ready. Verify that ColumnStore is up and ready "
"before running cpimport."),
false);
}
if (bDebug)
logInitiateMsg("BRM state verified: state is Ready");
//--------------------------------------------------------------------------
// Verify that the state of BRM is read/write
//--------------------------------------------------------------------------
task = TASK_BRM_STATE_READ_WRITE;
int brmReadWriteStatus = BRMWrapper::getInstance()->isReadWrite();
if (brmReadWriteStatus != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << ec.errorString(brmReadWriteStatus) << " cpimport.bin is terminating.";
startupError(oss.str(), false);
}
if (bDebug)
logInitiateMsg("BRM state is Read/Write");
//--------------------------------------------------------------------------
// Make sure we're not about to shutdown
//--------------------------------------------------------------------------
task = TASK_SHUTDOWN_PENDING;
int brmShutdownPending = BRMWrapper::getInstance()->isShutdownPending(bRollback, bForce);
if (brmShutdownPending != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << ec.errorString(brmShutdownPending) << " cpimport.bin is terminating.";
startupError(oss.str(), false);
}
if (bDebug)
logInitiateMsg("Verified no shutdown operation is pending");
//--------------------------------------------------------------------------
// Make sure we're not write suspended
//--------------------------------------------------------------------------
task = TASK_SUSPEND_PENDING;
int brmSuspendPending = BRMWrapper::getInstance()->isSuspendPending();
if (brmSuspendPending != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << ec.errorString(brmSuspendPending) << " cpimport.bin is terminating.";
startupError(oss.str(), false);
}
if (bDebug)
logInitiateMsg("Verified no suspend operation is pending");
//--------------------------------------------------------------------------
// Set some flags
//--------------------------------------------------------------------------
task = TASK_ESTABLISH_JOBFILE;
BRMWrapper::setUseVb(false);
Cache::setUseCache(false);
//--------------------------------------------------------------------------
// Construct temporary Job XML file if user provided schema, job, and
// optional load filename.
//--------------------------------------------------------------------------
boost::filesystem::path sFileName;
bool bUseTempJobFile = false;
if (!BulkLoad::disableConsoleOutput())
cout << std::endl; // print blank line before we start
// Start tracking time to create/load jobfile;
// The elapsed time for this step is logged at the end of loadJobInfo()
curJob.startTimer();
if (!xmlGenSchema.empty()) // create temporary job file name
{
// If JobID is not provided, then default to the table OID
if (sJobIdStr.empty())
{
std::string tableOIDStr;
getTableOID(xmlGenSchema, xmlGenTable, tableOIDStr);
if (!(BulkLoad::disableConsoleOutput()))
cout << "Using table OID " << tableOIDStr << " as the default JOB ID" << std::endl;
sJobIdStr = tableOIDStr;
}
// No need to validate column list in job XML file for user errors,
// if cpimport.bin just generated the job XML file on-the-fly.
bValidateColumnList = false;
bUseTempJobFile = true;
constructTempXmlFile(curJob.getTempJobDir(), sJobIdStr, xmlGenSchema, xmlGenTable,
curJob.getAlternateImportDir(), curJob.getS3Bucket(), sFileName);
}
else // create user's persistent job file name
{
// Construct the job description file name
std::string xmlErrMsg;
std::string tableOIdStr("");
rc = XMLJob::genJobXMLFileName(sXMLJobDir, curJob.getJobDir(), sJobIdStr, bUseTempJobFile,
std::string(), std::string(), sFileName, xmlErrMsg, tableOIdStr);
if (rc != NO_ERROR)
{
std::ostringstream oss;
oss << "cpimport.bin error creating Job XML file name: " << xmlErrMsg;
startupError(oss.str(), false);
}
printInputSource(curJob.getAlternateImportDir(), sFileName.string(), curJob.getS3Bucket());
}
if (bDebug)
logInitiateMsg("Job xml file is established");
//-------------------------------------------------------------------------
// Bug 5415 Add HDFS MemBuffer vs. FileBuffer decision logic.
// MemoryCheckPercent. This controls at what percent of total memory be
// consumed by all processes before we switch from HdfsRdwrMemBuffer to
// HdfsRdwrFileBuffer. This is only used in Hdfs installations.
//-------------------------------------------------------------------------
config::Config* cf = config::Config::makeConfig();
int checkPct = 95;
string strCheckPct = cf->getConfig("SystemConfig", "MemoryCheckPercent");
if (strCheckPct.length() != 0)
checkPct = cf->uFromText(strCheckPct);
//--------------------------------------------------------------------------
// If we're HDFS, start the monitor thread.
// Otherwise, we don't need it, so don't waste the resources.
//--------------------------------------------------------------------------
if (idbdatafile::IDBPolicy::useHdfs())
{
new boost::thread(utils::MonitorProcMem(0, checkPct, SUBSYSTEM_ID_WE_BULK));
}
//--------------------------------------------------------------------------
// This is the real business
//--------------------------------------------------------------------------
task = TASK_LOAD_JOBFILE;
rc = curJob.loadJobInfo(sFileName.string(), bUseTempJobFile, argc, argv, bLogInfo2ToConsole,
bValidateColumnList);
if (rc != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << "Error in loading job information; " << ec.errorString(rc) << "; cpimport.bin is terminating.";
startupError(oss.str(), false);
}
if (bDebug)
logInitiateMsg("Job xml file is loaded");
task = TASK_PROCESS_DATA;
// Log start of job to INFO log
logging::Message::Args startMsgArgs;
startMsgArgs.add(sJobIdStr);
startMsgArgs.add(curJob.getSchema());
SimpleSysLog::instance()->logMsg(startMsgArgs, logging::LOG_TYPE_INFO, logging::M0081);
curJob.printJob();
rc = curJob.processJob();
if (rc != NO_ERROR)
{
if (!BulkLoad::disableConsoleOutput())
cerr << endl << "Error in loading job data" << endl;
}
}
catch (std::exception& ex)
{
std::ostringstream oss;
oss << "Uncaught exception caught in cpimport.bin main() while " << taskLabels[task] << "; " << ex.what();
exceptionMsg = oss.str();
if (task != TASK_PROCESS_DATA)
{
startupError(exceptionMsg, false);
}
rc = ERR_UNKNOWN;
}
//--------------------------------------------------------------------------
// Log end of job to INFO log
//--------------------------------------------------------------------------
logging::Message::Args endMsgArgs;
endMsgArgs.add(sJobIdStr);
if (rc != NO_ERROR)
{
std::string failMsg("FAILED");
if (exceptionMsg.length() > 0)
{
failMsg += "; ";
failMsg += exceptionMsg;
}
endMsgArgs.add(failMsg.c_str());
}
else
{
endMsgArgs.add("SUCCESS");
}
SimpleSysLog::instance()->logMsg(endMsgArgs, logging::LOG_TYPE_INFO, logging::M0082);
if (rc != NO_ERROR)
return (EXIT_FAILURE);
else
return (EXIT_SUCCESS);
}