1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00

MCOL-5505 add parquet support for cpimport and add mcs_parquet_ddl and mcs_parquet_gen tools

This commit is contained in:
HanpyBin
2023-08-20 16:01:58 +08:00
committed by Leonid Fedorov
parent 94a680ea60
commit fe597ec78c
25 changed files with 4677 additions and 251 deletions

View File

@ -29,6 +29,7 @@
#include <stdlib.h>
#include <string.h>
#include <type_traits>
#include <chrono>
#include "mcs_decimal.h"
using namespace std;
#include <boost/algorithm/string/case_conv.hpp>
@ -1572,6 +1573,44 @@ boost::any DataConvert::StringToTimestamp(const datatypes::ConvertFromStringPara
return value;
}
//------------------------------------------------------------------------------
// Convert date32 parquet data to binary date. Used by BulkLoad.
//------------------------------------------------------------------------------
int32_t DataConvert::ConvertArrowColumnDate(int32_t dayVal, int& status)
{
int inYear;
int inMonth;
int inDay;
int32_t value = 0;
int64_t secondsSinceEpoch = dayVal;
secondsSinceEpoch *= 86400;
std::chrono::seconds duration(secondsSinceEpoch);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::localtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
if (isDateValid(inDay, inMonth, inYear))
{
Date aDay;
aDay.year = inYear;
aDay.month = inMonth;
aDay.day = inDay;
memcpy(&value, &aDay, 4);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert date string to binary date. Used by BulkLoad.
//------------------------------------------------------------------------------
@ -1658,6 +1697,100 @@ bool DataConvert::isColumnDateValid(int32_t date)
return (isDateValid(d.day, d.month, d.year));
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary datetime(millisecond). Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnDatetime(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::milliseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = duration.count() % 1000;
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
DateTime aDatetime;
aDatetime.year = inYear;
aDatetime.month = inMonth;
aDatetime.day = inDay;
aDatetime.hour = inHour;
aDatetime.minute = inMinute;
aDatetime.second = inSecond;
aDatetime.msecond = inMicrosecond;
memcpy(&value, &aDatetime, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary datetime(millisecond). Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnDatetimeUs(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::microseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = duration.count() % 1000000;
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
DateTime aDatetime;
aDatetime.year = inYear;
aDatetime.month = inMonth;
aDatetime.day = inDay;
aDatetime.hour = inHour;
aDatetime.minute = inMinute;
aDatetime.second = inSecond;
aDatetime.msecond = inMicrosecond;
memcpy(&value, &aDatetime, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert date/time string to binary date/time. Used by BulkLoad.
//------------------------------------------------------------------------------
@ -1798,6 +1931,127 @@ int64_t DataConvert::convertColumnDatetime(const char* dataOrg, CalpontDateTimeF
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary timestamp. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTimestamp(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::milliseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = duration.count() % 1000;
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
MySQLTime m_time;
m_time.year = inYear;
m_time.month = inMonth;
m_time.day = inDay;
m_time.hour = inHour;
m_time.minute = inMinute;
m_time.second = inSecond;
m_time.second_part = inMicrosecond;
bool isValid = true;
int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
if (!isValid)
{
status = -1;
return value;
}
TimeStamp timestamp;
timestamp.second = seconds;
timestamp.msecond = m_time.second_part;
memcpy(&value, &timestamp, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary timestamp. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTimestampUs(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::microseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = static_cast<int>(duration.count() % 1000000);
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
MySQLTime m_time;
m_time.year = inYear;
m_time.month = inMonth;
m_time.day = inDay;
m_time.hour = inHour;
m_time.minute = inMinute;
m_time.second = inSecond;
m_time.second_part = inMicrosecond;
bool isValid = true;
int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
if (!isValid)
{
status = -1;
return value;
}
TimeStamp timestamp;
timestamp.second = seconds;
timestamp.msecond = m_time.second_part;
memcpy(&value, &timestamp, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp string to binary timestamp. Used by BulkLoad.
// Most of this code is taken from DataConvert::convertColumnDatetime
@ -1972,6 +2226,123 @@ int64_t DataConvert::convertColumnTimestamp(const char* dataOrg, CalpontDateTime
return value;
}
//------------------------------------------------------------------------------
// Convert time32 parquet data to binary time. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTime32(int32_t timeVal, int& status)
{
int64_t value = 0;
// convert millisecond to time
int inHour, inMinute, inSecond, inMicrosecond;
inHour = inMinute = inSecond = inMicrosecond = 0;
bool isNeg = false;
if (timeVal < 0)
isNeg = true;
inHour = timeVal / 3600000;
inMinute = (timeVal - inHour * 3600000) / 60000;
inSecond = (timeVal - inHour * 3600000 - inMinute * 60000) / 1000;
inMicrosecond = timeVal - inHour * 3600000 - inMinute * 60000 - inSecond * 1000;
if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
Time atime;
atime.hour = inHour;
atime.minute = inMinute;
atime.second = inSecond;
atime.msecond = inMicrosecond;
atime.is_neg = isNeg;
memcpy(&value, &atime, 8);
}
else
{
// Emulate MariaDB's time saturation
if (inHour > 838)
{
Time atime;
atime.hour = 838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
else if (inHour < -838)
{
Time atime;
atime.hour = -838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
// If neither of the above match then we return a 0 time
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert time64 parquet data to binary time. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTime64(int64_t timeVal, int& status)
{
int64_t value = 0;
// convert macrosecond to time
int inHour, inMinute, inSecond, inMicrosecond;
inHour = inMinute = inSecond = inMicrosecond = 0;
bool isNeg = false;
if (timeVal < 0)
isNeg = true;
inHour = timeVal / 3600000000;
inMinute = (timeVal - inHour * 3600000000) / 60000000;
inSecond = (timeVal - inHour * 3600000000 - inMinute * 60000000) / 1000000;
inMicrosecond = timeVal - inHour * 3600000000 - inMinute * 60000000 - inSecond * 1000000;
if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
Time atime;
atime.hour = inHour;
atime.minute = inMinute;
atime.second = inSecond;
atime.msecond = inMicrosecond;
atime.is_neg = isNeg;
memcpy(&value, &atime, 8);
}
else
{
// Emulate MariaDB's time saturation
if (inHour > 838)
{
Time atime;
atime.hour = 838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
else if (inHour < -838)
{
Time atime;
atime.hour = -838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
// If neither of the above match then we return a 0 time
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert time string to binary time. Used by BulkLoad.
// Most of this is taken from str_to_time in sql-common/my_time.c

View File

@ -1170,6 +1170,14 @@ class DataConvert
EXPORT static std::string timeToString1(long long timevalue);
static inline void timeToString1(long long timevalue, char* buf, unsigned int buflen);
/**
* @brief convert parquet date data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing days
* @param status 0 - success, -1 - fail
*/
EXPORT static int32_t ConvertArrowColumnDate(int32_t dayVal, int& status);
/**
* @brief convert a date column data, represnted as a string, to it's native
* format. This function is for bulkload to use.
@ -1188,6 +1196,22 @@ class DataConvert
*/
EXPORT static bool isColumnDateValid(int32_t date);
/**
* @brief convert parquet datetime data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing millisecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnDatetime(int64_t timeVal, int& status);
/**
* @brief convert parquet datetime data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing microsecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnDatetimeUs(int64_t timeVal, int& status);
/**
* @brief convert a datetime column data, represented as a string,
* to it's native format. This function is for bulkload to use.
@ -1201,6 +1225,22 @@ class DataConvert
EXPORT static int64_t convertColumnDatetime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
int& status, unsigned int dataOrgLen);
/**
* @brief convert parquet timestamp data(millisecond) to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing millisecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTimestamp(int64_t timeVal, int& status);
/**
* @brief convert parquet timestamp data(microsecond) to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing millisecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTimestampUs(int64_t timeVal, int& status);
/**
* @brief convert a timestamp column data, represented as a string,
* to it's native format. This function is for bulkload to use.
@ -1228,6 +1268,22 @@ class DataConvert
EXPORT static int64_t convertColumnTime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
int& status, unsigned int dataOrgLen);
/**
* @brief convert parquet time data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing milliseconds since midnight
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTime32(int32_t timeVal, int& status);
/**
* @brief convert parquet time data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing either microseconds or nanoseconds since midnight
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTime64(int64_t timeVal, int& status);
/**
* @brief Is specified datetime valid; used by binary bulk load
*/