1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-05 16:15:50 +03:00

chore(arrow) bump apache arrow version and fix test load generator for memory usage (#3149)

* bump apache arrow version and fix test load generator for memory usage
* limit arrow simd by SSE4.2
This commit is contained in:
Leonid Fedorov
2024-03-22 19:47:55 +03:00
committed by GitHub
parent 49757ba8d5
commit 9fe6efe84a
2 changed files with 362 additions and 350 deletions

View File

@@ -31,14 +31,15 @@ set(ARROW_CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}"
"-DARROW_DATASET=ON" "-DARROW_DATASET=ON"
"-DARROW_PARQUET=ON" "-DARROW_PARQUET=ON"
"-DARROW_FILESYSTEM=ON" "-DARROW_FILESYSTEM=ON"
"-DARROW_RUNTIME_SIMD_LEVEL=SSE4_2"
"-DThrift_ROOT=${CMAKE_CURRENT_BINARY_DIR}/external/thrift" "-DThrift_ROOT=${CMAKE_CURRENT_BINARY_DIR}/external/thrift"
) )
set(ARROW_INCLUDE_DIR "${ARROW_PREFIX}/include") set(ARROW_INCLUDE_DIR "${ARROW_PREFIX}/include")
set(ARROW_BUILD_BYPRODUCTS "${ARROW_STATIC_LIB}" "${PARQUET_STATIC_LIB}") set(ARROW_BUILD_BYPRODUCTS "${ARROW_STATIC_LIB}" "${PARQUET_STATIC_LIB}")
externalproject_add(external_arrow externalproject_add(external_arrow
URL https://github.com/apache/arrow/archive/refs/tags/go/v13.0.0.tar.gz URL https://github.com/apache/arrow/archive/refs/tags/apache-arrow-15.0.2.tar.gz
URL_HASH SHA256=ea4a79a4103379573ecbcf19229437a4ba547c0146a7f3c3be0a7e0b3de5de6c URL_HASH SHA256=4735b349845bff1fe95ed11abbfed204eb092cabc37523aa13a80cb830fe5b5e
SOURCE_SUBDIR cpp SOURCE_SUBDIR cpp
BINARY_DIR "${ARROW_BINARY_DIR}" BINARY_DIR "${ARROW_BINARY_DIR}"
CMAKE_ARGS "${ARROW_CMAKE_ARGS}" CMAKE_ARGS "${ARROW_CMAKE_ARGS}"

View File

@@ -5,8 +5,14 @@
#include <arrow/io/api.h> #include <arrow/io/api.h>
#include <parquet/exception.h> #include <parquet/exception.h>
#include <parquet/arrow/reader.h> #include <parquet/arrow/reader.h>
#include <parquet/arrow/schema.h>
#include <arrow/type.h>
#include <parquet/arrow/writer.h> #include <parquet/arrow/writer.h>
#include <iostream>
#include <array>
#include <type_traits>
static void usage(const std::string& pname) static void usage(const std::string& pname)
{ {
std::cout << "usage: " << pname << " [-dalbscih]" << std::endl; std::cout << "usage: " << pname << " [-dalbscih]" << std::endl;
@@ -23,7 +29,7 @@ static void usage(const std::string& pname)
/** /**
* generate one parquet file with INT32 data type * generate one parquet file with INT32 data type
*/ */
void generateIntTable(std::string fileDir) void generateIntTable(std::string fileDir)
{ {
// generate data // generate data
@@ -36,7 +42,7 @@ void generateIntTable(std::string fileDir)
validity[3] = 0; validity[3] = 0;
std::vector<int32_t> values; std::vector<int32_t> values;
for (int32_t i = 0; i < reserve_num-1; i++) for (int32_t i = 0; i < reserve_num - 1; i++)
values.push_back(i); values.push_back(i);
values.push_back(static_cast<int32_t>(2147483648)); values.push_back(static_cast<int32_t>(2147483648));
@@ -49,15 +55,14 @@ void generateIntTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/int32.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/int32.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with INT64 data type * generate one parquet file with INT64 data type
*/ */
void generateInt64Table(std::string fileDir) void generateInt64Table(std::string fileDir)
{ {
// generate data // generate data
@@ -69,29 +74,27 @@ void generateInt64Table(std::string fileDir)
validity[2] = 0; validity[2] = 0;
validity[3] = 0; validity[3] = 0;
std::vector<int64_t> values; std::vector<int64_t> values;
for (int64_t i = 0; i < reserve_num-1; i++) for (int64_t i = 0; i < reserve_num - 1; i++)
values.push_back(i); values.push_back(i);
values.push_back(2147483648); values.push_back(2147483648);
PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity)); PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity));
std::shared_ptr<arrow::Array> array; std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(builder.Finish(&array)); PARQUET_THROW_NOT_OK(builder.Finish(&array));
std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::int64())}); std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::int64())});
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {array}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {array});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/int64.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/int64.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with FLOAT data type * generate one parquet file with FLOAT data type
*/ */
void generateFloatTable(std::string fileDir) void generateFloatTable(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -101,7 +104,7 @@ void generateFloatTable(std::string fileDir)
validity[2] = 0; validity[2] = 0;
std::vector<float> values; std::vector<float> values;
for (int i = 0; i < reserve_num; i++) for (int i = 0; i < reserve_num; i++)
values.push_back(i+1.5); values.push_back(i + 1.5);
PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity)); PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity));
std::shared_ptr<arrow::Array> array; std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(builder.Finish(&array)); PARQUET_THROW_NOT_OK(builder.Finish(&array));
@@ -112,15 +115,14 @@ void generateFloatTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/float.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/float.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with DOUBLE data type * generate one parquet file with DOUBLE data type
*/ */
void generateDoubleTable(std::string fileDir) void generateDoubleTable(std::string fileDir)
{ {
// -----------------Float64----------------------- // -----------------Float64-----------------------
@@ -131,7 +133,7 @@ void generateDoubleTable(std::string fileDir)
dvalidity[3] = 0; dvalidity[3] = 0;
std::vector<double> dvalues; std::vector<double> dvalues;
for (int i = 0; i < reserve_num; i++) for (int i = 0; i < reserve_num; i++)
dvalues.push_back(i+2.5); dvalues.push_back(i + 2.5);
PARQUET_THROW_NOT_OK(doublebuilder.AppendValues(dvalues, dvalidity)); PARQUET_THROW_NOT_OK(doublebuilder.AppendValues(dvalues, dvalidity));
std::shared_ptr<arrow::Array> doublearray; std::shared_ptr<arrow::Array> doublearray;
PARQUET_THROW_NOT_OK(doublebuilder.Finish(&doublearray)); PARQUET_THROW_NOT_OK(doublebuilder.Finish(&doublearray));
@@ -142,15 +144,14 @@ void generateDoubleTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/double.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/double.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with TIME data type * generate one parquet file with TIME data type
*/ */
void generateTimeTable(std::string fileDir) void generateTimeTable(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -160,26 +161,26 @@ void generateTimeTable(std::string fileDir)
std::vector<bool> time32validity(reserve_num, true); std::vector<bool> time32validity(reserve_num, true);
std::vector<int32_t> time32values; std::vector<int32_t> time32values;
for (int32_t i = 0; i < reserve_num; i++) for (int32_t i = 0; i < reserve_num; i++)
time32values.push_back(i*3605000); time32values.push_back(i * 3605000);
PARQUET_THROW_NOT_OK(time32builder.AppendValues(time32values, time32validity)); PARQUET_THROW_NOT_OK(time32builder.AppendValues(time32values, time32validity));
std::shared_ptr<arrow::Array> time32array; std::shared_ptr<arrow::Array> time32array;
PARQUET_THROW_NOT_OK(time32builder.Finish(&time32array)); PARQUET_THROW_NOT_OK(time32builder.Finish(&time32array));
std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::time32(arrow::TimeUnit::MILLI))}); std::shared_ptr<arrow::Schema> schema =
arrow::schema({arrow::field("col1", arrow::time32(arrow::TimeUnit::MILLI))});
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {time32array}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {time32array});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/time.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/time.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with TIME64(microsecond) data type * generate one parquet file with TIME64(microsecond) data type
*/ */
void generateTime64Table(std::string fileDir) void generateTime64Table(std::string fileDir)
{ {
int64_t reserve_num = 30; int64_t reserve_num = 30;
@@ -189,26 +190,26 @@ void generateTime64Table(std::string fileDir)
std::vector<bool> time64validity(reserve_num, true); std::vector<bool> time64validity(reserve_num, true);
std::vector<int64_t> time64values; std::vector<int64_t> time64values;
for (int64_t i = 0; i < reserve_num; i++) for (int64_t i = 0; i < reserve_num; i++)
time64values.push_back(i*3605001); time64values.push_back(i * 3605001);
PARQUET_THROW_NOT_OK(time64builder.AppendValues(time64values, time64validity)); PARQUET_THROW_NOT_OK(time64builder.AppendValues(time64values, time64validity));
std::shared_ptr<arrow::Array> time64array; std::shared_ptr<arrow::Array> time64array;
PARQUET_THROW_NOT_OK(time64builder.Finish(&time64array)); PARQUET_THROW_NOT_OK(time64builder.Finish(&time64array));
std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::time64(arrow::TimeUnit::MICRO))}); std::shared_ptr<arrow::Schema> schema =
arrow::schema({arrow::field("col1", arrow::time64(arrow::TimeUnit::MICRO))});
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {time64array}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {time64array});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/time64.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/time64.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with STRING data type * generate one parquet file with STRING data type
*/ */
void generateStringTable(std::string fileDir) void generateStringTable(std::string fileDir)
{ {
const int reserve_num = 30; const int reserve_num = 30;
@@ -218,7 +219,7 @@ void generateStringTable(std::string fileDir)
uint8_t validity1[reserve_num]; uint8_t validity1[reserve_num];
std::vector<std::string> values1; std::vector<std::string> values1;
for (int64_t i = reserve_num-1; i >= 0; i--) for (int64_t i = reserve_num - 1; i >= 0; i--)
{ {
// values1.push_back(std::string("hhhh")); // values1.push_back(std::string("hhhh"));
validity1[i] = 1; validity1[i] = 1;
@@ -268,15 +269,14 @@ void generateStringTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/string.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/string.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with TIMESTAMP data type(millisecond) * generate one parquet file with TIMESTAMP data type(millisecond)
*/ */
void generateTimestampTable(std::string fileDir) void generateTimestampTable(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -291,21 +291,21 @@ void generateTimestampTable(std::string fileDir)
std::shared_ptr<arrow::Array> tsarray; std::shared_ptr<arrow::Array> tsarray;
PARQUET_THROW_NOT_OK(tsbuilder.Finish(&tsarray)); PARQUET_THROW_NOT_OK(tsbuilder.Finish(&tsarray));
std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::timestamp(arrow::TimeUnit::MILLI))}); std::shared_ptr<arrow::Schema> schema =
arrow::schema({arrow::field("col1", arrow::timestamp(arrow::TimeUnit::MILLI))});
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {tsarray}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {tsarray});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/ts.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/ts.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with TIMESTAMP data type(microsecond) * generate one parquet file with TIMESTAMP data type(microsecond)
*/ */
void generateTimestampUsTable(std::string fileDir) void generateTimestampUsTable(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -320,21 +320,21 @@ void generateTimestampUsTable(std::string fileDir)
std::shared_ptr<arrow::Array> tsarray; std::shared_ptr<arrow::Array> tsarray;
PARQUET_THROW_NOT_OK(tsbuilder.Finish(&tsarray)); PARQUET_THROW_NOT_OK(tsbuilder.Finish(&tsarray));
std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::timestamp(arrow::TimeUnit::MICRO))}); std::shared_ptr<arrow::Schema> schema =
arrow::schema({arrow::field("col1", arrow::timestamp(arrow::TimeUnit::MICRO))});
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {tsarray}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {tsarray});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/ts.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/ts.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with DATE data type * generate one parquet file with DATE data type
*/ */
void generateDateTable(std::string fileDir) void generateDateTable(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -355,15 +355,14 @@ void generateDateTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/date.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/date.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with INT16 data type * generate one parquet file with INT16 data type
*/ */
void generateInt16Table(std::string fileDir) void generateInt16Table(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -384,15 +383,14 @@ void generateInt16Table(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/int16.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/int16.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with INT8 data type * generate one parquet file with INT8 data type
*/ */
void generateInt8Table(std::string fileDir) void generateInt8Table(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -407,22 +405,20 @@ void generateInt8Table(std::string fileDir)
std::shared_ptr<arrow::Array> i8array; std::shared_ptr<arrow::Array> i8array;
PARQUET_THROW_NOT_OK(i8builder.Finish(&i8array)); PARQUET_THROW_NOT_OK(i8builder.Finish(&i8array));
std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::int8())}); std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::int8())});
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {i8array}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {i8array});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/int8.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/int8.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with DECIMAL data type * generate one parquet file with DECIMAL data type
*/ */
void generateDecimalTable(std::string fileDir) void generateDecimalTable(std::string fileDir)
{ {
// ----------------------decimal // ----------------------decimal
@@ -443,15 +439,14 @@ void generateDecimalTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/decimal.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/decimal.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with UNSIGNED INT data type * generate one parquet file with UNSIGNED INT data type
*/ */
void generateUintTable(std::string fileDir) void generateUintTable(std::string fileDir)
{ {
// generate data // generate data
@@ -463,29 +458,27 @@ void generateUintTable(std::string fileDir)
validity[2] = 0; validity[2] = 0;
validity[3] = 0; validity[3] = 0;
std::vector<uint32_t> values; std::vector<uint32_t> values;
for (uint32_t i = 0; i < reserve_num-1; i++) for (uint32_t i = 0; i < reserve_num - 1; i++)
values.push_back(i); values.push_back(i);
values.push_back(2147483648); values.push_back(2147483648);
PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity)); PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity));
std::shared_ptr<arrow::Array> array; std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(builder.Finish(&array)); PARQUET_THROW_NOT_OK(builder.Finish(&array));
std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::uint32())}); std::shared_ptr<arrow::Schema> schema = arrow::schema({arrow::field("col1", arrow::uint32())});
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {array}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {array});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint32.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint32.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with UNSIGNED INT16 data type * generate one parquet file with UNSIGNED INT16 data type
*/ */
void generateUint16Table(std::string fileDir) void generateUint16Table(std::string fileDir)
{ {
uint16_t reserve_num = 30; uint16_t reserve_num = 30;
@@ -506,15 +499,14 @@ void generateUint16Table(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint16.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint16.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with UNSIGNED INT8 data type * generate one parquet file with UNSIGNED INT8 data type
*/ */
void generateUint8Table(std::string fileDir) void generateUint8Table(std::string fileDir)
{ {
uint8_t reserve_num = 30; uint8_t reserve_num = 30;
@@ -535,15 +527,14 @@ void generateUint8Table(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint8.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint8.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with UNSIGNED INT64 data type * generate one parquet file with UNSIGNED INT64 data type
*/ */
void generateUint64Table(std::string fileDir) void generateUint64Table(std::string fileDir)
{ {
// generate data // generate data
@@ -555,7 +546,7 @@ void generateUint64Table(std::string fileDir)
validity[2] = 0; validity[2] = 0;
validity[3] = 0; validity[3] = 0;
std::vector<uint64_t> values; std::vector<uint64_t> values;
for (uint64_t i = 0; i < reserve_num-1; i++) for (uint64_t i = 0; i < reserve_num - 1; i++)
values.push_back(i); values.push_back(i);
values.push_back(2147483648); values.push_back(2147483648);
PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity)); PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity));
@@ -568,15 +559,14 @@ void generateUint64Table(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint64.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/uint64.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with BOOLEAN data type * generate one parquet file with BOOLEAN data type
*/ */
void generateBoolTable(std::string fileDir) void generateBoolTable(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -598,15 +588,14 @@ void generateBoolTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/bool.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/bool.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate one parquet file with NULL data type * generate one parquet file with NULL data type
*/ */
void generateNullTable(std::string fileDir) void generateNullTable(std::string fileDir)
{ {
int reserve_num = 30; int reserve_num = 30;
@@ -623,15 +612,14 @@ void generateNullTable(std::string fileDir)
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/null.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/null.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 3));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
} }
/** /**
* generate different parquet files with one data type * generate different parquet files with one data type
*/ */
void generateTable(std::string fileDir) void generateTable(std::string fileDir)
{ {
generateBoolTable(fileDir); generateBoolTable(fileDir);
@@ -655,7 +643,7 @@ void generateTable(std::string fileDir)
/** /**
* generate one parquet file with different data types * generate one parquet file with different data types
*/ */
void generateAllTable(std::string fileDir) void generateAllTable(std::string fileDir)
{ {
const int reserve_num = 30; const int reserve_num = 30;
@@ -679,7 +667,7 @@ void generateAllTable(std::string fileDir)
int32Validity[2] = 0; int32Validity[2] = 0;
int32Validity[3] = 0; int32Validity[3] = 0;
std::vector<int32_t> int32Values; std::vector<int32_t> int32Values;
for (int32_t i = 0; i < reserve_num-1; i++) for (int32_t i = 0; i < reserve_num - 1; i++)
int32Values.push_back(i); int32Values.push_back(i);
int32Values.push_back(static_cast<int32_t>(2147483648)); int32Values.push_back(static_cast<int32_t>(2147483648));
PARQUET_THROW_NOT_OK(int32Builder.AppendValues(int32Values, int32Validity)); PARQUET_THROW_NOT_OK(int32Builder.AppendValues(int32Values, int32Validity));
@@ -694,7 +682,7 @@ void generateAllTable(std::string fileDir)
int64Validity[2] = 0; int64Validity[2] = 0;
int64Validity[3] = 0; int64Validity[3] = 0;
std::vector<int64_t> int64Values; std::vector<int64_t> int64Values;
for (int64_t i = 0; i < reserve_num-1; i++) for (int64_t i = 0; i < reserve_num - 1; i++)
int64Values.push_back(i); int64Values.push_back(i);
int64Values.push_back(2147483648); int64Values.push_back(2147483648);
PARQUET_THROW_NOT_OK(int64Builder.AppendValues(int64Values, int64Validity)); PARQUET_THROW_NOT_OK(int64Builder.AppendValues(int64Values, int64Validity));
@@ -708,7 +696,7 @@ void generateAllTable(std::string fileDir)
floatValidity[2] = 0; floatValidity[2] = 0;
std::vector<float> floatValues; std::vector<float> floatValues;
for (int i = 0; i < reserve_num; i++) for (int i = 0; i < reserve_num; i++)
floatValues.push_back(i+1.5); floatValues.push_back(i + 1.5);
PARQUET_THROW_NOT_OK(floatBuilder.AppendValues(floatValues, floatValidity)); PARQUET_THROW_NOT_OK(floatBuilder.AppendValues(floatValues, floatValidity));
std::shared_ptr<arrow::Array> floatArray; std::shared_ptr<arrow::Array> floatArray;
PARQUET_THROW_NOT_OK(floatBuilder.Finish(&floatArray)); PARQUET_THROW_NOT_OK(floatBuilder.Finish(&floatArray));
@@ -720,7 +708,7 @@ void generateAllTable(std::string fileDir)
dvalidity[3] = 0; dvalidity[3] = 0;
std::vector<double> dvalues; std::vector<double> dvalues;
for (int i = 0; i < reserve_num; i++) for (int i = 0; i < reserve_num; i++)
dvalues.push_back(i+2.5); dvalues.push_back(i + 2.5);
PARQUET_THROW_NOT_OK(doubleBuilder.AppendValues(dvalues, dvalidity)); PARQUET_THROW_NOT_OK(doubleBuilder.AppendValues(dvalues, dvalidity));
std::shared_ptr<arrow::Array> doubleArray; std::shared_ptr<arrow::Array> doubleArray;
PARQUET_THROW_NOT_OK(doubleBuilder.Finish(&doubleArray)); PARQUET_THROW_NOT_OK(doubleBuilder.Finish(&doubleArray));
@@ -731,7 +719,7 @@ void generateAllTable(std::string fileDir)
std::vector<bool> time32validity(reserve_num, true); std::vector<bool> time32validity(reserve_num, true);
std::vector<int32_t> time32values; std::vector<int32_t> time32values;
for (int32_t i = 0; i < reserve_num; i++) for (int32_t i = 0; i < reserve_num; i++)
time32values.push_back(i*3605001); time32values.push_back(i * 3605001);
PARQUET_THROW_NOT_OK(time32builder.AppendValues(time32values, time32validity)); PARQUET_THROW_NOT_OK(time32builder.AppendValues(time32values, time32validity));
std::shared_ptr<arrow::Array> time32array; std::shared_ptr<arrow::Array> time32array;
PARQUET_THROW_NOT_OK(time32builder.Finish(&time32array)); PARQUET_THROW_NOT_OK(time32builder.Finish(&time32array));
@@ -743,7 +731,7 @@ void generateAllTable(std::string fileDir)
std::vector<bool> time64validity(reserve_num, true); std::vector<bool> time64validity(reserve_num, true);
std::vector<int64_t> time64values; std::vector<int64_t> time64values;
for (int64_t i = 0; i < reserve_num64; i++) for (int64_t i = 0; i < reserve_num64; i++)
time64values.push_back(i*3605000001); time64values.push_back(i * 3605000001);
PARQUET_THROW_NOT_OK(time64builder.AppendValues(time64values, time64validity)); PARQUET_THROW_NOT_OK(time64builder.AppendValues(time64values, time64validity));
std::shared_ptr<arrow::Array> time64array; std::shared_ptr<arrow::Array> time64array;
PARQUET_THROW_NOT_OK(time64builder.Finish(&time64array)); PARQUET_THROW_NOT_OK(time64builder.Finish(&time64array));
@@ -753,7 +741,7 @@ void generateAllTable(std::string fileDir)
PARQUET_THROW_NOT_OK(strbuilder.Reserve(reserve_num)); PARQUET_THROW_NOT_OK(strbuilder.Reserve(reserve_num));
uint8_t validity1[reserve_num]; uint8_t validity1[reserve_num];
std::vector<std::string> values1; std::vector<std::string> values1;
for (int64_t i = reserve_num-1; i >= 0; i--) for (int64_t i = reserve_num - 1; i >= 0; i--)
{ {
validity1[i] = 1; validity1[i] = 1;
} }
@@ -895,7 +883,7 @@ void generateAllTable(std::string fileDir)
uint32Validity[2] = 0; uint32Validity[2] = 0;
uint32Validity[3] = 0; uint32Validity[3] = 0;
std::vector<uint32_t> uint32Values; std::vector<uint32_t> uint32Values;
for (uint32_t i = 0; i < reserve_num-1; i++) for (uint32_t i = 0; i < reserve_num - 1; i++)
uint32Values.push_back(i); uint32Values.push_back(i);
uint32Values.push_back(2147483648); uint32Values.push_back(2147483648);
PARQUET_THROW_NOT_OK(uint32Builder.AppendValues(uint32Values, uint32Validity)); PARQUET_THROW_NOT_OK(uint32Builder.AppendValues(uint32Values, uint32Validity));
@@ -933,7 +921,7 @@ void generateAllTable(std::string fileDir)
uint64Validity[2] = 0; uint64Validity[2] = 0;
uint64Validity[3] = 0; uint64Validity[3] = 0;
std::vector<uint64_t> uint64Values; std::vector<uint64_t> uint64Values;
for (uint64_t i = 0; i < ureserve_num-1; i++) for (uint64_t i = 0; i < ureserve_num - 1; i++)
uint64Values.push_back(i); uint64Values.push_back(i);
uint64Values.push_back(2147483648); uint64Values.push_back(2147483648);
PARQUET_THROW_NOT_OK(uint64Builder.AppendValues(uint64Values, uint64Validity)); PARQUET_THROW_NOT_OK(uint64Builder.AppendValues(uint64Values, uint64Validity));
@@ -955,7 +943,7 @@ void generateAllTable(std::string fileDir)
uint8_t binaryValidity[reserve_num]; uint8_t binaryValidity[reserve_num];
std::vector<std::string> binaryValues; std::vector<std::string> binaryValues;
for (int32_t i = reserve_num-1; i >= 0; i--) for (int32_t i = reserve_num - 1; i >= 0; i--)
{ {
binaryValidity[i] = 1; binaryValidity[i] = 1;
} }
@@ -1006,8 +994,8 @@ void generateAllTable(std::string fileDir)
// make schema // make schema
// 28 cols // 28 cols
std::shared_ptr<arrow::Schema> schema = arrow::schema({ std::shared_ptr<arrow::Schema> schema =
arrow::field("col1", arrow::int32()), arrow::schema({arrow::field("col1", arrow::int32()),
arrow::field("col2", arrow::int64()), arrow::field("col2", arrow::int64()),
arrow::field("col3", arrow::float32()), arrow::field("col3", arrow::float32()),
arrow::field("col4", arrow::float64()), arrow::field("col4", arrow::float64()),
@@ -1034,44 +1022,18 @@ void generateAllTable(std::string fileDir)
arrow::field("col25", arrow::timestamp(arrow::TimeUnit::MICRO)), arrow::field("col25", arrow::timestamp(arrow::TimeUnit::MICRO)),
arrow::field("col26", arrow::timestamp(arrow::TimeUnit::MICRO)), arrow::field("col26", arrow::timestamp(arrow::TimeUnit::MICRO)),
arrow::field("col27", arrow::binary()), arrow::field("col27", arrow::binary()),
arrow::field("col28", arrow::fixed_size_binary(4)) arrow::field("col28", arrow::fixed_size_binary(4))});
}); std::shared_ptr<arrow::Table> table = arrow::Table::Make(
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, { schema,
int32Array, {int32Array, int64Array, floatArray, doubleArray, time32array, strarray, strarray,
int64Array, strarray, strarray, strarray, strarray, tsarray, date32array, tsarray,
floatArray, i16array, i8array, decimalArray, uint32Array, ui16array, ui8array, uint64Array,
doubleArray, boolArray, decimalArray1, time64array, tsarray1, tsarray1, binaryArray, fixedSizeArray});
time32array,
strarray,
strarray,
strarray,
strarray,
strarray,
strarray,
tsarray,
date32array,
tsarray,
i16array,
i8array,
decimalArray,
uint32Array,
ui16array,
ui8array,
uint64Array,
boolArray,
decimalArray1,
time64array,
tsarray1,
tsarray1,
binaryArray,
fixedSizeArray
});
// write to file // write to file
arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::MemoryPool* pool = arrow::default_memory_pool();
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/tests.parquet"));
outfile, arrow::io::FileOutputStream::Open(fileDir + "/tests.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 1000)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 1000));
PARQUET_THROW_NOT_OK(outfile->Close()); PARQUET_THROW_NOT_OK(outfile->Close());
@@ -1082,8 +1044,8 @@ void generateAllTable(std::string fileDir)
std::shared_ptr<arrow::Array> nullarray; std::shared_ptr<arrow::Array> nullarray;
PARQUET_THROW_NOT_OK(nullBuilder.Finish(&nullarray)); PARQUET_THROW_NOT_OK(nullBuilder.Finish(&nullarray));
std::shared_ptr<arrow::Schema> schema1 = arrow::schema({ std::shared_ptr<arrow::Schema> schema1 =
arrow::field("col1", arrow::null()), arrow::schema({arrow::field("col1", arrow::null()),
arrow::field("col2", arrow::null()), arrow::field("col2", arrow::null()),
arrow::field("col3", arrow::null()), arrow::field("col3", arrow::null()),
arrow::field("col4", arrow::null()), arrow::field("col4", arrow::null()),
@@ -1110,123 +1072,172 @@ void generateAllTable(std::string fileDir)
arrow::field("col25", arrow::timestamp(arrow::TimeUnit::MICRO)), arrow::field("col25", arrow::timestamp(arrow::TimeUnit::MICRO)),
arrow::field("col26", arrow::timestamp(arrow::TimeUnit::MILLI)), arrow::field("col26", arrow::timestamp(arrow::TimeUnit::MILLI)),
arrow::field("col27", arrow::null()), arrow::field("col27", arrow::null()),
arrow::field("col28", arrow::null()) arrow::field("col28", arrow::null())});
}); std::shared_ptr<arrow::Table> table1 = arrow::Table::Make(
std::shared_ptr<arrow::Table> table1 = arrow::Table::Make(schema1, { schema1, {nullarray, nullarray, nullarray, nullarray, nullarray, nullarray, nullarray,
nullarray, strarray, nullarray, nullarray, strarray, tsarray, nullarray, nullarray,
nullarray, nullarray, nullarray, nullarray, nullarray, nullarray, nullarray, nullarray,
nullarray, nullarray, nullarray, nullarray, tsarray1, tsarray, nullarray, nullarray});
nullarray,
nullarray,
nullarray,
nullarray,
strarray,
nullarray,
nullarray,
strarray,
tsarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
nullarray,
tsarray1,
tsarray,
nullarray,
nullarray
});
std::shared_ptr<arrow::io::FileOutputStream> outfile1; std::shared_ptr<arrow::io::FileOutputStream> outfile1;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(outfile1, arrow::io::FileOutputStream::Open(fileDir + "/nulls.parquet"));
outfile1, arrow::io::FileOutputStream::Open(fileDir + "/nulls.parquet"));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table1, pool, outfile1, 3)); PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table1, pool, outfile1, 3));
PARQUET_THROW_NOT_OK(outfile1->Close()); PARQUET_THROW_NOT_OK(outfile1->Close());
} }
/**
* generate large volume parquet files class LargeDataProducer : public arrow::RecordBatchReader
*/
void generateLargeTable(int64_t reserve_num, std::string rowNum, std::string fileDir)
{ {
// int32 public:
arrow::Int32Builder builder; LargeDataProducer(size_t numberOfRowsToProduce, size_t chunkSize)
// int64_t reserve_num = 1000000; : numberOfRowsToProduce_(numberOfRowsToProduce), chunkSize_(chunkSize)
PARQUET_THROW_NOT_OK(builder.Reserve(reserve_num));
std::vector<bool> validity(reserve_num, true);
std::vector<int32_t> values;
for (int32_t i = 0; i < reserve_num; i++)
values.push_back(i);
PARQUET_THROW_NOT_OK(builder.AppendValues(values, validity));
std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(builder.Finish(&array));
// timestamp
arrow::TimestampBuilder tsbuilder(arrow::timestamp(arrow::TimeUnit::MILLI), arrow::default_memory_pool());
PARQUET_THROW_NOT_OK(tsbuilder.Reserve(reserve_num));
std::vector<bool> tsvalidity(reserve_num, true);
std::vector<int64_t> tsvalues;
for (int64_t i = 0; i < reserve_num; i++)
tsvalues.push_back(i * 10000001);
PARQUET_THROW_NOT_OK(tsbuilder.AppendValues(tsvalues, tsvalidity));
std::shared_ptr<arrow::Array> tsarray;
PARQUET_THROW_NOT_OK(tsbuilder.Finish(&tsarray));
// string
arrow::StringBuilder strbuilder;
PARQUET_THROW_NOT_OK(strbuilder.Reserve(reserve_num));
std::vector<std::string> values1;
for (int64_t i = reserve_num-1; i >= 0; i--)
{ {
values1.push_back(std::string("hhhh")); PARQUET_THROW_NOT_OK(i32builder_.Reserve(chunkSize));
PARQUET_THROW_NOT_OK(tsbuilder_.Reserve(chunkSize));
PARQUET_THROW_NOT_OK(d128builder_.Reserve(chunkSize));
PARQUET_THROW_NOT_OK(strbuilder_.Reserve(chunkSize));
PARQUET_THROW_NOT_OK(doublebuilder_.Reserve(chunkSize));
} }
PARQUET_THROW_NOT_OK(strbuilder.AppendValues(values1));
std::shared_ptr<arrow::Array> strarray;
PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
// decimal virtual std::shared_ptr<arrow::Schema> schema() const
auto t = arrow::Decimal128Type::Make(38, 10); {
PARQUET_ASSIGN_OR_THROW(auto t1, t); return arrow::schema(
arrow::Decimal128Builder d128builder(t1, arrow::default_memory_pool()); {arrow::field("col1", arrow::int32()), arrow::field("col2", arrow::timestamp(arrow::TimeUnit::MILLI)),
for (int64_t i = 0; i < reserve_num; i++) arrow::field("col3", arrow::utf8()), arrow::field("col4", arrow::decimal128(38, 10)),
PARQUET_THROW_NOT_OK(d128builder.Append(arrow::Decimal128("1234567890987654321.12345678"))); arrow::field("col5", arrow::float64()), arrow::field("col6", arrow::utf8())});
std::shared_ptr<arrow::Array> decimalArray; }
PARQUET_THROW_NOT_OK(d128builder.Finish(&decimalArray));
// double virtual arrow::Status ReadNext(std::shared_ptr<arrow::RecordBatch>* batch)
arrow::DoubleBuilder doublebuilder; {
PARQUET_THROW_NOT_OK(doublebuilder.Reserve(reserve_num)); if (restChunkSize() == 0)
std::vector<bool> dvalidity(reserve_num, true); {
batch = nullptr;
}
else
{
*batch = arrow::RecordBatch::Make(schema(), restChunkSize(),
{ProduceInts32(), ProduceTimeStamps(), ProduceStrings(),
ProduceDecimals(), ProduceDoubles(), ProduceStrings()});
}
rowsProduced_ += restChunkSize();
return arrow::Status::OK();
}
private:
size_t restChunkSize()
{
return std::min(chunkSize_, numberOfRowsToProduce_ - rowsProduced_);
}
std::shared_ptr<arrow::Array> ProduceInts32()
{
size_t num = restChunkSize();
std::vector<bool> validity(num, true);
std::vector<int32_t> values;
values.reserve(num);
for (int32_t i = (int32_t)rowsProduced_; i < (int32_t)(rowsProduced_ + num); i++)
values.push_back(i);
PARQUET_THROW_NOT_OK(i32builder_.AppendValues(values, validity));
std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(i32builder_.Finish(&array));
i32builder_.Reset();
return array;
}
std::shared_ptr<arrow::Array> ProduceDoubles()
{
size_t num = restChunkSize();
std::vector<bool> dvalidity(num, true);
std::vector<double> dvalues; std::vector<double> dvalues;
for (int i = 0; i < reserve_num; i++) for (size_t i = rowsProduced_; i < rowsProduced_ + num; i++)
dvalues.push_back(i+2.5); dvalues.push_back(i + 2.5);
PARQUET_THROW_NOT_OK(doublebuilder.AppendValues(dvalues, dvalidity)); PARQUET_THROW_NOT_OK(doublebuilder_.AppendValues(dvalues, dvalidity));
std::shared_ptr<arrow::Array> doublearray; std::shared_ptr<arrow::Array> doublearray;
PARQUET_THROW_NOT_OK(doublebuilder.Finish(&doublearray)); PARQUET_THROW_NOT_OK(doublebuilder_.Finish(&doublearray));
doublebuilder_.Reset();
return doublearray;
}
std::shared_ptr<arrow::Schema> schema = arrow::schema({ std::shared_ptr<arrow::Array> ProduceStrings()
arrow::field("col1", arrow::int32()), {
arrow::field("col2", arrow::timestamp(arrow::TimeUnit::MILLI)), size_t num = restChunkSize();
arrow::field("col3", arrow::utf8()), std::vector<std::string> values;
arrow::field("col4", arrow::decimal128(38, 10)), for (size_t i = 0; i < num ; i++)
arrow::field("col5", arrow::float64()), {
arrow::field("col6", arrow::utf8()) values.push_back(std::string("hhhh"));
}); }
std::shared_ptr<arrow::Table> table = arrow::Table::Make(schema, {array, tsarray, strarray, decimalArray, doublearray, strarray}); PARQUET_THROW_NOT_OK(strbuilder_.AppendValues(values));
std::shared_ptr<arrow::Array> strarray;
PARQUET_THROW_NOT_OK(strbuilder_.Finish(&strarray));
strbuilder_.Reset();
return strarray;
}
// write to file std::shared_ptr<arrow::Array> ProduceTimeStamps()
arrow::MemoryPool* pool = arrow::default_memory_pool(); {
size_t num = restChunkSize();
std::vector<bool> tsvalidity(num, true);
std::vector<int64_t> tsvalues;
for (int64_t i = 0; i < int64_t(num); i++)
tsvalues.push_back(i * 10000001);
PARQUET_THROW_NOT_OK(tsbuilder_.AppendValues(tsvalues, tsvalidity));
std::shared_ptr<arrow::Array> tsarray;
PARQUET_THROW_NOT_OK(tsbuilder_.Finish(&tsarray));
tsbuilder_.Reset();
return tsarray;
}
std::shared_ptr<arrow::Array> ProduceDecimals()
{
size_t num = restChunkSize();
for (size_t i = 0; i < num; i++)
PARQUET_THROW_NOT_OK(d128builder_.Append(arrow::Decimal128("1234567890987654321.12345678")));
std::shared_ptr<arrow::Array> decimalArray;
PARQUET_THROW_NOT_OK(d128builder_.Finish(&decimalArray));
d128builder_.Reset();
return decimalArray;
}
private:
arrow::Int32Builder i32builder_;
arrow::DoubleBuilder doublebuilder_;
arrow::StringBuilder strbuilder_;
arrow::TimestampBuilder tsbuilder_{arrow::timestamp(arrow::TimeUnit::MILLI), arrow::default_memory_pool()};
arrow::Decimal128Builder d128builder_{arrow::Decimal128Type::Make(38, 10).ValueOrDie()};
size_t numberOfRowsToProduce_ = 100;
size_t chunkSize_ = 10;
size_t rowsProduced_ = 0;
};
void generateLargeTable(size_t reserve_num, std::string rowNum, std::string fileDir)
{
// Data is in RBR
std::shared_ptr<arrow::RecordBatchReader> batch_stream = std::make_shared<LargeDataProducer>(reserve_num, 1000000);
// Choose compression
std::shared_ptr<parquet::WriterProperties> props = parquet::WriterProperties::Builder().build();
// Opt to store Arrow schema for easier reads back into Arrow
std::shared_ptr<parquet::ArrowWriterProperties> arrow_props = parquet::ArrowWriterProperties::Builder().build();
// Create a writer
std::shared_ptr<arrow::io::FileOutputStream> outfile; std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW(outfile, arrow::io::FileOutputStream::Open(fileDir + "/" + rowNum + "MRows.parquet"));
std::unique_ptr<parquet::arrow::FileWriter> writer;
PARQUET_ASSIGN_OR_THROW( PARQUET_ASSIGN_OR_THROW(
outfile, arrow::io::FileOutputStream::Open(fileDir + "/" + rowNum + "MRows.parquet")); writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), arrow::default_memory_pool(),
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, pool, outfile, 100000)); outfile, props, arrow_props));
PARQUET_THROW_NOT_OK(outfile->Close());
// Write each batch as a row_group
for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *batch_stream)
{
PARQUET_ASSIGN_OR_THROW(auto batch, maybe_batch);
PARQUET_ASSIGN_OR_THROW(auto table, arrow::Table::FromRecordBatches(batch->schema(), {batch}));
PARQUET_THROW_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows()));
}
// Write file footer and close
PARQUET_THROW_NOT_OK(writer->Close());
} }
int main(int argc, char** argv) int main(int argc, char** argv)