1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-04-18 21:44:02 +03:00

MCOL-5505 add parquet support for cpimport and add mcs_parquet_ddl and mcs_parquet_gen tools

This commit is contained in:
HanpyBin 2023-08-20 16:01:58 +08:00 committed by Leonid Fedorov
parent 94a680ea60
commit fe597ec78c
25 changed files with 4677 additions and 251 deletions

View File

@ -0,0 +1,77 @@
DROP DATABASE IF EXISTS mcol_5505_parquet_large_volume;
CREATE DATABASE mcol_5505_parquet_large_volume;
USE mcol_5505_parquet_large_volume;
SET time_zone = '+8:00';
Create TABLE t1(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
Create TABLE t2(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
Create TABLE t3(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
Create TABLE t4(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
SELECT * FROM t1 ORDER BY col1 LIMIT 5;
col1 col2 col3 col4 col5 col6
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
SELECT COUNT(*) FROM t1;
COUNT(*)
1000000
SELECT * FROM t2 ORDER BY col1 LIMIT 5;
col1 col2 col3 col4 col5 col6
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
SELECT COUNT(*) FROM t2;
COUNT(*)
10000000
SELECT * FROM t3 ORDER BY col1 LIMIT 5;
col1 col2 col3 col4 col5 col6
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
SELECT COUNT(*) FROM t3;
COUNT(*)
50000000
SELECT * FROM t4 ORDER BY col1 LIMIT 5;
col1 col2 col3 col4 col5 col6
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
SELECT COUNT(*) FROM t4;
COUNT(*)
100000000
DROP DATABASE mcol_5505_parquet_large_volume;

View File

@ -0,0 +1,100 @@
DROP DATABASE IF EXISTS mcol_5505_cpimport_parquet;
CREATE DATABASE mcol_5505_cpimport_parquet;
USE mcol_5505_cpimport_parquet;
SET time_zone = '+8:00';
Create TABLE t1(
col1 INT,
col2 BIGINT,
col3 FLOAT,
col4 DOUBLE,
col5 TIME(3),
col6 VARCHAR(2),
col7 VARCHAR(5),
col8 VARCHAR(20),
col9 CHAR(2),
col10 CHAR(5),
col11 CHAR(20),
col12 TIMESTAMP(3),
col13 DATE,
col14 DATETIME(3),
col15 SMALLINT,
col16 TINYINT,
col17 DECIMAL(9,3),
col18 INT UNSIGNED,
col19 SMALLINT UNSIGNED,
col20 TINYINT UNSIGNED,
col21 BIGINT UNSIGNED,
col22 BOOLEAN,
col23 DECIMAL(38,10),
col24 TIME(6),
col25 TIMESTAMP(6),
col26 DATETIME(6),
col27 CHAR(4),
col28 CHAR(4)
) ENGINE=Columnstore;
SELECT * FROM t1;
col1 col2 col3 col4 col5 col6 col7 col8 col9 col10 col11 col12 col13 col14 col15 col16 col17 col18 col19 col20 col21 col22 col23 col24 col25 col26 col27 col28
0 0 1.5 2.5 00:00:00.000 a a a a a a 0000-00-00 00:00:00.000 1970-01-01 1970-01-01 00:00:00.000 0 0 1383.433 0 0 0 0 1 12345678909876543.2112345678 00:00:00.000000 0000-00-00 00:00:00.000000 1970-01-01 00:00:00.000000 a abcd
NULL NULL 2.5 3.5 01:00:05.001 NULL NULL NULL NULL NULL NULL 1970-01-01 10:46:40.001 1970-01-11 1970-01-01 02:46:40.001 1 1 NULL NULL 1 1 NULL 1 12345678909876543.2112345678 01:00:05.000001 1970-01-01 10:46:40.000001 1970-01-01 02:46:40.000001 ab abcd
NULL NULL NULL 4.5 02:00:10.002 ab abcd abcd ab abcd abcd 1970-01-01 13:33:20.002 1970-01-21 1970-01-01 05:33:20.002 2 2 532235.234 NULL 2 2 NULL 1 12345678909876543.2112345678 02:00:10.000002 1970-01-01 13:33:20.000002 1970-01-01 05:33:20.000002 abcd abcd
NULL NULL 4.5 NULL 03:00:15.003 ab abcde abcde ab abcde abcde 1970-01-01 16:20:00.003 1970-01-31 1970-01-01 08:20:00.003 3 3 NULL NULL 3 3 NULL 1 12345678909876543.2112345678 03:00:15.000003 1970-01-01 16:20:00.000003 1970-01-01 08:20:00.000003 abcd abcd
4 4 5.5 6.5 04:00:20.004 ab abcde abcdefg ab abcde abcdefg 1970-01-01 19:06:40.004 1970-02-10 1970-01-01 11:06:40.004 4 4 5325.234 4 4 4 4 1 12345678909876543.2112345678 04:00:20.000004 1970-01-01 19:06:40.000004 1970-01-01 11:06:40.000004 abcd abcd
5 5 6.5 7.5 05:00:25.005 Wh Whlg1 Whlg1xXAxP Wh Whlg1 Whlg1xXAxP 1970-01-01 21:53:20.005 1970-02-20 1970-01-01 13:53:20.005 5 5 NULL 5 5 5 5 0 12345678909876543.2112345678 05:00:25.000005 1970-01-01 21:53:20.000005 1970-01-01 13:53:20.000005 Whlg abcd
6 6 7.5 8.5 06:00:30.006 4N 4Nimz 4NimzSQzMD 4N 4Nimz 4NimzSQzMD 1970-01-02 00:40:00.006 1970-03-02 1970-01-01 16:40:00.006 6 6 1383.433 6 6 6 6 1 12345678909876543.2112345678 06:00:30.000006 1970-01-02 00:40:00.000006 1970-01-01 16:40:00.000006 4Nim abcd
7 7 8.5 9.5 07:00:35.007 G2 G23ne G23ne3j92Ky0wBF G2 G23ne G23ne3j92Ky0wBF 1970-01-02 03:26:40.007 1970-03-12 1970-01-01 19:26:40.007 7 7 NULL 7 7 7 7 1 12345678909876543.2112345678 07:00:35.000007 1970-01-02 03:26:40.000007 1970-01-01 19:26:40.000007 G23n abcd
8 8 9.5 10.5 08:00:40.008 F4 F4z F4z F4 F4z F4z 1970-01-02 06:13:20.008 1970-03-22 1970-01-01 22:13:20.008 8 8 532235.234 8 8 8 8 1 12345678909876543.2112345678 08:00:40.000008 1970-01-02 06:13:20.000008 1970-01-01 22:13:20.000008 F4z abcd
9 9 10.5 11.5 09:00:45.009 8J 8JCVT 8JCVTsGYB7V 8J 8JCVT 8JCVTsGYB7V 1970-01-02 09:00:00.009 1970-04-01 1970-01-02 01:00:00.009 9 9 NULL 9 9 9 9 1 12345678909876543.2112345678 09:00:45.000009 1970-01-02 09:00:00.000009 1970-01-02 01:00:00.000009 8JCV abcd
10 10 11.5 12.5 10:00:50.010 23 23235 23235 23 23235 23235 1970-01-02 11:46:40.010 1970-04-11 1970-01-02 03:46:40.010 10 10 5325.234 10 10 10 10 1 12345678909876543.2112345678 10:00:50.000010 1970-01-02 11:46:40.000010 1970-01-02 03:46:40.000010 2323 abcd
11 11 12.5 13.5 11:00:55.011 sd sda22 sda22 sd sda22 sda22 1970-01-02 14:33:20.011 1970-04-21 1970-01-02 06:33:20.011 11 11 NULL 11 11 11 11 1 12345678909876543.2112345678 11:00:55.000011 1970-01-02 14:33:20.000011 1970-01-02 06:33:20.000011 sda2 abcd
12 12 13.5 14.5 12:01:00.012 SD SD7sd SD7sdFD7 SD SD7sd SD7sdFD7 1970-01-02 17:20:00.012 1970-05-01 1970-01-02 09:20:00.012 12 12 1383.433 12 12 12 12 1 12345678909876543.2112345678 12:01:00.000012 1970-01-02 17:20:00.000012 1970-01-02 09:20:00.000012 SD7s abcd
13 13 14.5 15.5 13:01:05.013 gv gvv3h gvv3hYwdfOD gv gvv3h gvv3hYwdfOD 1970-01-02 20:06:40.013 1970-05-11 1970-01-02 12:06:40.013 13 13 NULL 13 13 13 13 1 12345678909876543.2112345678 13:01:05.000013 1970-01-02 20:06:40.000013 1970-01-02 12:06:40.000013 gvv3 abcd
14 14 15.5 16.5 14:01:10.014 y8 y8wjo y8wjo4v50s6 y8 y8wjo y8wjo4v50s6 1970-01-02 22:53:20.014 1970-05-21 1970-01-02 14:53:20.014 14 14 532235.234 14 14 14 14 1 12345678909876543.2112345678 14:01:10.000014 1970-01-02 22:53:20.000014 1970-01-02 14:53:20.000014 y8wj abcd
15 15 16.5 17.5 15:01:15.015 aN aNJW5 aNJW56SJieE8KVV aN aNJW5 aNJW56SJieE8KVV 1970-01-03 01:40:00.015 1970-05-31 1970-01-02 17:40:00.015 15 15 NULL 15 15 15 15 1 12345678909876543.2112345678 15:01:15.000015 1970-01-03 01:40:00.000015 1970-01-02 17:40:00.000015 aNJW abcd
16 16 17.5 18.5 16:01:20.016 1+ 1+2=3 1+2=3 1+ 1+2=3 1+2=3 1970-01-03 04:26:40.016 1970-06-10 1970-01-02 20:26:40.016 16 16 5325.234 16 16 16 16 1 12345678909876543.2112345678 16:01:20.000016 1970-01-03 04:26:40.000016 1970-01-02 20:26:40.000016 1+2= abcd
17 17 18.5 19.5 17:01:25.017 He Hello Hello World! He Hello Hello World! 1970-01-03 07:13:20.017 1970-06-20 1970-01-02 23:13:20.017 17 17 NULL 17 17 17 17 1 12345678909876543.2112345678 17:01:25.000017 1970-01-03 07:13:20.000017 1970-01-02 23:13:20.000017 Hell abcd
18 18 19.5 20.5 18:01:30.018 1! 1!!!1 1!!!1 1! 1!!!1 1!!!1 1970-01-03 10:00:00.018 1970-06-30 1970-01-03 02:00:00.018 18 18 1383.433 18 18 18 18 1 12345678909876543.2112345678 18:01:30.000018 1970-01-03 10:00:00.000018 1970-01-03 02:00:00.000018 1!!! abcd
19 19 20.5 21.5 19:01:35.019 82 82440 824407880313877 82 82440 824407880313877 1970-01-03 12:46:40.019 1970-07-10 1970-01-03 04:46:40.019 19 19 NULL 19 19 19 19 1 12345678909876543.2112345678 19:01:35.000019 1970-01-03 12:46:40.000019 1970-01-03 04:46:40.000019 8244 abcd
20 20 21.5 22.5 20:01:40.020 19 1970- 1970-01-01 08:02:23 19 1970- 1970-01-01 08:02:23 1970-01-03 15:33:20.020 1970-07-20 1970-01-03 07:33:20.020 20 20 532235.234 20 20 20 20 1 12345678909876543.2112345678 20:01:40.000020 1970-01-03 15:33:20.000020 1970-01-03 07:33:20.000020 1970 abcd
21 21 22.5 23.5 21:01:45.021 19 1970- 1970-05-31 19 1970- 1970-05-31 1970-01-03 18:20:00.021 1970-07-30 1970-01-03 10:20:00.021 21 21 NULL 21 21 21 21 1 12345678909876543.2112345678 21:01:45.000021 1970-01-03 18:20:00.000021 1970-01-03 10:20:00.000021 1970 abcd
22 22 23.5 24.5 22:01:50.022 xx xxx xxx xx xxx xxx 1970-01-03 21:06:40.022 1970-08-09 1970-01-03 13:06:40.022 22 22 5325.234 22 22 22 22 1 12345678909876543.2112345678 22:01:50.000022 1970-01-03 21:06:40.000022 1970-01-03 13:06:40.000022 xxx abcd
23 23 24.5 25.5 23:01:55.023 ON ONMKM ONMKMQVBRWBUTWT ON ONMKM ONMKMQVBRWBUTWT 1970-01-03 23:53:20.023 1970-08-19 1970-01-03 15:53:20.023 23 23 NULL 23 23 23 23 1 12345678909876543.2112345678 23:01:55.000023 1970-01-03 23:53:20.000023 1970-01-03 15:53:20.000023 ONMK abcd
24 24 25.5 26.5 24:02:00.024 ZW ZWMWH ZWMWHSEZDYODQWP ZW ZWMWH ZWMWHSEZDYODQWP 1970-01-04 02:40:00.024 1970-08-29 1970-01-03 18:40:00.024 24 24 1383.433 24 24 24 24 1 12345678909876543.2112345678 24:02:00.000024 1970-01-04 02:40:00.000024 1970-01-03 18:40:00.000024 ZWMW abcd
25 25 26.5 27.5 25:02:05.025 Ho HoCYp HoCYpJ Ho HoCYp HoCYpJ 1970-01-04 05:26:40.025 1970-09-08 1970-01-03 21:26:40.025 25 25 NULL 25 25 25 25 1 12345678909876543.2112345678 25:02:05.000025 1970-01-04 05:26:40.000025 1970-01-03 21:26:40.000025 HoCY abcd
26 26 27.5 28.5 26:02:10.026 -1 -100 -100 -1 -100 -100 1970-01-04 08:13:20.026 1970-09-18 1970-01-04 00:13:20.026 26 26 532235.234 26 26 26 26 1 12345678909876543.2112345678 26:02:10.000026 1970-01-04 08:13:20.000026 1970-01-04 00:13:20.000026 -100 abcd
27 27 28.5 29.5 27:02:15.027 Iq Iqa8N Iqa8Nr Iq Iqa8N Iqa8Nr 1970-01-04 11:00:00.027 1970-09-28 1970-01-04 03:00:00.027 27 27 NULL 27 27 27 27 1 12345678909876543.2112345678 27:02:15.000027 1970-01-04 11:00:00.000027 1970-01-04 03:00:00.000027 Iqa8 abcd
28 28 29.5 30.5 28:02:20.028 nD nD274 nD274v nD nD274 nD274v 1970-01-04 13:46:40.028 1970-10-08 1970-01-04 05:46:40.028 28 28 5325.234 28 28 28 28 1 12345678909876543.2112345678 28:02:20.000028 1970-01-04 13:46:40.000028 1970-01-04 05:46:40.000028 nD27 abcd
-2147483646 2147483648 30.5 31.5 29:02:25.029 6y 6y0Jy 6y0JyW 6y 6y0Jy 6y0JyW 1970-01-04 16:33:20.029 1970-10-18 1970-01-04 08:33:20.029 29 29 NULL 2147483648 29 29 2147483648 1 12345678909876543.2112345678 29:02:25.000029 1970-01-04 16:33:20.000029 1970-01-04 08:33:20.000029 6y0J abcd
NULL NULL NULL NULL NULL NULL NULL a NULL NULL a 0000-00-00 00:00:00.000 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 0000-00-00 00:00:00.000000 1970-01-01 00:00:00.000000 NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 10:46:40.001 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 10:46:40.000001 1970-01-01 02:46:40.000001 NULL NULL
NULL NULL NULL NULL NULL NULL NULL abcd NULL NULL abcd 1970-01-01 13:33:20.002 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 13:33:20.000002 1970-01-01 05:33:20.000002 NULL NULL
NULL NULL NULL NULL NULL NULL NULL abcde NULL NULL abcde 1970-01-01 16:20:00.003 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 16:20:00.000003 1970-01-01 08:20:00.000003 NULL NULL
NULL NULL NULL NULL NULL NULL NULL abcdefg NULL NULL abcdefg 1970-01-01 19:06:40.004 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 19:06:40.000004 1970-01-01 11:06:40.000004 NULL NULL
NULL NULL NULL NULL NULL NULL NULL Whlg1xXAxP NULL NULL Whlg1xXAxP 1970-01-01 21:53:20.005 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 21:53:20.000005 1970-01-01 13:53:20.000005 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 4NimzSQzMD NULL NULL 4NimzSQzMD 1970-01-02 00:40:00.006 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 00:40:00.000006 1970-01-01 16:40:00.000006 NULL NULL
NULL NULL NULL NULL NULL NULL NULL G23ne3j92Ky0wBF NULL NULL G23ne3j92Ky0wBF 1970-01-02 03:26:40.007 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 03:26:40.000007 1970-01-01 19:26:40.000007 NULL NULL
NULL NULL NULL NULL NULL NULL NULL F4z NULL NULL F4z 1970-01-02 06:13:20.008 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 06:13:20.000008 1970-01-01 22:13:20.000008 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 8JCVTsGYB7V NULL NULL 8JCVTsGYB7V 1970-01-02 09:00:00.009 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 09:00:00.000009 1970-01-02 01:00:00.000009 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 23235 NULL NULL 23235 1970-01-02 11:46:40.010 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 11:46:40.000010 1970-01-02 03:46:40.000010 NULL NULL
NULL NULL NULL NULL NULL NULL NULL sda22 NULL NULL sda22 1970-01-02 14:33:20.011 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 14:33:20.000011 1970-01-02 06:33:20.000011 NULL NULL
NULL NULL NULL NULL NULL NULL NULL SD7sdFD7 NULL NULL SD7sdFD7 1970-01-02 17:20:00.012 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 17:20:00.000012 1970-01-02 09:20:00.000012 NULL NULL
NULL NULL NULL NULL NULL NULL NULL gvv3hYwdfOD NULL NULL gvv3hYwdfOD 1970-01-02 20:06:40.013 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 20:06:40.000013 1970-01-02 12:06:40.000013 NULL NULL
NULL NULL NULL NULL NULL NULL NULL y8wjo4v50s6 NULL NULL y8wjo4v50s6 1970-01-02 22:53:20.014 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 22:53:20.000014 1970-01-02 14:53:20.000014 NULL NULL
NULL NULL NULL NULL NULL NULL NULL aNJW56SJieE8KVV NULL NULL aNJW56SJieE8KVV 1970-01-03 01:40:00.015 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 01:40:00.000015 1970-01-02 17:40:00.000015 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 1+2=3 NULL NULL 1+2=3 1970-01-03 04:26:40.016 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 04:26:40.000016 1970-01-02 20:26:40.000016 NULL NULL
NULL NULL NULL NULL NULL NULL NULL Hello World! NULL NULL Hello World! 1970-01-03 07:13:20.017 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 07:13:20.000017 1970-01-02 23:13:20.000017 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 1!!!1 NULL NULL 1!!!1 1970-01-03 10:00:00.018 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 10:00:00.000018 1970-01-03 02:00:00.000018 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 824407880313877 NULL NULL 824407880313877 1970-01-03 12:46:40.019 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 12:46:40.000019 1970-01-03 04:46:40.000019 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 1970-01-01 08:02:23 NULL NULL 1970-01-01 08:02:23 1970-01-03 15:33:20.020 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 15:33:20.000020 1970-01-03 07:33:20.000020 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 1970-05-31 NULL NULL 1970-05-31 1970-01-03 18:20:00.021 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 18:20:00.000021 1970-01-03 10:20:00.000021 NULL NULL
NULL NULL NULL NULL NULL NULL NULL xxx NULL NULL xxx 1970-01-03 21:06:40.022 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 21:06:40.000022 1970-01-03 13:06:40.000022 NULL NULL
NULL NULL NULL NULL NULL NULL NULL ONMKMQVBRWBUTWT NULL NULL ONMKMQVBRWBUTWT 1970-01-03 23:53:20.023 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 23:53:20.000023 1970-01-03 15:53:20.000023 NULL NULL
NULL NULL NULL NULL NULL NULL NULL ZWMWHSEZDYODQWP NULL NULL ZWMWHSEZDYODQWP 1970-01-04 02:40:00.024 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 02:40:00.000024 1970-01-03 18:40:00.000024 NULL NULL
NULL NULL NULL NULL NULL NULL NULL HoCYpJ NULL NULL HoCYpJ 1970-01-04 05:26:40.025 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 05:26:40.000025 1970-01-03 21:26:40.000025 NULL NULL
NULL NULL NULL NULL NULL NULL NULL -100 NULL NULL -100 1970-01-04 08:13:20.026 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 08:13:20.000026 1970-01-04 00:13:20.000026 NULL NULL
NULL NULL NULL NULL NULL NULL NULL Iqa8Nr NULL NULL Iqa8Nr 1970-01-04 11:00:00.027 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 11:00:00.000027 1970-01-04 03:00:00.000027 NULL NULL
NULL NULL NULL NULL NULL NULL NULL nD274v NULL NULL nD274v 1970-01-04 13:46:40.028 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 13:46:40.000028 1970-01-04 05:46:40.000028 NULL NULL
NULL NULL NULL NULL NULL NULL NULL 6y0JyW NULL NULL 6y0JyW 1970-01-04 16:33:20.029 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 16:33:20.000029 1970-01-04 08:33:20.000029 NULL NULL
SELECT COUNT(*) FROM t1;
COUNT(*)
60
DROP DATABASE mcol_5505_cpimport_parquet;

View File

@ -0,0 +1,36 @@
DROP DATABASE IF EXISTS mcol_5505_parquet_ddl;
CREATE DATABASE mcol_5505_parquet_ddl;
USE mcol_5505_parquet_ddl;
SHOW CREATE TABLE parquet_test_table;
Table Create Table
parquet_test_table CREATE TABLE `parquet_test_table` (
`col1` int(11) DEFAULT NULL,
`col2` bigint(20) DEFAULT NULL,
`col3` float DEFAULT NULL,
`col4` double DEFAULT NULL,
`col5` time(3) DEFAULT NULL,
`col6` varchar(2000) DEFAULT NULL,
`col7` varchar(2000) DEFAULT NULL,
`col8` varchar(2000) DEFAULT NULL,
`col9` varchar(2000) DEFAULT NULL,
`col10` varchar(2000) DEFAULT NULL,
`col11` varchar(2000) DEFAULT NULL,
`col12` timestamp(3) NULL DEFAULT NULL,
`col13` date DEFAULT NULL,
`col14` timestamp(3) NULL DEFAULT NULL,
`col15` smallint(6) DEFAULT NULL,
`col16` tinyint(4) DEFAULT NULL,
`col17` decimal(9,3) DEFAULT NULL,
`col18` int(10) unsigned DEFAULT NULL,
`col19` smallint(5) unsigned DEFAULT NULL,
`col20` tinyint(3) unsigned DEFAULT NULL,
`col21` bigint(20) unsigned DEFAULT NULL,
`col22` tinyint(1) DEFAULT NULL,
`col23` decimal(38,10) DEFAULT NULL,
`col24` time(6) DEFAULT NULL,
`col25` timestamp(6) NULL DEFAULT NULL,
`col26` timestamp(6) NULL DEFAULT NULL,
`col27` varbinary(8000) DEFAULT NULL,
`col28` char(4) DEFAULT NULL
) ENGINE=Columnstore DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
DROP DATABASE mcol_5505_parquet_ddl;

View File

@ -0,0 +1,82 @@
#
# parquet support for large volume data file
# Author: Bin Ruan, binruan0227@gmail.com
#
if (!$MYSQL_TEST_ROOT){
skip Should be run by root to execute cpimport;
}
-- source ../include/have_columnstore.inc
--disable_warnings
DROP DATABASE IF EXISTS mcol_5505_parquet_large_volume;
--enable_warnings
CREATE DATABASE mcol_5505_parquet_large_volume;
USE mcol_5505_parquet_large_volume;
SET time_zone = '+8:00';
# Create table
Create TABLE t1(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
Create TABLE t2(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
Create TABLE t3(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
Create TABLE t4(
col1 INT,
col2 TIMESTAMP(3),
col3 CHAR(6),
col4 DECIMAL(38,10),
col5 DOUBLE,
col6 VARCHAR(20)
) ENGINE=Columnstore;
# Generate data
--exec mcs_parquet_gen -l -f $MTR_SUITE_DIR/../std_data
#Valid data and table
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t1 $MTR_SUITE_DIR/../std_data/1MRows.parquet >/dev/null
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t2 $MTR_SUITE_DIR/../std_data/10MRows.parquet >/dev/null
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t3 $MTR_SUITE_DIR/../std_data/50MRows.parquet >/dev/null
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t4 $MTR_SUITE_DIR/../std_data/100MRows.parquet >/dev/null
SELECT * FROM t1 ORDER BY col1 LIMIT 5;
SELECT COUNT(*) FROM t1;
SELECT * FROM t2 ORDER BY col1 LIMIT 5;
SELECT COUNT(*) FROM t2;
SELECT * FROM t3 ORDER BY col1 LIMIT 5;
SELECT COUNT(*) FROM t3;
SELECT * FROM t4 ORDER BY col1 LIMIT 5;
SELECT COUNT(*) FROM t4;
# Clean UP
--exec rm $MTR_SUITE_DIR/../std_data/1MRows.parquet
--exec rm $MTR_SUITE_DIR/../std_data/10MRows.parquet
--exec rm $MTR_SUITE_DIR/../std_data/50MRows.parquet
--exec rm $MTR_SUITE_DIR/../std_data/100MRows.parquet
DROP DATABASE mcol_5505_parquet_large_volume;

View File

@ -0,0 +1,64 @@
#
# Check the parquet support for different data types
# Author: Bin Ruan, binruan0227@gmail.com
#
if (!$MYSQL_TEST_ROOT){
skip Should be run by root to execute cpimport;
}
-- source ../include/have_columnstore.inc
--disable_warnings
DROP DATABASE IF EXISTS mcol_5505_cpimport_parquet;
--enable_warnings
CREATE DATABASE mcol_5505_cpimport_parquet;
USE mcol_5505_cpimport_parquet;
# Create table
SET time_zone = '+8:00';
Create TABLE t1(
col1 INT,
col2 BIGINT,
col3 FLOAT,
col4 DOUBLE,
col5 TIME(3),
col6 VARCHAR(2),
col7 VARCHAR(5),
col8 VARCHAR(20),
col9 CHAR(2),
col10 CHAR(5),
col11 CHAR(20),
col12 TIMESTAMP(3),
col13 DATE,
col14 DATETIME(3),
col15 SMALLINT,
col16 TINYINT,
col17 DECIMAL(9,3),
col18 INT UNSIGNED,
col19 SMALLINT UNSIGNED,
col20 TINYINT UNSIGNED,
col21 BIGINT UNSIGNED,
col22 BOOLEAN,
col23 DECIMAL(38,10),
col24 TIME(6),
col25 TIMESTAMP(6),
col26 DATETIME(6),
col27 CHAR(4),
col28 CHAR(4)
) ENGINE=Columnstore;
# Generate data
--exec mcs_parquet_gen -a -f $MTR_SUITE_DIR/../std_data
#Valid data and table
--exec $MCS_CPIMPORT mcol_5505_cpimport_parquet t1 $MTR_SUITE_DIR/../std_data/tests.parquet >/dev/null
--exec $MCS_CPIMPORT mcol_5505_cpimport_parquet t1 $MTR_SUITE_DIR/../std_data/nulls.parquet >/dev/null
SELECT * FROM t1;
SELECT COUNT(*) FROM t1;
# Clean UP
--exec rm $MTR_SUITE_DIR/../std_data/tests.parquet
--exec rm $MTR_SUITE_DIR/../std_data/nulls.parquet
DROP DATABASE mcol_5505_cpimport_parquet;

View File

@ -0,0 +1,33 @@
#
# check mcs_parquet_ddl tool
# Author: Bin Ruan, binruan0227@gmail.com
#
-- source ../include/have_columnstore.inc
--disable_warnings
DROP DATABASE IF EXISTS mcol_5505_parquet_ddl;
--enable_warnings
--disable_result_log
--exec mcs_parquet_gen -a -f $MTR_SUITE_DIR/../std_data
--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/tests.parquet $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl
# Wrong source file type
--error 3
--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/int8.par $MTR_SUITE_DIR/../std_data/int8table.ddl
# Wrong number of argument files
--error 4
--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/tests.parquet
--enable_result_log
# Create table
CREATE DATABASE mcol_5505_parquet_ddl;
--exec $MYSQL mcol_5505_parquet_ddl < $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl >/dev/null
USE mcol_5505_parquet_ddl;
SHOW CREATE TABLE parquet_test_table;
# Clean UP
--exec rm $MTR_SUITE_DIR/../std_data/tests.parquet
--exec rm $MTR_SUITE_DIR/../std_data/nulls.parquet
--exec rm $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl
DROP DATABASE mcol_5505_parquet_ddl;

View File

@ -13,3 +13,5 @@ add_subdirectory(idbmeminfo)
add_subdirectory(rebuildEM)
add_subdirectory(passwd)
add_subdirectory(configMgt)
add_subdirectory(parquetGen)
add_subdirectory(parquetDDL)

View File

@ -0,0 +1,6 @@
include_directories(${ENGINE_COMMON_INCLUDES})
set(parquetDDL_SRCS main.cpp)
add_executable(mcs_parquet_ddl ${parquetDDL_SRCS})
target_link_libraries(mcs_parquet_ddl arrow parquet)
install(TARGETS mcs_parquet_ddl DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)

285
tools/parquetDDL/main.cpp Normal file
View File

@ -0,0 +1,285 @@
#include <iostream>
#include <string>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/exception.h>
#include <parquet/arrow/reader.h>
#include <vector>
#include <fstream>
#include <unistd.h>
enum STATUS_CODE
{
NO_ERROR,
EMPTY_FIELD,
UNSUPPORTED_DATA_TYPE,
UNSUPPORTED_FILE_TYPE,
FILE_NUM_ERROR
};
/**
* print the usage information
*/
static void usage()
{
std::cout << "usage: " << std::endl;
std::cout << "Reading parquet then output its corresponding .ddl file." << std::endl;
std::cout << "mcs_parquet_ddl [input_parquet_file] [output_ddl_file]" << std::endl;
}
/**
* get the schema of the parquet file
*/
void getSchema(std::string filePath, std::shared_ptr<arrow::Schema>* parquetSchema)
{
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(filePath, arrow::default_memory_pool()));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
PARQUET_THROW_NOT_OK(reader->GetSchema(parquetSchema));
PARQUET_THROW_NOT_OK(infile->Close());
}
/**
* convert arrow data type id to corresponding columnstore type string
*/
int convert2mcs(std::shared_ptr<arrow::DataType> dataType, arrow::Type::type typeId, std::string& colType)
{
switch (typeId)
{
case arrow::Type::type::BOOL:
{
colType = "BOOLEAN";
break;
}
case arrow::Type::type::UINT8:
{
colType = "TINYINT UNSIGNED";
break;
}
case arrow::Type::type::INT8:
{
colType = "TINYINT";
break;
}
case arrow::Type::type::UINT16:
{
colType = "SMALLINT UNSIGNED";
break;
}
case arrow::Type::type::INT16:
{
colType = "SMALLINT";
break;
}
case arrow::Type::type::UINT32:
{
colType = "INT UNSIGNED";
break;
}
case arrow::Type::type::INT32:
{
colType = "INT";
break;
}
case arrow::Type::type::UINT64:
{
colType = "BIGINT UNSIGNED";
break;
}
case arrow::Type::type::INT64:
{
colType = "BIGINT";
break;
}
case arrow::Type::type::FLOAT:
{
colType = "FLOAT";
break;
}
case arrow::Type::type::DOUBLE:
{
colType = "DOUBLE";
break;
}
case arrow::Type::type::STRING:
{
// set 2000 as the maximum length and VARCHAR as column type
colType = "VARCHAR(2000)";
break;
}
case arrow::Type::type::BINARY:
{
// set 8000 as the maximum length and VARCHAR as column type
colType = "VARCHAR(8000) character set 'binary'";
break;
}
case arrow::Type::type::FIXED_SIZE_BINARY:
{
std::shared_ptr<arrow::FixedSizeBinaryType> fType = std::static_pointer_cast<arrow::FixedSizeBinaryType>(dataType);
int byteWidth = fType->byte_width();
colType = "CHAR(" + std::to_string(byteWidth) + ")";
break;
}
case arrow::Type::type::DATE32:
{
colType = "DATE";
break;
}
case arrow::Type::type::DATE64:
{
colType = "DATE";
break;
}
case arrow::Type::type::TIMESTAMP:
{
std::shared_ptr<arrow::TimestampType> fType = std::static_pointer_cast<arrow::TimestampType>(dataType);
if (fType->unit() == arrow::TimeUnit::MILLI)
colType = "TIMESTAMP(3)";
else if (fType->unit() == arrow::TimeUnit::MICRO)
colType = "TIMESTAMP(6)";
else
return UNSUPPORTED_DATA_TYPE;
break;
}
case arrow::Type::type::TIME32:
{
colType = "TIME(3)";
break;
}
case arrow::Type::type::TIME64:
{
std::shared_ptr<arrow::Time64Type> fType = std::static_pointer_cast<arrow::Time64Type>(dataType);
if (fType->unit() == arrow::TimeUnit::MICRO)
colType = "TIME(6)";
else
return UNSUPPORTED_DATA_TYPE;
break;
}
case arrow::Type::type::DECIMAL128:
{
// get precision and scale
std::shared_ptr<arrow::DecimalType> fType = std::static_pointer_cast<arrow::DecimalType>(dataType);
int32_t fPrecision = fType->precision();
int32_t fScale = fType->scale();
colType = "DECIMAL(" + std::to_string(fPrecision) + "," + std::to_string(fScale) + ")";
break;
}
default:
{
return UNSUPPORTED_DATA_TYPE;
}
}
return NO_ERROR;
}
/**
* main function to generate DDL file
*/
int generateDDL(std::string filePath, std::string targetPath, std::string tableName)
{
std::shared_ptr<arrow::Schema> parquetSchema;
getSchema(filePath, &parquetSchema);
std::vector<std::string> parquetCols;
std::vector<std::string> parquetTypes;
int rc = NO_ERROR;
int fieldsNum = parquetSchema->num_fields();
if (fieldsNum == 0)
{
return EMPTY_FIELD;
}
for (int i = 0; i < fieldsNum; i++)
{
const std::shared_ptr<arrow::Field> tField = parquetSchema->field(i);
const std::string tName = tField->name();
std::string colType;
auto tType = tField->type();
parquetCols.push_back(tName);
rc = convert2mcs(tType, tType->id(), colType);
if (rc != NO_ERROR)
{
std::cout << "Not allowed data type: " << tName << std::endl;
return rc;
}
parquetTypes.push_back(colType);
}
std::string str1 = "CREATE TABLE " + tableName + "(\n";
std::string str2 = ") ENGINE=Columnstore;";
for (int i = 0; i < fieldsNum; i++)
{
str1 += parquetCols[i] + " " + parquetTypes[i] + (i == fieldsNum-1 ? "\n" : ",\n");
}
str1 += str2;
std::ofstream outfile(targetPath + tableName + ".ddl");
outfile << str1;
outfile.close();
std::cout << "Successfully generate " + tableName + ".ddl" << std::endl;
return rc;
}
int main(int argc, char** argv)
{
int32_t option;
while ((option = getopt(argc, argv, "h")) != EOF)
{
switch (option)
{
case 'h':
case '?':
default:
usage();
return (option == 'h' ? 0 : -1);
break;
}
}
// parquet file argv[1]
// ddl file argv[2]
// input parameter should be 3 (no more than 3 also)
if (argc != 3)
{
std::cout << "Please input source parquet file and target ddl file" << std::endl;
return FILE_NUM_ERROR;
}
std::string parquetFile(argv[1]);
std::string ddlFile(argv[2]);
// check file extension
std::string::size_type endBase = ddlFile.rfind('.');
std::string::size_type endBase1 = parquetFile.rfind('.');
if (endBase == std::string::npos || endBase1 == std::string::npos ||
parquetFile.substr(endBase1 + 1) != "parquet" ||
ddlFile.substr(endBase + 1) != "ddl")
{
std::cout << "File type not supported" << std::endl;
usage();
return UNSUPPORTED_FILE_TYPE;
}
std::string targetPath;
std::string tableName;
std::string::size_type startBase = ddlFile.rfind('/');
targetPath.assign(argv[2], startBase + 1);
tableName.assign(argv[2] + startBase + 1, endBase - startBase - 1);
std::cout << "Reading " + parquetFile << std::endl;
int rc = generateDDL(parquetFile, targetPath, tableName);
if (rc != NO_ERROR)
{
std::cout << "Input parquet file illegal: no data field" << std::endl;
}
return rc;
}

View File

@ -0,0 +1,6 @@
include_directories(${ENGINE_COMMON_INCLUDES})
set(parquetGen_SRCS main.cpp)
add_executable(mcs_parquet_gen ${parquetGen_SRCS})
target_link_libraries(mcs_parquet_gen boost_system boost_filesystem arrow parquet)
install(TARGETS mcs_parquet_gen DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)

1342
tools/parquetGen/main.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,7 @@
#include <stdlib.h>
#include <string.h>
#include <type_traits>
#include <chrono>
#include "mcs_decimal.h"
using namespace std;
#include <boost/algorithm/string/case_conv.hpp>
@ -1572,6 +1573,44 @@ boost::any DataConvert::StringToTimestamp(const datatypes::ConvertFromStringPara
return value;
}
//------------------------------------------------------------------------------
// Convert date32 parquet data to binary date. Used by BulkLoad.
//------------------------------------------------------------------------------
int32_t DataConvert::ConvertArrowColumnDate(int32_t dayVal, int& status)
{
int inYear;
int inMonth;
int inDay;
int32_t value = 0;
int64_t secondsSinceEpoch = dayVal;
secondsSinceEpoch *= 86400;
std::chrono::seconds duration(secondsSinceEpoch);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::localtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
if (isDateValid(inDay, inMonth, inYear))
{
Date aDay;
aDay.year = inYear;
aDay.month = inMonth;
aDay.day = inDay;
memcpy(&value, &aDay, 4);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert date string to binary date. Used by BulkLoad.
//------------------------------------------------------------------------------
@ -1658,6 +1697,100 @@ bool DataConvert::isColumnDateValid(int32_t date)
return (isDateValid(d.day, d.month, d.year));
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary datetime(millisecond). Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnDatetime(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::milliseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = duration.count() % 1000;
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
DateTime aDatetime;
aDatetime.year = inYear;
aDatetime.month = inMonth;
aDatetime.day = inDay;
aDatetime.hour = inHour;
aDatetime.minute = inMinute;
aDatetime.second = inSecond;
aDatetime.msecond = inMicrosecond;
memcpy(&value, &aDatetime, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary datetime(millisecond). Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnDatetimeUs(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::microseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = duration.count() % 1000000;
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
DateTime aDatetime;
aDatetime.year = inYear;
aDatetime.month = inMonth;
aDatetime.day = inDay;
aDatetime.hour = inHour;
aDatetime.minute = inMinute;
aDatetime.second = inSecond;
aDatetime.msecond = inMicrosecond;
memcpy(&value, &aDatetime, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert date/time string to binary date/time. Used by BulkLoad.
//------------------------------------------------------------------------------
@ -1798,6 +1931,127 @@ int64_t DataConvert::convertColumnDatetime(const char* dataOrg, CalpontDateTimeF
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary timestamp. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTimestamp(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::milliseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = duration.count() % 1000;
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
MySQLTime m_time;
m_time.year = inYear;
m_time.month = inMonth;
m_time.day = inDay;
m_time.hour = inHour;
m_time.minute = inMinute;
m_time.second = inSecond;
m_time.second_part = inMicrosecond;
bool isValid = true;
int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
if (!isValid)
{
status = -1;
return value;
}
TimeStamp timestamp;
timestamp.second = seconds;
timestamp.msecond = m_time.second_part;
memcpy(&value, &timestamp, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp parquet data to binary timestamp. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTimestampUs(int64_t timeVal, int& status)
{
int64_t value = 0;
int inYear;
int inMonth;
int inDay;
int inHour;
int inMinute;
int inSecond;
int inMicrosecond;
std::chrono::microseconds duration(timeVal);
std::chrono::system_clock::time_point timePoint(duration);
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
std::tm* timeInfo = std::gmtime(&ttime);
inYear = timeInfo->tm_year + 1900;
inMonth = timeInfo->tm_mon + 1;
inDay = timeInfo->tm_mday;
inHour = timeInfo->tm_hour;
inMinute = timeInfo->tm_min;
inSecond = timeInfo->tm_sec;
inMicrosecond = static_cast<int>(duration.count() % 1000000);
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
MySQLTime m_time;
m_time.year = inYear;
m_time.month = inMonth;
m_time.day = inDay;
m_time.hour = inHour;
m_time.minute = inMinute;
m_time.second = inSecond;
m_time.second_part = inMicrosecond;
bool isValid = true;
int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
if (!isValid)
{
status = -1;
return value;
}
TimeStamp timestamp;
timestamp.second = seconds;
timestamp.msecond = m_time.second_part;
memcpy(&value, &timestamp, 8);
}
else
{
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert timestamp string to binary timestamp. Used by BulkLoad.
// Most of this code is taken from DataConvert::convertColumnDatetime
@ -1972,6 +2226,123 @@ int64_t DataConvert::convertColumnTimestamp(const char* dataOrg, CalpontDateTime
return value;
}
//------------------------------------------------------------------------------
// Convert time32 parquet data to binary time. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTime32(int32_t timeVal, int& status)
{
int64_t value = 0;
// convert millisecond to time
int inHour, inMinute, inSecond, inMicrosecond;
inHour = inMinute = inSecond = inMicrosecond = 0;
bool isNeg = false;
if (timeVal < 0)
isNeg = true;
inHour = timeVal / 3600000;
inMinute = (timeVal - inHour * 3600000) / 60000;
inSecond = (timeVal - inHour * 3600000 - inMinute * 60000) / 1000;
inMicrosecond = timeVal - inHour * 3600000 - inMinute * 60000 - inSecond * 1000;
if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
Time atime;
atime.hour = inHour;
atime.minute = inMinute;
atime.second = inSecond;
atime.msecond = inMicrosecond;
atime.is_neg = isNeg;
memcpy(&value, &atime, 8);
}
else
{
// Emulate MariaDB's time saturation
if (inHour > 838)
{
Time atime;
atime.hour = 838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
else if (inHour < -838)
{
Time atime;
atime.hour = -838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
// If neither of the above match then we return a 0 time
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert time64 parquet data to binary time. Used by BulkLoad.
//------------------------------------------------------------------------------
int64_t DataConvert::convertArrowColumnTime64(int64_t timeVal, int& status)
{
int64_t value = 0;
// convert macrosecond to time
int inHour, inMinute, inSecond, inMicrosecond;
inHour = inMinute = inSecond = inMicrosecond = 0;
bool isNeg = false;
if (timeVal < 0)
isNeg = true;
inHour = timeVal / 3600000000;
inMinute = (timeVal - inHour * 3600000000) / 60000000;
inSecond = (timeVal - inHour * 3600000000 - inMinute * 60000000) / 1000000;
inMicrosecond = timeVal - inHour * 3600000000 - inMinute * 60000000 - inSecond * 1000000;
if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
{
Time atime;
atime.hour = inHour;
atime.minute = inMinute;
atime.second = inSecond;
atime.msecond = inMicrosecond;
atime.is_neg = isNeg;
memcpy(&value, &atime, 8);
}
else
{
// Emulate MariaDB's time saturation
if (inHour > 838)
{
Time atime;
atime.hour = 838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
else if (inHour < -838)
{
Time atime;
atime.hour = -838;
atime.minute = 59;
atime.second = 59;
atime.msecond = 999999;
atime.is_neg = false;
memcpy(&value, &atime, 8);
}
// If neither of the above match then we return a 0 time
status = -1;
}
return value;
}
//------------------------------------------------------------------------------
// Convert time string to binary time. Used by BulkLoad.
// Most of this is taken from str_to_time in sql-common/my_time.c

View File

@ -1170,6 +1170,14 @@ class DataConvert
EXPORT static std::string timeToString1(long long timevalue);
static inline void timeToString1(long long timevalue, char* buf, unsigned int buflen);
/**
* @brief convert parquet date data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing days
* @param status 0 - success, -1 - fail
*/
EXPORT static int32_t ConvertArrowColumnDate(int32_t dayVal, int& status);
/**
* @brief convert a date column data, represnted as a string, to it's native
* format. This function is for bulkload to use.
@ -1188,6 +1196,22 @@ class DataConvert
*/
EXPORT static bool isColumnDateValid(int32_t date);
/**
* @brief convert parquet datetime data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing millisecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnDatetime(int64_t timeVal, int& status);
/**
* @brief convert parquet datetime data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing microsecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnDatetimeUs(int64_t timeVal, int& status);
/**
* @brief convert a datetime column data, represented as a string,
* to it's native format. This function is for bulkload to use.
@ -1201,6 +1225,22 @@ class DataConvert
EXPORT static int64_t convertColumnDatetime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
int& status, unsigned int dataOrgLen);
/**
* @brief convert parquet timestamp data(millisecond) to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing millisecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTimestamp(int64_t timeVal, int& status);
/**
* @brief convert parquet timestamp data(microsecond) to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing millisecond from unix epoch
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTimestampUs(int64_t timeVal, int& status);
/**
* @brief convert a timestamp column data, represented as a string,
* to it's native format. This function is for bulkload to use.
@ -1228,6 +1268,22 @@ class DataConvert
EXPORT static int64_t convertColumnTime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
int& status, unsigned int dataOrgLen);
/**
* @brief convert parquet time data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing milliseconds since midnight
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTime32(int32_t timeVal, int& status);
/**
* @brief convert parquet time data to its native format. This function is for bulkload to use.
*
* @param dayVal the input data representing either microseconds or nanoseconds since midnight
* @param status 0 - success, -1 - fail
*/
EXPORT static int64_t convertArrowColumnTime64(int64_t timeVal, int& status);
/**
* @brief Is specified datetime valid; used by binary bulk load
*/

View File

@ -40,7 +40,11 @@ set(cpimport.bin_SRCS cpimport.cpp)
add_executable(cpimport.bin ${cpimport.bin_SRCS})
add_dependencies(cpimport.bin marias3)
target_link_libraries(cpimport.bin ${ENGINE_LDFLAGS} ${NETSNMP_LIBRARIES} ${ENGINE_WRITE_LIBS} ${S3API_DEPS} we_bulk we_xml)
target_link_libraries(cpimport.bin ${ENGINE_LDFLAGS} ${NETSNMP_LIBRARIES} ${ENGINE_WRITE_LIBS} ${S3API_DEPS} we_bulk we_xml)
FIND_PACKAGE(Arrow)
FIND_PACKAGE(Parquet)
target_link_libraries(cpimport.bin arrow)
target_link_libraries(cpimport.bin parquet)
install(TARGETS cpimport.bin DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)

View File

@ -1210,6 +1210,32 @@ int BulkLoad::manageImportDataFileList(Job& job, int tableNo, TableInfo* tableIn
std::vector<std::string> loadFilesList;
bool bUseStdin = false;
// Check if all the import files are parquet file
bool isParquet = false;
for (unsigned int i = 0; i < fCmdLineImportFiles.size(); i++)
{
if (fCmdLineImportFiles[i].rfind(".parquet") != std::string::npos)
{
if (!isParquet)
isParquet = true;
}
else
{
if (isParquet)
{
ostringstream oss;
oss << "Import files exist parquet file while not all of them are parquet files.";
fLog.logMsg(oss.str(), ERR_FILE_TYPE_DIFF, MSGLVL_ERROR);
return ERR_FILE_TYPE_DIFF;
}
}
}
if (isParquet)
{
setImportDataMode(IMPORT_DATA_PARQUET);
}
// Take loadFileName from command line argument override "if" one exists,
// else we take from the Job xml file
std::string loadFileName;

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,7 @@
#include "we_columninfo.h"
#include "calpontsystemcatalog.h"
#include "dataconvert.h"
#include <arrow/api.h>
namespace WriteEngine
{
class Log;
@ -84,6 +84,9 @@ class BulkLoadBuffer
char* fOverflowBuf; // Overflow data held for next buffer
unsigned fOverflowSize; // Current size of fOverflowBuf
std::shared_ptr<arrow::RecordBatch> fParquetBatch; // Batch of parquet file to be parsed
std::shared_ptr<arrow::RecordBatch> fParquetBatchParser; // for temporary use by parser
std::shared_ptr<::arrow::RecordBatchReader> fParquetReader; // Reader for read batches of parquet data
// Information about the locker and status for each column in this buffer.
// Note that TableInfo::fSyncUpdatesTI mutex is used to synchronize
// access to fColumnLocks and fParseComplete from both read and parse
@ -174,6 +177,19 @@ class BulkLoadBuffer
void convert(char* field, int fieldLength, bool nullFlag, unsigned char* output, const JobColumn& column,
BLBufferStats& bufStats);
/** @brief Parse a batch of parquet data in read buffer for a nonDictionary column
*/
int parseColParquet(ColumnInfo& columnInfo);
/** @brief Convert batch parquet data depending upon the data type
*/
void convertParquet(std::shared_ptr<arrow::Array> columnData, unsigned char* buf, const JobColumn& column,
BLBufferStats& bufStats, RID& lastInputRowInExtent, ColumnInfo& columnInfo,
bool& updateCPInfoPendingFlag, ColumnBufferSection* section);
inline void updateCPMinMax(ColumnInfo& columnInfo, RID& lastInputRowInExtent, BLBufferStats& bufStats,
bool& updateCPInfoPendingFlag, ColumnBufferSection* section, uint32_t curRow);
/** @brief Copy the overflow data
*/
void copyOverflow(const BulkLoadBuffer& buffer);
@ -263,6 +279,11 @@ class BulkLoadBuffer
fStatusBLB = status;
}
void setParquetReader(std::shared_ptr<::arrow::RecordBatchReader> reader)
{
fParquetReader = reader;
}
/** @brief Try to lock a column for the buffer
* TableInfo::fSyncUpdatesTI mutex should be locked when calling this
* function (see fColumnLocks discussion).
@ -273,6 +294,10 @@ class BulkLoadBuffer
size_t* parse_length, RID& totalReadRows, RID& correctTotalRows,
const boost::ptr_vector<ColumnInfo>& columnsInfo, unsigned int allowedErrCntThisCall);
/** @brief Read the batch data into the buffer
*/
int fillFromFileParquet(RID& totalReadRows, RID& correctTotalRows);
/** @brief Read the table data into the buffer
*/
int fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* handle, RID& totalRows, RID& correctTotalRows,

View File

@ -1657,6 +1657,41 @@ int ColumnInfo::closeDctnryStore(bool bAbort)
return rc;
}
//--------------------------------------------------------------------------------------
// Update dictionary store file with string column parquet data, and return the assigned
// tokens (tokenbuf) to be stored in the corresponding column token file.
//--------------------------------------------------------------------------------------
int ColumnInfo::updateDctnryStoreParquet(std::shared_ptr<arrow::Array> columnData, int tokenPos, const int totalRow, char* tokenBuf)
{
long long truncCount = 0;
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
#endif
boost::mutex::scoped_lock lock(fDictionaryMutex);
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
#endif
int rc = fStore->insertDctnryParquet(columnData, tokenPos, totalRow, id, tokenBuf, truncCount, column.cs, column.weType);
if (rc != NO_ERROR)
{
WErrorCodes ec;
std::ostringstream oss;
oss << "updateDctnryStore: error adding rows to store file for "
<< "OID-" << column.dctnry.dctnryOid << "; DBRoot-" << curCol.dataFile.fDbRoot << "; part-"
<< curCol.dataFile.fPartition << "; seg-" << curCol.dataFile.fSegment << "; " << ec.errorString(rc);
fLog->logMsg(oss.str(), rc, MSGLVL_CRITICAL);
fpTableInfo->fBRMReporter.addToErrMsgEntry(oss.str());
return rc;
}
incSaturatedCnt(truncCount);
return NO_ERROR;
}
//------------------------------------------------------------------------------
// Update dictionary store file with specified strings, and return the assigned
// tokens (tokenbuf) to be stored in the corresponding column token file.

View File

@ -200,6 +200,13 @@ class ColumnInfo : public WeUIDGID
*/
void lastInputRowInExtentInc();
/** @brief Update dictionary for arrow/parquet format
* Parse and store the parquet data into the store file, and
* returns the assigned tokens (tokenBuf) to be stored in the
* corresponding column token file.
*/
int updateDctnryStoreParquet(std::shared_ptr<arrow::Array> columnData, int tokenPos, const int totalRow, char* tokenBuf);
/** @brief Update dictionary method.
* Parses and stores specified strings into the store file, and
* returns the assigned tokens (tokenBuf) to be stored in the

View File

@ -55,6 +55,9 @@ using namespace querytele;
#include "oamcache.h"
#include "cacheutils.h"
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/exception.h>
namespace
{
const std::string BAD_FILE_SUFFIX = ".bad"; // Reject data file suffix
@ -153,6 +156,8 @@ TableInfo::TableInfo(Log* logger, const BRM::TxnID txnID, const string& processN
, fRejectErrCnt(0)
, fExtentStrAlloc(tableOID, logger)
, fOamCachePtr(oam::OamCache::makeOamCache())
, fParquetReader(NULL)
, fReader(nullptr)
{
fBuffers.clear();
fColumns.clear();
@ -266,24 +271,44 @@ int TableInfo::readTableData()
{
RID validTotalRows = 0;
RID totalRowsPerInputFile = 0;
int64_t totalRowsParquet = 0; // totalRowsParquet to be used in later function
// needs int64_t type
int filesTBProcessed = fLoadFileList.size();
int fileCounter = 0;
unsigned long long qtSentAt = 0;
if (fHandle == NULL)
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
fFileName = fLoadFileList[fileCounter];
int rc = openTableFile();
if (rc != NO_ERROR)
if (fHandle == NULL)
{
// Mark the table status as error and exit.
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
fStatusTI = WriteEngine::ERR;
return rc;
fFileName = fLoadFileList[fileCounter];
int rc = openTableFile();
if (rc != NO_ERROR)
{
// Mark the table status as error and exit.
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
fStatusTI = WriteEngine::ERR;
return rc;
}
fileCounter++;
}
}
else
{
if (fParquetReader == NULL)
{
fFileName = fLoadFileList[fileCounter];
int rc = openTableFileParquet(totalRowsParquet);
if (rc != NO_ERROR)
{
// Mark the table status as error and exit.
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
fStatusTI = WriteEngine::ERR;
return rc;
}
fileCounter++;
}
fileCounter++;
}
timeval readStart;
@ -419,16 +444,23 @@ int TableInfo::readTableData()
// validTotalRows is ongoing total of valid rows read for all files
// pertaining to this DB table.
int readRc;
if (fReadFromS3)
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
fColumns, allowedErrCntThisCall);
if (fReadFromS3)
{
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
fColumns, allowedErrCntThisCall);
}
else
{
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
validTotalRows, fColumns, allowedErrCntThisCall);
}
}
else
{
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
validTotalRows, fColumns, allowedErrCntThisCall);
readRc = fBuffers[readBufNo].fillFromFileParquet(totalRowsPerInputFile, validTotalRows);
}
if (readRc != NO_ERROR)
@ -530,7 +562,7 @@ int TableInfo::readTableData()
fCurrentReadBuffer = (fCurrentReadBuffer + 1) % fReadBufCount;
// bufferCount++;
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)))
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)) || (totalRowsPerInputFile == (RID)totalRowsParquet))
{
timeval readFinished;
gettimeofday(&readFinished, NULL);
@ -567,7 +599,15 @@ int TableInfo::readTableData()
if (fileCounter < filesTBProcessed)
{
fFileName = fLoadFileList[fileCounter];
int rc = openTableFile();
int rc;
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
rc = openTableFile();
}
else
{
rc = openTableFileParquet(totalRowsParquet);
}
if (rc != NO_ERROR)
{
@ -1252,6 +1292,45 @@ void TableInfo::addColumn(ColumnInfo* info)
fExtentStrAlloc.addColumn(info->column.mapOid, info->column.width, info->column.dataType);
}
int TableInfo::openTableFileParquet(int64_t &totalRowsParquet)
{
if (fParquetReader != NULL)
return NO_ERROR;
std::shared_ptr<arrow::io::ReadableFile> infile;
try
{
PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(fFileName, arrow::default_memory_pool()));
PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &fReader));
fReader->set_batch_size(1000);
PARQUET_THROW_NOT_OK(fReader->ScanContents({0}, 1000, &totalRowsParquet));
PARQUET_THROW_NOT_OK(fReader->GetRecordBatchReader(&fParquetReader));
}
catch (std::exception& ex)
{
ostringstream oss;
oss << "Error opening import file " << fFileName << ".";
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
return ERR_FILE_OPEN;
}
catch (...)
{
ostringstream oss;
oss << "Error opening import file " << fFileName << ".";
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
return ERR_FILE_OPEN;
}
// initialize fBuffers batch source
for (int i = 0; i < fReadBufCount; ++i)
{
fBuffers[i].setParquetReader(fParquetReader);
}
return NO_ERROR;
}
//------------------------------------------------------------------------------
// Open the file corresponding to fFileName so that we can import it's contents.
// A buffer is also allocated and passed to setvbuf().
@ -1331,24 +1410,32 @@ int TableInfo::openTableFile()
//------------------------------------------------------------------------------
void TableInfo::closeTableFile()
{
if (fHandle)
if (fImportDataMode != IMPORT_DATA_PARQUET)
{
// If reading from stdin, we don't delete the buffer out from under
// the file handle, because stdin is still open. This will cause a
// memory leak, but when using stdin, we can only read in 1 table.
// So it's not like we will be leaking multiple buffers for several
// tables over the life of the job.
if (!fReadFromStdin)
if (fHandle)
{
fclose(fHandle);
delete[] fFileBuffer;
// If reading from stdin, we don't delete the buffer out from under
// the file handle, because stdin is still open. This will cause a
// memory leak, but when using stdin, we can only read in 1 table.
// So it's not like we will be leaking multiple buffers for several
// tables over the life of the job.
if (!fReadFromStdin)
{
fclose(fHandle);
delete[] fFileBuffer;
}
fHandle = 0;
}
else if (ms3)
{
ms3_free((uint8_t*)fFileBuffer);
}
fHandle = 0;
}
else if (ms3)
else
{
ms3_free((uint8_t*)fFileBuffer);
fReader.reset();
fParquetReader.reset();
}
}

View File

@ -30,6 +30,9 @@
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/uuid/uuid.hpp>
#include <arrow/api.h>
#include <parquet/arrow/reader.h>
#include <libmarias3/marias3.h>
#include "we_type.h"
@ -170,22 +173,25 @@ class TableInfo : public WeUIDGID
oam::OamCache* fOamCachePtr; // OamCache: ptr is copyable
boost::uuids::uuid fJobUUID; // Job UUID
std::vector<BRM::LBID_t> fDictFlushBlks; // dict blks to be flushed from cache
std::shared_ptr<arrow::RecordBatchReader> fParquetReader; // Batch reader to read batches of data
std::unique_ptr<parquet::arrow::FileReader> fReader; // Reader to read parquet file
//--------------------------------------------------------------------------
// Private Functions
//--------------------------------------------------------------------------
int changeTableLockState(); // Change state of table lock to cleanup
void closeTableFile(); // Close current tbl file; free buffer
void closeOpenDbFiles(); // Close DB files left open at job's end
int confirmDBFileChanges(); // Confirm DB file changes (on HDFS)
void deleteTempDBFileChanges(); // Delete DB temp swap files (on HDFS)
int finishBRM(); // Finish reporting updates for BRM
void freeProcessingBuffers(); // Free up Processing Buffers
bool isBufferAvailable(bool report); // Is tbl buffer available for reading
int openTableFile(); // Open data file and set the buffer
void reportTotals(double elapsedSec); // Report summary totals
void sleepMS(long int ms); // Sleep method
int changeTableLockState(); // Change state of table lock to cleanup
void closeTableFile(); // Close current tbl file; free buffer
void closeOpenDbFiles(); // Close DB files left open at job's end
int confirmDBFileChanges(); // Confirm DB file changes (on HDFS)
void deleteTempDBFileChanges(); // Delete DB temp swap files (on HDFS)
int finishBRM(); // Finish reporting updates for BRM
void freeProcessingBuffers(); // Free up Processing Buffers
bool isBufferAvailable(bool report); // Is tbl buffer available for reading
int openTableFileParquet(int64_t &totalRowsParquet); // Open parquet data file and set batch reader for each buffer
int openTableFile(); // Open data file and set the buffer
void reportTotals(double elapsedSec); // Report summary totals
void sleepMS(long int ms); // Sleep method
// Compare column HWM with the examplar HWM.
int compareHWMs(const int smallestColumnId, const int widerColumnId, const uint32_t smallerColumnWidth,
const uint32_t widerColumnWidth, const std::vector<DBRootExtentInfo>& segFileInfo,

View File

@ -35,6 +35,8 @@
#include <iostream>
using namespace std;
#include "bytestream.h"
#include "brmtypes.h"
#include "extentmap.h" // for DICT_COL_WIDTH
@ -745,6 +747,365 @@ int Dctnry::insertDctnry2(Signature& sig)
return NO_ERROR;
}
int Dctnry::insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
const CHARSET_INFO* cs, const WriteEngine::ColType& weType)
{
if (cs->mbmaxlen > 1)
{
// For TEXT columns, we truncate based on the number of bytes,
// and not based on the number of characters, as for CHAR/VARCHAR
// columns in the else block.
if (weType == WriteEngine::WR_TEXT)
{
if (curSig.size > m_colWidth)
{
uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
curSig.size = m_colWidth - truncate_point;
truncCount++;
}
}
else
{
const char* start = (const char*) curSig.signature;
const char* end = (const char*)(curSig.signature + curSig.size);
size_t numChars = cs->numchars(start, end);
size_t maxCharLength = m_colWidth / cs->mbmaxlen;
if (numChars > maxCharLength)
{
MY_STRCOPY_STATUS status;
cs->well_formed_char_length(start, end, maxCharLength, &status);
curSig.size = status.m_source_end_pos - start;
truncCount++;
}
}
}
else // cs->mbmaxlen == 1
{
if (curSig.size > m_colWidth)
{
curSig.size = m_colWidth;
truncCount++;
}
}
//...Search for the string in our string cache
// if it fits into one block (< 8KB)
if (curSig.size <= MAX_SIGNATURE_SIZE)
{
// Stats::startParseEvent("getTokenFromArray");
found = getTokenFromArray(curSig);
if (found)
{
memcpy(pOut + outOffset, &curSig.token, 8);
outOffset += 8;
startPos++;
// Stats::stopParseEvent("getTokenFromArray");
return NO_ERROR;
}
// Stats::stopParseEvent("getTokenFromArray");
}
totalUseSize = m_totalHdrBytes + curSig.size;
//...String not found in cache, so proceed.
// If room is available in current block then insert into block.
// @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
if (((totalUseSize <= m_freeSpace - HDR_UNIT_SIZE) ||
((curSig.size > 8176) && (m_freeSpace > HDR_UNIT_SIZE))) &&
(m_curOp < (MAX_OP_COUNT - 1)))
{
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
m_curBlock.state = BLK_WRITE;
memcpy(pOut + outOffset, &curSig.token, 8);
outOffset += 8;
startPos++;
found = true;
//...If we have reached limit for the number of strings allowed in
// a block, then we write the current block so that we can start
// another block.
if (m_curOp >= MAX_OP_COUNT - 1)
{
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
m_curBlock.state = BLK_READ;
next = true;
}
//...Add string to cache, if we have not exceeded cache limit
// Don't cache big blobs
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
{
addToStringCache(curSig);
}
}
else //...No room for this string in current block, so we write
// out the current block, so we can start another block
{
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
m_curBlock.state = BLK_READ;
next = true;
found = false;
} // if m_freeSpace
//..."next" flag is used to indicate that we need to advance to the
// next block in the store file.
if (next)
{
memset(m_curBlock.data, 0, sizeof(m_curBlock.data));
memcpy(m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
m_curBlock.state = BLK_WRITE;
m_curOp = 0;
next = false;
m_lastFbo++;
m_curFbo = m_lastFbo;
//...Expand current extent if it is an abbreviated initial extent
if ((m_curFbo == m_numBlocks) && (m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
{
RETURN_ON_ERROR(expandDctnryExtent());
}
//...Allocate a new extent if we have reached the last block in the
// current extent.
if (m_curFbo == m_numBlocks)
{
// last block
LBID_t startLbid;
// Add an extent.
RETURN_ON_ERROR(
createDctnry(m_dctnryOID, m_colWidth, m_dbRoot, m_partition, m_segment, startLbid, false));
if (m_logger)
{
std::ostringstream oss;
oss << "Add dictionary extent OID-" << m_dctnryOID << "; DBRoot-" << m_dbRoot << "; part-"
<< m_partition << "; seg-" << m_segment << "; hwm-" << m_curFbo << "; LBID-" << startLbid
<< "; file-" << m_segFileName;
m_logger->logMsg(oss.str(), MSGLVL_INFO2);
}
m_curLbid = startLbid;
// now seek back to the curFbo, after adding an extent
// @bug5769 For uncompressed only;
// ChunkManager manages the file offset for the compression case
if (m_compressionType == 0)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
#endif
long long byteOffset = m_curFbo;
byteOffset *= BYTE_PER_BLOCK;
RETURN_ON_ERROR(setFileOffset(m_dFile, byteOffset));
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
#endif
}
}
else
{
// LBIDs are numbered collectively and consecutively within an
// extent, so within an extent we can derive the LBID by simply
// incrementing it rather than having to go back to BRM to look
// up the LBID for each FBO.
m_curLbid++;
}
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT);
#endif
m_curBlock.lbid = m_curLbid;
//..."found" flag indicates whether the string was already found
// "or" added to the end of the previous block. If false, then
// we need to add the string to the new block.
if (!found)
{
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
m_curBlock.state = BLK_WRITE;
memcpy(pOut + outOffset, &curSig.token, 8);
outOffset += 8;
startPos++;
//...Add string to cache, if we have not exceeded cache limit
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
{
addToStringCache(curSig);
}
}
} // if next
return NO_ERROR;
}
/*******************************************************************************
* Description:
* Used by bulk import to insert batch of parquet strings into this store file.
* Function assumes that the file is already positioned to the current block.
*
* PARAMETERS:
* input
* columnData - arrow array containing input strings
* startRowIdx - start position for current batch parquet data
* totalRow - number of rows in "buf"
* col - column of strings to be parsed from "buf"
* output
* tokenBuf - tokens assigned to inserted strings
*
* RETURN:
* success - successfully write the header to block
* failure - it did not write the header to block
******************************************************************************/
int Dctnry::insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx,
const int totalRow, const int col, char* tokenBuf,
long long& truncCount, const CHARSET_INFO* cs,
const WriteEngine::ColType& weType)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT);
#endif
int startPos = 0;
int totalUseSize = 0;
int outOffset = 0;
const char* pIn;
char* pOut = tokenBuf;
Signature curSig;
bool found = false;
bool next = false;
CommBlock cb;
cb.file.oid = m_dctnryOID;
cb.file.pFile = m_dFile;
WriteEngine::Token nullToken;
bool isNonNullArray = true;
std::shared_ptr<arrow::BinaryArray> binaryArray;
std::shared_ptr<arrow::FixedSizeBinaryArray> fixedSizeBinaryArray;
if (columnData->type_id() != arrow::Type::type::FIXED_SIZE_BINARY)
binaryArray = std::static_pointer_cast<arrow::BinaryArray>(columnData);
else
fixedSizeBinaryArray = std::static_pointer_cast<arrow::FixedSizeBinaryArray>(columnData);
// check if this column data imported is NULL array or not
if (columnData->type_id() == arrow::Type::type::NA)
isNonNullArray = false;
//...Loop through all the rows for the specified column
while (startPos < totalRow)
{
found = false;
void* curSigPtr = static_cast<void*>(&curSig);
memset(curSigPtr, 0, sizeof(curSig));
// if this column is not null data
if (isNonNullArray)
{
const uint8_t* data;
// if (binaryArray != nullptr)
// {
// data = binaryArray->GetValue(startPos + startRowIdx, &curSig.size);
// }
// else
// {
// data = fixedSizeBinaryArray->GetValue(startPos + startRowIdx);
// std::shared_ptr<arrow::DataType> tType = fixedSizeBinaryArray->type();
// curSig.size = tType->byte_width();
// }
// comment this line and uncomment the above will reproduce the error
data = binaryArray->GetValue(startPos + startRowIdx, &curSig.size);
const char* dataPtr = reinterpret_cast<const char*>(data);
// Strip trailing null bytes '\0' (by adjusting curSig.size) if import-
// ing in binary mode. If entire string is binary zeros, then we treat
// as a NULL value.
if (curSig.size > 0)
{
const char* fld = dataPtr;
int kk = curSig.size - 1;
for (; kk >= 0; kk--)
{
if (fld[kk] != '\0')
break;
}
curSig.size = kk + 1;
}
// Read thread should validate against max size so that the entire row
// can be rejected up front. Once we get here in the parsing thread,
// it is too late to reject the row. However, as a precaution, we
// still check against max size & set to null token if needed.
if ((curSig.size == 0) || (curSig.size > MAX_BLOB_SIZE))
{
if (m_defVal.length() > 0) // use default string if available
{
pIn = m_defVal.str();
curSig.signature = (unsigned char*)pIn;
curSig.size = m_defVal.length();
}
else
{
memcpy(pOut + outOffset, &nullToken, 8);
outOffset += 8;
startPos++;
continue;
}
}
else
{
pIn = dataPtr;
curSig.signature = (unsigned char*)pIn;
}
}
else
{
curSig.size = 0;
if (m_defVal.length() > 0) // use default string if available
{
pIn = m_defVal.str();
curSig.signature = (unsigned char*)pIn;
curSig.size = m_defVal.length();
}
else
{
memcpy(pOut + outOffset, &nullToken, 8);
outOffset += 8;
startPos++;
continue;
}
}
RETURN_ON_ERROR(insertDctnry1(curSig, found, pOut, outOffset, startPos, totalUseSize, cb, next, truncCount,
cs, weType));
}
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
// Done
// If any data leftover and not written by subsequent call to
// insertDctnry(), then it will be written by closeDctnry().
return NO_ERROR;
}
/*******************************************************************************
* Description:
* Used by bulk import to insert collection of strings into this store file.
@ -838,201 +1199,8 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow,
curSig.signature = (unsigned char*)pIn;
}
if (cs->mbmaxlen > 1)
{
// For TEXT columns, we truncate based on the number of bytes,
// and not based on the number of characters, as for CHAR/VARCHAR
// columns in the else block.
if (weType == WriteEngine::WR_TEXT)
{
if (curSig.size > m_colWidth)
{
uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
curSig.size = m_colWidth - truncate_point;
truncCount++;
}
}
else
{
const char* start = (const char*) curSig.signature;
const char* end = (const char*)(curSig.signature + curSig.size);
size_t numChars = cs->numchars(start, end);
size_t maxCharLength = m_colWidth / cs->mbmaxlen;
if (numChars > maxCharLength)
{
MY_STRCOPY_STATUS status;
cs->well_formed_char_length(start, end, maxCharLength, &status);
curSig.size = status.m_source_end_pos - start;
truncCount++;
}
}
}
else // cs->mbmaxlen == 1
{
if (curSig.size > m_colWidth)
{
curSig.size = m_colWidth;
truncCount++;
}
}
//...Search for the string in our string cache
// if it fits into one block (< 8KB)
if (curSig.size <= MAX_SIGNATURE_SIZE)
{
// Stats::startParseEvent("getTokenFromArray");
found = getTokenFromArray(curSig);
if (found)
{
memcpy(pOut + outOffset, &curSig.token, 8);
outOffset += 8;
startPos++;
// Stats::stopParseEvent("getTokenFromArray");
continue;
}
// Stats::stopParseEvent("getTokenFromArray");
}
totalUseSize = m_totalHdrBytes + curSig.size;
//...String not found in cache, so proceed.
// If room is available in current block then insert into block.
// @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
if (((totalUseSize <= m_freeSpace - HDR_UNIT_SIZE) ||
((curSig.size > 8176) && (m_freeSpace > HDR_UNIT_SIZE))) &&
(m_curOp < (MAX_OP_COUNT - 1)))
{
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
m_curBlock.state = BLK_WRITE;
memcpy(pOut + outOffset, &curSig.token, 8);
outOffset += 8;
startPos++;
found = true;
//...If we have reached limit for the number of strings allowed in
// a block, then we write the current block so that we can start
// another block.
if (m_curOp >= MAX_OP_COUNT - 1)
{
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
m_curBlock.state = BLK_READ;
next = true;
}
//...Add string to cache, if we have not exceeded cache limit
// Don't cache big blobs
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
{
addToStringCache(curSig);
}
}
else //...No room for this string in current block, so we write
// out the current block, so we can start another block
{
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
#endif
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
m_curBlock.state = BLK_READ;
next = true;
found = false;
} // if m_freeSpace
//..."next" flag is used to indicate that we need to advance to the
// next block in the store file.
if (next)
{
memset(m_curBlock.data, 0, sizeof(m_curBlock.data));
memcpy(m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
m_curBlock.state = BLK_WRITE;
m_curOp = 0;
next = false;
m_lastFbo++;
m_curFbo = m_lastFbo;
//...Expand current extent if it is an abbreviated initial extent
if ((m_curFbo == m_numBlocks) && (m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
{
RETURN_ON_ERROR(expandDctnryExtent());
}
//...Allocate a new extent if we have reached the last block in the
// current extent.
if (m_curFbo == m_numBlocks)
{
// last block
LBID_t startLbid;
// Add an extent.
RETURN_ON_ERROR(
createDctnry(m_dctnryOID, m_colWidth, m_dbRoot, m_partition, m_segment, startLbid, false));
if (m_logger)
{
std::ostringstream oss;
oss << "Add dictionary extent OID-" << m_dctnryOID << "; DBRoot-" << m_dbRoot << "; part-"
<< m_partition << "; seg-" << m_segment << "; hwm-" << m_curFbo << "; LBID-" << startLbid
<< "; file-" << m_segFileName;
m_logger->logMsg(oss.str(), MSGLVL_INFO2);
}
m_curLbid = startLbid;
// now seek back to the curFbo, after adding an extent
// @bug5769 For uncompressed only;
// ChunkManager manages the file offset for the compression case
if (m_compressionType == 0)
{
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
#endif
long long byteOffset = m_curFbo;
byteOffset *= BYTE_PER_BLOCK;
RETURN_ON_ERROR(setFileOffset(m_dFile, byteOffset));
#ifdef PROFILE
Stats::stopParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
#endif
}
}
else
{
// LBIDs are numbered collectively and consecutively within an
// extent, so within an extent we can derive the LBID by simply
// incrementing it rather than having to go back to BRM to look
// up the LBID for each FBO.
m_curLbid++;
}
#ifdef PROFILE
Stats::startParseEvent(WE_STATS_PARSE_DCT);
#endif
m_curBlock.lbid = m_curLbid;
//..."found" flag indicates whether the string was already found
// "or" added to the end of the previous block. If false, then
// we need to add the string to the new block.
if (!found)
{
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
m_curBlock.state = BLK_WRITE;
memcpy(pOut + outOffset, &curSig.token, 8);
outOffset += 8;
startPos++;
//...Add string to cache, if we have not exceeded cache limit
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
{
addToStringCache(curSig);
}
}
} // if next
RETURN_ON_ERROR(insertDctnry1(curSig, found, pOut, outOffset, startPos, totalUseSize, cb, next, truncCount,
cs, weType));
} // end while
#ifdef PROFILE

View File

@ -37,6 +37,8 @@
#include "bytestream.h"
#include "nullstring.h"
#include <arrow/api.h>
#define EXPORT
/** Namespace WriteEngine */
@ -157,6 +159,20 @@ class Dctnry : public DbFileOp
*/
EXPORT int insertDctnry(const int& sgnature_size, const unsigned char* sgnature_value, Token& token);
/**
* @brief Insert signature value to a file block and return token/pointer
* (for Bulk use)
*
* @param columnData - arrow array containing strings to be parsed
* @param startRowIdx - start position for current batch parquet data
* @param totalRow - total number of rows in buf
* @param col - the column to be parsed from buf
* @param tokenBuf - (output) list of tokens for the parsed strings
*/
EXPORT int insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx, const int totalRow,
const int col, char* tokenBuf, long long& truncCount,
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
/**
* @brief Insert a signature value to a file block and return token/pointer
* (for Bulk use)
@ -280,6 +296,9 @@ class Dctnry : public DbFileOp
// insertDctnryHdr inserts the new value info into the header.
// insertSgnture inserts the new value into the block.
//
int insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
int insertDctnry2(Signature& sig);
void insertDctnryHdr(unsigned char* blockBuf, const int& size);
void insertSgnture(unsigned char* blockBuf, const int& size, unsigned char* value);

View File

@ -113,7 +113,7 @@ const int ERR_COMPBASE = 1650; // Compression errors
const int ERR_AUTOINCBASE = 1700; // Auto-increment errors
const int ERR_BLKCACHEBASE = 1750; // Block cache flush errors
const int ERR_METABKUPBASE = 1800; // Backup bulk meta file errors
const int ERR_PARQUETBASE = 1850; // Parquet importing errors
//--------------------------------------------------------------------------
// Generic error
//--------------------------------------------------------------------------
@ -152,6 +152,7 @@ const int ERR_FILE_GLOBBING = ERR_FILEBASE + 19; // Error globbing a file
const int ERR_FILE_EOF = ERR_FILEBASE + 20; // EOF
const int ERR_FILE_CHOWN = ERR_FILEBASE + 21; // EOF
const int ERR_INTERNAL = ERR_FILEBASE + 22; // EOF
const int ERR_FILE_TYPE_DIFF = ERR_FILEBASE + 23; // Files import type are different
//--------------------------------------------------------------------------
// XML level error
@ -389,6 +390,11 @@ const int ERR_METADATABKUP_COMP_READ_BULK_BKUP =
ERR_METABKUPBASE + 7; // Error reading from backup chunk file */
const int ERR_METADATABKUP_COMP_RENAME = ERR_METABKUPBASE + 8; // Error renaming chunk file */
//--------------------------------------------------------------------------
// Parquet errors when importing
//--------------------------------------------------------------------------
const int ERR_PARQUET_AUX = ERR_PARQUETBASE + 1; // Error when creating aux column for parquet file
//------------------------------------------------------------------------------
// Class used to convert an error code to a corresponding error message string
//------------------------------------------------------------------------------

View File

@ -137,11 +137,13 @@ enum BulkModeType
// Import Mode 0-text Import (default)
// 1-Binary Import with NULL values
// 2-Binary Import with saturated NULL values
// 3-Binary Import with parquet file
enum ImportDataMode
{
IMPORT_DATA_TEXT = 0,
IMPORT_DATA_BIN_ACCEPT_NULL = 1,
IMPORT_DATA_BIN_SAT_NULL = 2
IMPORT_DATA_BIN_SAT_NULL = 2,
IMPORT_DATA_PARQUET = 3
};
/**