mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-04-18 21:44:02 +03:00
MCOL-5505 add parquet support for cpimport and add mcs_parquet_ddl and mcs_parquet_gen tools
This commit is contained in:
parent
94a680ea60
commit
fe597ec78c
@ -0,0 +1,77 @@
|
||||
DROP DATABASE IF EXISTS mcol_5505_parquet_large_volume;
|
||||
CREATE DATABASE mcol_5505_parquet_large_volume;
|
||||
USE mcol_5505_parquet_large_volume;
|
||||
SET time_zone = '+8:00';
|
||||
Create TABLE t1(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
Create TABLE t2(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
Create TABLE t3(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
Create TABLE t4(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
SELECT * FROM t1 ORDER BY col1 LIMIT 5;
|
||||
col1 col2 col3 col4 col5 col6
|
||||
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
|
||||
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
|
||||
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
|
||||
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
|
||||
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
|
||||
SELECT COUNT(*) FROM t1;
|
||||
COUNT(*)
|
||||
1000000
|
||||
SELECT * FROM t2 ORDER BY col1 LIMIT 5;
|
||||
col1 col2 col3 col4 col5 col6
|
||||
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
|
||||
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
|
||||
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
|
||||
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
|
||||
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
|
||||
SELECT COUNT(*) FROM t2;
|
||||
COUNT(*)
|
||||
10000000
|
||||
SELECT * FROM t3 ORDER BY col1 LIMIT 5;
|
||||
col1 col2 col3 col4 col5 col6
|
||||
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
|
||||
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
|
||||
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
|
||||
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
|
||||
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
|
||||
SELECT COUNT(*) FROM t3;
|
||||
COUNT(*)
|
||||
50000000
|
||||
SELECT * FROM t4 ORDER BY col1 LIMIT 5;
|
||||
col1 col2 col3 col4 col5 col6
|
||||
0 0000-00-00 00:00:00.000 hhhh 12345678909876543.2112345678 2.5 hhhh
|
||||
1 1970-01-01 10:46:40.001 hhhh 12345678909876543.2112345678 3.5 hhhh
|
||||
2 1970-01-01 13:33:20.002 hhhh 12345678909876543.2112345678 4.5 hhhh
|
||||
3 1970-01-01 16:20:00.003 hhhh 12345678909876543.2112345678 5.5 hhhh
|
||||
4 1970-01-01 19:06:40.004 hhhh 12345678909876543.2112345678 6.5 hhhh
|
||||
SELECT COUNT(*) FROM t4;
|
||||
COUNT(*)
|
||||
100000000
|
||||
DROP DATABASE mcol_5505_parquet_large_volume;
|
100
mysql-test/columnstore/basic/r/mcol-5505-cpimport-parquet.result
Normal file
100
mysql-test/columnstore/basic/r/mcol-5505-cpimport-parquet.result
Normal file
@ -0,0 +1,100 @@
|
||||
DROP DATABASE IF EXISTS mcol_5505_cpimport_parquet;
|
||||
CREATE DATABASE mcol_5505_cpimport_parquet;
|
||||
USE mcol_5505_cpimport_parquet;
|
||||
SET time_zone = '+8:00';
|
||||
Create TABLE t1(
|
||||
col1 INT,
|
||||
col2 BIGINT,
|
||||
col3 FLOAT,
|
||||
col4 DOUBLE,
|
||||
col5 TIME(3),
|
||||
col6 VARCHAR(2),
|
||||
col7 VARCHAR(5),
|
||||
col8 VARCHAR(20),
|
||||
col9 CHAR(2),
|
||||
col10 CHAR(5),
|
||||
col11 CHAR(20),
|
||||
col12 TIMESTAMP(3),
|
||||
col13 DATE,
|
||||
col14 DATETIME(3),
|
||||
col15 SMALLINT,
|
||||
col16 TINYINT,
|
||||
col17 DECIMAL(9,3),
|
||||
col18 INT UNSIGNED,
|
||||
col19 SMALLINT UNSIGNED,
|
||||
col20 TINYINT UNSIGNED,
|
||||
col21 BIGINT UNSIGNED,
|
||||
col22 BOOLEAN,
|
||||
col23 DECIMAL(38,10),
|
||||
col24 TIME(6),
|
||||
col25 TIMESTAMP(6),
|
||||
col26 DATETIME(6),
|
||||
col27 CHAR(4),
|
||||
col28 CHAR(4)
|
||||
) ENGINE=Columnstore;
|
||||
SELECT * FROM t1;
|
||||
col1 col2 col3 col4 col5 col6 col7 col8 col9 col10 col11 col12 col13 col14 col15 col16 col17 col18 col19 col20 col21 col22 col23 col24 col25 col26 col27 col28
|
||||
0 0 1.5 2.5 00:00:00.000 a a a a a a 0000-00-00 00:00:00.000 1970-01-01 1970-01-01 00:00:00.000 0 0 1383.433 0 0 0 0 1 12345678909876543.2112345678 00:00:00.000000 0000-00-00 00:00:00.000000 1970-01-01 00:00:00.000000 a abcd
|
||||
NULL NULL 2.5 3.5 01:00:05.001 NULL NULL NULL NULL NULL NULL 1970-01-01 10:46:40.001 1970-01-11 1970-01-01 02:46:40.001 1 1 NULL NULL 1 1 NULL 1 12345678909876543.2112345678 01:00:05.000001 1970-01-01 10:46:40.000001 1970-01-01 02:46:40.000001 ab abcd
|
||||
NULL NULL NULL 4.5 02:00:10.002 ab abcd abcd ab abcd abcd 1970-01-01 13:33:20.002 1970-01-21 1970-01-01 05:33:20.002 2 2 532235.234 NULL 2 2 NULL 1 12345678909876543.2112345678 02:00:10.000002 1970-01-01 13:33:20.000002 1970-01-01 05:33:20.000002 abcd abcd
|
||||
NULL NULL 4.5 NULL 03:00:15.003 ab abcde abcde ab abcde abcde 1970-01-01 16:20:00.003 1970-01-31 1970-01-01 08:20:00.003 3 3 NULL NULL 3 3 NULL 1 12345678909876543.2112345678 03:00:15.000003 1970-01-01 16:20:00.000003 1970-01-01 08:20:00.000003 abcd abcd
|
||||
4 4 5.5 6.5 04:00:20.004 ab abcde abcdefg ab abcde abcdefg 1970-01-01 19:06:40.004 1970-02-10 1970-01-01 11:06:40.004 4 4 5325.234 4 4 4 4 1 12345678909876543.2112345678 04:00:20.000004 1970-01-01 19:06:40.000004 1970-01-01 11:06:40.000004 abcd abcd
|
||||
5 5 6.5 7.5 05:00:25.005 Wh Whlg1 Whlg1xXAxP Wh Whlg1 Whlg1xXAxP 1970-01-01 21:53:20.005 1970-02-20 1970-01-01 13:53:20.005 5 5 NULL 5 5 5 5 0 12345678909876543.2112345678 05:00:25.000005 1970-01-01 21:53:20.000005 1970-01-01 13:53:20.000005 Whlg abcd
|
||||
6 6 7.5 8.5 06:00:30.006 4N 4Nimz 4NimzSQzMD 4N 4Nimz 4NimzSQzMD 1970-01-02 00:40:00.006 1970-03-02 1970-01-01 16:40:00.006 6 6 1383.433 6 6 6 6 1 12345678909876543.2112345678 06:00:30.000006 1970-01-02 00:40:00.000006 1970-01-01 16:40:00.000006 4Nim abcd
|
||||
7 7 8.5 9.5 07:00:35.007 G2 G23ne G23ne3j92Ky0wBF G2 G23ne G23ne3j92Ky0wBF 1970-01-02 03:26:40.007 1970-03-12 1970-01-01 19:26:40.007 7 7 NULL 7 7 7 7 1 12345678909876543.2112345678 07:00:35.000007 1970-01-02 03:26:40.000007 1970-01-01 19:26:40.000007 G23n abcd
|
||||
8 8 9.5 10.5 08:00:40.008 F4 F4z F4z F4 F4z F4z 1970-01-02 06:13:20.008 1970-03-22 1970-01-01 22:13:20.008 8 8 532235.234 8 8 8 8 1 12345678909876543.2112345678 08:00:40.000008 1970-01-02 06:13:20.000008 1970-01-01 22:13:20.000008 F4z abcd
|
||||
9 9 10.5 11.5 09:00:45.009 8J 8JCVT 8JCVTsGYB7V 8J 8JCVT 8JCVTsGYB7V 1970-01-02 09:00:00.009 1970-04-01 1970-01-02 01:00:00.009 9 9 NULL 9 9 9 9 1 12345678909876543.2112345678 09:00:45.000009 1970-01-02 09:00:00.000009 1970-01-02 01:00:00.000009 8JCV abcd
|
||||
10 10 11.5 12.5 10:00:50.010 23 23235 23235 23 23235 23235 1970-01-02 11:46:40.010 1970-04-11 1970-01-02 03:46:40.010 10 10 5325.234 10 10 10 10 1 12345678909876543.2112345678 10:00:50.000010 1970-01-02 11:46:40.000010 1970-01-02 03:46:40.000010 2323 abcd
|
||||
11 11 12.5 13.5 11:00:55.011 sd sda22 sda22 sd sda22 sda22 1970-01-02 14:33:20.011 1970-04-21 1970-01-02 06:33:20.011 11 11 NULL 11 11 11 11 1 12345678909876543.2112345678 11:00:55.000011 1970-01-02 14:33:20.000011 1970-01-02 06:33:20.000011 sda2 abcd
|
||||
12 12 13.5 14.5 12:01:00.012 SD SD7sd SD7sdFD7 SD SD7sd SD7sdFD7 1970-01-02 17:20:00.012 1970-05-01 1970-01-02 09:20:00.012 12 12 1383.433 12 12 12 12 1 12345678909876543.2112345678 12:01:00.000012 1970-01-02 17:20:00.000012 1970-01-02 09:20:00.000012 SD7s abcd
|
||||
13 13 14.5 15.5 13:01:05.013 gv gvv3h gvv3hYwdfOD gv gvv3h gvv3hYwdfOD 1970-01-02 20:06:40.013 1970-05-11 1970-01-02 12:06:40.013 13 13 NULL 13 13 13 13 1 12345678909876543.2112345678 13:01:05.000013 1970-01-02 20:06:40.000013 1970-01-02 12:06:40.000013 gvv3 abcd
|
||||
14 14 15.5 16.5 14:01:10.014 y8 y8wjo y8wjo4v50s6 y8 y8wjo y8wjo4v50s6 1970-01-02 22:53:20.014 1970-05-21 1970-01-02 14:53:20.014 14 14 532235.234 14 14 14 14 1 12345678909876543.2112345678 14:01:10.000014 1970-01-02 22:53:20.000014 1970-01-02 14:53:20.000014 y8wj abcd
|
||||
15 15 16.5 17.5 15:01:15.015 aN aNJW5 aNJW56SJieE8KVV aN aNJW5 aNJW56SJieE8KVV 1970-01-03 01:40:00.015 1970-05-31 1970-01-02 17:40:00.015 15 15 NULL 15 15 15 15 1 12345678909876543.2112345678 15:01:15.000015 1970-01-03 01:40:00.000015 1970-01-02 17:40:00.000015 aNJW abcd
|
||||
16 16 17.5 18.5 16:01:20.016 1+ 1+2=3 1+2=3 1+ 1+2=3 1+2=3 1970-01-03 04:26:40.016 1970-06-10 1970-01-02 20:26:40.016 16 16 5325.234 16 16 16 16 1 12345678909876543.2112345678 16:01:20.000016 1970-01-03 04:26:40.000016 1970-01-02 20:26:40.000016 1+2= abcd
|
||||
17 17 18.5 19.5 17:01:25.017 He Hello Hello World! He Hello Hello World! 1970-01-03 07:13:20.017 1970-06-20 1970-01-02 23:13:20.017 17 17 NULL 17 17 17 17 1 12345678909876543.2112345678 17:01:25.000017 1970-01-03 07:13:20.000017 1970-01-02 23:13:20.000017 Hell abcd
|
||||
18 18 19.5 20.5 18:01:30.018 1! 1!!!1 1!!!1 1! 1!!!1 1!!!1 1970-01-03 10:00:00.018 1970-06-30 1970-01-03 02:00:00.018 18 18 1383.433 18 18 18 18 1 12345678909876543.2112345678 18:01:30.000018 1970-01-03 10:00:00.000018 1970-01-03 02:00:00.000018 1!!! abcd
|
||||
19 19 20.5 21.5 19:01:35.019 82 82440 824407880313877 82 82440 824407880313877 1970-01-03 12:46:40.019 1970-07-10 1970-01-03 04:46:40.019 19 19 NULL 19 19 19 19 1 12345678909876543.2112345678 19:01:35.000019 1970-01-03 12:46:40.000019 1970-01-03 04:46:40.000019 8244 abcd
|
||||
20 20 21.5 22.5 20:01:40.020 19 1970- 1970-01-01 08:02:23 19 1970- 1970-01-01 08:02:23 1970-01-03 15:33:20.020 1970-07-20 1970-01-03 07:33:20.020 20 20 532235.234 20 20 20 20 1 12345678909876543.2112345678 20:01:40.000020 1970-01-03 15:33:20.000020 1970-01-03 07:33:20.000020 1970 abcd
|
||||
21 21 22.5 23.5 21:01:45.021 19 1970- 1970-05-31 19 1970- 1970-05-31 1970-01-03 18:20:00.021 1970-07-30 1970-01-03 10:20:00.021 21 21 NULL 21 21 21 21 1 12345678909876543.2112345678 21:01:45.000021 1970-01-03 18:20:00.000021 1970-01-03 10:20:00.000021 1970 abcd
|
||||
22 22 23.5 24.5 22:01:50.022 xx xxx xxx xx xxx xxx 1970-01-03 21:06:40.022 1970-08-09 1970-01-03 13:06:40.022 22 22 5325.234 22 22 22 22 1 12345678909876543.2112345678 22:01:50.000022 1970-01-03 21:06:40.000022 1970-01-03 13:06:40.000022 xxx abcd
|
||||
23 23 24.5 25.5 23:01:55.023 ON ONMKM ONMKMQVBRWBUTWT ON ONMKM ONMKMQVBRWBUTWT 1970-01-03 23:53:20.023 1970-08-19 1970-01-03 15:53:20.023 23 23 NULL 23 23 23 23 1 12345678909876543.2112345678 23:01:55.000023 1970-01-03 23:53:20.000023 1970-01-03 15:53:20.000023 ONMK abcd
|
||||
24 24 25.5 26.5 24:02:00.024 ZW ZWMWH ZWMWHSEZDYODQWP ZW ZWMWH ZWMWHSEZDYODQWP 1970-01-04 02:40:00.024 1970-08-29 1970-01-03 18:40:00.024 24 24 1383.433 24 24 24 24 1 12345678909876543.2112345678 24:02:00.000024 1970-01-04 02:40:00.000024 1970-01-03 18:40:00.000024 ZWMW abcd
|
||||
25 25 26.5 27.5 25:02:05.025 Ho HoCYp HoCYpJ Ho HoCYp HoCYpJ 1970-01-04 05:26:40.025 1970-09-08 1970-01-03 21:26:40.025 25 25 NULL 25 25 25 25 1 12345678909876543.2112345678 25:02:05.000025 1970-01-04 05:26:40.000025 1970-01-03 21:26:40.000025 HoCY abcd
|
||||
26 26 27.5 28.5 26:02:10.026 -1 -100 -100 -1 -100 -100 1970-01-04 08:13:20.026 1970-09-18 1970-01-04 00:13:20.026 26 26 532235.234 26 26 26 26 1 12345678909876543.2112345678 26:02:10.000026 1970-01-04 08:13:20.000026 1970-01-04 00:13:20.000026 -100 abcd
|
||||
27 27 28.5 29.5 27:02:15.027 Iq Iqa8N Iqa8Nr Iq Iqa8N Iqa8Nr 1970-01-04 11:00:00.027 1970-09-28 1970-01-04 03:00:00.027 27 27 NULL 27 27 27 27 1 12345678909876543.2112345678 27:02:15.000027 1970-01-04 11:00:00.000027 1970-01-04 03:00:00.000027 Iqa8 abcd
|
||||
28 28 29.5 30.5 28:02:20.028 nD nD274 nD274v nD nD274 nD274v 1970-01-04 13:46:40.028 1970-10-08 1970-01-04 05:46:40.028 28 28 5325.234 28 28 28 28 1 12345678909876543.2112345678 28:02:20.000028 1970-01-04 13:46:40.000028 1970-01-04 05:46:40.000028 nD27 abcd
|
||||
-2147483646 2147483648 30.5 31.5 29:02:25.029 6y 6y0Jy 6y0JyW 6y 6y0Jy 6y0JyW 1970-01-04 16:33:20.029 1970-10-18 1970-01-04 08:33:20.029 29 29 NULL 2147483648 29 29 2147483648 1 12345678909876543.2112345678 29:02:25.000029 1970-01-04 16:33:20.000029 1970-01-04 08:33:20.000029 6y0J abcd
|
||||
NULL NULL NULL NULL NULL NULL NULL a NULL NULL a 0000-00-00 00:00:00.000 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 0000-00-00 00:00:00.000000 1970-01-01 00:00:00.000000 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 10:46:40.001 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 10:46:40.000001 1970-01-01 02:46:40.000001 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL abcd NULL NULL abcd 1970-01-01 13:33:20.002 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 13:33:20.000002 1970-01-01 05:33:20.000002 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL abcde NULL NULL abcde 1970-01-01 16:20:00.003 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 16:20:00.000003 1970-01-01 08:20:00.000003 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL abcdefg NULL NULL abcdefg 1970-01-01 19:06:40.004 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 19:06:40.000004 1970-01-01 11:06:40.000004 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL Whlg1xXAxP NULL NULL Whlg1xXAxP 1970-01-01 21:53:20.005 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-01 21:53:20.000005 1970-01-01 13:53:20.000005 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 4NimzSQzMD NULL NULL 4NimzSQzMD 1970-01-02 00:40:00.006 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 00:40:00.000006 1970-01-01 16:40:00.000006 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL G23ne3j92Ky0wBF NULL NULL G23ne3j92Ky0wBF 1970-01-02 03:26:40.007 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 03:26:40.000007 1970-01-01 19:26:40.000007 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL F4z NULL NULL F4z 1970-01-02 06:13:20.008 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 06:13:20.000008 1970-01-01 22:13:20.000008 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 8JCVTsGYB7V NULL NULL 8JCVTsGYB7V 1970-01-02 09:00:00.009 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 09:00:00.000009 1970-01-02 01:00:00.000009 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 23235 NULL NULL 23235 1970-01-02 11:46:40.010 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 11:46:40.000010 1970-01-02 03:46:40.000010 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL sda22 NULL NULL sda22 1970-01-02 14:33:20.011 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 14:33:20.000011 1970-01-02 06:33:20.000011 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL SD7sdFD7 NULL NULL SD7sdFD7 1970-01-02 17:20:00.012 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 17:20:00.000012 1970-01-02 09:20:00.000012 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL gvv3hYwdfOD NULL NULL gvv3hYwdfOD 1970-01-02 20:06:40.013 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 20:06:40.000013 1970-01-02 12:06:40.000013 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL y8wjo4v50s6 NULL NULL y8wjo4v50s6 1970-01-02 22:53:20.014 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-02 22:53:20.000014 1970-01-02 14:53:20.000014 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL aNJW56SJieE8KVV NULL NULL aNJW56SJieE8KVV 1970-01-03 01:40:00.015 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 01:40:00.000015 1970-01-02 17:40:00.000015 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 1+2=3 NULL NULL 1+2=3 1970-01-03 04:26:40.016 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 04:26:40.000016 1970-01-02 20:26:40.000016 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL Hello World! NULL NULL Hello World! 1970-01-03 07:13:20.017 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 07:13:20.000017 1970-01-02 23:13:20.000017 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 1!!!1 NULL NULL 1!!!1 1970-01-03 10:00:00.018 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 10:00:00.000018 1970-01-03 02:00:00.000018 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 824407880313877 NULL NULL 824407880313877 1970-01-03 12:46:40.019 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 12:46:40.000019 1970-01-03 04:46:40.000019 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 1970-01-01 08:02:23 NULL NULL 1970-01-01 08:02:23 1970-01-03 15:33:20.020 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 15:33:20.000020 1970-01-03 07:33:20.000020 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 1970-05-31 NULL NULL 1970-05-31 1970-01-03 18:20:00.021 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 18:20:00.000021 1970-01-03 10:20:00.000021 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL xxx NULL NULL xxx 1970-01-03 21:06:40.022 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 21:06:40.000022 1970-01-03 13:06:40.000022 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL ONMKMQVBRWBUTWT NULL NULL ONMKMQVBRWBUTWT 1970-01-03 23:53:20.023 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-03 23:53:20.000023 1970-01-03 15:53:20.000023 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL ZWMWHSEZDYODQWP NULL NULL ZWMWHSEZDYODQWP 1970-01-04 02:40:00.024 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 02:40:00.000024 1970-01-03 18:40:00.000024 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL HoCYpJ NULL NULL HoCYpJ 1970-01-04 05:26:40.025 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 05:26:40.000025 1970-01-03 21:26:40.000025 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL -100 NULL NULL -100 1970-01-04 08:13:20.026 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 08:13:20.000026 1970-01-04 00:13:20.000026 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL Iqa8Nr NULL NULL Iqa8Nr 1970-01-04 11:00:00.027 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 11:00:00.000027 1970-01-04 03:00:00.000027 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL nD274v NULL NULL nD274v 1970-01-04 13:46:40.028 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 13:46:40.000028 1970-01-04 05:46:40.000028 NULL NULL
|
||||
NULL NULL NULL NULL NULL NULL NULL 6y0JyW NULL NULL 6y0JyW 1970-01-04 16:33:20.029 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1970-01-04 16:33:20.000029 1970-01-04 08:33:20.000029 NULL NULL
|
||||
SELECT COUNT(*) FROM t1;
|
||||
COUNT(*)
|
||||
60
|
||||
DROP DATABASE mcol_5505_cpimport_parquet;
|
36
mysql-test/columnstore/basic/r/mcol-5505-parquet-ddl.result
Normal file
36
mysql-test/columnstore/basic/r/mcol-5505-parquet-ddl.result
Normal file
@ -0,0 +1,36 @@
|
||||
DROP DATABASE IF EXISTS mcol_5505_parquet_ddl;
|
||||
CREATE DATABASE mcol_5505_parquet_ddl;
|
||||
USE mcol_5505_parquet_ddl;
|
||||
SHOW CREATE TABLE parquet_test_table;
|
||||
Table Create Table
|
||||
parquet_test_table CREATE TABLE `parquet_test_table` (
|
||||
`col1` int(11) DEFAULT NULL,
|
||||
`col2` bigint(20) DEFAULT NULL,
|
||||
`col3` float DEFAULT NULL,
|
||||
`col4` double DEFAULT NULL,
|
||||
`col5` time(3) DEFAULT NULL,
|
||||
`col6` varchar(2000) DEFAULT NULL,
|
||||
`col7` varchar(2000) DEFAULT NULL,
|
||||
`col8` varchar(2000) DEFAULT NULL,
|
||||
`col9` varchar(2000) DEFAULT NULL,
|
||||
`col10` varchar(2000) DEFAULT NULL,
|
||||
`col11` varchar(2000) DEFAULT NULL,
|
||||
`col12` timestamp(3) NULL DEFAULT NULL,
|
||||
`col13` date DEFAULT NULL,
|
||||
`col14` timestamp(3) NULL DEFAULT NULL,
|
||||
`col15` smallint(6) DEFAULT NULL,
|
||||
`col16` tinyint(4) DEFAULT NULL,
|
||||
`col17` decimal(9,3) DEFAULT NULL,
|
||||
`col18` int(10) unsigned DEFAULT NULL,
|
||||
`col19` smallint(5) unsigned DEFAULT NULL,
|
||||
`col20` tinyint(3) unsigned DEFAULT NULL,
|
||||
`col21` bigint(20) unsigned DEFAULT NULL,
|
||||
`col22` tinyint(1) DEFAULT NULL,
|
||||
`col23` decimal(38,10) DEFAULT NULL,
|
||||
`col24` time(6) DEFAULT NULL,
|
||||
`col25` timestamp(6) NULL DEFAULT NULL,
|
||||
`col26` timestamp(6) NULL DEFAULT NULL,
|
||||
`col27` varbinary(8000) DEFAULT NULL,
|
||||
`col28` char(4) DEFAULT NULL
|
||||
) ENGINE=Columnstore DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
|
||||
DROP DATABASE mcol_5505_parquet_ddl;
|
@ -0,0 +1,82 @@
|
||||
#
|
||||
# parquet support for large volume data file
|
||||
# Author: Bin Ruan, binruan0227@gmail.com
|
||||
#
|
||||
if (!$MYSQL_TEST_ROOT){
|
||||
skip Should be run by root to execute cpimport;
|
||||
}
|
||||
|
||||
-- source ../include/have_columnstore.inc
|
||||
|
||||
--disable_warnings
|
||||
DROP DATABASE IF EXISTS mcol_5505_parquet_large_volume;
|
||||
--enable_warnings
|
||||
|
||||
CREATE DATABASE mcol_5505_parquet_large_volume;
|
||||
USE mcol_5505_parquet_large_volume;
|
||||
SET time_zone = '+8:00';
|
||||
# Create table
|
||||
Create TABLE t1(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
|
||||
Create TABLE t2(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
|
||||
Create TABLE t3(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
|
||||
Create TABLE t4(
|
||||
col1 INT,
|
||||
col2 TIMESTAMP(3),
|
||||
col3 CHAR(6),
|
||||
col4 DECIMAL(38,10),
|
||||
col5 DOUBLE,
|
||||
col6 VARCHAR(20)
|
||||
) ENGINE=Columnstore;
|
||||
|
||||
# Generate data
|
||||
--exec mcs_parquet_gen -l -f $MTR_SUITE_DIR/../std_data
|
||||
|
||||
|
||||
#Valid data and table
|
||||
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t1 $MTR_SUITE_DIR/../std_data/1MRows.parquet >/dev/null
|
||||
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t2 $MTR_SUITE_DIR/../std_data/10MRows.parquet >/dev/null
|
||||
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t3 $MTR_SUITE_DIR/../std_data/50MRows.parquet >/dev/null
|
||||
--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t4 $MTR_SUITE_DIR/../std_data/100MRows.parquet >/dev/null
|
||||
|
||||
SELECT * FROM t1 ORDER BY col1 LIMIT 5;
|
||||
SELECT COUNT(*) FROM t1;
|
||||
|
||||
SELECT * FROM t2 ORDER BY col1 LIMIT 5;
|
||||
SELECT COUNT(*) FROM t2;
|
||||
|
||||
SELECT * FROM t3 ORDER BY col1 LIMIT 5;
|
||||
SELECT COUNT(*) FROM t3;
|
||||
|
||||
SELECT * FROM t4 ORDER BY col1 LIMIT 5;
|
||||
SELECT COUNT(*) FROM t4;
|
||||
|
||||
# Clean UP
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/1MRows.parquet
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/10MRows.parquet
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/50MRows.parquet
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/100MRows.parquet
|
||||
DROP DATABASE mcol_5505_parquet_large_volume;
|
@ -0,0 +1,64 @@
|
||||
#
|
||||
# Check the parquet support for different data types
|
||||
# Author: Bin Ruan, binruan0227@gmail.com
|
||||
#
|
||||
if (!$MYSQL_TEST_ROOT){
|
||||
skip Should be run by root to execute cpimport;
|
||||
}
|
||||
|
||||
-- source ../include/have_columnstore.inc
|
||||
|
||||
--disable_warnings
|
||||
DROP DATABASE IF EXISTS mcol_5505_cpimport_parquet;
|
||||
--enable_warnings
|
||||
|
||||
CREATE DATABASE mcol_5505_cpimport_parquet;
|
||||
USE mcol_5505_cpimport_parquet;
|
||||
# Create table
|
||||
SET time_zone = '+8:00';
|
||||
Create TABLE t1(
|
||||
col1 INT,
|
||||
col2 BIGINT,
|
||||
col3 FLOAT,
|
||||
col4 DOUBLE,
|
||||
col5 TIME(3),
|
||||
col6 VARCHAR(2),
|
||||
col7 VARCHAR(5),
|
||||
col8 VARCHAR(20),
|
||||
col9 CHAR(2),
|
||||
col10 CHAR(5),
|
||||
col11 CHAR(20),
|
||||
col12 TIMESTAMP(3),
|
||||
col13 DATE,
|
||||
col14 DATETIME(3),
|
||||
col15 SMALLINT,
|
||||
col16 TINYINT,
|
||||
col17 DECIMAL(9,3),
|
||||
col18 INT UNSIGNED,
|
||||
col19 SMALLINT UNSIGNED,
|
||||
col20 TINYINT UNSIGNED,
|
||||
col21 BIGINT UNSIGNED,
|
||||
col22 BOOLEAN,
|
||||
col23 DECIMAL(38,10),
|
||||
col24 TIME(6),
|
||||
col25 TIMESTAMP(6),
|
||||
col26 DATETIME(6),
|
||||
col27 CHAR(4),
|
||||
col28 CHAR(4)
|
||||
) ENGINE=Columnstore;
|
||||
|
||||
# Generate data
|
||||
--exec mcs_parquet_gen -a -f $MTR_SUITE_DIR/../std_data
|
||||
|
||||
|
||||
#Valid data and table
|
||||
--exec $MCS_CPIMPORT mcol_5505_cpimport_parquet t1 $MTR_SUITE_DIR/../std_data/tests.parquet >/dev/null
|
||||
--exec $MCS_CPIMPORT mcol_5505_cpimport_parquet t1 $MTR_SUITE_DIR/../std_data/nulls.parquet >/dev/null
|
||||
|
||||
SELECT * FROM t1;
|
||||
SELECT COUNT(*) FROM t1;
|
||||
|
||||
# Clean UP
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/tests.parquet
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/nulls.parquet
|
||||
DROP DATABASE mcol_5505_cpimport_parquet;
|
33
mysql-test/columnstore/basic/t/mcol-5505-parquet-ddl.test
Normal file
33
mysql-test/columnstore/basic/t/mcol-5505-parquet-ddl.test
Normal file
@ -0,0 +1,33 @@
|
||||
#
|
||||
# check mcs_parquet_ddl tool
|
||||
# Author: Bin Ruan, binruan0227@gmail.com
|
||||
#
|
||||
-- source ../include/have_columnstore.inc
|
||||
|
||||
--disable_warnings
|
||||
DROP DATABASE IF EXISTS mcol_5505_parquet_ddl;
|
||||
--enable_warnings
|
||||
|
||||
--disable_result_log
|
||||
--exec mcs_parquet_gen -a -f $MTR_SUITE_DIR/../std_data
|
||||
--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/tests.parquet $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl
|
||||
# Wrong source file type
|
||||
--error 3
|
||||
--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/int8.par $MTR_SUITE_DIR/../std_data/int8table.ddl
|
||||
# Wrong number of argument files
|
||||
--error 4
|
||||
--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/tests.parquet
|
||||
--enable_result_log
|
||||
|
||||
# Create table
|
||||
CREATE DATABASE mcol_5505_parquet_ddl;
|
||||
|
||||
--exec $MYSQL mcol_5505_parquet_ddl < $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl >/dev/null
|
||||
USE mcol_5505_parquet_ddl;
|
||||
SHOW CREATE TABLE parquet_test_table;
|
||||
|
||||
# Clean UP
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/tests.parquet
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/nulls.parquet
|
||||
--exec rm $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl
|
||||
DROP DATABASE mcol_5505_parquet_ddl;
|
@ -13,3 +13,5 @@ add_subdirectory(idbmeminfo)
|
||||
add_subdirectory(rebuildEM)
|
||||
add_subdirectory(passwd)
|
||||
add_subdirectory(configMgt)
|
||||
add_subdirectory(parquetGen)
|
||||
add_subdirectory(parquetDDL)
|
6
tools/parquetDDL/CMakeLists.txt
Normal file
6
tools/parquetDDL/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
||||
include_directories(${ENGINE_COMMON_INCLUDES})
|
||||
|
||||
set(parquetDDL_SRCS main.cpp)
|
||||
add_executable(mcs_parquet_ddl ${parquetDDL_SRCS})
|
||||
target_link_libraries(mcs_parquet_ddl arrow parquet)
|
||||
install(TARGETS mcs_parquet_ddl DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)
|
285
tools/parquetDDL/main.cpp
Normal file
285
tools/parquetDDL/main.cpp
Normal file
@ -0,0 +1,285 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <arrow/api.h>
|
||||
#include <arrow/io/api.h>
|
||||
#include <parquet/exception.h>
|
||||
#include <parquet/arrow/reader.h>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <unistd.h>
|
||||
|
||||
enum STATUS_CODE
|
||||
{
|
||||
NO_ERROR,
|
||||
EMPTY_FIELD,
|
||||
UNSUPPORTED_DATA_TYPE,
|
||||
UNSUPPORTED_FILE_TYPE,
|
||||
FILE_NUM_ERROR
|
||||
};
|
||||
|
||||
/**
|
||||
* print the usage information
|
||||
*/
|
||||
static void usage()
|
||||
{
|
||||
std::cout << "usage: " << std::endl;
|
||||
std::cout << "Reading parquet then output its corresponding .ddl file." << std::endl;
|
||||
std::cout << "mcs_parquet_ddl [input_parquet_file] [output_ddl_file]" << std::endl;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the schema of the parquet file
|
||||
*/
|
||||
void getSchema(std::string filePath, std::shared_ptr<arrow::Schema>* parquetSchema)
|
||||
{
|
||||
std::shared_ptr<arrow::io::ReadableFile> infile;
|
||||
PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(filePath, arrow::default_memory_pool()));
|
||||
std::unique_ptr<parquet::arrow::FileReader> reader;
|
||||
PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
|
||||
PARQUET_THROW_NOT_OK(reader->GetSchema(parquetSchema));
|
||||
PARQUET_THROW_NOT_OK(infile->Close());
|
||||
}
|
||||
|
||||
/**
|
||||
* convert arrow data type id to corresponding columnstore type string
|
||||
*/
|
||||
int convert2mcs(std::shared_ptr<arrow::DataType> dataType, arrow::Type::type typeId, std::string& colType)
|
||||
{
|
||||
switch (typeId)
|
||||
{
|
||||
case arrow::Type::type::BOOL:
|
||||
{
|
||||
colType = "BOOLEAN";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::UINT8:
|
||||
{
|
||||
colType = "TINYINT UNSIGNED";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::INT8:
|
||||
{
|
||||
colType = "TINYINT";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::UINT16:
|
||||
{
|
||||
colType = "SMALLINT UNSIGNED";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::INT16:
|
||||
{
|
||||
colType = "SMALLINT";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::UINT32:
|
||||
{
|
||||
colType = "INT UNSIGNED";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::INT32:
|
||||
{
|
||||
colType = "INT";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::UINT64:
|
||||
{
|
||||
colType = "BIGINT UNSIGNED";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::INT64:
|
||||
{
|
||||
colType = "BIGINT";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::FLOAT:
|
||||
{
|
||||
colType = "FLOAT";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::DOUBLE:
|
||||
{
|
||||
colType = "DOUBLE";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::STRING:
|
||||
{
|
||||
// set 2000 as the maximum length and VARCHAR as column type
|
||||
colType = "VARCHAR(2000)";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::BINARY:
|
||||
{
|
||||
// set 8000 as the maximum length and VARCHAR as column type
|
||||
colType = "VARCHAR(8000) character set 'binary'";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::FIXED_SIZE_BINARY:
|
||||
{
|
||||
std::shared_ptr<arrow::FixedSizeBinaryType> fType = std::static_pointer_cast<arrow::FixedSizeBinaryType>(dataType);
|
||||
int byteWidth = fType->byte_width();
|
||||
colType = "CHAR(" + std::to_string(byteWidth) + ")";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::DATE32:
|
||||
{
|
||||
colType = "DATE";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::DATE64:
|
||||
{
|
||||
colType = "DATE";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::TIMESTAMP:
|
||||
{
|
||||
std::shared_ptr<arrow::TimestampType> fType = std::static_pointer_cast<arrow::TimestampType>(dataType);
|
||||
|
||||
if (fType->unit() == arrow::TimeUnit::MILLI)
|
||||
colType = "TIMESTAMP(3)";
|
||||
else if (fType->unit() == arrow::TimeUnit::MICRO)
|
||||
colType = "TIMESTAMP(6)";
|
||||
else
|
||||
return UNSUPPORTED_DATA_TYPE;
|
||||
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::TIME32:
|
||||
{
|
||||
colType = "TIME(3)";
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::TIME64:
|
||||
{
|
||||
std::shared_ptr<arrow::Time64Type> fType = std::static_pointer_cast<arrow::Time64Type>(dataType);
|
||||
|
||||
if (fType->unit() == arrow::TimeUnit::MICRO)
|
||||
colType = "TIME(6)";
|
||||
else
|
||||
return UNSUPPORTED_DATA_TYPE;
|
||||
|
||||
break;
|
||||
}
|
||||
case arrow::Type::type::DECIMAL128:
|
||||
{
|
||||
// get precision and scale
|
||||
std::shared_ptr<arrow::DecimalType> fType = std::static_pointer_cast<arrow::DecimalType>(dataType);
|
||||
int32_t fPrecision = fType->precision();
|
||||
int32_t fScale = fType->scale();
|
||||
colType = "DECIMAL(" + std::to_string(fPrecision) + "," + std::to_string(fScale) + ")";
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
return UNSUPPORTED_DATA_TYPE;
|
||||
}
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* main function to generate DDL file
|
||||
*/
|
||||
int generateDDL(std::string filePath, std::string targetPath, std::string tableName)
|
||||
{
|
||||
std::shared_ptr<arrow::Schema> parquetSchema;
|
||||
getSchema(filePath, &parquetSchema);
|
||||
std::vector<std::string> parquetCols;
|
||||
std::vector<std::string> parquetTypes;
|
||||
int rc = NO_ERROR;
|
||||
int fieldsNum = parquetSchema->num_fields();
|
||||
|
||||
if (fieldsNum == 0)
|
||||
{
|
||||
return EMPTY_FIELD;
|
||||
}
|
||||
|
||||
for (int i = 0; i < fieldsNum; i++)
|
||||
{
|
||||
const std::shared_ptr<arrow::Field> tField = parquetSchema->field(i);
|
||||
const std::string tName = tField->name();
|
||||
std::string colType;
|
||||
auto tType = tField->type();
|
||||
parquetCols.push_back(tName);
|
||||
rc = convert2mcs(tType, tType->id(), colType);
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
std::cout << "Not allowed data type: " << tName << std::endl;
|
||||
return rc;
|
||||
}
|
||||
|
||||
parquetTypes.push_back(colType);
|
||||
}
|
||||
|
||||
std::string str1 = "CREATE TABLE " + tableName + "(\n";
|
||||
std::string str2 = ") ENGINE=Columnstore;";
|
||||
|
||||
for (int i = 0; i < fieldsNum; i++)
|
||||
{
|
||||
str1 += parquetCols[i] + " " + parquetTypes[i] + (i == fieldsNum-1 ? "\n" : ",\n");
|
||||
}
|
||||
|
||||
str1 += str2;
|
||||
std::ofstream outfile(targetPath + tableName + ".ddl");
|
||||
outfile << str1;
|
||||
outfile.close();
|
||||
std::cout << "Successfully generate " + tableName + ".ddl" << std::endl;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int32_t option;
|
||||
|
||||
while ((option = getopt(argc, argv, "h")) != EOF)
|
||||
{
|
||||
switch (option)
|
||||
{
|
||||
case 'h':
|
||||
case '?':
|
||||
default:
|
||||
usage();
|
||||
return (option == 'h' ? 0 : -1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// parquet file argv[1]
|
||||
// ddl file argv[2]
|
||||
// input parameter should be 3 (no more than 3 also)
|
||||
if (argc != 3)
|
||||
{
|
||||
std::cout << "Please input source parquet file and target ddl file" << std::endl;
|
||||
return FILE_NUM_ERROR;
|
||||
}
|
||||
std::string parquetFile(argv[1]);
|
||||
std::string ddlFile(argv[2]);
|
||||
|
||||
// check file extension
|
||||
std::string::size_type endBase = ddlFile.rfind('.');
|
||||
std::string::size_type endBase1 = parquetFile.rfind('.');
|
||||
if (endBase == std::string::npos || endBase1 == std::string::npos ||
|
||||
parquetFile.substr(endBase1 + 1) != "parquet" ||
|
||||
ddlFile.substr(endBase + 1) != "ddl")
|
||||
{
|
||||
std::cout << "File type not supported" << std::endl;
|
||||
usage();
|
||||
return UNSUPPORTED_FILE_TYPE;
|
||||
}
|
||||
|
||||
std::string targetPath;
|
||||
std::string tableName;
|
||||
std::string::size_type startBase = ddlFile.rfind('/');
|
||||
targetPath.assign(argv[2], startBase + 1);
|
||||
tableName.assign(argv[2] + startBase + 1, endBase - startBase - 1);
|
||||
std::cout << "Reading " + parquetFile << std::endl;
|
||||
int rc = generateDDL(parquetFile, targetPath, tableName);
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
std::cout << "Input parquet file illegal: no data field" << std::endl;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
6
tools/parquetGen/CMakeLists.txt
Normal file
6
tools/parquetGen/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
||||
include_directories(${ENGINE_COMMON_INCLUDES})
|
||||
|
||||
set(parquetGen_SRCS main.cpp)
|
||||
add_executable(mcs_parquet_gen ${parquetGen_SRCS})
|
||||
target_link_libraries(mcs_parquet_gen boost_system boost_filesystem arrow parquet)
|
||||
install(TARGETS mcs_parquet_gen DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)
|
1342
tools/parquetGen/main.cpp
Normal file
1342
tools/parquetGen/main.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -29,6 +29,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <type_traits>
|
||||
#include <chrono>
|
||||
#include "mcs_decimal.h"
|
||||
using namespace std;
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
@ -1572,6 +1573,44 @@ boost::any DataConvert::StringToTimestamp(const datatypes::ConvertFromStringPara
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert date32 parquet data to binary date. Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
int32_t DataConvert::ConvertArrowColumnDate(int32_t dayVal, int& status)
|
||||
{
|
||||
int inYear;
|
||||
int inMonth;
|
||||
int inDay;
|
||||
int32_t value = 0;
|
||||
|
||||
int64_t secondsSinceEpoch = dayVal;
|
||||
secondsSinceEpoch *= 86400;
|
||||
std::chrono::seconds duration(secondsSinceEpoch);
|
||||
|
||||
std::chrono::system_clock::time_point timePoint(duration);
|
||||
|
||||
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
|
||||
std::tm* timeInfo = std::localtime(&ttime);
|
||||
|
||||
inYear = timeInfo->tm_year + 1900;
|
||||
inMonth = timeInfo->tm_mon + 1;
|
||||
inDay = timeInfo->tm_mday;
|
||||
|
||||
if (isDateValid(inDay, inMonth, inYear))
|
||||
{
|
||||
Date aDay;
|
||||
aDay.year = inYear;
|
||||
aDay.month = inMonth;
|
||||
aDay.day = inDay;
|
||||
memcpy(&value, &aDay, 4);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = -1;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert date string to binary date. Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
@ -1658,6 +1697,100 @@ bool DataConvert::isColumnDateValid(int32_t date)
|
||||
return (isDateValid(d.day, d.month, d.year));
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert timestamp parquet data to binary datetime(millisecond). Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
int64_t DataConvert::convertArrowColumnDatetime(int64_t timeVal, int& status)
|
||||
{
|
||||
int64_t value = 0;
|
||||
int inYear;
|
||||
int inMonth;
|
||||
int inDay;
|
||||
int inHour;
|
||||
int inMinute;
|
||||
int inSecond;
|
||||
int inMicrosecond;
|
||||
|
||||
std::chrono::milliseconds duration(timeVal);
|
||||
std::chrono::system_clock::time_point timePoint(duration);
|
||||
|
||||
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
|
||||
std::tm* timeInfo = std::gmtime(&ttime);
|
||||
|
||||
inYear = timeInfo->tm_year + 1900;
|
||||
inMonth = timeInfo->tm_mon + 1;
|
||||
inDay = timeInfo->tm_mday;
|
||||
inHour = timeInfo->tm_hour;
|
||||
inMinute = timeInfo->tm_min;
|
||||
inSecond = timeInfo->tm_sec;
|
||||
inMicrosecond = duration.count() % 1000;
|
||||
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
|
||||
{
|
||||
DateTime aDatetime;
|
||||
aDatetime.year = inYear;
|
||||
aDatetime.month = inMonth;
|
||||
aDatetime.day = inDay;
|
||||
aDatetime.hour = inHour;
|
||||
aDatetime.minute = inMinute;
|
||||
aDatetime.second = inSecond;
|
||||
aDatetime.msecond = inMicrosecond;
|
||||
|
||||
memcpy(&value, &aDatetime, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = -1;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert timestamp parquet data to binary datetime(millisecond). Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
int64_t DataConvert::convertArrowColumnDatetimeUs(int64_t timeVal, int& status)
|
||||
{
|
||||
int64_t value = 0;
|
||||
int inYear;
|
||||
int inMonth;
|
||||
int inDay;
|
||||
int inHour;
|
||||
int inMinute;
|
||||
int inSecond;
|
||||
int inMicrosecond;
|
||||
|
||||
std::chrono::microseconds duration(timeVal);
|
||||
std::chrono::system_clock::time_point timePoint(duration);
|
||||
|
||||
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
|
||||
std::tm* timeInfo = std::gmtime(&ttime);
|
||||
|
||||
inYear = timeInfo->tm_year + 1900;
|
||||
inMonth = timeInfo->tm_mon + 1;
|
||||
inDay = timeInfo->tm_mday;
|
||||
inHour = timeInfo->tm_hour;
|
||||
inMinute = timeInfo->tm_min;
|
||||
inSecond = timeInfo->tm_sec;
|
||||
inMicrosecond = duration.count() % 1000000;
|
||||
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
|
||||
{
|
||||
DateTime aDatetime;
|
||||
aDatetime.year = inYear;
|
||||
aDatetime.month = inMonth;
|
||||
aDatetime.day = inDay;
|
||||
aDatetime.hour = inHour;
|
||||
aDatetime.minute = inMinute;
|
||||
aDatetime.second = inSecond;
|
||||
aDatetime.msecond = inMicrosecond;
|
||||
|
||||
memcpy(&value, &aDatetime, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = -1;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert date/time string to binary date/time. Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
@ -1798,6 +1931,127 @@ int64_t DataConvert::convertColumnDatetime(const char* dataOrg, CalpontDateTimeF
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert timestamp parquet data to binary timestamp. Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
int64_t DataConvert::convertArrowColumnTimestamp(int64_t timeVal, int& status)
|
||||
{
|
||||
int64_t value = 0;
|
||||
int inYear;
|
||||
int inMonth;
|
||||
int inDay;
|
||||
int inHour;
|
||||
int inMinute;
|
||||
int inSecond;
|
||||
int inMicrosecond;
|
||||
|
||||
std::chrono::milliseconds duration(timeVal);
|
||||
std::chrono::system_clock::time_point timePoint(duration);
|
||||
|
||||
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
|
||||
std::tm* timeInfo = std::gmtime(&ttime);
|
||||
|
||||
inYear = timeInfo->tm_year + 1900;
|
||||
inMonth = timeInfo->tm_mon + 1;
|
||||
inDay = timeInfo->tm_mday;
|
||||
inHour = timeInfo->tm_hour;
|
||||
inMinute = timeInfo->tm_min;
|
||||
inSecond = timeInfo->tm_sec;
|
||||
inMicrosecond = duration.count() % 1000;
|
||||
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
|
||||
{
|
||||
MySQLTime m_time;
|
||||
m_time.year = inYear;
|
||||
m_time.month = inMonth;
|
||||
m_time.day = inDay;
|
||||
m_time.hour = inHour;
|
||||
m_time.minute = inMinute;
|
||||
m_time.second = inSecond;
|
||||
m_time.second_part = inMicrosecond;
|
||||
|
||||
bool isValid = true;
|
||||
int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
|
||||
|
||||
if (!isValid)
|
||||
{
|
||||
status = -1;
|
||||
return value;
|
||||
}
|
||||
|
||||
TimeStamp timestamp;
|
||||
timestamp.second = seconds;
|
||||
timestamp.msecond = m_time.second_part;
|
||||
|
||||
memcpy(&value, ×tamp, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = -1;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert timestamp parquet data to binary timestamp. Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
int64_t DataConvert::convertArrowColumnTimestampUs(int64_t timeVal, int& status)
|
||||
{
|
||||
int64_t value = 0;
|
||||
int inYear;
|
||||
int inMonth;
|
||||
int inDay;
|
||||
int inHour;
|
||||
int inMinute;
|
||||
int inSecond;
|
||||
int inMicrosecond;
|
||||
|
||||
std::chrono::microseconds duration(timeVal);
|
||||
std::chrono::system_clock::time_point timePoint(duration);
|
||||
|
||||
std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
|
||||
std::tm* timeInfo = std::gmtime(&ttime);
|
||||
|
||||
inYear = timeInfo->tm_year + 1900;
|
||||
inMonth = timeInfo->tm_mon + 1;
|
||||
inDay = timeInfo->tm_mday;
|
||||
inHour = timeInfo->tm_hour;
|
||||
inMinute = timeInfo->tm_min;
|
||||
inSecond = timeInfo->tm_sec;
|
||||
inMicrosecond = static_cast<int>(duration.count() % 1000000);
|
||||
|
||||
if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
|
||||
{
|
||||
MySQLTime m_time;
|
||||
m_time.year = inYear;
|
||||
m_time.month = inMonth;
|
||||
m_time.day = inDay;
|
||||
m_time.hour = inHour;
|
||||
m_time.minute = inMinute;
|
||||
m_time.second = inSecond;
|
||||
m_time.second_part = inMicrosecond;
|
||||
|
||||
bool isValid = true;
|
||||
int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
|
||||
|
||||
if (!isValid)
|
||||
{
|
||||
status = -1;
|
||||
return value;
|
||||
}
|
||||
|
||||
TimeStamp timestamp;
|
||||
timestamp.second = seconds;
|
||||
timestamp.msecond = m_time.second_part;
|
||||
|
||||
memcpy(&value, ×tamp, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = -1;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert timestamp string to binary timestamp. Used by BulkLoad.
|
||||
// Most of this code is taken from DataConvert::convertColumnDatetime
|
||||
@ -1972,6 +2226,123 @@ int64_t DataConvert::convertColumnTimestamp(const char* dataOrg, CalpontDateTime
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert time32 parquet data to binary time. Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
int64_t DataConvert::convertArrowColumnTime32(int32_t timeVal, int& status)
|
||||
{
|
||||
int64_t value = 0;
|
||||
// convert millisecond to time
|
||||
int inHour, inMinute, inSecond, inMicrosecond;
|
||||
inHour = inMinute = inSecond = inMicrosecond = 0;
|
||||
bool isNeg = false;
|
||||
if (timeVal < 0)
|
||||
isNeg = true;
|
||||
inHour = timeVal / 3600000;
|
||||
inMinute = (timeVal - inHour * 3600000) / 60000;
|
||||
inSecond = (timeVal - inHour * 3600000 - inMinute * 60000) / 1000;
|
||||
inMicrosecond = timeVal - inHour * 3600000 - inMinute * 60000 - inSecond * 1000;
|
||||
if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
|
||||
{
|
||||
Time atime;
|
||||
atime.hour = inHour;
|
||||
atime.minute = inMinute;
|
||||
atime.second = inSecond;
|
||||
atime.msecond = inMicrosecond;
|
||||
atime.is_neg = isNeg;
|
||||
|
||||
memcpy(&value, &atime, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Emulate MariaDB's time saturation
|
||||
if (inHour > 838)
|
||||
{
|
||||
Time atime;
|
||||
atime.hour = 838;
|
||||
atime.minute = 59;
|
||||
atime.second = 59;
|
||||
atime.msecond = 999999;
|
||||
atime.is_neg = false;
|
||||
memcpy(&value, &atime, 8);
|
||||
}
|
||||
else if (inHour < -838)
|
||||
{
|
||||
Time atime;
|
||||
atime.hour = -838;
|
||||
atime.minute = 59;
|
||||
atime.second = 59;
|
||||
atime.msecond = 999999;
|
||||
atime.is_neg = false;
|
||||
memcpy(&value, &atime, 8);
|
||||
}
|
||||
|
||||
// If neither of the above match then we return a 0 time
|
||||
|
||||
status = -1;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert time64 parquet data to binary time. Used by BulkLoad.
|
||||
//------------------------------------------------------------------------------
|
||||
int64_t DataConvert::convertArrowColumnTime64(int64_t timeVal, int& status)
|
||||
{
|
||||
int64_t value = 0;
|
||||
// convert macrosecond to time
|
||||
int inHour, inMinute, inSecond, inMicrosecond;
|
||||
inHour = inMinute = inSecond = inMicrosecond = 0;
|
||||
bool isNeg = false;
|
||||
if (timeVal < 0)
|
||||
isNeg = true;
|
||||
inHour = timeVal / 3600000000;
|
||||
inMinute = (timeVal - inHour * 3600000000) / 60000000;
|
||||
inSecond = (timeVal - inHour * 3600000000 - inMinute * 60000000) / 1000000;
|
||||
inMicrosecond = timeVal - inHour * 3600000000 - inMinute * 60000000 - inSecond * 1000000;
|
||||
if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
|
||||
{
|
||||
Time atime;
|
||||
atime.hour = inHour;
|
||||
atime.minute = inMinute;
|
||||
atime.second = inSecond;
|
||||
atime.msecond = inMicrosecond;
|
||||
atime.is_neg = isNeg;
|
||||
|
||||
memcpy(&value, &atime, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Emulate MariaDB's time saturation
|
||||
if (inHour > 838)
|
||||
{
|
||||
Time atime;
|
||||
atime.hour = 838;
|
||||
atime.minute = 59;
|
||||
atime.second = 59;
|
||||
atime.msecond = 999999;
|
||||
atime.is_neg = false;
|
||||
memcpy(&value, &atime, 8);
|
||||
}
|
||||
else if (inHour < -838)
|
||||
{
|
||||
Time atime;
|
||||
atime.hour = -838;
|
||||
atime.minute = 59;
|
||||
atime.second = 59;
|
||||
atime.msecond = 999999;
|
||||
atime.is_neg = false;
|
||||
memcpy(&value, &atime, 8);
|
||||
}
|
||||
|
||||
// If neither of the above match then we return a 0 time
|
||||
|
||||
status = -1;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Convert time string to binary time. Used by BulkLoad.
|
||||
// Most of this is taken from str_to_time in sql-common/my_time.c
|
||||
|
@ -1170,6 +1170,14 @@ class DataConvert
|
||||
EXPORT static std::string timeToString1(long long timevalue);
|
||||
static inline void timeToString1(long long timevalue, char* buf, unsigned int buflen);
|
||||
|
||||
/**
|
||||
* @brief convert parquet date data to its native format. This function is for bulkload to use.
|
||||
*
|
||||
* @param dayVal the input data representing days
|
||||
* @param status 0 - success, -1 - fail
|
||||
*/
|
||||
EXPORT static int32_t ConvertArrowColumnDate(int32_t dayVal, int& status);
|
||||
|
||||
/**
|
||||
* @brief convert a date column data, represnted as a string, to it's native
|
||||
* format. This function is for bulkload to use.
|
||||
@ -1188,6 +1196,22 @@ class DataConvert
|
||||
*/
|
||||
EXPORT static bool isColumnDateValid(int32_t date);
|
||||
|
||||
/**
|
||||
* @brief convert parquet datetime data to its native format. This function is for bulkload to use.
|
||||
*
|
||||
* @param dayVal the input data representing millisecond from unix epoch
|
||||
* @param status 0 - success, -1 - fail
|
||||
*/
|
||||
EXPORT static int64_t convertArrowColumnDatetime(int64_t timeVal, int& status);
|
||||
|
||||
/**
|
||||
* @brief convert parquet datetime data to its native format. This function is for bulkload to use.
|
||||
*
|
||||
* @param dayVal the input data representing microsecond from unix epoch
|
||||
* @param status 0 - success, -1 - fail
|
||||
*/
|
||||
EXPORT static int64_t convertArrowColumnDatetimeUs(int64_t timeVal, int& status);
|
||||
|
||||
/**
|
||||
* @brief convert a datetime column data, represented as a string,
|
||||
* to it's native format. This function is for bulkload to use.
|
||||
@ -1201,6 +1225,22 @@ class DataConvert
|
||||
EXPORT static int64_t convertColumnDatetime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
|
||||
int& status, unsigned int dataOrgLen);
|
||||
|
||||
/**
|
||||
* @brief convert parquet timestamp data(millisecond) to its native format. This function is for bulkload to use.
|
||||
*
|
||||
* @param dayVal the input data representing millisecond from unix epoch
|
||||
* @param status 0 - success, -1 - fail
|
||||
*/
|
||||
EXPORT static int64_t convertArrowColumnTimestamp(int64_t timeVal, int& status);
|
||||
|
||||
/**
|
||||
* @brief convert parquet timestamp data(microsecond) to its native format. This function is for bulkload to use.
|
||||
*
|
||||
* @param dayVal the input data representing millisecond from unix epoch
|
||||
* @param status 0 - success, -1 - fail
|
||||
*/
|
||||
EXPORT static int64_t convertArrowColumnTimestampUs(int64_t timeVal, int& status);
|
||||
|
||||
/**
|
||||
* @brief convert a timestamp column data, represented as a string,
|
||||
* to it's native format. This function is for bulkload to use.
|
||||
@ -1228,6 +1268,22 @@ class DataConvert
|
||||
EXPORT static int64_t convertColumnTime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
|
||||
int& status, unsigned int dataOrgLen);
|
||||
|
||||
/**
|
||||
* @brief convert parquet time data to its native format. This function is for bulkload to use.
|
||||
*
|
||||
* @param dayVal the input data representing milliseconds since midnight
|
||||
* @param status 0 - success, -1 - fail
|
||||
*/
|
||||
EXPORT static int64_t convertArrowColumnTime32(int32_t timeVal, int& status);
|
||||
|
||||
/**
|
||||
* @brief convert parquet time data to its native format. This function is for bulkload to use.
|
||||
*
|
||||
* @param dayVal the input data representing either microseconds or nanoseconds since midnight
|
||||
* @param status 0 - success, -1 - fail
|
||||
*/
|
||||
EXPORT static int64_t convertArrowColumnTime64(int64_t timeVal, int& status);
|
||||
|
||||
/**
|
||||
* @brief Is specified datetime valid; used by binary bulk load
|
||||
*/
|
||||
|
@ -40,7 +40,11 @@ set(cpimport.bin_SRCS cpimport.cpp)
|
||||
|
||||
add_executable(cpimport.bin ${cpimport.bin_SRCS})
|
||||
add_dependencies(cpimport.bin marias3)
|
||||
target_link_libraries(cpimport.bin ${ENGINE_LDFLAGS} ${NETSNMP_LIBRARIES} ${ENGINE_WRITE_LIBS} ${S3API_DEPS} we_bulk we_xml)
|
||||
|
||||
target_link_libraries(cpimport.bin ${ENGINE_LDFLAGS} ${NETSNMP_LIBRARIES} ${ENGINE_WRITE_LIBS} ${S3API_DEPS} we_bulk we_xml)
|
||||
FIND_PACKAGE(Arrow)
|
||||
FIND_PACKAGE(Parquet)
|
||||
target_link_libraries(cpimport.bin arrow)
|
||||
target_link_libraries(cpimport.bin parquet)
|
||||
install(TARGETS cpimport.bin DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)
|
||||
|
||||
|
@ -1210,6 +1210,32 @@ int BulkLoad::manageImportDataFileList(Job& job, int tableNo, TableInfo* tableIn
|
||||
std::vector<std::string> loadFilesList;
|
||||
bool bUseStdin = false;
|
||||
|
||||
// Check if all the import files are parquet file
|
||||
bool isParquet = false;
|
||||
for (unsigned int i = 0; i < fCmdLineImportFiles.size(); i++)
|
||||
{
|
||||
if (fCmdLineImportFiles[i].rfind(".parquet") != std::string::npos)
|
||||
{
|
||||
if (!isParquet)
|
||||
isParquet = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (isParquet)
|
||||
{
|
||||
ostringstream oss;
|
||||
oss << "Import files exist parquet file while not all of them are parquet files.";
|
||||
fLog.logMsg(oss.str(), ERR_FILE_TYPE_DIFF, MSGLVL_ERROR);
|
||||
return ERR_FILE_TYPE_DIFF;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isParquet)
|
||||
{
|
||||
setImportDataMode(IMPORT_DATA_PARQUET);
|
||||
}
|
||||
|
||||
// Take loadFileName from command line argument override "if" one exists,
|
||||
// else we take from the Job xml file
|
||||
std::string loadFileName;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -30,7 +30,7 @@
|
||||
#include "we_columninfo.h"
|
||||
#include "calpontsystemcatalog.h"
|
||||
#include "dataconvert.h"
|
||||
|
||||
#include <arrow/api.h>
|
||||
namespace WriteEngine
|
||||
{
|
||||
class Log;
|
||||
@ -84,6 +84,9 @@ class BulkLoadBuffer
|
||||
char* fOverflowBuf; // Overflow data held for next buffer
|
||||
unsigned fOverflowSize; // Current size of fOverflowBuf
|
||||
|
||||
std::shared_ptr<arrow::RecordBatch> fParquetBatch; // Batch of parquet file to be parsed
|
||||
std::shared_ptr<arrow::RecordBatch> fParquetBatchParser; // for temporary use by parser
|
||||
std::shared_ptr<::arrow::RecordBatchReader> fParquetReader; // Reader for read batches of parquet data
|
||||
// Information about the locker and status for each column in this buffer.
|
||||
// Note that TableInfo::fSyncUpdatesTI mutex is used to synchronize
|
||||
// access to fColumnLocks and fParseComplete from both read and parse
|
||||
@ -174,6 +177,19 @@ class BulkLoadBuffer
|
||||
void convert(char* field, int fieldLength, bool nullFlag, unsigned char* output, const JobColumn& column,
|
||||
BLBufferStats& bufStats);
|
||||
|
||||
/** @brief Parse a batch of parquet data in read buffer for a nonDictionary column
|
||||
*/
|
||||
int parseColParquet(ColumnInfo& columnInfo);
|
||||
|
||||
/** @brief Convert batch parquet data depending upon the data type
|
||||
*/
|
||||
void convertParquet(std::shared_ptr<arrow::Array> columnData, unsigned char* buf, const JobColumn& column,
|
||||
BLBufferStats& bufStats, RID& lastInputRowInExtent, ColumnInfo& columnInfo,
|
||||
bool& updateCPInfoPendingFlag, ColumnBufferSection* section);
|
||||
|
||||
|
||||
inline void updateCPMinMax(ColumnInfo& columnInfo, RID& lastInputRowInExtent, BLBufferStats& bufStats,
|
||||
bool& updateCPInfoPendingFlag, ColumnBufferSection* section, uint32_t curRow);
|
||||
/** @brief Copy the overflow data
|
||||
*/
|
||||
void copyOverflow(const BulkLoadBuffer& buffer);
|
||||
@ -263,6 +279,11 @@ class BulkLoadBuffer
|
||||
fStatusBLB = status;
|
||||
}
|
||||
|
||||
void setParquetReader(std::shared_ptr<::arrow::RecordBatchReader> reader)
|
||||
{
|
||||
fParquetReader = reader;
|
||||
}
|
||||
|
||||
/** @brief Try to lock a column for the buffer
|
||||
* TableInfo::fSyncUpdatesTI mutex should be locked when calling this
|
||||
* function (see fColumnLocks discussion).
|
||||
@ -273,6 +294,10 @@ class BulkLoadBuffer
|
||||
size_t* parse_length, RID& totalReadRows, RID& correctTotalRows,
|
||||
const boost::ptr_vector<ColumnInfo>& columnsInfo, unsigned int allowedErrCntThisCall);
|
||||
|
||||
/** @brief Read the batch data into the buffer
|
||||
*/
|
||||
int fillFromFileParquet(RID& totalReadRows, RID& correctTotalRows);
|
||||
|
||||
/** @brief Read the table data into the buffer
|
||||
*/
|
||||
int fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* handle, RID& totalRows, RID& correctTotalRows,
|
||||
|
@ -1657,6 +1657,41 @@ int ColumnInfo::closeDctnryStore(bool bAbort)
|
||||
return rc;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------------
|
||||
// Update dictionary store file with string column parquet data, and return the assigned
|
||||
// tokens (tokenbuf) to be stored in the corresponding column token file.
|
||||
//--------------------------------------------------------------------------------------
|
||||
int ColumnInfo::updateDctnryStoreParquet(std::shared_ptr<arrow::Array> columnData, int tokenPos, const int totalRow, char* tokenBuf)
|
||||
{
|
||||
long long truncCount = 0;
|
||||
|
||||
#ifdef PROFILE
|
||||
Stats::startParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
|
||||
#endif
|
||||
boost::mutex::scoped_lock lock(fDictionaryMutex);
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
|
||||
#endif
|
||||
|
||||
int rc = fStore->insertDctnryParquet(columnData, tokenPos, totalRow, id, tokenBuf, truncCount, column.cs, column.weType);
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
WErrorCodes ec;
|
||||
std::ostringstream oss;
|
||||
oss << "updateDctnryStore: error adding rows to store file for "
|
||||
<< "OID-" << column.dctnry.dctnryOid << "; DBRoot-" << curCol.dataFile.fDbRoot << "; part-"
|
||||
<< curCol.dataFile.fPartition << "; seg-" << curCol.dataFile.fSegment << "; " << ec.errorString(rc);
|
||||
fLog->logMsg(oss.str(), rc, MSGLVL_CRITICAL);
|
||||
fpTableInfo->fBRMReporter.addToErrMsgEntry(oss.str());
|
||||
return rc;
|
||||
}
|
||||
|
||||
incSaturatedCnt(truncCount);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Update dictionary store file with specified strings, and return the assigned
|
||||
// tokens (tokenbuf) to be stored in the corresponding column token file.
|
||||
|
@ -200,6 +200,13 @@ class ColumnInfo : public WeUIDGID
|
||||
*/
|
||||
void lastInputRowInExtentInc();
|
||||
|
||||
/** @brief Update dictionary for arrow/parquet format
|
||||
* Parse and store the parquet data into the store file, and
|
||||
* returns the assigned tokens (tokenBuf) to be stored in the
|
||||
* corresponding column token file.
|
||||
*/
|
||||
int updateDctnryStoreParquet(std::shared_ptr<arrow::Array> columnData, int tokenPos, const int totalRow, char* tokenBuf);
|
||||
|
||||
/** @brief Update dictionary method.
|
||||
* Parses and stores specified strings into the store file, and
|
||||
* returns the assigned tokens (tokenBuf) to be stored in the
|
||||
|
@ -55,6 +55,9 @@ using namespace querytele;
|
||||
#include "oamcache.h"
|
||||
#include "cacheutils.h"
|
||||
|
||||
#include <arrow/io/api.h>
|
||||
#include <parquet/arrow/reader.h>
|
||||
#include <parquet/exception.h>
|
||||
namespace
|
||||
{
|
||||
const std::string BAD_FILE_SUFFIX = ".bad"; // Reject data file suffix
|
||||
@ -153,6 +156,8 @@ TableInfo::TableInfo(Log* logger, const BRM::TxnID txnID, const string& processN
|
||||
, fRejectErrCnt(0)
|
||||
, fExtentStrAlloc(tableOID, logger)
|
||||
, fOamCachePtr(oam::OamCache::makeOamCache())
|
||||
, fParquetReader(NULL)
|
||||
, fReader(nullptr)
|
||||
{
|
||||
fBuffers.clear();
|
||||
fColumns.clear();
|
||||
@ -266,24 +271,44 @@ int TableInfo::readTableData()
|
||||
{
|
||||
RID validTotalRows = 0;
|
||||
RID totalRowsPerInputFile = 0;
|
||||
int64_t totalRowsParquet = 0; // totalRowsParquet to be used in later function
|
||||
// needs int64_t type
|
||||
int filesTBProcessed = fLoadFileList.size();
|
||||
int fileCounter = 0;
|
||||
unsigned long long qtSentAt = 0;
|
||||
|
||||
if (fHandle == NULL)
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFile();
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
if (fHandle == NULL)
|
||||
{
|
||||
// Mark the table status as error and exit.
|
||||
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
|
||||
fStatusTI = WriteEngine::ERR;
|
||||
return rc;
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFile();
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
// Mark the table status as error and exit.
|
||||
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
|
||||
fStatusTI = WriteEngine::ERR;
|
||||
return rc;
|
||||
}
|
||||
fileCounter++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fParquetReader == NULL)
|
||||
{
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFileParquet(totalRowsParquet);
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
// Mark the table status as error and exit.
|
||||
boost::mutex::scoped_lock lock(fSyncUpdatesTI);
|
||||
fStatusTI = WriteEngine::ERR;
|
||||
return rc;
|
||||
}
|
||||
fileCounter++;
|
||||
}
|
||||
|
||||
fileCounter++;
|
||||
}
|
||||
|
||||
timeval readStart;
|
||||
@ -419,16 +444,23 @@ int TableInfo::readTableData()
|
||||
// validTotalRows is ongoing total of valid rows read for all files
|
||||
// pertaining to this DB table.
|
||||
int readRc;
|
||||
if (fReadFromS3)
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
|
||||
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
|
||||
fColumns, allowedErrCntThisCall);
|
||||
if (fReadFromS3)
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
|
||||
&fS3ParseLength, totalRowsPerInputFile, validTotalRows,
|
||||
fColumns, allowedErrCntThisCall);
|
||||
}
|
||||
else
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
|
||||
validTotalRows, fColumns, allowedErrCntThisCall);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
|
||||
validTotalRows, fColumns, allowedErrCntThisCall);
|
||||
readRc = fBuffers[readBufNo].fillFromFileParquet(totalRowsPerInputFile, validTotalRows);
|
||||
}
|
||||
|
||||
if (readRc != NO_ERROR)
|
||||
@ -530,7 +562,7 @@ int TableInfo::readTableData()
|
||||
fCurrentReadBuffer = (fCurrentReadBuffer + 1) % fReadBufCount;
|
||||
|
||||
// bufferCount++;
|
||||
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)))
|
||||
if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)) || (totalRowsPerInputFile == (RID)totalRowsParquet))
|
||||
{
|
||||
timeval readFinished;
|
||||
gettimeofday(&readFinished, NULL);
|
||||
@ -567,7 +599,15 @@ int TableInfo::readTableData()
|
||||
if (fileCounter < filesTBProcessed)
|
||||
{
|
||||
fFileName = fLoadFileList[fileCounter];
|
||||
int rc = openTableFile();
|
||||
int rc;
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
rc = openTableFile();
|
||||
}
|
||||
else
|
||||
{
|
||||
rc = openTableFileParquet(totalRowsParquet);
|
||||
}
|
||||
|
||||
if (rc != NO_ERROR)
|
||||
{
|
||||
@ -1252,6 +1292,45 @@ void TableInfo::addColumn(ColumnInfo* info)
|
||||
fExtentStrAlloc.addColumn(info->column.mapOid, info->column.width, info->column.dataType);
|
||||
}
|
||||
|
||||
|
||||
int TableInfo::openTableFileParquet(int64_t &totalRowsParquet)
|
||||
{
|
||||
if (fParquetReader != NULL)
|
||||
return NO_ERROR;
|
||||
std::shared_ptr<arrow::io::ReadableFile> infile;
|
||||
try
|
||||
{
|
||||
PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(fFileName, arrow::default_memory_pool()));
|
||||
PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &fReader));
|
||||
fReader->set_batch_size(1000);
|
||||
PARQUET_THROW_NOT_OK(fReader->ScanContents({0}, 1000, &totalRowsParquet));
|
||||
PARQUET_THROW_NOT_OK(fReader->GetRecordBatchReader(&fParquetReader));
|
||||
}
|
||||
catch (std::exception& ex)
|
||||
{
|
||||
ostringstream oss;
|
||||
oss << "Error opening import file " << fFileName << ".";
|
||||
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
|
||||
|
||||
return ERR_FILE_OPEN;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
ostringstream oss;
|
||||
oss << "Error opening import file " << fFileName << ".";
|
||||
fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
|
||||
|
||||
return ERR_FILE_OPEN;
|
||||
}
|
||||
// initialize fBuffers batch source
|
||||
for (int i = 0; i < fReadBufCount; ++i)
|
||||
{
|
||||
fBuffers[i].setParquetReader(fParquetReader);
|
||||
}
|
||||
return NO_ERROR;
|
||||
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Open the file corresponding to fFileName so that we can import it's contents.
|
||||
// A buffer is also allocated and passed to setvbuf().
|
||||
@ -1331,24 +1410,32 @@ int TableInfo::openTableFile()
|
||||
//------------------------------------------------------------------------------
|
||||
void TableInfo::closeTableFile()
|
||||
{
|
||||
if (fHandle)
|
||||
if (fImportDataMode != IMPORT_DATA_PARQUET)
|
||||
{
|
||||
// If reading from stdin, we don't delete the buffer out from under
|
||||
// the file handle, because stdin is still open. This will cause a
|
||||
// memory leak, but when using stdin, we can only read in 1 table.
|
||||
// So it's not like we will be leaking multiple buffers for several
|
||||
// tables over the life of the job.
|
||||
if (!fReadFromStdin)
|
||||
if (fHandle)
|
||||
{
|
||||
fclose(fHandle);
|
||||
delete[] fFileBuffer;
|
||||
// If reading from stdin, we don't delete the buffer out from under
|
||||
// the file handle, because stdin is still open. This will cause a
|
||||
// memory leak, but when using stdin, we can only read in 1 table.
|
||||
// So it's not like we will be leaking multiple buffers for several
|
||||
// tables over the life of the job.
|
||||
if (!fReadFromStdin)
|
||||
{
|
||||
fclose(fHandle);
|
||||
delete[] fFileBuffer;
|
||||
}
|
||||
|
||||
fHandle = 0;
|
||||
}
|
||||
else if (ms3)
|
||||
{
|
||||
ms3_free((uint8_t*)fFileBuffer);
|
||||
}
|
||||
|
||||
fHandle = 0;
|
||||
}
|
||||
else if (ms3)
|
||||
else
|
||||
{
|
||||
ms3_free((uint8_t*)fFileBuffer);
|
||||
fReader.reset();
|
||||
fParquetReader.reset();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,9 @@
|
||||
#include <boost/ptr_container/ptr_vector.hpp>
|
||||
#include <boost/uuid/uuid.hpp>
|
||||
|
||||
#include <arrow/api.h>
|
||||
#include <parquet/arrow/reader.h>
|
||||
|
||||
#include <libmarias3/marias3.h>
|
||||
|
||||
#include "we_type.h"
|
||||
@ -170,22 +173,25 @@ class TableInfo : public WeUIDGID
|
||||
oam::OamCache* fOamCachePtr; // OamCache: ptr is copyable
|
||||
boost::uuids::uuid fJobUUID; // Job UUID
|
||||
std::vector<BRM::LBID_t> fDictFlushBlks; // dict blks to be flushed from cache
|
||||
|
||||
|
||||
std::shared_ptr<arrow::RecordBatchReader> fParquetReader; // Batch reader to read batches of data
|
||||
std::unique_ptr<parquet::arrow::FileReader> fReader; // Reader to read parquet file
|
||||
//--------------------------------------------------------------------------
|
||||
// Private Functions
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
int changeTableLockState(); // Change state of table lock to cleanup
|
||||
void closeTableFile(); // Close current tbl file; free buffer
|
||||
void closeOpenDbFiles(); // Close DB files left open at job's end
|
||||
int confirmDBFileChanges(); // Confirm DB file changes (on HDFS)
|
||||
void deleteTempDBFileChanges(); // Delete DB temp swap files (on HDFS)
|
||||
int finishBRM(); // Finish reporting updates for BRM
|
||||
void freeProcessingBuffers(); // Free up Processing Buffers
|
||||
bool isBufferAvailable(bool report); // Is tbl buffer available for reading
|
||||
int openTableFile(); // Open data file and set the buffer
|
||||
void reportTotals(double elapsedSec); // Report summary totals
|
||||
void sleepMS(long int ms); // Sleep method
|
||||
int changeTableLockState(); // Change state of table lock to cleanup
|
||||
void closeTableFile(); // Close current tbl file; free buffer
|
||||
void closeOpenDbFiles(); // Close DB files left open at job's end
|
||||
int confirmDBFileChanges(); // Confirm DB file changes (on HDFS)
|
||||
void deleteTempDBFileChanges(); // Delete DB temp swap files (on HDFS)
|
||||
int finishBRM(); // Finish reporting updates for BRM
|
||||
void freeProcessingBuffers(); // Free up Processing Buffers
|
||||
bool isBufferAvailable(bool report); // Is tbl buffer available for reading
|
||||
int openTableFileParquet(int64_t &totalRowsParquet); // Open parquet data file and set batch reader for each buffer
|
||||
int openTableFile(); // Open data file and set the buffer
|
||||
void reportTotals(double elapsedSec); // Report summary totals
|
||||
void sleepMS(long int ms); // Sleep method
|
||||
// Compare column HWM with the examplar HWM.
|
||||
int compareHWMs(const int smallestColumnId, const int widerColumnId, const uint32_t smallerColumnWidth,
|
||||
const uint32_t widerColumnWidth, const std::vector<DBRootExtentInfo>& segFileInfo,
|
||||
|
@ -35,6 +35,8 @@
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
||||
#include "bytestream.h"
|
||||
#include "brmtypes.h"
|
||||
#include "extentmap.h" // for DICT_COL_WIDTH
|
||||
@ -745,6 +747,365 @@ int Dctnry::insertDctnry2(Signature& sig)
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
int Dctnry::insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
|
||||
int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
|
||||
const CHARSET_INFO* cs, const WriteEngine::ColType& weType)
|
||||
{
|
||||
if (cs->mbmaxlen > 1)
|
||||
{
|
||||
// For TEXT columns, we truncate based on the number of bytes,
|
||||
// and not based on the number of characters, as for CHAR/VARCHAR
|
||||
// columns in the else block.
|
||||
if (weType == WriteEngine::WR_TEXT)
|
||||
{
|
||||
if (curSig.size > m_colWidth)
|
||||
{
|
||||
uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
|
||||
curSig.size = m_colWidth - truncate_point;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const char* start = (const char*) curSig.signature;
|
||||
const char* end = (const char*)(curSig.signature + curSig.size);
|
||||
size_t numChars = cs->numchars(start, end);
|
||||
size_t maxCharLength = m_colWidth / cs->mbmaxlen;
|
||||
|
||||
if (numChars > maxCharLength)
|
||||
{
|
||||
MY_STRCOPY_STATUS status;
|
||||
cs->well_formed_char_length(start, end, maxCharLength, &status);
|
||||
curSig.size = status.m_source_end_pos - start;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // cs->mbmaxlen == 1
|
||||
{
|
||||
if (curSig.size > m_colWidth)
|
||||
{
|
||||
curSig.size = m_colWidth;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
|
||||
//...Search for the string in our string cache
|
||||
// if it fits into one block (< 8KB)
|
||||
if (curSig.size <= MAX_SIGNATURE_SIZE)
|
||||
{
|
||||
// Stats::startParseEvent("getTokenFromArray");
|
||||
found = getTokenFromArray(curSig);
|
||||
|
||||
if (found)
|
||||
{
|
||||
memcpy(pOut + outOffset, &curSig.token, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
// Stats::stopParseEvent("getTokenFromArray");
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
// Stats::stopParseEvent("getTokenFromArray");
|
||||
}
|
||||
|
||||
totalUseSize = m_totalHdrBytes + curSig.size;
|
||||
|
||||
//...String not found in cache, so proceed.
|
||||
// If room is available in current block then insert into block.
|
||||
// @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
|
||||
if (((totalUseSize <= m_freeSpace - HDR_UNIT_SIZE) ||
|
||||
((curSig.size > 8176) && (m_freeSpace > HDR_UNIT_SIZE))) &&
|
||||
(m_curOp < (MAX_OP_COUNT - 1)))
|
||||
{
|
||||
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
|
||||
m_curBlock.state = BLK_WRITE;
|
||||
memcpy(pOut + outOffset, &curSig.token, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
found = true;
|
||||
|
||||
//...If we have reached limit for the number of strings allowed in
|
||||
// a block, then we write the current block so that we can start
|
||||
// another block.
|
||||
if (m_curOp >= MAX_OP_COUNT - 1)
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
|
||||
m_curBlock.state = BLK_READ;
|
||||
next = true;
|
||||
}
|
||||
|
||||
//...Add string to cache, if we have not exceeded cache limit
|
||||
// Don't cache big blobs
|
||||
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
|
||||
{
|
||||
addToStringCache(curSig);
|
||||
}
|
||||
}
|
||||
else //...No room for this string in current block, so we write
|
||||
// out the current block, so we can start another block
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
|
||||
m_curBlock.state = BLK_READ;
|
||||
next = true;
|
||||
found = false;
|
||||
} // if m_freeSpace
|
||||
|
||||
//..."next" flag is used to indicate that we need to advance to the
|
||||
// next block in the store file.
|
||||
if (next)
|
||||
{
|
||||
memset(m_curBlock.data, 0, sizeof(m_curBlock.data));
|
||||
memcpy(m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
|
||||
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
|
||||
m_curBlock.state = BLK_WRITE;
|
||||
m_curOp = 0;
|
||||
next = false;
|
||||
m_lastFbo++;
|
||||
m_curFbo = m_lastFbo;
|
||||
|
||||
//...Expand current extent if it is an abbreviated initial extent
|
||||
if ((m_curFbo == m_numBlocks) && (m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
|
||||
{
|
||||
RETURN_ON_ERROR(expandDctnryExtent());
|
||||
}
|
||||
|
||||
//...Allocate a new extent if we have reached the last block in the
|
||||
// current extent.
|
||||
if (m_curFbo == m_numBlocks)
|
||||
{
|
||||
// last block
|
||||
LBID_t startLbid;
|
||||
|
||||
// Add an extent.
|
||||
RETURN_ON_ERROR(
|
||||
createDctnry(m_dctnryOID, m_colWidth, m_dbRoot, m_partition, m_segment, startLbid, false));
|
||||
|
||||
if (m_logger)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "Add dictionary extent OID-" << m_dctnryOID << "; DBRoot-" << m_dbRoot << "; part-"
|
||||
<< m_partition << "; seg-" << m_segment << "; hwm-" << m_curFbo << "; LBID-" << startLbid
|
||||
<< "; file-" << m_segFileName;
|
||||
m_logger->logMsg(oss.str(), MSGLVL_INFO2);
|
||||
}
|
||||
|
||||
m_curLbid = startLbid;
|
||||
|
||||
// now seek back to the curFbo, after adding an extent
|
||||
// @bug5769 For uncompressed only;
|
||||
// ChunkManager manages the file offset for the compression case
|
||||
if (m_compressionType == 0)
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::startParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
|
||||
#endif
|
||||
long long byteOffset = m_curFbo;
|
||||
byteOffset *= BYTE_PER_BLOCK;
|
||||
RETURN_ON_ERROR(setFileOffset(m_dFile, byteOffset));
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// LBIDs are numbered collectively and consecutively within an
|
||||
// extent, so within an extent we can derive the LBID by simply
|
||||
// incrementing it rather than having to go back to BRM to look
|
||||
// up the LBID for each FBO.
|
||||
m_curLbid++;
|
||||
}
|
||||
|
||||
#ifdef PROFILE
|
||||
Stats::startParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
m_curBlock.lbid = m_curLbid;
|
||||
|
||||
//..."found" flag indicates whether the string was already found
|
||||
// "or" added to the end of the previous block. If false, then
|
||||
// we need to add the string to the new block.
|
||||
if (!found)
|
||||
{
|
||||
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
|
||||
m_curBlock.state = BLK_WRITE;
|
||||
memcpy(pOut + outOffset, &curSig.token, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
|
||||
//...Add string to cache, if we have not exceeded cache limit
|
||||
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
|
||||
{
|
||||
addToStringCache(curSig);
|
||||
}
|
||||
}
|
||||
} // if next
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Description:
|
||||
* Used by bulk import to insert batch of parquet strings into this store file.
|
||||
* Function assumes that the file is already positioned to the current block.
|
||||
*
|
||||
* PARAMETERS:
|
||||
* input
|
||||
* columnData - arrow array containing input strings
|
||||
* startRowIdx - start position for current batch parquet data
|
||||
* totalRow - number of rows in "buf"
|
||||
* col - column of strings to be parsed from "buf"
|
||||
* output
|
||||
* tokenBuf - tokens assigned to inserted strings
|
||||
*
|
||||
* RETURN:
|
||||
* success - successfully write the header to block
|
||||
* failure - it did not write the header to block
|
||||
******************************************************************************/
|
||||
int Dctnry::insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx,
|
||||
const int totalRow, const int col, char* tokenBuf,
|
||||
long long& truncCount, const CHARSET_INFO* cs,
|
||||
const WriteEngine::ColType& weType)
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::startParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
int startPos = 0;
|
||||
int totalUseSize = 0;
|
||||
|
||||
int outOffset = 0;
|
||||
const char* pIn;
|
||||
char* pOut = tokenBuf;
|
||||
Signature curSig;
|
||||
bool found = false;
|
||||
bool next = false;
|
||||
CommBlock cb;
|
||||
cb.file.oid = m_dctnryOID;
|
||||
cb.file.pFile = m_dFile;
|
||||
WriteEngine::Token nullToken;
|
||||
|
||||
bool isNonNullArray = true;
|
||||
std::shared_ptr<arrow::BinaryArray> binaryArray;
|
||||
std::shared_ptr<arrow::FixedSizeBinaryArray> fixedSizeBinaryArray;
|
||||
|
||||
if (columnData->type_id() != arrow::Type::type::FIXED_SIZE_BINARY)
|
||||
binaryArray = std::static_pointer_cast<arrow::BinaryArray>(columnData);
|
||||
else
|
||||
fixedSizeBinaryArray = std::static_pointer_cast<arrow::FixedSizeBinaryArray>(columnData);
|
||||
|
||||
// check if this column data imported is NULL array or not
|
||||
if (columnData->type_id() == arrow::Type::type::NA)
|
||||
isNonNullArray = false;
|
||||
|
||||
//...Loop through all the rows for the specified column
|
||||
while (startPos < totalRow)
|
||||
{
|
||||
found = false;
|
||||
void* curSigPtr = static_cast<void*>(&curSig);
|
||||
memset(curSigPtr, 0, sizeof(curSig));
|
||||
|
||||
// if this column is not null data
|
||||
if (isNonNullArray)
|
||||
{
|
||||
const uint8_t* data;
|
||||
|
||||
// if (binaryArray != nullptr)
|
||||
// {
|
||||
// data = binaryArray->GetValue(startPos + startRowIdx, &curSig.size);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// data = fixedSizeBinaryArray->GetValue(startPos + startRowIdx);
|
||||
// std::shared_ptr<arrow::DataType> tType = fixedSizeBinaryArray->type();
|
||||
// curSig.size = tType->byte_width();
|
||||
// }
|
||||
|
||||
// comment this line and uncomment the above will reproduce the error
|
||||
data = binaryArray->GetValue(startPos + startRowIdx, &curSig.size);
|
||||
|
||||
const char* dataPtr = reinterpret_cast<const char*>(data);
|
||||
|
||||
// Strip trailing null bytes '\0' (by adjusting curSig.size) if import-
|
||||
// ing in binary mode. If entire string is binary zeros, then we treat
|
||||
// as a NULL value.
|
||||
if (curSig.size > 0)
|
||||
{
|
||||
const char* fld = dataPtr;
|
||||
int kk = curSig.size - 1;
|
||||
|
||||
for (; kk >= 0; kk--)
|
||||
{
|
||||
if (fld[kk] != '\0')
|
||||
break;
|
||||
}
|
||||
curSig.size = kk + 1;
|
||||
}
|
||||
|
||||
// Read thread should validate against max size so that the entire row
|
||||
// can be rejected up front. Once we get here in the parsing thread,
|
||||
// it is too late to reject the row. However, as a precaution, we
|
||||
// still check against max size & set to null token if needed.
|
||||
if ((curSig.size == 0) || (curSig.size > MAX_BLOB_SIZE))
|
||||
{
|
||||
if (m_defVal.length() > 0) // use default string if available
|
||||
{
|
||||
pIn = m_defVal.str();
|
||||
curSig.signature = (unsigned char*)pIn;
|
||||
curSig.size = m_defVal.length();
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(pOut + outOffset, &nullToken, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pIn = dataPtr;
|
||||
curSig.signature = (unsigned char*)pIn;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
curSig.size = 0;
|
||||
|
||||
if (m_defVal.length() > 0) // use default string if available
|
||||
{
|
||||
pIn = m_defVal.str();
|
||||
curSig.signature = (unsigned char*)pIn;
|
||||
curSig.size = m_defVal.length();
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(pOut + outOffset, &nullToken, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
RETURN_ON_ERROR(insertDctnry1(curSig, found, pOut, outOffset, startPos, totalUseSize, cb, next, truncCount,
|
||||
cs, weType));
|
||||
}
|
||||
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
// Done
|
||||
// If any data leftover and not written by subsequent call to
|
||||
// insertDctnry(), then it will be written by closeDctnry().
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Description:
|
||||
* Used by bulk import to insert collection of strings into this store file.
|
||||
@ -838,201 +1199,8 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow,
|
||||
curSig.signature = (unsigned char*)pIn;
|
||||
}
|
||||
|
||||
if (cs->mbmaxlen > 1)
|
||||
{
|
||||
// For TEXT columns, we truncate based on the number of bytes,
|
||||
// and not based on the number of characters, as for CHAR/VARCHAR
|
||||
// columns in the else block.
|
||||
if (weType == WriteEngine::WR_TEXT)
|
||||
{
|
||||
if (curSig.size > m_colWidth)
|
||||
{
|
||||
uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
|
||||
curSig.size = m_colWidth - truncate_point;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const char* start = (const char*) curSig.signature;
|
||||
const char* end = (const char*)(curSig.signature + curSig.size);
|
||||
size_t numChars = cs->numchars(start, end);
|
||||
size_t maxCharLength = m_colWidth / cs->mbmaxlen;
|
||||
|
||||
if (numChars > maxCharLength)
|
||||
{
|
||||
MY_STRCOPY_STATUS status;
|
||||
cs->well_formed_char_length(start, end, maxCharLength, &status);
|
||||
curSig.size = status.m_source_end_pos - start;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else // cs->mbmaxlen == 1
|
||||
{
|
||||
if (curSig.size > m_colWidth)
|
||||
{
|
||||
curSig.size = m_colWidth;
|
||||
truncCount++;
|
||||
}
|
||||
}
|
||||
|
||||
//...Search for the string in our string cache
|
||||
// if it fits into one block (< 8KB)
|
||||
if (curSig.size <= MAX_SIGNATURE_SIZE)
|
||||
{
|
||||
// Stats::startParseEvent("getTokenFromArray");
|
||||
found = getTokenFromArray(curSig);
|
||||
|
||||
if (found)
|
||||
{
|
||||
memcpy(pOut + outOffset, &curSig.token, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
// Stats::stopParseEvent("getTokenFromArray");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Stats::stopParseEvent("getTokenFromArray");
|
||||
}
|
||||
|
||||
totalUseSize = m_totalHdrBytes + curSig.size;
|
||||
|
||||
//...String not found in cache, so proceed.
|
||||
// If room is available in current block then insert into block.
|
||||
// @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
|
||||
if (((totalUseSize <= m_freeSpace - HDR_UNIT_SIZE) ||
|
||||
((curSig.size > 8176) && (m_freeSpace > HDR_UNIT_SIZE))) &&
|
||||
(m_curOp < (MAX_OP_COUNT - 1)))
|
||||
{
|
||||
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
|
||||
m_curBlock.state = BLK_WRITE;
|
||||
memcpy(pOut + outOffset, &curSig.token, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
found = true;
|
||||
|
||||
//...If we have reached limit for the number of strings allowed in
|
||||
// a block, then we write the current block so that we can start
|
||||
// another block.
|
||||
if (m_curOp >= MAX_OP_COUNT - 1)
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
|
||||
m_curBlock.state = BLK_READ;
|
||||
next = true;
|
||||
}
|
||||
|
||||
//...Add string to cache, if we have not exceeded cache limit
|
||||
// Don't cache big blobs
|
||||
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
|
||||
{
|
||||
addToStringCache(curSig);
|
||||
}
|
||||
}
|
||||
else //...No room for this string in current block, so we write
|
||||
// out the current block, so we can start another block
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
|
||||
m_curBlock.state = BLK_READ;
|
||||
next = true;
|
||||
found = false;
|
||||
} // if m_freeSpace
|
||||
|
||||
//..."next" flag is used to indicate that we need to advance to the
|
||||
// next block in the store file.
|
||||
if (next)
|
||||
{
|
||||
memset(m_curBlock.data, 0, sizeof(m_curBlock.data));
|
||||
memcpy(m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
|
||||
m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
|
||||
m_curBlock.state = BLK_WRITE;
|
||||
m_curOp = 0;
|
||||
next = false;
|
||||
m_lastFbo++;
|
||||
m_curFbo = m_lastFbo;
|
||||
|
||||
//...Expand current extent if it is an abbreviated initial extent
|
||||
if ((m_curFbo == m_numBlocks) && (m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
|
||||
{
|
||||
RETURN_ON_ERROR(expandDctnryExtent());
|
||||
}
|
||||
|
||||
//...Allocate a new extent if we have reached the last block in the
|
||||
// current extent.
|
||||
if (m_curFbo == m_numBlocks)
|
||||
{
|
||||
// last block
|
||||
LBID_t startLbid;
|
||||
|
||||
// Add an extent.
|
||||
RETURN_ON_ERROR(
|
||||
createDctnry(m_dctnryOID, m_colWidth, m_dbRoot, m_partition, m_segment, startLbid, false));
|
||||
|
||||
if (m_logger)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "Add dictionary extent OID-" << m_dctnryOID << "; DBRoot-" << m_dbRoot << "; part-"
|
||||
<< m_partition << "; seg-" << m_segment << "; hwm-" << m_curFbo << "; LBID-" << startLbid
|
||||
<< "; file-" << m_segFileName;
|
||||
m_logger->logMsg(oss.str(), MSGLVL_INFO2);
|
||||
}
|
||||
|
||||
m_curLbid = startLbid;
|
||||
|
||||
// now seek back to the curFbo, after adding an extent
|
||||
// @bug5769 For uncompressed only;
|
||||
// ChunkManager manages the file offset for the compression case
|
||||
if (m_compressionType == 0)
|
||||
{
|
||||
#ifdef PROFILE
|
||||
Stats::startParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
|
||||
#endif
|
||||
long long byteOffset = m_curFbo;
|
||||
byteOffset *= BYTE_PER_BLOCK;
|
||||
RETURN_ON_ERROR(setFileOffset(m_dFile, byteOffset));
|
||||
#ifdef PROFILE
|
||||
Stats::stopParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// LBIDs are numbered collectively and consecutively within an
|
||||
// extent, so within an extent we can derive the LBID by simply
|
||||
// incrementing it rather than having to go back to BRM to look
|
||||
// up the LBID for each FBO.
|
||||
m_curLbid++;
|
||||
}
|
||||
|
||||
#ifdef PROFILE
|
||||
Stats::startParseEvent(WE_STATS_PARSE_DCT);
|
||||
#endif
|
||||
m_curBlock.lbid = m_curLbid;
|
||||
|
||||
//..."found" flag indicates whether the string was already found
|
||||
// "or" added to the end of the previous block. If false, then
|
||||
// we need to add the string to the new block.
|
||||
if (!found)
|
||||
{
|
||||
RETURN_ON_ERROR(insertDctnry2(curSig)); // m_freeSpace updated!
|
||||
m_curBlock.state = BLK_WRITE;
|
||||
memcpy(pOut + outOffset, &curSig.token, 8);
|
||||
outOffset += 8;
|
||||
startPos++;
|
||||
|
||||
//...Add string to cache, if we have not exceeded cache limit
|
||||
if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
|
||||
{
|
||||
addToStringCache(curSig);
|
||||
}
|
||||
}
|
||||
} // if next
|
||||
RETURN_ON_ERROR(insertDctnry1(curSig, found, pOut, outOffset, startPos, totalUseSize, cb, next, truncCount,
|
||||
cs, weType));
|
||||
} // end while
|
||||
|
||||
#ifdef PROFILE
|
||||
|
@ -37,6 +37,8 @@
|
||||
#include "bytestream.h"
|
||||
#include "nullstring.h"
|
||||
|
||||
#include <arrow/api.h>
|
||||
|
||||
#define EXPORT
|
||||
|
||||
/** Namespace WriteEngine */
|
||||
@ -157,6 +159,20 @@ class Dctnry : public DbFileOp
|
||||
*/
|
||||
EXPORT int insertDctnry(const int& sgnature_size, const unsigned char* sgnature_value, Token& token);
|
||||
|
||||
/**
|
||||
* @brief Insert signature value to a file block and return token/pointer
|
||||
* (for Bulk use)
|
||||
*
|
||||
* @param columnData - arrow array containing strings to be parsed
|
||||
* @param startRowIdx - start position for current batch parquet data
|
||||
* @param totalRow - total number of rows in buf
|
||||
* @param col - the column to be parsed from buf
|
||||
* @param tokenBuf - (output) list of tokens for the parsed strings
|
||||
*/
|
||||
EXPORT int insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx, const int totalRow,
|
||||
const int col, char* tokenBuf, long long& truncCount,
|
||||
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
|
||||
|
||||
/**
|
||||
* @brief Insert a signature value to a file block and return token/pointer
|
||||
* (for Bulk use)
|
||||
@ -280,6 +296,9 @@ class Dctnry : public DbFileOp
|
||||
// insertDctnryHdr inserts the new value info into the header.
|
||||
// insertSgnture inserts the new value into the block.
|
||||
//
|
||||
int insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
|
||||
int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
|
||||
const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
|
||||
int insertDctnry2(Signature& sig);
|
||||
void insertDctnryHdr(unsigned char* blockBuf, const int& size);
|
||||
void insertSgnture(unsigned char* blockBuf, const int& size, unsigned char* value);
|
||||
|
@ -113,7 +113,7 @@ const int ERR_COMPBASE = 1650; // Compression errors
|
||||
const int ERR_AUTOINCBASE = 1700; // Auto-increment errors
|
||||
const int ERR_BLKCACHEBASE = 1750; // Block cache flush errors
|
||||
const int ERR_METABKUPBASE = 1800; // Backup bulk meta file errors
|
||||
|
||||
const int ERR_PARQUETBASE = 1850; // Parquet importing errors
|
||||
//--------------------------------------------------------------------------
|
||||
// Generic error
|
||||
//--------------------------------------------------------------------------
|
||||
@ -152,6 +152,7 @@ const int ERR_FILE_GLOBBING = ERR_FILEBASE + 19; // Error globbing a file
|
||||
const int ERR_FILE_EOF = ERR_FILEBASE + 20; // EOF
|
||||
const int ERR_FILE_CHOWN = ERR_FILEBASE + 21; // EOF
|
||||
const int ERR_INTERNAL = ERR_FILEBASE + 22; // EOF
|
||||
const int ERR_FILE_TYPE_DIFF = ERR_FILEBASE + 23; // Files import type are different
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// XML level error
|
||||
@ -389,6 +390,11 @@ const int ERR_METADATABKUP_COMP_READ_BULK_BKUP =
|
||||
ERR_METABKUPBASE + 7; // Error reading from backup chunk file */
|
||||
const int ERR_METADATABKUP_COMP_RENAME = ERR_METABKUPBASE + 8; // Error renaming chunk file */
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// Parquet errors when importing
|
||||
//--------------------------------------------------------------------------
|
||||
const int ERR_PARQUET_AUX = ERR_PARQUETBASE + 1; // Error when creating aux column for parquet file
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Class used to convert an error code to a corresponding error message string
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -137,11 +137,13 @@ enum BulkModeType
|
||||
// Import Mode 0-text Import (default)
|
||||
// 1-Binary Import with NULL values
|
||||
// 2-Binary Import with saturated NULL values
|
||||
// 3-Binary Import with parquet file
|
||||
enum ImportDataMode
|
||||
{
|
||||
IMPORT_DATA_TEXT = 0,
|
||||
IMPORT_DATA_BIN_ACCEPT_NULL = 1,
|
||||
IMPORT_DATA_BIN_SAT_NULL = 2
|
||||
IMPORT_DATA_BIN_SAT_NULL = 2,
|
||||
IMPORT_DATA_PARQUET = 3
|
||||
};
|
||||
|
||||
/**
|
||||
|
Loading…
x
Reference in New Issue
Block a user