MCOL-5505 add parquet support for cpimport and add mcs_parquet_ddl and mcs_parquet_gen tools

2025-04-18 21:44:02 +03:00 · 2023-08-20 16:01:58 +08:00 · 2023-08-20 16:01:58 +08:00 · fe597ec78c
commit fe597ec78c
parent 94a680ea60
25 changed files with 4677 additions and 251 deletions
--- a/mysql-test/columnstore/basic/r/mcol-5505-cpimport-parquet-large-volume.result
+++ b/mysql-test/columnstore/basic/r/mcol-5505-cpimport-parquet-large-volume.result
@ -0,0 +1,77 @@
+DROP DATABASE IF EXISTS mcol_5505_parquet_large_volume;
+CREATE DATABASE mcol_5505_parquet_large_volume;
+USE mcol_5505_parquet_large_volume;
+SET time_zone = '+8:00';
+Create TABLE t1(
+col1 INT,
+col2 TIMESTAMP(3),
+col3 CHAR(6),
+col4 DECIMAL(38,10),
+col5 DOUBLE,
+col6 VARCHAR(20)
+) ENGINE=Columnstore;
+Create TABLE t2(
+col1 INT,
+col2 TIMESTAMP(3),
+col3 CHAR(6),
+col4 DECIMAL(38,10),
+col5 DOUBLE,
+col6 VARCHAR(20)
+) ENGINE=Columnstore;
+Create TABLE t3(
+col1 INT,
+col2 TIMESTAMP(3),
+col3 CHAR(6),
+col4 DECIMAL(38,10),
+col5 DOUBLE,
+col6 VARCHAR(20)
+) ENGINE=Columnstore;
+Create TABLE t4(
+col1 INT,
+col2 TIMESTAMP(3),
+col3 CHAR(6),
+col4 DECIMAL(38,10),
+col5 DOUBLE,
+col6 VARCHAR(20)
+) ENGINE=Columnstore;
+SELECT * FROM t1 ORDER BY col1 LIMIT 5;
+col1	col2	col3	col4	col5	col6
+0	0000-00-00 00:00:00.000	hhhh	12345678909876543.2112345678	2.5	hhhh
+1	1970-01-01 10:46:40.001	hhhh	12345678909876543.2112345678	3.5	hhhh
+2	1970-01-01 13:33:20.002	hhhh	12345678909876543.2112345678	4.5	hhhh
+3	1970-01-01 16:20:00.003	hhhh	12345678909876543.2112345678	5.5	hhhh
+4	1970-01-01 19:06:40.004	hhhh	12345678909876543.2112345678	6.5	hhhh
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+1000000
+SELECT * FROM t2 ORDER BY col1 LIMIT 5;
+col1	col2	col3	col4	col5	col6
+0	0000-00-00 00:00:00.000	hhhh	12345678909876543.2112345678	2.5	hhhh
+1	1970-01-01 10:46:40.001	hhhh	12345678909876543.2112345678	3.5	hhhh
+2	1970-01-01 13:33:20.002	hhhh	12345678909876543.2112345678	4.5	hhhh
+3	1970-01-01 16:20:00.003	hhhh	12345678909876543.2112345678	5.5	hhhh
+4	1970-01-01 19:06:40.004	hhhh	12345678909876543.2112345678	6.5	hhhh
+SELECT COUNT(*) FROM t2;
+COUNT(*)
+10000000
+SELECT * FROM t3 ORDER BY col1 LIMIT 5;
+col1	col2	col3	col4	col5	col6
+0	0000-00-00 00:00:00.000	hhhh	12345678909876543.2112345678	2.5	hhhh
+1	1970-01-01 10:46:40.001	hhhh	12345678909876543.2112345678	3.5	hhhh
+2	1970-01-01 13:33:20.002	hhhh	12345678909876543.2112345678	4.5	hhhh
+3	1970-01-01 16:20:00.003	hhhh	12345678909876543.2112345678	5.5	hhhh
+4	1970-01-01 19:06:40.004	hhhh	12345678909876543.2112345678	6.5	hhhh
+SELECT COUNT(*) FROM t3;
+COUNT(*)
+50000000
+SELECT * FROM t4 ORDER BY col1 LIMIT 5;
+col1	col2	col3	col4	col5	col6
+0	0000-00-00 00:00:00.000	hhhh	12345678909876543.2112345678	2.5	hhhh
+1	1970-01-01 10:46:40.001	hhhh	12345678909876543.2112345678	3.5	hhhh
+2	1970-01-01 13:33:20.002	hhhh	12345678909876543.2112345678	4.5	hhhh
+3	1970-01-01 16:20:00.003	hhhh	12345678909876543.2112345678	5.5	hhhh
+4	1970-01-01 19:06:40.004	hhhh	12345678909876543.2112345678	6.5	hhhh
+SELECT COUNT(*) FROM t4;
+COUNT(*)
+100000000
+DROP DATABASE mcol_5505_parquet_large_volume;
--- a/mysql-test/columnstore/basic/r/mcol-5505-cpimport-parquet.result
+++ b/mysql-test/columnstore/basic/r/mcol-5505-cpimport-parquet.result
@ -0,0 +1,100 @@
+DROP DATABASE IF EXISTS mcol_5505_cpimport_parquet;
+CREATE DATABASE mcol_5505_cpimport_parquet;
+USE mcol_5505_cpimport_parquet;
+SET time_zone = '+8:00';
+Create TABLE t1(
+col1 INT,
+col2 BIGINT,
+col3 FLOAT,
+col4 DOUBLE,
+col5 TIME(3),
+col6 VARCHAR(2),
+col7 VARCHAR(5),
+col8 VARCHAR(20),
+col9 CHAR(2),
+col10 CHAR(5),
+col11 CHAR(20),
+col12 TIMESTAMP(3),
+col13 DATE,
+col14 DATETIME(3),
+col15 SMALLINT,
+col16 TINYINT,
+col17 DECIMAL(9,3),
+col18 INT UNSIGNED,
+col19 SMALLINT UNSIGNED,
+col20 TINYINT UNSIGNED,
+col21 BIGINT UNSIGNED,
+col22 BOOLEAN,
+col23 DECIMAL(38,10),
+col24 TIME(6),
+col25 TIMESTAMP(6),
+col26 DATETIME(6),
+col27 CHAR(4),
+col28 CHAR(4)
+) ENGINE=Columnstore;
+SELECT * FROM t1;
+col1	col2	col3	col4	col5	col6	col7	col8	col9	col10	col11	col12	col13	col14	col15	col16	col17	col18	col19	col20	col21	col22	col23	col24	col25	col26	col27	col28
+0	0	1.5	2.5	00:00:00.000	a	a	a	a	a	a	0000-00-00 00:00:00.000	1970-01-01	1970-01-01 00:00:00.000	0	0	1383.433	0	0	0	0	1	12345678909876543.2112345678	00:00:00.000000	0000-00-00 00:00:00.000000	1970-01-01 00:00:00.000000	a	abcd
+NULL	NULL	2.5	3.5	01:00:05.001	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 10:46:40.001	1970-01-11	1970-01-01 02:46:40.001	1	1	NULL	NULL	1	1	NULL	1	12345678909876543.2112345678	01:00:05.000001	1970-01-01 10:46:40.000001	1970-01-01 02:46:40.000001	ab	abcd
+NULL	NULL	NULL	4.5	02:00:10.002	ab	abcd	abcd	ab	abcd	abcd	1970-01-01 13:33:20.002	1970-01-21	1970-01-01 05:33:20.002	2	2	532235.234	NULL	2	2	NULL	1	12345678909876543.2112345678	02:00:10.000002	1970-01-01 13:33:20.000002	1970-01-01 05:33:20.000002	abcd	abcd
+NULL	NULL	4.5	NULL	03:00:15.003	ab	abcde	abcde	ab	abcde	abcde	1970-01-01 16:20:00.003	1970-01-31	1970-01-01 08:20:00.003	3	3	NULL	NULL	3	3	NULL	1	12345678909876543.2112345678	03:00:15.000003	1970-01-01 16:20:00.000003	1970-01-01 08:20:00.000003	abcd	abcd
+4	4	5.5	6.5	04:00:20.004	ab	abcde	abcdefg	ab	abcde	abcdefg	1970-01-01 19:06:40.004	1970-02-10	1970-01-01 11:06:40.004	4	4	5325.234	4	4	4	4	1	12345678909876543.2112345678	04:00:20.000004	1970-01-01 19:06:40.000004	1970-01-01 11:06:40.000004	abcd	abcd
+5	5	6.5	7.5	05:00:25.005	Wh	Whlg1	Whlg1xXAxP	Wh	Whlg1	Whlg1xXAxP	1970-01-01 21:53:20.005	1970-02-20	1970-01-01 13:53:20.005	5	5	NULL	5	5	5	5	0	12345678909876543.2112345678	05:00:25.000005	1970-01-01 21:53:20.000005	1970-01-01 13:53:20.000005	Whlg	abcd
+6	6	7.5	8.5	06:00:30.006	4N	4Nimz	4NimzSQzMD	4N	4Nimz	4NimzSQzMD	1970-01-02 00:40:00.006	1970-03-02	1970-01-01 16:40:00.006	6	6	1383.433	6	6	6	6	1	12345678909876543.2112345678	06:00:30.000006	1970-01-02 00:40:00.000006	1970-01-01 16:40:00.000006	4Nim	abcd
+7	7	8.5	9.5	07:00:35.007	G2	G23ne	G23ne3j92Ky0wBF	G2	G23ne	G23ne3j92Ky0wBF	1970-01-02 03:26:40.007	1970-03-12	1970-01-01 19:26:40.007	7	7	NULL	7	7	7	7	1	12345678909876543.2112345678	07:00:35.000007	1970-01-02 03:26:40.000007	1970-01-01 19:26:40.000007	G23n	abcd
+8	8	9.5	10.5	08:00:40.008	F4	F4z	F4z	F4	F4z	F4z	1970-01-02 06:13:20.008	1970-03-22	1970-01-01 22:13:20.008	8	8	532235.234	8	8	8	8	1	12345678909876543.2112345678	08:00:40.000008	1970-01-02 06:13:20.000008	1970-01-01 22:13:20.000008	F4z	abcd
+9	9	10.5	11.5	09:00:45.009	8J	8JCVT	8JCVTsGYB7V	8J	8JCVT	8JCVTsGYB7V	1970-01-02 09:00:00.009	1970-04-01	1970-01-02 01:00:00.009	9	9	NULL	9	9	9	9	1	12345678909876543.2112345678	09:00:45.000009	1970-01-02 09:00:00.000009	1970-01-02 01:00:00.000009	8JCV	abcd
+10	10	11.5	12.5	10:00:50.010	23	23235	23235	23	23235	23235	1970-01-02 11:46:40.010	1970-04-11	1970-01-02 03:46:40.010	10	10	5325.234	10	10	10	10	1	12345678909876543.2112345678	10:00:50.000010	1970-01-02 11:46:40.000010	1970-01-02 03:46:40.000010	2323	abcd
+11	11	12.5	13.5	11:00:55.011	sd	sda22	sda22	sd	sda22	sda22	1970-01-02 14:33:20.011	1970-04-21	1970-01-02 06:33:20.011	11	11	NULL	11	11	11	11	1	12345678909876543.2112345678	11:00:55.000011	1970-01-02 14:33:20.000011	1970-01-02 06:33:20.000011	sda2	abcd
+12	12	13.5	14.5	12:01:00.012	SD	SD7sd	SD7sdFD7	SD	SD7sd	SD7sdFD7	1970-01-02 17:20:00.012	1970-05-01	1970-01-02 09:20:00.012	12	12	1383.433	12	12	12	12	1	12345678909876543.2112345678	12:01:00.000012	1970-01-02 17:20:00.000012	1970-01-02 09:20:00.000012	SD7s	abcd
+13	13	14.5	15.5	13:01:05.013	gv	gvv3h	gvv3hYwdfOD	gv	gvv3h	gvv3hYwdfOD	1970-01-02 20:06:40.013	1970-05-11	1970-01-02 12:06:40.013	13	13	NULL	13	13	13	13	1	12345678909876543.2112345678	13:01:05.000013	1970-01-02 20:06:40.000013	1970-01-02 12:06:40.000013	gvv3	abcd
+14	14	15.5	16.5	14:01:10.014	y8	y8wjo	y8wjo4v50s6	y8	y8wjo	y8wjo4v50s6	1970-01-02 22:53:20.014	1970-05-21	1970-01-02 14:53:20.014	14	14	532235.234	14	14	14	14	1	12345678909876543.2112345678	14:01:10.000014	1970-01-02 22:53:20.000014	1970-01-02 14:53:20.000014	y8wj	abcd
+15	15	16.5	17.5	15:01:15.015	aN	aNJW5	aNJW56SJieE8KVV	aN	aNJW5	aNJW56SJieE8KVV	1970-01-03 01:40:00.015	1970-05-31	1970-01-02 17:40:00.015	15	15	NULL	15	15	15	15	1	12345678909876543.2112345678	15:01:15.000015	1970-01-03 01:40:00.000015	1970-01-02 17:40:00.000015	aNJW	abcd
+16	16	17.5	18.5	16:01:20.016	1+	1+2=3	1+2=3	1+	1+2=3	1+2=3	1970-01-03 04:26:40.016	1970-06-10	1970-01-02 20:26:40.016	16	16	5325.234	16	16	16	16	1	12345678909876543.2112345678	16:01:20.000016	1970-01-03 04:26:40.000016	1970-01-02 20:26:40.000016	1+2=	abcd
+17	17	18.5	19.5	17:01:25.017	He	Hello	Hello World!	He	Hello	Hello World!	1970-01-03 07:13:20.017	1970-06-20	1970-01-02 23:13:20.017	17	17	NULL	17	17	17	17	1	12345678909876543.2112345678	17:01:25.000017	1970-01-03 07:13:20.000017	1970-01-02 23:13:20.000017	Hell	abcd
+18	18	19.5	20.5	18:01:30.018	1!	1!!!1	1!!!1	1!	1!!!1	1!!!1	1970-01-03 10:00:00.018	1970-06-30	1970-01-03 02:00:00.018	18	18	1383.433	18	18	18	18	1	12345678909876543.2112345678	18:01:30.000018	1970-01-03 10:00:00.000018	1970-01-03 02:00:00.000018	1!!!	abcd
+19	19	20.5	21.5	19:01:35.019	82	82440	824407880313877	82	82440	824407880313877	1970-01-03 12:46:40.019	1970-07-10	1970-01-03 04:46:40.019	19	19	NULL	19	19	19	19	1	12345678909876543.2112345678	19:01:35.000019	1970-01-03 12:46:40.000019	1970-01-03 04:46:40.000019	8244	abcd
+20	20	21.5	22.5	20:01:40.020	19	1970-	1970-01-01 08:02:23	19	1970-	1970-01-01 08:02:23	1970-01-03 15:33:20.020	1970-07-20	1970-01-03 07:33:20.020	20	20	532235.234	20	20	20	20	1	12345678909876543.2112345678	20:01:40.000020	1970-01-03 15:33:20.000020	1970-01-03 07:33:20.000020	1970	abcd
+21	21	22.5	23.5	21:01:45.021	19	1970-	1970-05-31	19	1970-	1970-05-31	1970-01-03 18:20:00.021	1970-07-30	1970-01-03 10:20:00.021	21	21	NULL	21	21	21	21	1	12345678909876543.2112345678	21:01:45.000021	1970-01-03 18:20:00.000021	1970-01-03 10:20:00.000021	1970	abcd
+22	22	23.5	24.5	22:01:50.022	xx	xxx	xxx	xx	xxx	xxx	1970-01-03 21:06:40.022	1970-08-09	1970-01-03 13:06:40.022	22	22	5325.234	22	22	22	22	1	12345678909876543.2112345678	22:01:50.000022	1970-01-03 21:06:40.000022	1970-01-03 13:06:40.000022	xxx	abcd
+23	23	24.5	25.5	23:01:55.023	ON	ONMKM	ONMKMQVBRWBUTWT	ON	ONMKM	ONMKMQVBRWBUTWT	1970-01-03 23:53:20.023	1970-08-19	1970-01-03 15:53:20.023	23	23	NULL	23	23	23	23	1	12345678909876543.2112345678	23:01:55.000023	1970-01-03 23:53:20.000023	1970-01-03 15:53:20.000023	ONMK	abcd
+24	24	25.5	26.5	24:02:00.024	ZW	ZWMWH	ZWMWHSEZDYODQWP	ZW	ZWMWH	ZWMWHSEZDYODQWP	1970-01-04 02:40:00.024	1970-08-29	1970-01-03 18:40:00.024	24	24	1383.433	24	24	24	24	1	12345678909876543.2112345678	24:02:00.000024	1970-01-04 02:40:00.000024	1970-01-03 18:40:00.000024	ZWMW	abcd
+25	25	26.5	27.5	25:02:05.025	Ho	HoCYp	HoCYpJ	Ho	HoCYp	HoCYpJ	1970-01-04 05:26:40.025	1970-09-08	1970-01-03 21:26:40.025	25	25	NULL	25	25	25	25	1	12345678909876543.2112345678	25:02:05.000025	1970-01-04 05:26:40.000025	1970-01-03 21:26:40.000025	HoCY	abcd
+26	26	27.5	28.5	26:02:10.026	-1	-100	-100	-1	-100	-100	1970-01-04 08:13:20.026	1970-09-18	1970-01-04 00:13:20.026	26	26	532235.234	26	26	26	26	1	12345678909876543.2112345678	26:02:10.000026	1970-01-04 08:13:20.000026	1970-01-04 00:13:20.000026	-100	abcd
+27	27	28.5	29.5	27:02:15.027	Iq	Iqa8N	Iqa8Nr	Iq	Iqa8N	Iqa8Nr	1970-01-04 11:00:00.027	1970-09-28	1970-01-04 03:00:00.027	27	27	NULL	27	27	27	27	1	12345678909876543.2112345678	27:02:15.000027	1970-01-04 11:00:00.000027	1970-01-04 03:00:00.000027	Iqa8	abcd
+28	28	29.5	30.5	28:02:20.028	nD	nD274	nD274v	nD	nD274	nD274v	1970-01-04 13:46:40.028	1970-10-08	1970-01-04 05:46:40.028	28	28	5325.234	28	28	28	28	1	12345678909876543.2112345678	28:02:20.000028	1970-01-04 13:46:40.000028	1970-01-04 05:46:40.000028	nD27	abcd
+-2147483646	2147483648	30.5	31.5	29:02:25.029	6y	6y0Jy	6y0JyW	6y	6y0Jy	6y0JyW	1970-01-04 16:33:20.029	1970-10-18	1970-01-04 08:33:20.029	29	29	NULL	2147483648	29	29	2147483648	1	12345678909876543.2112345678	29:02:25.000029	1970-01-04 16:33:20.000029	1970-01-04 08:33:20.000029	6y0J	abcd
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	a	NULL	NULL	a	0000-00-00 00:00:00.000	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	0000-00-00 00:00:00.000000	1970-01-01 00:00:00.000000	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 10:46:40.001	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 10:46:40.000001	1970-01-01 02:46:40.000001	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	abcd	NULL	NULL	abcd	1970-01-01 13:33:20.002	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 13:33:20.000002	1970-01-01 05:33:20.000002	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	abcde	NULL	NULL	abcde	1970-01-01 16:20:00.003	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 16:20:00.000003	1970-01-01 08:20:00.000003	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	abcdefg	NULL	NULL	abcdefg	1970-01-01 19:06:40.004	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 19:06:40.000004	1970-01-01 11:06:40.000004	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	Whlg1xXAxP	NULL	NULL	Whlg1xXAxP	1970-01-01 21:53:20.005	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 21:53:20.000005	1970-01-01 13:53:20.000005	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	4NimzSQzMD	NULL	NULL	4NimzSQzMD	1970-01-02 00:40:00.006	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 00:40:00.000006	1970-01-01 16:40:00.000006	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	G23ne3j92Ky0wBF	NULL	NULL	G23ne3j92Ky0wBF	1970-01-02 03:26:40.007	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 03:26:40.000007	1970-01-01 19:26:40.000007	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	F4z	NULL	NULL	F4z	1970-01-02 06:13:20.008	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 06:13:20.000008	1970-01-01 22:13:20.000008	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	8JCVTsGYB7V	NULL	NULL	8JCVTsGYB7V	1970-01-02 09:00:00.009	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 09:00:00.000009	1970-01-02 01:00:00.000009	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	23235	NULL	NULL	23235	1970-01-02 11:46:40.010	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 11:46:40.000010	1970-01-02 03:46:40.000010	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	sda22	NULL	NULL	sda22	1970-01-02 14:33:20.011	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 14:33:20.000011	1970-01-02 06:33:20.000011	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	SD7sdFD7	NULL	NULL	SD7sdFD7	1970-01-02 17:20:00.012	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 17:20:00.000012	1970-01-02 09:20:00.000012	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	gvv3hYwdfOD	NULL	NULL	gvv3hYwdfOD	1970-01-02 20:06:40.013	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 20:06:40.000013	1970-01-02 12:06:40.000013	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	y8wjo4v50s6	NULL	NULL	y8wjo4v50s6	1970-01-02 22:53:20.014	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-02 22:53:20.000014	1970-01-02 14:53:20.000014	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	aNJW56SJieE8KVV	NULL	NULL	aNJW56SJieE8KVV	1970-01-03 01:40:00.015	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 01:40:00.000015	1970-01-02 17:40:00.000015	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	1+2=3	NULL	NULL	1+2=3	1970-01-03 04:26:40.016	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 04:26:40.000016	1970-01-02 20:26:40.000016	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	Hello World!	NULL	NULL	Hello World!	1970-01-03 07:13:20.017	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 07:13:20.000017	1970-01-02 23:13:20.000017	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	1!!!1	NULL	NULL	1!!!1	1970-01-03 10:00:00.018	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 10:00:00.000018	1970-01-03 02:00:00.000018	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	824407880313877	NULL	NULL	824407880313877	1970-01-03 12:46:40.019	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 12:46:40.000019	1970-01-03 04:46:40.000019	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-01 08:02:23	NULL	NULL	1970-01-01 08:02:23	1970-01-03 15:33:20.020	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 15:33:20.000020	1970-01-03 07:33:20.000020	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-05-31	NULL	NULL	1970-05-31	1970-01-03 18:20:00.021	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 18:20:00.000021	1970-01-03 10:20:00.000021	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	xxx	NULL	NULL	xxx	1970-01-03 21:06:40.022	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 21:06:40.000022	1970-01-03 13:06:40.000022	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	ONMKMQVBRWBUTWT	NULL	NULL	ONMKMQVBRWBUTWT	1970-01-03 23:53:20.023	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-03 23:53:20.000023	1970-01-03 15:53:20.000023	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	ZWMWHSEZDYODQWP	NULL	NULL	ZWMWHSEZDYODQWP	1970-01-04 02:40:00.024	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-04 02:40:00.000024	1970-01-03 18:40:00.000024	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	HoCYpJ	NULL	NULL	HoCYpJ	1970-01-04 05:26:40.025	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-04 05:26:40.000025	1970-01-03 21:26:40.000025	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	-100	NULL	NULL	-100	1970-01-04 08:13:20.026	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-04 08:13:20.000026	1970-01-04 00:13:20.000026	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	Iqa8Nr	NULL	NULL	Iqa8Nr	1970-01-04 11:00:00.027	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-04 11:00:00.000027	1970-01-04 03:00:00.000027	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	nD274v	NULL	NULL	nD274v	1970-01-04 13:46:40.028	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-04 13:46:40.000028	1970-01-04 05:46:40.000028	NULL	NULL
+NULL	NULL	NULL	NULL	NULL	NULL	NULL	6y0JyW	NULL	NULL	6y0JyW	1970-01-04 16:33:20.029	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	1970-01-04 16:33:20.000029	1970-01-04 08:33:20.000029	NULL	NULL
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+60
+DROP DATABASE mcol_5505_cpimport_parquet;
--- a/mysql-test/columnstore/basic/r/mcol-5505-parquet-ddl.result
+++ b/mysql-test/columnstore/basic/r/mcol-5505-parquet-ddl.result
@ -0,0 +1,36 @@
+DROP DATABASE IF EXISTS mcol_5505_parquet_ddl;
+CREATE DATABASE mcol_5505_parquet_ddl;
+USE mcol_5505_parquet_ddl;
+SHOW CREATE TABLE parquet_test_table;
+Table	Create Table
+parquet_test_table	CREATE TABLE `parquet_test_table` (
+  `col1` int(11) DEFAULT NULL,
+  `col2` bigint(20) DEFAULT NULL,
+  `col3` float DEFAULT NULL,
+  `col4` double DEFAULT NULL,
+  `col5` time(3) DEFAULT NULL,
+  `col6` varchar(2000) DEFAULT NULL,
+  `col7` varchar(2000) DEFAULT NULL,
+  `col8` varchar(2000) DEFAULT NULL,
+  `col9` varchar(2000) DEFAULT NULL,
+  `col10` varchar(2000) DEFAULT NULL,
+  `col11` varchar(2000) DEFAULT NULL,
+  `col12` timestamp(3) NULL DEFAULT NULL,
+  `col13` date DEFAULT NULL,
+  `col14` timestamp(3) NULL DEFAULT NULL,
+  `col15` smallint(6) DEFAULT NULL,
+  `col16` tinyint(4) DEFAULT NULL,
+  `col17` decimal(9,3) DEFAULT NULL,
+  `col18` int(10) unsigned DEFAULT NULL,
+  `col19` smallint(5) unsigned DEFAULT NULL,
+  `col20` tinyint(3) unsigned DEFAULT NULL,
+  `col21` bigint(20) unsigned DEFAULT NULL,
+  `col22` tinyint(1) DEFAULT NULL,
+  `col23` decimal(38,10) DEFAULT NULL,
+  `col24` time(6) DEFAULT NULL,
+  `col25` timestamp(6) NULL DEFAULT NULL,
+  `col26` timestamp(6) NULL DEFAULT NULL,
+  `col27` varbinary(8000) DEFAULT NULL,
+  `col28` char(4) DEFAULT NULL
+) ENGINE=Columnstore DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+DROP DATABASE mcol_5505_parquet_ddl;
--- a/mysql-test/columnstore/basic/t/mcol-5505-cpimport-parquet-large-volume.test
+++ b/mysql-test/columnstore/basic/t/mcol-5505-cpimport-parquet-large-volume.test
@ -0,0 +1,82 @@
+#
+# parquet support for large volume data file
+# Author: Bin Ruan, binruan0227@gmail.com
+#
+if (!$MYSQL_TEST_ROOT){
+    skip Should be run by root to execute cpimport;
+}
+
+-- source ../include/have_columnstore.inc
+
+--disable_warnings
+DROP DATABASE IF EXISTS mcol_5505_parquet_large_volume;
+--enable_warnings
+
+CREATE DATABASE mcol_5505_parquet_large_volume;
+USE mcol_5505_parquet_large_volume;
+SET time_zone = '+8:00';
+# Create table
+Create TABLE t1(
+    col1 INT,
+    col2 TIMESTAMP(3),
+    col3 CHAR(6),
+    col4 DECIMAL(38,10),
+    col5 DOUBLE,
+    col6 VARCHAR(20)
+) ENGINE=Columnstore;
+
+Create TABLE t2(
+    col1 INT,
+    col2 TIMESTAMP(3),
+    col3 CHAR(6),
+    col4 DECIMAL(38,10),
+    col5 DOUBLE,
+    col6 VARCHAR(20)
+) ENGINE=Columnstore;
+
+Create TABLE t3(
+    col1 INT,
+    col2 TIMESTAMP(3),
+    col3 CHAR(6),
+    col4 DECIMAL(38,10),
+    col5 DOUBLE,
+    col6 VARCHAR(20)
+) ENGINE=Columnstore;
+
+Create TABLE t4(
+    col1 INT,
+    col2 TIMESTAMP(3),
+    col3 CHAR(6),
+    col4 DECIMAL(38,10),
+    col5 DOUBLE,
+    col6 VARCHAR(20)
+) ENGINE=Columnstore;
+
+# Generate data
+--exec mcs_parquet_gen -l -f $MTR_SUITE_DIR/../std_data
+
+
+#Valid data and table
+--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t1 $MTR_SUITE_DIR/../std_data/1MRows.parquet >/dev/null
+--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t2 $MTR_SUITE_DIR/../std_data/10MRows.parquet >/dev/null
+--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t3 $MTR_SUITE_DIR/../std_data/50MRows.parquet >/dev/null
+--exec $MCS_CPIMPORT mcol_5505_parquet_large_volume t4 $MTR_SUITE_DIR/../std_data/100MRows.parquet >/dev/null
+
+SELECT * FROM t1 ORDER BY col1 LIMIT 5;
+SELECT COUNT(*) FROM t1;
+
+SELECT * FROM t2 ORDER BY col1 LIMIT 5;
+SELECT COUNT(*) FROM t2;
+
+SELECT * FROM t3 ORDER BY col1 LIMIT 5;
+SELECT COUNT(*) FROM t3;
+
+SELECT * FROM t4 ORDER BY col1 LIMIT 5;
+SELECT COUNT(*) FROM t4;
+
+# Clean UP
+--exec rm $MTR_SUITE_DIR/../std_data/1MRows.parquet
+--exec rm $MTR_SUITE_DIR/../std_data/10MRows.parquet
+--exec rm $MTR_SUITE_DIR/../std_data/50MRows.parquet
+--exec rm $MTR_SUITE_DIR/../std_data/100MRows.parquet
+DROP DATABASE mcol_5505_parquet_large_volume;
--- a/mysql-test/columnstore/basic/t/mcol-5505-cpimport-parquet.test
+++ b/mysql-test/columnstore/basic/t/mcol-5505-cpimport-parquet.test
@ -0,0 +1,64 @@
+#
+# Check the parquet support for different data types
+# Author: Bin Ruan, binruan0227@gmail.com
+#
+if (!$MYSQL_TEST_ROOT){
+    skip Should be run by root to execute cpimport;
+}
+
+-- source ../include/have_columnstore.inc
+
+--disable_warnings
+DROP DATABASE IF EXISTS mcol_5505_cpimport_parquet;
+--enable_warnings
+
+CREATE DATABASE mcol_5505_cpimport_parquet;
+USE mcol_5505_cpimport_parquet;
+# Create table
+SET time_zone = '+8:00';
+Create TABLE t1(
+    col1 INT,
+    col2 BIGINT,
+    col3 FLOAT,
+    col4 DOUBLE,
+    col5 TIME(3),
+    col6 VARCHAR(2),
+    col7 VARCHAR(5),
+    col8 VARCHAR(20),
+    col9 CHAR(2),
+    col10 CHAR(5),
+    col11 CHAR(20),
+    col12 TIMESTAMP(3),
+    col13 DATE,
+    col14 DATETIME(3),
+    col15 SMALLINT,
+    col16 TINYINT,
+    col17 DECIMAL(9,3),
+    col18 INT UNSIGNED,
+    col19 SMALLINT UNSIGNED,
+    col20 TINYINT UNSIGNED,
+    col21 BIGINT UNSIGNED,
+    col22 BOOLEAN,
+    col23 DECIMAL(38,10),
+    col24 TIME(6),
+    col25 TIMESTAMP(6),
+    col26 DATETIME(6),
+    col27 CHAR(4),
+    col28 CHAR(4)
+) ENGINE=Columnstore;
+
+# Generate data
+--exec mcs_parquet_gen -a -f $MTR_SUITE_DIR/../std_data
+
+
+#Valid data and table
+--exec $MCS_CPIMPORT mcol_5505_cpimport_parquet t1 $MTR_SUITE_DIR/../std_data/tests.parquet >/dev/null
+--exec $MCS_CPIMPORT mcol_5505_cpimport_parquet t1 $MTR_SUITE_DIR/../std_data/nulls.parquet >/dev/null
+
+SELECT * FROM t1;
+SELECT COUNT(*) FROM t1;
+
+# Clean UP
+--exec rm $MTR_SUITE_DIR/../std_data/tests.parquet
+--exec rm $MTR_SUITE_DIR/../std_data/nulls.parquet
+DROP DATABASE mcol_5505_cpimport_parquet;
--- a/mysql-test/columnstore/basic/t/mcol-5505-parquet-ddl.test
+++ b/mysql-test/columnstore/basic/t/mcol-5505-parquet-ddl.test
@ -0,0 +1,33 @@
+#
+# check mcs_parquet_ddl tool
+# Author: Bin Ruan, binruan0227@gmail.com
+#
+-- source ../include/have_columnstore.inc
+
+--disable_warnings
+DROP DATABASE IF EXISTS mcol_5505_parquet_ddl;
+--enable_warnings
+
+--disable_result_log
+--exec mcs_parquet_gen -a -f $MTR_SUITE_DIR/../std_data
+--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/tests.parquet $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl
+# Wrong source file type
+--error 3
+--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/int8.par $MTR_SUITE_DIR/../std_data/int8table.ddl
+# Wrong number of argument files
+--error 4
+--exec mcs_parquet_ddl $MTR_SUITE_DIR/../std_data/tests.parquet
+--enable_result_log
+
+# Create table
+CREATE DATABASE mcol_5505_parquet_ddl;
+
+--exec $MYSQL mcol_5505_parquet_ddl < $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl >/dev/null
+USE mcol_5505_parquet_ddl;
+SHOW CREATE TABLE parquet_test_table;
+
+# Clean UP
+--exec rm $MTR_SUITE_DIR/../std_data/tests.parquet
+--exec rm $MTR_SUITE_DIR/../std_data/nulls.parquet
+--exec rm $MTR_SUITE_DIR/../std_data/parquet_test_table.ddl
+DROP DATABASE mcol_5505_parquet_ddl;
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@ -13,3 +13,5 @@ add_subdirectory(idbmeminfo)
 add_subdirectory(rebuildEM)
 add_subdirectory(passwd)
 add_subdirectory(configMgt)
+add_subdirectory(parquetGen)
+add_subdirectory(parquetDDL)
--- a/tools/parquetDDL/CMakeLists.txt
+++ b/tools/parquetDDL/CMakeLists.txt
@ -0,0 +1,6 @@
+include_directories(${ENGINE_COMMON_INCLUDES})
+
+set(parquetDDL_SRCS main.cpp)
+add_executable(mcs_parquet_ddl ${parquetDDL_SRCS})
+target_link_libraries(mcs_parquet_ddl arrow parquet)
+install(TARGETS mcs_parquet_ddl DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)
--- a/tools/parquetDDL/main.cpp
+++ b/tools/parquetDDL/main.cpp
@ -0,0 +1,285 @@
+#include <iostream>
+#include <string>
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/exception.h>
+#include <parquet/arrow/reader.h>
+#include <vector>
+#include <fstream>
+#include <unistd.h>
+
+enum STATUS_CODE
+{
+  NO_ERROR,
+  EMPTY_FIELD,
+  UNSUPPORTED_DATA_TYPE,
+  UNSUPPORTED_FILE_TYPE,
+  FILE_NUM_ERROR
+};
+
+/**
+ * print the usage information
+*/
+static void usage()
+{
+  std::cout << "usage: " << std::endl;
+  std::cout << "Reading parquet then output its corresponding .ddl file." << std::endl;
+  std::cout << "mcs_parquet_ddl [input_parquet_file] [output_ddl_file]" << std::endl;
+}
+
+/**
+ * get the schema of the parquet file
+*/
+void getSchema(std::string filePath, std::shared_ptr<arrow::Schema>* parquetSchema)
+{
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(filePath, arrow::default_memory_pool()));
+  std::unique_ptr<parquet::arrow::FileReader> reader;
+  PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+  PARQUET_THROW_NOT_OK(reader->GetSchema(parquetSchema));
+  PARQUET_THROW_NOT_OK(infile->Close());
+}
+
+/**
+ * convert arrow data type id to corresponding columnstore type string
+*/
+int convert2mcs(std::shared_ptr<arrow::DataType> dataType, arrow::Type::type typeId, std::string& colType)
+{
+  switch (typeId)
+  {
+    case arrow::Type::type::BOOL:
+    {
+      colType = "BOOLEAN";
+      break;
+    }
+    case arrow::Type::type::UINT8:
+    {
+      colType = "TINYINT UNSIGNED";
+      break;
+    }
+    case arrow::Type::type::INT8:
+    {
+      colType = "TINYINT";
+      break;
+    }
+    case arrow::Type::type::UINT16:
+    {
+      colType = "SMALLINT UNSIGNED";
+      break;
+    }
+    case arrow::Type::type::INT16:
+    {
+      colType = "SMALLINT";
+      break;
+    }
+    case arrow::Type::type::UINT32:
+    {
+      colType = "INT UNSIGNED";
+      break;
+    }
+    case arrow::Type::type::INT32:
+    {
+      colType = "INT";
+      break;
+    }
+    case arrow::Type::type::UINT64:
+    {
+      colType = "BIGINT UNSIGNED";
+      break;
+    }
+    case arrow::Type::type::INT64:
+    {
+      colType = "BIGINT";
+      break;
+    }
+    case arrow::Type::type::FLOAT:
+    {
+      colType = "FLOAT";
+      break;
+    }
+    case arrow::Type::type::DOUBLE:
+    {
+      colType = "DOUBLE";
+      break;
+    }
+    case arrow::Type::type::STRING:
+    {
+      // set 2000 as the maximum length and VARCHAR as column type
+      colType = "VARCHAR(2000)";
+      break;
+    }
+    case arrow::Type::type::BINARY:
+    {
+      // set 8000 as the maximum length and VARCHAR as column type
+      colType = "VARCHAR(8000) character set 'binary'";
+      break;
+    }
+    case arrow::Type::type::FIXED_SIZE_BINARY:
+    {
+      std::shared_ptr<arrow::FixedSizeBinaryType> fType = std::static_pointer_cast<arrow::FixedSizeBinaryType>(dataType);
+      int byteWidth = fType->byte_width();
+      colType = "CHAR(" + std::to_string(byteWidth) + ")";
+      break;
+    }
+    case arrow::Type::type::DATE32:
+    {
+      colType = "DATE";
+      break;
+    }
+    case arrow::Type::type::DATE64:
+    {
+      colType = "DATE";
+      break;
+    }
+    case arrow::Type::type::TIMESTAMP:
+    {
+      std::shared_ptr<arrow::TimestampType> fType = std::static_pointer_cast<arrow::TimestampType>(dataType);
+
+      if (fType->unit() == arrow::TimeUnit::MILLI)
+        colType = "TIMESTAMP(3)";
+      else if (fType->unit() == arrow::TimeUnit::MICRO)
+        colType = "TIMESTAMP(6)";
+      else
+        return UNSUPPORTED_DATA_TYPE;
+
+      break;
+    }
+    case arrow::Type::type::TIME32:
+    {
+      colType = "TIME(3)";
+      break;
+    }
+    case arrow::Type::type::TIME64:
+    {
+      std::shared_ptr<arrow::Time64Type> fType = std::static_pointer_cast<arrow::Time64Type>(dataType);
+
+      if (fType->unit() == arrow::TimeUnit::MICRO)
+        colType = "TIME(6)";
+      else
+        return UNSUPPORTED_DATA_TYPE;
+        
+      break;
+    }
+    case arrow::Type::type::DECIMAL128:
+    {
+      // get precision and scale
+      std::shared_ptr<arrow::DecimalType> fType = std::static_pointer_cast<arrow::DecimalType>(dataType);
+      int32_t fPrecision = fType->precision();
+      int32_t fScale = fType->scale();
+      colType = "DECIMAL(" + std::to_string(fPrecision) + "," + std::to_string(fScale) + ")";
+      break;
+    }
+    default:
+    {
+      return UNSUPPORTED_DATA_TYPE;
+    }
+  }
+  return NO_ERROR;
+}
+
+/**
+ * main function to generate DDL file
+*/
+int generateDDL(std::string filePath, std::string targetPath, std::string tableName)
+{
+  std::shared_ptr<arrow::Schema> parquetSchema;
+  getSchema(filePath, &parquetSchema);
+  std::vector<std::string> parquetCols;
+  std::vector<std::string> parquetTypes;
+  int rc = NO_ERROR;
+  int fieldsNum = parquetSchema->num_fields();
+
+  if (fieldsNum == 0)
+  {
+    return EMPTY_FIELD;
+  }
+
+  for (int i = 0; i < fieldsNum; i++)
+  {
+    const std::shared_ptr<arrow::Field> tField = parquetSchema->field(i);
+    const std::string tName = tField->name();
+    std::string colType;
+    auto tType = tField->type();
+    parquetCols.push_back(tName);
+    rc = convert2mcs(tType, tType->id(), colType);
+
+    if (rc != NO_ERROR)
+    {
+      std::cout << "Not allowed data type: " << tName << std::endl;
+      return rc;
+    }
+
+    parquetTypes.push_back(colType);
+  }
+
+  std::string str1 = "CREATE TABLE " + tableName + "(\n";
+  std::string str2 = ") ENGINE=Columnstore;";
+
+  for (int i = 0; i < fieldsNum; i++)
+  {
+    str1 += parquetCols[i] + " " + parquetTypes[i] + (i == fieldsNum-1 ? "\n" : ",\n");
+  }
+
+  str1 += str2;
+  std::ofstream outfile(targetPath + tableName + ".ddl");
+  outfile << str1;
+  outfile.close();
+  std::cout << "Successfully generate " + tableName + ".ddl" << std::endl;
+  return rc;
+}
+
+int main(int argc, char** argv)
+{
+  int32_t option;
+
+  while ((option = getopt(argc, argv, "h")) != EOF)
+  {
+    switch (option)
+    {
+      case 'h':
+      case '?':
+      default:
+        usage();
+        return (option == 'h' ? 0 : -1);
+        break;
+    }
+  }
+
+  // parquet file argv[1]
+  // ddl file argv[2]
+  // input parameter should be 3 (no more than 3 also)
+  if (argc != 3)
+  {
+    std::cout << "Please input source parquet file and target ddl file" << std::endl;
+    return FILE_NUM_ERROR;
+  }
+  std::string parquetFile(argv[1]);
+  std::string ddlFile(argv[2]);
+
+  // check file extension
+  std::string::size_type endBase = ddlFile.rfind('.');
+  std::string::size_type endBase1 = parquetFile.rfind('.');
+  if (endBase == std::string::npos || endBase1 == std::string::npos || 
+      parquetFile.substr(endBase1 + 1) != "parquet" ||
+      ddlFile.substr(endBase + 1) != "ddl")
+  {
+    std::cout << "File type not supported" << std::endl;
+    usage();
+    return UNSUPPORTED_FILE_TYPE;
+  }
+
+  std::string targetPath;
+  std::string tableName;
+  std::string::size_type startBase = ddlFile.rfind('/');
+  targetPath.assign(argv[2], startBase + 1);
+  tableName.assign(argv[2] + startBase + 1, endBase - startBase - 1);
+  std::cout << "Reading " + parquetFile << std::endl;
+  int rc = generateDDL(parquetFile, targetPath, tableName);
+
+  if (rc != NO_ERROR)
+  {
+    std::cout << "Input parquet file illegal: no data field" << std::endl;
+  }
+
+  return rc;
+}
--- a/tools/parquetGen/CMakeLists.txt
+++ b/tools/parquetGen/CMakeLists.txt
@ -0,0 +1,6 @@
+include_directories(${ENGINE_COMMON_INCLUDES})
+
+set(parquetGen_SRCS main.cpp)
+add_executable(mcs_parquet_gen ${parquetGen_SRCS})
+target_link_libraries(mcs_parquet_gen boost_system boost_filesystem arrow parquet)
+install(TARGETS mcs_parquet_gen DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)
--- a/tools/parquetGen/main.cpp
+++ b/tools/parquetGen/main.cpp
--- a/utils/dataconvert/dataconvert.cpp
+++ b/utils/dataconvert/dataconvert.cpp
@ -29,6 +29,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <type_traits>
+#include <chrono>
 #include "mcs_decimal.h"
 using namespace std;
 #include <boost/algorithm/string/case_conv.hpp>
@ -1572,6 +1573,44 @@ boost::any DataConvert::StringToTimestamp(const datatypes::ConvertFromStringPara
  return value;
 }

+//------------------------------------------------------------------------------
+// Convert date32 parquet data to binary date.  Used by BulkLoad.
+//------------------------------------------------------------------------------
+int32_t DataConvert::ConvertArrowColumnDate(int32_t dayVal, int& status)
+{
+  int inYear;
+  int inMonth;
+  int inDay;
+  int32_t value = 0;
+
+  int64_t secondsSinceEpoch = dayVal;
+  secondsSinceEpoch *= 86400;
+  std::chrono::seconds duration(secondsSinceEpoch);
+
+  std::chrono::system_clock::time_point timePoint(duration);
+
+  std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
+  std::tm* timeInfo = std::localtime(&ttime);
+
+  inYear = timeInfo->tm_year + 1900;
+  inMonth = timeInfo->tm_mon + 1;
+  inDay = timeInfo->tm_mday;
+
+  if (isDateValid(inDay, inMonth, inYear))
+  {
+    Date aDay;
+    aDay.year = inYear;
+    aDay.month = inMonth;
+    aDay.day = inDay;
+    memcpy(&value, &aDay, 4);
+  }
+  else
+  {
+    status = -1;
+  }
+  return value;
+}
+
 //------------------------------------------------------------------------------
 // Convert date string to binary date.  Used by BulkLoad.
 //------------------------------------------------------------------------------
@ -1658,6 +1697,100 @@ bool DataConvert::isColumnDateValid(int32_t date)
  return (isDateValid(d.day, d.month, d.year));
 }

+//------------------------------------------------------------------------------
+// Convert timestamp parquet data to binary datetime(millisecond).  Used by BulkLoad.
+//------------------------------------------------------------------------------
+int64_t DataConvert::convertArrowColumnDatetime(int64_t timeVal, int& status)
+{
+  int64_t value = 0;
+  int inYear;
+  int inMonth;
+  int inDay;
+  int inHour;
+  int inMinute;
+  int inSecond;
+  int inMicrosecond;
+
+  std::chrono::milliseconds duration(timeVal);
+  std::chrono::system_clock::time_point timePoint(duration);
+
+  std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
+  std::tm* timeInfo = std::gmtime(&ttime);
+
+  inYear = timeInfo->tm_year + 1900;
+  inMonth = timeInfo->tm_mon + 1;
+  inDay = timeInfo->tm_mday;
+  inHour = timeInfo->tm_hour;
+  inMinute = timeInfo->tm_min;
+  inSecond = timeInfo->tm_sec;
+  inMicrosecond = duration.count() % 1000;
+  if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
+  {
+    DateTime aDatetime;
+    aDatetime.year = inYear;
+    aDatetime.month = inMonth;
+    aDatetime.day = inDay;
+    aDatetime.hour = inHour;
+    aDatetime.minute = inMinute;
+    aDatetime.second = inSecond;
+    aDatetime.msecond = inMicrosecond;
+
+    memcpy(&value, &aDatetime, 8);
+  }
+  else
+  {
+    status = -1;
+  }
+  return value;
+}
+
+//------------------------------------------------------------------------------
+// Convert timestamp parquet data to binary datetime(millisecond).  Used by BulkLoad.
+//------------------------------------------------------------------------------
+int64_t DataConvert::convertArrowColumnDatetimeUs(int64_t timeVal, int& status)
+{
+  int64_t value = 0;
+  int inYear;
+  int inMonth;
+  int inDay;
+  int inHour;
+  int inMinute;
+  int inSecond;
+  int inMicrosecond;
+
+  std::chrono::microseconds duration(timeVal);
+  std::chrono::system_clock::time_point timePoint(duration);
+
+  std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
+  std::tm* timeInfo = std::gmtime(&ttime);
+
+  inYear = timeInfo->tm_year + 1900;
+  inMonth = timeInfo->tm_mon + 1;
+  inDay = timeInfo->tm_mday;
+  inHour = timeInfo->tm_hour;
+  inMinute = timeInfo->tm_min;
+  inSecond = timeInfo->tm_sec;
+  inMicrosecond = duration.count() % 1000000;
+  if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
+  {
+    DateTime aDatetime;
+    aDatetime.year = inYear;
+    aDatetime.month = inMonth;
+    aDatetime.day = inDay;
+    aDatetime.hour = inHour;
+    aDatetime.minute = inMinute;
+    aDatetime.second = inSecond;
+    aDatetime.msecond = inMicrosecond;
+
+    memcpy(&value, &aDatetime, 8);
+  }
+  else
+  {
+    status = -1;
+  }
+  return value;
+}
+
 //------------------------------------------------------------------------------
 // Convert date/time string to binary date/time.  Used by BulkLoad.
 //------------------------------------------------------------------------------
@ -1798,6 +1931,127 @@ int64_t DataConvert::convertColumnDatetime(const char* dataOrg, CalpontDateTimeF
  return value;
 }

+//------------------------------------------------------------------------------
+// Convert timestamp parquet data to binary timestamp.  Used by BulkLoad.
+//------------------------------------------------------------------------------
+int64_t DataConvert::convertArrowColumnTimestamp(int64_t timeVal, int& status)
+{
+  int64_t value = 0;
+  int inYear;
+  int inMonth;
+  int inDay;
+  int inHour;
+  int inMinute;
+  int inSecond;
+  int inMicrosecond;
+
+  std::chrono::milliseconds duration(timeVal);
+  std::chrono::system_clock::time_point timePoint(duration);
+
+  std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
+  std::tm* timeInfo = std::gmtime(&ttime);
+
+  inYear = timeInfo->tm_year + 1900;
+  inMonth = timeInfo->tm_mon + 1;
+  inDay = timeInfo->tm_mday;
+  inHour = timeInfo->tm_hour;
+  inMinute = timeInfo->tm_min;
+  inSecond = timeInfo->tm_sec;
+  inMicrosecond = duration.count() % 1000;
+  if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
+  {
+    MySQLTime m_time;
+    m_time.year = inYear;
+    m_time.month = inMonth;
+    m_time.day = inDay;
+    m_time.hour = inHour;
+    m_time.minute = inMinute;
+    m_time.second = inSecond;
+    m_time.second_part = inMicrosecond;
+
+    bool isValid = true;
+    int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
+
+    if (!isValid)
+    {
+      status = -1;
+      return value;
+    }
+
+    TimeStamp timestamp;
+    timestamp.second = seconds;
+    timestamp.msecond = m_time.second_part;
+
+    memcpy(&value, &timestamp, 8);
+  }
+  else
+  {
+    status = -1;
+  }
+  return value;
+}
+
+//------------------------------------------------------------------------------
+// Convert timestamp parquet data to binary timestamp.  Used by BulkLoad.
+//------------------------------------------------------------------------------
+int64_t DataConvert::convertArrowColumnTimestampUs(int64_t timeVal, int& status)
+{
+  int64_t value = 0;
+  int inYear;
+  int inMonth;
+  int inDay;
+  int inHour;
+  int inMinute;
+  int inSecond;
+  int inMicrosecond;
+  
+  std::chrono::microseconds duration(timeVal);
+  std::chrono::system_clock::time_point timePoint(duration);
+
+  std::time_t ttime = std::chrono::system_clock::to_time_t(timePoint);
+  std::tm* timeInfo = std::gmtime(&ttime);
+
+  inYear = timeInfo->tm_year + 1900;
+  inMonth = timeInfo->tm_mon + 1;
+  inDay = timeInfo->tm_mday;
+  inHour = timeInfo->tm_hour;
+  inMinute = timeInfo->tm_min;
+  inSecond = timeInfo->tm_sec;
+  inMicrosecond = static_cast<int>(duration.count() % 1000000);
+
+  if (isDateValid(inDay, inMonth, inYear) && isDateTimeValid(inHour, inMinute, inSecond, inMicrosecond))
+  {
+    MySQLTime m_time;
+    m_time.year = inYear;
+    m_time.month = inMonth;
+    m_time.day = inDay;
+    m_time.hour = inHour;
+    m_time.minute = inMinute;
+    m_time.second = inSecond;
+    m_time.second_part = inMicrosecond;
+
+    bool isValid = true;
+    int64_t seconds = mySQLTimeToGmtSec(m_time, 0, isValid);
+
+    if (!isValid)
+    {
+      status = -1;
+      return value;
+    }
+
+    TimeStamp timestamp;
+    timestamp.second = seconds;
+    timestamp.msecond = m_time.second_part;
+
+    memcpy(&value, &timestamp, 8);
+  }
+  else
+  {
+    status = -1;
+  }
+  return value;
+}
+
 //------------------------------------------------------------------------------
 // Convert timestamp string to binary timestamp.  Used by BulkLoad.
 // Most of this code is taken from DataConvert::convertColumnDatetime
@ -1972,6 +2226,123 @@ int64_t DataConvert::convertColumnTimestamp(const char* dataOrg, CalpontDateTime
  return value;
 }

+//------------------------------------------------------------------------------
+// Convert time32 parquet data to binary time.  Used by BulkLoad.
+//------------------------------------------------------------------------------
+int64_t DataConvert::convertArrowColumnTime32(int32_t timeVal, int& status)
+{
+  int64_t value = 0;
+  // convert millisecond to time
+  int inHour, inMinute, inSecond, inMicrosecond;
+  inHour = inMinute = inSecond = inMicrosecond = 0;
+  bool isNeg = false;
+  if (timeVal < 0)
+    isNeg = true;
+  inHour = timeVal / 3600000;
+  inMinute = (timeVal - inHour * 3600000) / 60000;
+  inSecond = (timeVal - inHour * 3600000 - inMinute * 60000) / 1000;
+  inMicrosecond = timeVal - inHour * 3600000 - inMinute * 60000 - inSecond * 1000;
+  if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
+  {
+    Time atime;
+    atime.hour = inHour;
+    atime.minute = inMinute;
+    atime.second = inSecond;
+    atime.msecond = inMicrosecond;
+    atime.is_neg = isNeg;
+
+    memcpy(&value, &atime, 8);
+  }
+  else
+  {
+    // Emulate MariaDB's time saturation
+    if (inHour > 838)
+    {
+      Time atime;
+      atime.hour = 838;
+      atime.minute = 59;
+      atime.second = 59;
+      atime.msecond = 999999;
+      atime.is_neg = false;
+      memcpy(&value, &atime, 8);
+    }
+    else if (inHour < -838)
+    {
+      Time atime;
+      atime.hour = -838;
+      atime.minute = 59;
+      atime.second = 59;
+      atime.msecond = 999999;
+      atime.is_neg = false;
+      memcpy(&value, &atime, 8);
+    }
+
+    // If neither of the above match then we return a 0 time
+
+    status = -1;
+  }
+  return value;
+}
+
+//------------------------------------------------------------------------------
+// Convert time64 parquet data to binary time.  Used by BulkLoad.
+//------------------------------------------------------------------------------
+int64_t DataConvert::convertArrowColumnTime64(int64_t timeVal, int& status)
+{
+  int64_t value = 0;
+  // convert macrosecond to time
+  int inHour, inMinute, inSecond, inMicrosecond;
+  inHour = inMinute = inSecond = inMicrosecond = 0;
+  bool isNeg = false;
+  if (timeVal < 0)
+    isNeg = true;
+  inHour = timeVal / 3600000000;
+  inMinute = (timeVal - inHour * 3600000000) / 60000000;
+  inSecond = (timeVal - inHour * 3600000000 - inMinute * 60000000) / 1000000;
+  inMicrosecond = timeVal - inHour * 3600000000 - inMinute * 60000000 - inSecond * 1000000;
+  if (isTimeValid(inHour, inMinute, inSecond, inMicrosecond))
+  {
+    Time atime;
+    atime.hour = inHour;
+    atime.minute = inMinute;
+    atime.second = inSecond;
+    atime.msecond = inMicrosecond;
+    atime.is_neg = isNeg;
+
+    memcpy(&value, &atime, 8);
+  }
+  else
+  {
+    // Emulate MariaDB's time saturation
+    if (inHour > 838)
+    {
+      Time atime;
+      atime.hour = 838;
+      atime.minute = 59;
+      atime.second = 59;
+      atime.msecond = 999999;
+      atime.is_neg = false;
+      memcpy(&value, &atime, 8);
+    }
+    else if (inHour < -838)
+    {
+      Time atime;
+      atime.hour = -838;
+      atime.minute = 59;
+      atime.second = 59;
+      atime.msecond = 999999;
+      atime.is_neg = false;
+      memcpy(&value, &atime, 8);
+    }
+
+    // If neither of the above match then we return a 0 time
+
+    status = -1;
+  }
+  return value;
+}
+
+
 //------------------------------------------------------------------------------
 // Convert time string to binary time.  Used by BulkLoad.
 // Most of this is taken from str_to_time in sql-common/my_time.c
--- a/utils/dataconvert/dataconvert.h
+++ b/utils/dataconvert/dataconvert.h
@ -1170,6 +1170,14 @@ class DataConvert
  EXPORT static std::string timeToString1(long long timevalue);
  static inline void timeToString1(long long timevalue, char* buf, unsigned int buflen);

+  /**
+   * @brief convert parquet date data to its native format. This function is for bulkload to use.
+   * 
+   * @param dayVal the input data representing days
+   * @param status 0 - success, -1 - fail
+   */
+  EXPORT static int32_t ConvertArrowColumnDate(int32_t dayVal, int& status);
+
  /**
   * @brief convert a date column data, represnted as a string, to it's native
   * format. This function is for bulkload to use.
@ -1188,6 +1196,22 @@ class DataConvert
   */
  EXPORT static bool isColumnDateValid(int32_t date);

+  /**
+   * @brief convert parquet datetime data to its native format. This function is for bulkload to use.
+   * 
+   * @param dayVal the input data representing millisecond from unix epoch
+   * @param status 0 - success, -1 - fail
+   */
+  EXPORT static int64_t convertArrowColumnDatetime(int64_t timeVal, int& status);
+
+  /**
+   * @brief convert parquet datetime data to its native format. This function is for bulkload to use.
+   * 
+   * @param dayVal the input data representing microsecond from unix epoch
+   * @param status 0 - success, -1 - fail
+   */
+  EXPORT static int64_t convertArrowColumnDatetimeUs(int64_t timeVal, int& status);
+
  /**
   * @brief convert a datetime column data, represented as a string,
   * to it's native format. This function is for bulkload to use.
@ -1201,6 +1225,22 @@ class DataConvert
  EXPORT static int64_t convertColumnDatetime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
                                              int& status, unsigned int dataOrgLen);

+  /**
+   * @brief convert parquet timestamp data(millisecond) to its native format. This function is for bulkload to use.
+   * 
+   * @param dayVal the input data representing millisecond from unix epoch
+   * @param status 0 - success, -1 - fail
+   */
+  EXPORT static int64_t convertArrowColumnTimestamp(int64_t timeVal, int& status);
+
+  /**
+   * @brief convert parquet timestamp data(microsecond) to its native format. This function is for bulkload to use.
+   * 
+   * @param dayVal the input data representing millisecond from unix epoch
+   * @param status 0 - success, -1 - fail
+   */
+  EXPORT static int64_t convertArrowColumnTimestampUs(int64_t timeVal, int& status);
+ 
  /**
   * @brief convert a timestamp column data, represented as a string,
   * to it's native format. This function is for bulkload to use.
@ -1228,6 +1268,22 @@ class DataConvert
  EXPORT static int64_t convertColumnTime(const char* dataOrg, CalpontDateTimeFormat datetimeFormat,
                                          int& status, unsigned int dataOrgLen);

+  /**
+   * @brief convert parquet time data to its native format. This function is for bulkload to use.
+   * 
+   * @param dayVal the input data representing milliseconds since midnight
+   * @param status 0 - success, -1 - fail
+   */
+  EXPORT static int64_t convertArrowColumnTime32(int32_t timeVal, int& status);
+
+  /**
+   * @brief convert parquet time data to its native format. This function is for bulkload to use.
+   * 
+   * @param dayVal the input data representing either microseconds or nanoseconds since midnight
+   * @param status 0 - success, -1 - fail
+   */
+  EXPORT static int64_t convertArrowColumnTime64(int64_t timeVal, int& status);
+
  /**
   * @brief Is specified datetime valid; used by binary bulk load
   */
--- a/writeengine/bulk/CMakeLists.txt
+++ b/writeengine/bulk/CMakeLists.txt
@ -40,7 +40,11 @@ set(cpimport.bin_SRCS cpimport.cpp)

 add_executable(cpimport.bin ${cpimport.bin_SRCS})
 add_dependencies(cpimport.bin marias3)
-target_link_libraries(cpimport.bin ${ENGINE_LDFLAGS} ${NETSNMP_LIBRARIES} ${ENGINE_WRITE_LIBS} ${S3API_DEPS} we_bulk we_xml)

+target_link_libraries(cpimport.bin ${ENGINE_LDFLAGS} ${NETSNMP_LIBRARIES} ${ENGINE_WRITE_LIBS} ${S3API_DEPS} we_bulk we_xml)
+FIND_PACKAGE(Arrow)
+FIND_PACKAGE(Parquet)
+target_link_libraries(cpimport.bin arrow)
+target_link_libraries(cpimport.bin parquet)
 install(TARGETS cpimport.bin DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)

--- a/writeengine/bulk/we_bulkload.cpp
+++ b/writeengine/bulk/we_bulkload.cpp
@ -1210,6 +1210,32 @@ int BulkLoad::manageImportDataFileList(Job& job, int tableNo, TableInfo* tableIn
  std::vector<std::string> loadFilesList;
  bool bUseStdin = false;

+  // Check if all the import files are parquet file
+  bool isParquet = false;
+  for (unsigned int i = 0; i < fCmdLineImportFiles.size(); i++)
+  {
+    if (fCmdLineImportFiles[i].rfind(".parquet") != std::string::npos)
+    {
+      if (!isParquet)
+        isParquet = true;
+    }
+    else
+    {
+      if (isParquet)
+      {
+        ostringstream oss;
+        oss << "Import files exist parquet file while not all of them are parquet files.";
+        fLog.logMsg(oss.str(), ERR_FILE_TYPE_DIFF, MSGLVL_ERROR);
+        return ERR_FILE_TYPE_DIFF;
+      }
+    }
+  }
+
+  if (isParquet)
+  {
+    setImportDataMode(IMPORT_DATA_PARQUET);
+  }
+
  // Take loadFileName from command line argument override "if" one exists,
  // else we take from the Job xml file
  std::string loadFileName;
--- a/writeengine/bulk/we_bulkloadbuffer.cpp
+++ b/writeengine/bulk/we_bulkloadbuffer.cpp
--- a/writeengine/bulk/we_bulkloadbuffer.h
+++ b/writeengine/bulk/we_bulkloadbuffer.h
@ -30,7 +30,7 @@
 #include "we_columninfo.h"
 #include "calpontsystemcatalog.h"
 #include "dataconvert.h"
-
+#include <arrow/api.h>
 namespace WriteEngine
 {
 class Log;
@ -84,6 +84,9 @@ class BulkLoadBuffer
  char* fOverflowBuf;      // Overflow data held for next buffer
  unsigned fOverflowSize;  // Current size of fOverflowBuf

+  std::shared_ptr<arrow::RecordBatch> fParquetBatch;          // Batch of parquet file to be parsed
+  std::shared_ptr<arrow::RecordBatch> fParquetBatchParser;    // for temporary use by parser
+  std::shared_ptr<::arrow::RecordBatchReader> fParquetReader; // Reader for read batches of parquet data
  // Information about the locker and status for each column in this buffer.
  // Note that TableInfo::fSyncUpdatesTI mutex is used to synchronize
  // access to fColumnLocks and fParseComplete from both read and parse
@ -174,6 +177,19 @@ class BulkLoadBuffer
  void convert(char* field, int fieldLength, bool nullFlag, unsigned char* output, const JobColumn& column,
               BLBufferStats& bufStats);

+  /** @brief Parse a batch of parquet data in read buffer for a nonDictionary column
+   */
+  int parseColParquet(ColumnInfo& columnInfo);
+
+  /** @brief Convert batch parquet data depending upon the data type
+   */
+  void convertParquet(std::shared_ptr<arrow::Array> columnData, unsigned char* buf, const JobColumn& column,
+                      BLBufferStats& bufStats, RID& lastInputRowInExtent, ColumnInfo& columnInfo,
+                      bool& updateCPInfoPendingFlag, ColumnBufferSection* section);
+
+
+  inline void updateCPMinMax(ColumnInfo& columnInfo, RID& lastInputRowInExtent, BLBufferStats& bufStats,
+                                           bool& updateCPInfoPendingFlag, ColumnBufferSection* section, uint32_t curRow);
  /** @brief Copy the overflow data
   */
  void copyOverflow(const BulkLoadBuffer& buffer);
@ -263,6 +279,11 @@ class BulkLoadBuffer
    fStatusBLB = status;
  }

+  void setParquetReader(std::shared_ptr<::arrow::RecordBatchReader> reader)
+  {
+    fParquetReader = reader;
+  }
+
  /** @brief Try to lock a column for the buffer
   * TableInfo::fSyncUpdatesTI mutex should be locked when calling this
   * function (see fColumnLocks discussion).
@ -273,6 +294,10 @@ class BulkLoadBuffer
                     size_t* parse_length, RID& totalReadRows, RID& correctTotalRows,
                     const boost::ptr_vector<ColumnInfo>& columnsInfo, unsigned int allowedErrCntThisCall);

+  /** @brief Read the batch data into the buffer
+   */
+  int fillFromFileParquet(RID& totalReadRows, RID& correctTotalRows);
+
  /** @brief Read the table data into the buffer
   */
  int fillFromFile(const BulkLoadBuffer& overFlowBufIn, FILE* handle, RID& totalRows, RID& correctTotalRows,
--- a/writeengine/bulk/we_columninfo.cpp
+++ b/writeengine/bulk/we_columninfo.cpp
@ -1657,6 +1657,41 @@ int ColumnInfo::closeDctnryStore(bool bAbort)
  return rc;
 }

+//--------------------------------------------------------------------------------------
+// Update dictionary store file with string column parquet data, and return the assigned
+// tokens (tokenbuf) to be stored in the corresponding column token file.
+//--------------------------------------------------------------------------------------
+int ColumnInfo::updateDctnryStoreParquet(std::shared_ptr<arrow::Array> columnData, int tokenPos, const int totalRow, char* tokenBuf)
+{
+  long long truncCount = 0;
+
+#ifdef PROFILE
+  Stats::startParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
+#endif
+  boost::mutex::scoped_lock lock(fDictionaryMutex);
+#ifdef PROFILE
+  Stats::stopParseEvent(WE_STATS_WAIT_TO_PARSE_DCT);
+#endif
+
+  int rc = fStore->insertDctnryParquet(columnData, tokenPos, totalRow, id, tokenBuf, truncCount, column.cs, column.weType);
+
+  if (rc != NO_ERROR)
+  {
+    WErrorCodes ec;
+    std::ostringstream oss;
+    oss << "updateDctnryStore: error adding rows to store file for "
+        << "OID-" << column.dctnry.dctnryOid << "; DBRoot-" << curCol.dataFile.fDbRoot << "; part-"
+        << curCol.dataFile.fPartition << "; seg-" << curCol.dataFile.fSegment << "; " << ec.errorString(rc);
+    fLog->logMsg(oss.str(), rc, MSGLVL_CRITICAL);
+    fpTableInfo->fBRMReporter.addToErrMsgEntry(oss.str());
+    return rc;
+  }
+
+  incSaturatedCnt(truncCount);
+  
+  return NO_ERROR;
+}
+
 //------------------------------------------------------------------------------
 // Update dictionary store file with specified strings, and return the assigned
 // tokens (tokenbuf) to be stored in the corresponding column token file.
--- a/writeengine/bulk/we_columninfo.h
+++ b/writeengine/bulk/we_columninfo.h
@ -200,6 +200,13 @@ class ColumnInfo : public WeUIDGID
   */
  void lastInputRowInExtentInc();

+  /** @brief Update dictionary for arrow/parquet format
+   *  Parse and store the parquet data into the store file, and
+   *  returns the assigned tokens (tokenBuf) to be stored in the
+   *  corresponding column token file.
+   */
+  int updateDctnryStoreParquet(std::shared_ptr<arrow::Array> columnData, int tokenPos, const int totalRow, char* tokenBuf);
+
  /** @brief Update dictionary method.
   *  Parses and stores specified strings into the store file, and
   *  returns the assigned tokens (tokenBuf) to be stored in the
--- a/writeengine/bulk/we_tableinfo.cpp
+++ b/writeengine/bulk/we_tableinfo.cpp
@ -55,6 +55,9 @@ using namespace querytele;
 #include "oamcache.h"
 #include "cacheutils.h"

+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/exception.h>
 namespace
 {
 const std::string BAD_FILE_SUFFIX = ".bad";  // Reject data file suffix
@ -153,6 +156,8 @@ TableInfo::TableInfo(Log* logger, const BRM::TxnID txnID, const string& processN
 , fRejectErrCnt(0)
 , fExtentStrAlloc(tableOID, logger)
 , fOamCachePtr(oam::OamCache::makeOamCache())
+ , fParquetReader(NULL)
+ , fReader(nullptr)
 {
  fBuffers.clear();
  fColumns.clear();
@ -266,24 +271,44 @@ int TableInfo::readTableData()
 {
  RID validTotalRows = 0;
  RID totalRowsPerInputFile = 0;
+  int64_t totalRowsParquet = 0;   // totalRowsParquet to be used in later function
+                                  // needs int64_t type
  int filesTBProcessed = fLoadFileList.size();
  int fileCounter = 0;
  unsigned long long qtSentAt = 0;
-
-  if (fHandle == NULL)
+  if (fImportDataMode != IMPORT_DATA_PARQUET)
  {
-    fFileName = fLoadFileList[fileCounter];
-    int rc = openTableFile();
-
-    if (rc != NO_ERROR)
+    if (fHandle == NULL)
    {
-      // Mark the table status as error and exit.
-      boost::mutex::scoped_lock lock(fSyncUpdatesTI);
-      fStatusTI = WriteEngine::ERR;
-      return rc;
+      fFileName = fLoadFileList[fileCounter];
+      int rc = openTableFile();
+
+      if (rc != NO_ERROR)
+      {
+        // Mark the table status as error and exit.
+        boost::mutex::scoped_lock lock(fSyncUpdatesTI);
+        fStatusTI = WriteEngine::ERR;
+        return rc;
+      }
+      fileCounter++;
+    }
+  }
+  else
+  {
+    if (fParquetReader == NULL)
+    {
+      fFileName = fLoadFileList[fileCounter];
+      int rc = openTableFileParquet(totalRowsParquet);
+      if (rc != NO_ERROR)
+      {
+        // Mark the table status as error and exit.
+        boost::mutex::scoped_lock lock(fSyncUpdatesTI);
+        fStatusTI = WriteEngine::ERR;
+        return rc;
+      }
+      fileCounter++;
    }

-    fileCounter++;
  }

  timeval readStart;
@ -419,16 +444,23 @@ int TableInfo::readTableData()
    // validTotalRows is ongoing total of valid rows read for all files
    //   pertaining to this DB table.
    int readRc;
-    if (fReadFromS3)
+    if (fImportDataMode != IMPORT_DATA_PARQUET)
    {
-      readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
-                                                  &fS3ParseLength, totalRowsPerInputFile, validTotalRows,
-                                                  fColumns, allowedErrCntThisCall);
+      if (fReadFromS3)
+      {
+        readRc = fBuffers[readBufNo].fillFromMemory(fBuffers[prevReadBuf], fFileBuffer, fS3ReadLength,
+                                                    &fS3ParseLength, totalRowsPerInputFile, validTotalRows,
+                                                    fColumns, allowedErrCntThisCall);
+      }
+      else
+      {
+        readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
+                                                  validTotalRows, fColumns, allowedErrCntThisCall);
+      }
    }
    else
    {
-      readRc = fBuffers[readBufNo].fillFromFile(fBuffers[prevReadBuf], fHandle, totalRowsPerInputFile,
-                                                validTotalRows, fColumns, allowedErrCntThisCall);
+      readRc = fBuffers[readBufNo].fillFromFileParquet(totalRowsPerInputFile, validTotalRows);
    }

    if (readRc != NO_ERROR)
@ -530,7 +562,7 @@ int TableInfo::readTableData()
      fCurrentReadBuffer = (fCurrentReadBuffer + 1) % fReadBufCount;

      // bufferCount++;
-      if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)))
+      if ((fHandle && feof(fHandle)) || (fReadFromS3 && (fS3ReadLength == fS3ParseLength)) || (totalRowsPerInputFile == (RID)totalRowsParquet))
      {
        timeval readFinished;
        gettimeofday(&readFinished, NULL);
@ -567,7 +599,15 @@ int TableInfo::readTableData()
        if (fileCounter < filesTBProcessed)
        {
          fFileName = fLoadFileList[fileCounter];
-          int rc = openTableFile();
+          int rc;
+          if (fImportDataMode != IMPORT_DATA_PARQUET)
+          {
+            rc = openTableFile();
+          }
+          else
+          {
+            rc = openTableFileParquet(totalRowsParquet);
+          }

          if (rc != NO_ERROR)
          {
@ -1252,6 +1292,45 @@ void TableInfo::addColumn(ColumnInfo* info)
  fExtentStrAlloc.addColumn(info->column.mapOid, info->column.width, info->column.dataType);
 }

+ 
+int TableInfo::openTableFileParquet(int64_t &totalRowsParquet)
+{
+  if (fParquetReader != NULL)
+    return NO_ERROR;
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  try
+  {
+    PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(fFileName, arrow::default_memory_pool()));
+    PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &fReader));
+    fReader->set_batch_size(1000);
+    PARQUET_THROW_NOT_OK(fReader->ScanContents({0}, 1000, &totalRowsParquet));
+    PARQUET_THROW_NOT_OK(fReader->GetRecordBatchReader(&fParquetReader));
+  }
+  catch (std::exception& ex)
+  {
+    ostringstream oss;
+    oss << "Error opening import file " << fFileName << ".";
+    fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
+
+    return ERR_FILE_OPEN;
+  }
+  catch (...)
+  {
+    ostringstream oss;
+    oss << "Error opening import file " << fFileName << ".";
+    fLog->logMsg(oss.str(), ERR_FILE_OPEN, MSGLVL_ERROR);
+
+    return ERR_FILE_OPEN;
+  }
+  // initialize fBuffers batch source
+  for (int i = 0; i < fReadBufCount; ++i)
+  {
+    fBuffers[i].setParquetReader(fParquetReader);
+  }
+  return NO_ERROR;
+ 
+}
+ 
 //------------------------------------------------------------------------------
 // Open the file corresponding to fFileName so that we can import it's contents.
 // A buffer is also allocated and passed to setvbuf().
@ -1331,24 +1410,32 @@ int TableInfo::openTableFile()
 //------------------------------------------------------------------------------
 void TableInfo::closeTableFile()
 {
-  if (fHandle)
+  if (fImportDataMode != IMPORT_DATA_PARQUET)
  {
-    // If reading from stdin, we don't delete the buffer out from under
-    // the file handle, because stdin is still open.  This will cause a
-    // memory leak, but when using stdin, we can only read in 1 table.
-    // So it's not like we will be leaking multiple buffers for several
-    // tables over the life of the job.
-    if (!fReadFromStdin)
+    if (fHandle)
    {
-      fclose(fHandle);
-      delete[] fFileBuffer;
+      // If reading from stdin, we don't delete the buffer out from under
+      // the file handle, because stdin is still open.  This will cause a
+      // memory leak, but when using stdin, we can only read in 1 table.
+      // So it's not like we will be leaking multiple buffers for several
+      // tables over the life of the job.
+      if (!fReadFromStdin)
+      {
+        fclose(fHandle);
+        delete[] fFileBuffer;
+      }
+  
+      fHandle = 0;
+    }
+    else if (ms3)
+    {
+      ms3_free((uint8_t*)fFileBuffer);
    }
-
-    fHandle = 0;
  }
-  else if (ms3)
+  else
  {
-    ms3_free((uint8_t*)fFileBuffer);
+    fReader.reset();
+    fParquetReader.reset();
  }
 }

--- a/writeengine/bulk/we_tableinfo.h
+++ b/writeengine/bulk/we_tableinfo.h
@ -30,6 +30,9 @@
 #include <boost/ptr_container/ptr_vector.hpp>
 #include <boost/uuid/uuid.hpp>

+#include <arrow/api.h>
+#include <parquet/arrow/reader.h>
+
 #include <libmarias3/marias3.h>

 #include "we_type.h"
@ -170,22 +173,25 @@ class TableInfo : public WeUIDGID
  oam::OamCache* fOamCachePtr;              // OamCache: ptr is copyable
  boost::uuids::uuid fJobUUID;              // Job UUID
  std::vector<BRM::LBID_t> fDictFlushBlks;  // dict blks to be flushed from cache
-
+  
+  std::shared_ptr<arrow::RecordBatchReader> fParquetReader;   // Batch reader to read batches of data
+  std::unique_ptr<parquet::arrow::FileReader> fReader;        // Reader to read parquet file
  //--------------------------------------------------------------------------
  // Private Functions
  //--------------------------------------------------------------------------

-  int changeTableLockState();            // Change state of table lock to cleanup
-  void closeTableFile();                 // Close current tbl file; free buffer
-  void closeOpenDbFiles();               // Close DB files left open at job's end
-  int confirmDBFileChanges();            // Confirm DB file changes (on HDFS)
-  void deleteTempDBFileChanges();        // Delete DB temp swap files (on HDFS)
-  int finishBRM();                       // Finish reporting updates for BRM
-  void freeProcessingBuffers();          // Free up Processing Buffers
-  bool isBufferAvailable(bool report);   // Is tbl buffer available for reading
-  int openTableFile();                   // Open data file and set the buffer
-  void reportTotals(double elapsedSec);  // Report summary totals
-  void sleepMS(long int ms);             // Sleep method
+  int changeTableLockState();                       // Change state of table lock to cleanup
+  void closeTableFile();                            // Close current tbl file; free buffer
+  void closeOpenDbFiles();                          // Close DB files left open at job's end
+  int confirmDBFileChanges();                       // Confirm DB file changes (on HDFS)
+  void deleteTempDBFileChanges();                   // Delete DB temp swap files (on HDFS)
+  int finishBRM();                                  // Finish reporting updates for BRM
+  void freeProcessingBuffers();                     // Free up Processing Buffers
+  bool isBufferAvailable(bool report);              // Is tbl buffer available for reading
+  int openTableFileParquet(int64_t &totalRowsParquet);  // Open parquet data file and set batch reader for each buffer
+  int openTableFile();                              // Open data file and set the buffer
+  void reportTotals(double elapsedSec);             // Report summary totals
+  void sleepMS(long int ms);                        // Sleep method
  // Compare column HWM with the examplar HWM.
  int compareHWMs(const int smallestColumnId, const int widerColumnId, const uint32_t smallerColumnWidth,
                  const uint32_t widerColumnWidth, const std::vector<DBRootExtentInfo>& segFileInfo,
--- a/writeengine/dictionary/we_dctnry.cpp
+++ b/writeengine/dictionary/we_dctnry.cpp
@ -35,6 +35,8 @@
 #include <iostream>
 using namespace std;

+
+
 #include "bytestream.h"
 #include "brmtypes.h"
 #include "extentmap.h"  // for DICT_COL_WIDTH
@ -745,6 +747,365 @@ int Dctnry::insertDctnry2(Signature& sig)
  return NO_ERROR;
 }

+int Dctnry::insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
+                          int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
+                          const CHARSET_INFO* cs, const WriteEngine::ColType& weType)
+{
+  if (cs->mbmaxlen > 1)
+  {
+    // For TEXT columns, we truncate based on the number of bytes,
+    // and not based on the number of characters, as for CHAR/VARCHAR
+    // columns in the else block.
+    if (weType == WriteEngine::WR_TEXT)
+    {
+      if (curSig.size > m_colWidth)
+      {
+        uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
+        curSig.size = m_colWidth - truncate_point;
+        truncCount++;
+      }
+    }
+    else
+    {
+      const char* start = (const char*) curSig.signature;
+      const char* end = (const char*)(curSig.signature + curSig.size);
+      size_t numChars = cs->numchars(start, end);
+      size_t maxCharLength = m_colWidth / cs->mbmaxlen;
+
+      if (numChars > maxCharLength)
+      {
+        MY_STRCOPY_STATUS status;
+        cs->well_formed_char_length(start, end, maxCharLength, &status);
+        curSig.size = status.m_source_end_pos - start;
+        truncCount++;
+      }
+    }
+  }
+  else // cs->mbmaxlen == 1
+  {
+    if (curSig.size > m_colWidth)
+    {
+      curSig.size = m_colWidth;
+      truncCount++;
+    }
+  }
+
+  //...Search for the string in our string cache
+  // if it fits into one block (< 8KB)
+  if (curSig.size <= MAX_SIGNATURE_SIZE)
+  {
+    // Stats::startParseEvent("getTokenFromArray");
+    found = getTokenFromArray(curSig);
+
+    if (found)
+    {
+      memcpy(pOut + outOffset, &curSig.token, 8);
+      outOffset += 8;
+      startPos++;
+      // Stats::stopParseEvent("getTokenFromArray");
+      return NO_ERROR;
+    }
+
+    // Stats::stopParseEvent("getTokenFromArray");
+  }
+
+  totalUseSize = m_totalHdrBytes + curSig.size;
+
+  //...String not found in cache, so proceed.
+  //   If room is available in current block then insert into block.
+  // @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
+  if (((totalUseSize <= m_freeSpace - HDR_UNIT_SIZE) ||
+       ((curSig.size > 8176) && (m_freeSpace > HDR_UNIT_SIZE))) &&
+      (m_curOp < (MAX_OP_COUNT - 1)))
+  {
+    RETURN_ON_ERROR(insertDctnry2(curSig));  // m_freeSpace updated!
+    m_curBlock.state = BLK_WRITE;
+    memcpy(pOut + outOffset, &curSig.token, 8);
+    outOffset += 8;
+    startPos++;
+    found = true;
+
+    //...If we have reached limit for the number of strings allowed in
+    //   a block, then we write the current block so that we can start
+    //   another block.
+    if (m_curOp >= MAX_OP_COUNT - 1)
+    {
+#ifdef PROFILE
+      Stats::stopParseEvent(WE_STATS_PARSE_DCT);
+#endif
+      RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
+      m_curBlock.state = BLK_READ;
+      next = true;
+    }
+
+    //...Add string to cache, if we have not exceeded cache limit
+    // Don't cache big blobs
+    if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
+    {
+      addToStringCache(curSig);
+    }
+  }
+  else  //...No room for this string in current block, so we write
+        //   out the current block, so we can start another block
+  {
+#ifdef PROFILE
+    Stats::stopParseEvent(WE_STATS_PARSE_DCT);
+#endif
+    RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
+    m_curBlock.state = BLK_READ;
+    next = true;
+    found = false;
+  }  // if m_freeSpace
+
+  //..."next" flag is used to indicate that we need to advance to the
+  //   next block in the store file.
+  if (next)
+  {
+    memset(m_curBlock.data, 0, sizeof(m_curBlock.data));
+    memcpy(m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
+    m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
+    m_curBlock.state = BLK_WRITE;
+    m_curOp = 0;
+    next = false;
+    m_lastFbo++;
+    m_curFbo = m_lastFbo;
+
+    //...Expand current extent if it is an abbreviated initial extent
+    if ((m_curFbo == m_numBlocks) && (m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
+    {
+      RETURN_ON_ERROR(expandDctnryExtent());
+    }
+
+    //...Allocate a new extent if we have reached the last block in the
+    //   current extent.
+    if (m_curFbo == m_numBlocks)
+    {
+      // last block
+      LBID_t startLbid;
+
+      // Add an extent.
+      RETURN_ON_ERROR(
+          createDctnry(m_dctnryOID, m_colWidth, m_dbRoot, m_partition, m_segment, startLbid, false));
+
+      if (m_logger)
+      {
+        std::ostringstream oss;
+        oss << "Add dictionary extent OID-" << m_dctnryOID << "; DBRoot-" << m_dbRoot << "; part-"
+            << m_partition << "; seg-" << m_segment << "; hwm-" << m_curFbo << "; LBID-" << startLbid
+            << "; file-" << m_segFileName;
+        m_logger->logMsg(oss.str(), MSGLVL_INFO2);
+      }
+
+      m_curLbid = startLbid;
+
+      // now seek back to the curFbo, after adding an extent
+      // @bug5769 For uncompressed only;
+      // ChunkManager manages the file offset for the compression case
+      if (m_compressionType == 0)
+      {
+#ifdef PROFILE
+        Stats::startParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
+#endif
+        long long byteOffset = m_curFbo;
+        byteOffset *= BYTE_PER_BLOCK;
+        RETURN_ON_ERROR(setFileOffset(m_dFile, byteOffset));
+#ifdef PROFILE
+        Stats::stopParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
+#endif
+      }
+    }
+    else
+    {
+      // LBIDs are numbered collectively and consecutively within an
+      // extent, so within an extent we can derive the LBID by simply
+      // incrementing it rather than having to go back to BRM to look
+      // up the LBID for each FBO.
+      m_curLbid++;
+    }
+
+#ifdef PROFILE
+    Stats::startParseEvent(WE_STATS_PARSE_DCT);
+#endif
+    m_curBlock.lbid = m_curLbid;
+
+    //..."found" flag indicates whether the string was already found
+    //   "or" added to the end of the previous block.  If false, then
+    //   we need to add the string to the new block.
+    if (!found)
+    {
+      RETURN_ON_ERROR(insertDctnry2(curSig));  // m_freeSpace updated!
+      m_curBlock.state = BLK_WRITE;
+      memcpy(pOut + outOffset, &curSig.token, 8);
+      outOffset += 8;
+      startPos++;
+
+      //...Add string to cache, if we have not exceeded cache limit
+      if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
+      {
+        addToStringCache(curSig);
+      }
+    }
+  }  // if next
+
+  return NO_ERROR;
+}
+
+/*******************************************************************************
+ * Description:
+ * Used by bulk import to insert batch of parquet strings into this store file.
+ * Function assumes that the file is already positioned to the current block.
+ * 
+ * PARAMETERS:
+ *    input
+ *       columnData - arrow array containing input strings
+ *       startRowIdx - start position for current batch parquet data
+ *       totalRow - number of rows in "buf"
+ *       col - column of strings to be parsed from "buf"
+ *    output
+ *       tokenBuf   - tokens assigned to inserted strings
+ * 
+ * RETURN:
+ *    success     - successfully write the header to block
+ *    failure     - it did not write the header to block
+ ******************************************************************************/
+int Dctnry::insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx,
+                                const int totalRow, const int col, char* tokenBuf,
+                                long long& truncCount, const CHARSET_INFO* cs,
+                                const WriteEngine::ColType& weType)
+{
+#ifdef PROFILE
+  Stats::startParseEvent(WE_STATS_PARSE_DCT);
+#endif
+  int startPos = 0;
+  int totalUseSize = 0;
+
+  int outOffset = 0;
+  const char* pIn;
+  char* pOut = tokenBuf;
+  Signature curSig;
+  bool found = false;
+  bool next = false;
+  CommBlock cb;
+  cb.file.oid = m_dctnryOID;
+  cb.file.pFile = m_dFile;
+  WriteEngine::Token nullToken;
+
+  bool isNonNullArray = true;
+  std::shared_ptr<arrow::BinaryArray> binaryArray;
+  std::shared_ptr<arrow::FixedSizeBinaryArray> fixedSizeBinaryArray;
+
+  if (columnData->type_id() != arrow::Type::type::FIXED_SIZE_BINARY)
+    binaryArray = std::static_pointer_cast<arrow::BinaryArray>(columnData);
+  else
+    fixedSizeBinaryArray = std::static_pointer_cast<arrow::FixedSizeBinaryArray>(columnData);
+
+  // check if this column data imported is NULL array or not
+  if (columnData->type_id() == arrow::Type::type::NA)
+    isNonNullArray = false;
+  
+  //...Loop through all the rows for the specified column
+  while (startPos < totalRow)
+  {
+    found = false;
+    void* curSigPtr = static_cast<void*>(&curSig);
+    memset(curSigPtr, 0, sizeof(curSig));
+
+    // if this column is not null data
+    if (isNonNullArray)
+    {
+      const uint8_t* data;
+      
+      // if (binaryArray != nullptr)
+      // {
+      //   data = binaryArray->GetValue(startPos + startRowIdx, &curSig.size);
+      // }
+      // else
+      // {
+      //   data = fixedSizeBinaryArray->GetValue(startPos + startRowIdx);
+      //   std::shared_ptr<arrow::DataType> tType = fixedSizeBinaryArray->type();
+      //   curSig.size = tType->byte_width();
+      // }
+
+      // comment this line and uncomment the above will reproduce the error
+      data = binaryArray->GetValue(startPos + startRowIdx, &curSig.size);
+
+      const char* dataPtr = reinterpret_cast<const char*>(data);
+
+      // Strip trailing null bytes '\0' (by adjusting curSig.size) if import-
+      // ing in binary mode.  If entire string is binary zeros, then we treat
+      // as a NULL value.
+      if (curSig.size > 0)
+      {
+        const char* fld = dataPtr;
+        int kk = curSig.size - 1;
+
+        for (; kk >= 0; kk--)
+        {
+          if (fld[kk] != '\0')
+            break;
+        }
+        curSig.size = kk + 1;
+      }
+
+      // Read thread should validate against max size so that the entire row
+      // can be rejected up front.  Once we get here in the parsing thread,
+      // it is too late to reject the row.  However, as a precaution, we
+      // still check against max size & set to null token if needed.
+      if ((curSig.size == 0) || (curSig.size > MAX_BLOB_SIZE))
+      {
+        if (m_defVal.length() > 0)  // use default string if available
+        {
+          pIn = m_defVal.str();
+          curSig.signature = (unsigned char*)pIn;
+          curSig.size = m_defVal.length();
+        }
+        else
+        {
+          memcpy(pOut + outOffset, &nullToken, 8);
+          outOffset += 8;
+          startPos++;
+          continue;
+        }
+      }
+      else
+      {
+        pIn = dataPtr;
+        curSig.signature = (unsigned char*)pIn;
+      }
+    }
+    else
+    {
+      curSig.size = 0;
+
+      if (m_defVal.length() > 0)  // use default string if available
+      {
+        pIn = m_defVal.str();
+        curSig.signature = (unsigned char*)pIn;
+        curSig.size = m_defVal.length();
+      }
+      else
+      {
+        memcpy(pOut + outOffset, &nullToken, 8);
+        outOffset += 8;
+        startPos++;
+        continue;
+      }
+    }
+
+    RETURN_ON_ERROR(insertDctnry1(curSig, found, pOut, outOffset, startPos, totalUseSize, cb, next, truncCount,
+                                  cs, weType));
+  }
+
+#ifdef PROFILE
+  Stats::stopParseEvent(WE_STATS_PARSE_DCT);
+#endif
+  // Done
+  // If any data leftover and not written by subsequent call to
+  // insertDctnry(), then it will be written by closeDctnry().
+  
+  return NO_ERROR;
+}
+
 /*******************************************************************************
 * Description:
 * Used by bulk import to insert collection of strings into this store file.
@ -838,201 +1199,8 @@ int Dctnry::insertDctnry(const char* buf, ColPosPair** pos, const int totalRow,
      curSig.signature = (unsigned char*)pIn;
    }

-    if (cs->mbmaxlen > 1)
-    {
-      // For TEXT columns, we truncate based on the number of bytes,
-      // and not based on the number of characters, as for CHAR/VARCHAR
-      // columns in the else block.
-      if (weType == WriteEngine::WR_TEXT)
-      {
-        if (curSig.size > m_colWidth)
-        {
-          uint8_t truncate_point = utf8::utf8_truncate_point((const char*)curSig.signature, m_colWidth);
-          curSig.size = m_colWidth - truncate_point;
-          truncCount++;
-        }
-      }
-      else
-      {
-        const char* start = (const char*) curSig.signature;
-        const char* end = (const char*)(curSig.signature + curSig.size);
-        size_t numChars = cs->numchars(start, end);
-        size_t maxCharLength = m_colWidth / cs->mbmaxlen;
-
-        if (numChars > maxCharLength)
-        {
-          MY_STRCOPY_STATUS status;
-          cs->well_formed_char_length(start, end, maxCharLength, &status);
-          curSig.size = status.m_source_end_pos - start;
-          truncCount++;
-        }
-      }
-    }
-    else // cs->mbmaxlen == 1
-    {
-      if (curSig.size > m_colWidth)
-      {
-        curSig.size = m_colWidth;
-        truncCount++;
-      }
-    }
-
-    //...Search for the string in our string cache
-    // if it fits into one block (< 8KB)
-    if (curSig.size <= MAX_SIGNATURE_SIZE)
-    {
-      // Stats::startParseEvent("getTokenFromArray");
-      found = getTokenFromArray(curSig);
-
-      if (found)
-      {
-        memcpy(pOut + outOffset, &curSig.token, 8);
-        outOffset += 8;
-        startPos++;
-        // Stats::stopParseEvent("getTokenFromArray");
-        continue;
-      }
-
-      // Stats::stopParseEvent("getTokenFromArray");
-    }
-
-    totalUseSize = m_totalHdrBytes + curSig.size;
-
-    //...String not found in cache, so proceed.
-    //   If room is available in current block then insert into block.
-    // @bug 3960: Add MAX_OP_COUNT check to handle case after bulk rollback
-    if (((totalUseSize <= m_freeSpace - HDR_UNIT_SIZE) ||
-         ((curSig.size > 8176) && (m_freeSpace > HDR_UNIT_SIZE))) &&
-        (m_curOp < (MAX_OP_COUNT - 1)))
-    {
-      RETURN_ON_ERROR(insertDctnry2(curSig));  // m_freeSpace updated!
-      m_curBlock.state = BLK_WRITE;
-      memcpy(pOut + outOffset, &curSig.token, 8);
-      outOffset += 8;
-      startPos++;
-      found = true;
-
-      //...If we have reached limit for the number of strings allowed in
-      //   a block, then we write the current block so that we can start
-      //   another block.
-      if (m_curOp >= MAX_OP_COUNT - 1)
-      {
-#ifdef PROFILE
-        Stats::stopParseEvent(WE_STATS_PARSE_DCT);
-#endif
-        RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
-        m_curBlock.state = BLK_READ;
-        next = true;
-      }
-
-      //...Add string to cache, if we have not exceeded cache limit
-      // Don't cache big blobs
-      if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
-      {
-        addToStringCache(curSig);
-      }
-    }
-    else  //...No room for this string in current block, so we write
-          //   out the current block, so we can start another block
-    {
-#ifdef PROFILE
-      Stats::stopParseEvent(WE_STATS_PARSE_DCT);
-#endif
-      RETURN_ON_ERROR(writeDBFileNoVBCache(cb, &m_curBlock, m_curFbo));
-      m_curBlock.state = BLK_READ;
-      next = true;
-      found = false;
-    }  // if m_freeSpace
-
-    //..."next" flag is used to indicate that we need to advance to the
-    //   next block in the store file.
-    if (next)
-    {
-      memset(m_curBlock.data, 0, sizeof(m_curBlock.data));
-      memcpy(m_curBlock.data, &m_dctnryHeader2, m_totalHdrBytes);
-      m_freeSpace = BYTE_PER_BLOCK - m_totalHdrBytes;
-      m_curBlock.state = BLK_WRITE;
-      m_curOp = 0;
-      next = false;
-      m_lastFbo++;
-      m_curFbo = m_lastFbo;
-
-      //...Expand current extent if it is an abbreviated initial extent
-      if ((m_curFbo == m_numBlocks) && (m_numBlocks == NUM_BLOCKS_PER_INITIAL_EXTENT))
-      {
-        RETURN_ON_ERROR(expandDctnryExtent());
-      }
-
-      //...Allocate a new extent if we have reached the last block in the
-      //   current extent.
-      if (m_curFbo == m_numBlocks)
-      {
-        // last block
-        LBID_t startLbid;
-
-        // Add an extent.
-        RETURN_ON_ERROR(
-            createDctnry(m_dctnryOID, m_colWidth, m_dbRoot, m_partition, m_segment, startLbid, false));
-
-        if (m_logger)
-        {
-          std::ostringstream oss;
-          oss << "Add dictionary extent OID-" << m_dctnryOID << "; DBRoot-" << m_dbRoot << "; part-"
-              << m_partition << "; seg-" << m_segment << "; hwm-" << m_curFbo << "; LBID-" << startLbid
-              << "; file-" << m_segFileName;
-          m_logger->logMsg(oss.str(), MSGLVL_INFO2);
-        }
-
-        m_curLbid = startLbid;
-
-        // now seek back to the curFbo, after adding an extent
-        // @bug5769 For uncompressed only;
-        // ChunkManager manages the file offset for the compression case
-        if (m_compressionType == 0)
-        {
-#ifdef PROFILE
-          Stats::startParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
-#endif
-          long long byteOffset = m_curFbo;
-          byteOffset *= BYTE_PER_BLOCK;
-          RETURN_ON_ERROR(setFileOffset(m_dFile, byteOffset));
-#ifdef PROFILE
-          Stats::stopParseEvent(WE_STATS_PARSE_DCT_SEEK_EXTENT_BLK);
-#endif
-        }
-      }
-      else
-      {
-        // LBIDs are numbered collectively and consecutively within an
-        // extent, so within an extent we can derive the LBID by simply
-        // incrementing it rather than having to go back to BRM to look
-        // up the LBID for each FBO.
-        m_curLbid++;
-      }
-
-#ifdef PROFILE
-      Stats::startParseEvent(WE_STATS_PARSE_DCT);
-#endif
-      m_curBlock.lbid = m_curLbid;
-
-      //..."found" flag indicates whether the string was already found
-      //   "or" added to the end of the previous block.  If false, then
-      //   we need to add the string to the new block.
-      if (!found)
-      {
-        RETURN_ON_ERROR(insertDctnry2(curSig));  // m_freeSpace updated!
-        m_curBlock.state = BLK_WRITE;
-        memcpy(pOut + outOffset, &curSig.token, 8);
-        outOffset += 8;
-        startPos++;
-
-        //...Add string to cache, if we have not exceeded cache limit
-        if ((m_arraySize < MAX_STRING_CACHE_SIZE) && (curSig.size <= MAX_SIGNATURE_SIZE))
-        {
-          addToStringCache(curSig);
-        }
-      }
-    }  // if next
+    RETURN_ON_ERROR(insertDctnry1(curSig, found, pOut, outOffset, startPos, totalUseSize, cb, next, truncCount,
+                                  cs, weType));
  }    // end while

 #ifdef PROFILE
--- a/writeengine/dictionary/we_dctnry.h
+++ b/writeengine/dictionary/we_dctnry.h
@ -37,6 +37,8 @@
 #include "bytestream.h"
 #include "nullstring.h"

+#include <arrow/api.h>
+
 #define EXPORT

 /** Namespace WriteEngine */
@ -157,6 +159,20 @@ class Dctnry : public DbFileOp
   */
  EXPORT int insertDctnry(const int& sgnature_size, const unsigned char* sgnature_value, Token& token);

+  /**
+   * @brief Insert signature value to a file block and return token/pointer
+   * (for Bulk use)
+   * 
+   * @param columnData  - arrow array containing strings to be parsed
+   * @param startRowIdx - start position for current batch parquet data
+   * @param totalRow    - total number of rows in buf
+   * @param col         - the column to be parsed from buf
+   * @param tokenBuf    - (output) list of tokens for the parsed strings
+  */
+  EXPORT int insertDctnryParquet(std::shared_ptr<arrow::Array> columnData, int startRowIdx, const int totalRow,
+                                 const int col, char* tokenBuf, long long& truncCount,
+                                 const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
+  
  /**
   * @brief Insert a signature value to a file block and return token/pointer
   * (for Bulk use)
@ -280,6 +296,9 @@ class Dctnry : public DbFileOp
  // insertDctnryHdr inserts the new value info into the header.
  // insertSgnture   inserts the new value into the block.
  //
+  int insertDctnry1(Signature& curSig, bool found, char* pOut, int& outOffset, int& startPos,
+                    int& totalUseSize, CommBlock& cb, bool& next, long long& truncCount,
+                    const CHARSET_INFO* cs, const WriteEngine::ColType& weType);
  int insertDctnry2(Signature& sig);
  void insertDctnryHdr(unsigned char* blockBuf, const int& size);
  void insertSgnture(unsigned char* blockBuf, const int& size, unsigned char* value);
--- a/writeengine/shared/we_define.h
+++ b/writeengine/shared/we_define.h
@ -113,7 +113,7 @@ const int ERR_COMPBASE = 1650;      // Compression errors
 const int ERR_AUTOINCBASE = 1700;   // Auto-increment errors
 const int ERR_BLKCACHEBASE = 1750;  // Block cache flush errors
 const int ERR_METABKUPBASE = 1800;  // Backup bulk meta file errors
-
+const int ERR_PARQUETBASE = 1850;   // Parquet importing errors
 //--------------------------------------------------------------------------
 // Generic error
 //--------------------------------------------------------------------------
@ -152,6 +152,7 @@ const int ERR_FILE_GLOBBING = ERR_FILEBASE + 19;        // Error globbing a file
 const int ERR_FILE_EOF = ERR_FILEBASE + 20;             // EOF
 const int ERR_FILE_CHOWN = ERR_FILEBASE + 21;           // EOF
 const int ERR_INTERNAL = ERR_FILEBASE + 22;             // EOF
+const int ERR_FILE_TYPE_DIFF = ERR_FILEBASE + 23;       // Files import type are different

 //--------------------------------------------------------------------------
 // XML level error
@ -389,6 +390,11 @@ const int ERR_METADATABKUP_COMP_READ_BULK_BKUP =
    ERR_METABKUPBASE + 7;                                       // Error reading from backup chunk file */
 const int ERR_METADATABKUP_COMP_RENAME = ERR_METABKUPBASE + 8;  // Error renaming chunk file */

+//--------------------------------------------------------------------------
+// Parquet errors when importing
+//--------------------------------------------------------------------------
+const int ERR_PARQUET_AUX = ERR_PARQUETBASE + 1;    // Error when creating aux column for parquet file
+
 //------------------------------------------------------------------------------
 // Class used to convert an error code to a corresponding error message string
 //------------------------------------------------------------------------------
--- a/writeengine/shared/we_type.h
+++ b/writeengine/shared/we_type.h
@ -137,11 +137,13 @@ enum BulkModeType
 // Import Mode 0-text Import (default)
 //             1-Binary Import with NULL values
 //             2-Binary Import with saturated NULL values
+//             3-Binary Import with parquet file
 enum ImportDataMode
 {
  IMPORT_DATA_TEXT = 0,
  IMPORT_DATA_BIN_ACCEPT_NULL = 1,
-  IMPORT_DATA_BIN_SAT_NULL = 2
+  IMPORT_DATA_BIN_SAT_NULL = 2,
+  IMPORT_DATA_PARQUET = 3
 };

 /**