From 6e995e2e80a68660150c0a4ccbffc35ea67bebb9 Mon Sep 17 00:00:00 2001 From: Serguey Zefirov Date: Tue, 9 Jul 2024 10:29:08 +0300 Subject: [PATCH] fix: MCOL-5755: incorrect handling of BLOB (and TEXT) in GROUP BY BLOB fields did not work as grouping keys at all, they were assigned value NULL for any value, be it NULL or not. The fix is in the rowaggregation.cpp in the initMapping(), a switch/case branch was added to handle BLOB field copying there. Also, TEXT columns did not distinguish between NULL and empty string in the grouping algorithm, now they do. The fix is in the equals() function, now we specifically check for isNull() equality between values. --- dbcon/execplan/aggregatecolumn.cpp | 11 ++++++- .../basic/r/mcs35_select_group_by.result | 6 +++- ...incorrect-group-by-handling-of-BLOB.result | 29 +++++++++++++++++++ ...5-incorrect-group-by-handling-of-BLOB.test | 18 ++++++++++++ utils/common/conststring.h | 2 +- utils/rowgroup/rowaggregation.cpp | 12 ++++---- utils/rowgroup/rowgroup.cpp | 8 ++++- utils/rowgroup/rowgroup.h | 3 ++ 8 files changed, 79 insertions(+), 10 deletions(-) create mode 100644 mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result create mode 100644 mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test diff --git a/dbcon/execplan/aggregatecolumn.cpp b/dbcon/execplan/aggregatecolumn.cpp index a715de6f2..2546e9f18 100644 --- a/dbcon/execplan/aggregatecolumn.cpp +++ b/dbcon/execplan/aggregatecolumn.cpp @@ -586,7 +586,16 @@ void AggregateColumn::evaluate(Row& row, bool& isNull) break; case CalpontSystemCatalog::VARBINARY: - case CalpontSystemCatalog::BLOB: isNull = true; break; + case CalpontSystemCatalog::BLOB: + { + auto const str = row.getConstString(fInputIndex); + fResult.strVal.dropString(); + if (!str.isNull()) + fResult.strVal.assign((const uint8_t*)str.str(), str.length()); + + isNull = isNull || fResult.strVal.isNull(); + } + break; default: // treat as int64 if (row.equals<8>(BIGINTNULL, fInputIndex)) diff --git a/mysql-test/columnstore/basic/r/mcs35_select_group_by.result b/mysql-test/columnstore/basic/r/mcs35_select_group_by.result index eb7dbc28b..7cf594ab5 100644 --- a/mysql-test/columnstore/basic/r/mcs35_select_group_by.result +++ b/mysql-test/columnstore/basic/r/mcs35_select_group_by.result @@ -86,7 +86,11 @@ COUNT(*) 9 SELECT COUNT(*) FROM t1 GROUP BY t1_blob; COUNT(*) -14 +1 +1 +3 +3 +6 SELECT COUNT(*) FROM t1 GROUP BY t1_text; COUNT(*) 1 diff --git a/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result new file mode 100644 index 000000000..944999a55 --- /dev/null +++ b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result @@ -0,0 +1,29 @@ +DROP DATABASE IF EXISTS MCOL5755; +CREATE DATABASE MCOL5755; +USE MCOL5755; +CREATE TABLE t1 (t TEXT,c CHAR(10),b BLOB) ENGINE = columnstore; +INSERT INTO t1 VALUES (NULL,NULL,NULL); +INSERT INTO t1 VALUES ("","",""); +INSERT INTO t1 VALUES ("hello","hello","hello"); +INSERT INTO t1 VALUES ("HELLO","HELLO","HELLO"); +INSERT INTO t1 VALUES ("HELLO MY","HELLO MY","HELLO MY"); +INSERT INTO t1 VALUES ("a","a","a"); +INSERT INTO t1 VALUES (1,1,1); +SELECT b FROM t1 GROUP BY b; +b + +1 +HELLO +HELLO MY +NULL +a +hello +SELECT t FROM t1 GROUP BY t; +t + +1 +HELLO MY +NULL +a +hello +DROP DATABASE MCOL5755; diff --git a/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test new file mode 100644 index 000000000..ff0160be6 --- /dev/null +++ b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test @@ -0,0 +1,18 @@ +--disable_warnings +DROP DATABASE IF EXISTS MCOL5755; +--enable_warnings +CREATE DATABASE MCOL5755; +USE MCOL5755; +CREATE TABLE t1 (t TEXT,c CHAR(10),b BLOB) ENGINE = columnstore; +INSERT INTO t1 VALUES (NULL,NULL,NULL); +INSERT INTO t1 VALUES ("","",""); +INSERT INTO t1 VALUES ("hello","hello","hello"); +INSERT INTO t1 VALUES ("HELLO","HELLO","HELLO"); +INSERT INTO t1 VALUES ("HELLO MY","HELLO MY","HELLO MY"); +INSERT INTO t1 VALUES ("a","a","a"); +INSERT INTO t1 VALUES (1,1,1); +--sorted_result +SELECT b FROM t1 GROUP BY b; +--sorted_result +SELECT t FROM t1 GROUP BY t; +DROP DATABASE MCOL5755; diff --git a/utils/common/conststring.h b/utils/common/conststring.h index 021529bf5..103b84a1c 100644 --- a/utils/common/conststring.h +++ b/utils/common/conststring.h @@ -81,7 +81,7 @@ class ConstString } bool eq(const ConstString& rhs) const { - if (!mStr) + if (!mStr || !rhs.mStr) { return mStr == rhs.mStr; } diff --git a/utils/rowgroup/rowaggregation.cpp b/utils/rowgroup/rowaggregation.cpp index f2d5ec2a8..b68aad4f8 100644 --- a/utils/rowgroup/rowaggregation.cpp +++ b/utils/rowgroup/rowaggregation.cpp @@ -953,6 +953,10 @@ void RowAggregation::initMapData(const Row& rowIn) break; } + case execplan::CalpontSystemCatalog::VARBINARY: + case execplan::CalpontSystemCatalog::BLOB: + fRow.setStringField(rowIn.getConstString(colIn), colOut); + break; case execplan::CalpontSystemCatalog::DOUBLE: case execplan::CalpontSystemCatalog::UDOUBLE: @@ -1304,6 +1308,8 @@ void RowAggregation::doSelectSome(const Row& rowIn, int64_t colIn, int64_t colOu case execplan::CalpontSystemCatalog::CHAR: case execplan::CalpontSystemCatalog::VARCHAR: case execplan::CalpontSystemCatalog::TEXT: + case execplan::CalpontSystemCatalog::CLOB: + case execplan::CalpontSystemCatalog::BLOB: { auto valIn = rowIn.getStringField(colIn); fRow.setStringField(valIn, colOut); @@ -1343,12 +1349,6 @@ void RowAggregation::doSelectSome(const Row& rowIn, int64_t colIn, int64_t colOu break; } - case execplan::CalpontSystemCatalog::CLOB: - case execplan::CalpontSystemCatalog::BLOB: - { - fRow.setVarBinaryField(rowIn.getVarBinaryField(colIn), rowIn.getVarBinaryLength(colIn), colOut); - break; - } default: { idbassert_s(0, "unknown data type in doSelectSome()"); diff --git a/utils/rowgroup/rowgroup.cpp b/utils/rowgroup/rowgroup.cpp index 5d05b9cdc..b8f835d7f 100644 --- a/utils/rowgroup/rowgroup.cpp +++ b/utils/rowgroup/rowgroup.cpp @@ -977,8 +977,14 @@ bool Row::equals(const Row& r2, uint32_t lastCol) const cscDataType columnType = getColType(col); if (UNLIKELY(typeHasCollation(columnType))) { + auto c1 = getConstString(col); + auto c2 = r2.getConstString(col); + if (c1.isNull() != c2.isNull()) + { + return false; + } datatypes::Charset cs(getCharset(col)); - if (cs.strnncollsp(getConstString(col), r2.getConstString(col))) + if (cs.strnncollsp(c1, c2)) { return false; } diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h index 41af8a0a8..113f0ae6e 100644 --- a/utils/rowgroup/rowgroup.h +++ b/utils/rowgroup/rowgroup.h @@ -507,6 +507,9 @@ class Row inline void setBinaryField(const T* value, uint32_t colIndex); template inline void setBinaryField_offset(const T* value, uint32_t width, uint32_t colIndex); + // XXX: TODO: I'd deprecate these two functions in favor of get/setStringField. + // getSetStringField properly support binary data of up to 4G bytes + // and also provide perfomant interface through use of ConstString. // support VARBINARY // Add 2-byte length at the CHARSET_INFO*beginning of the field. nullptr and zero length field are // treated the same, could use one of the length bit to distinguish these two cases.