From 3bb2496ca1c32deddf53f3db4cb78f2ca50b92fb Mon Sep 17 00:00:00 2001 From: Serguey Zefirov Date: Tue, 9 Jul 2024 10:29:08 +0300 Subject: [PATCH] fix: MCOL-5755: incorrect handling of BLOB (and TEXT) in GROUP BY BLOB fields did not work as grouping keys at all, they were assigned value NULL for any value, be it NULL or not. The fix is in the rowaggregation.cpp in the initMapping(), a switch/case branch was added to handle BLOB field copying there. Also, TEXT columns did not distinguish between NULL and empty string in the grouping algorithm, now they do. The fix is in the equals() function, now we specifically check for isNull() equality between values. --- dbcon/execplan/aggregatecolumn.cpp | 11 ++++++- .../basic/r/mcs35_select_group_by.result | 6 +++- ...incorrect-group-by-handling-of-BLOB.result | 29 +++++++++++++++++++ ...5-incorrect-group-by-handling-of-BLOB.test | 18 ++++++++++++ utils/common/conststring.h | 2 +- utils/rowgroup/rowaggregation.cpp | 12 ++++---- utils/rowgroup/rowgroup.cpp | 8 ++++- utils/rowgroup/rowgroup.h | 3 ++ 8 files changed, 79 insertions(+), 10 deletions(-) create mode 100644 mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result create mode 100644 mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test diff --git a/dbcon/execplan/aggregatecolumn.cpp b/dbcon/execplan/aggregatecolumn.cpp index 0fffad794..8dd85727c 100644 --- a/dbcon/execplan/aggregatecolumn.cpp +++ b/dbcon/execplan/aggregatecolumn.cpp @@ -587,7 +587,16 @@ void AggregateColumn::evaluate(Row& row, bool& isNull) break; case CalpontSystemCatalog::VARBINARY: - case CalpontSystemCatalog::BLOB: isNull = true; break; + case CalpontSystemCatalog::BLOB: + { + auto const str = row.getConstString(fInputIndex); + fResult.strVal.dropString(); + if (!str.isNull()) + fResult.strVal.assign((const uint8_t*)str.str(), str.length()); + + isNull = isNull || fResult.strVal.isNull(); + } + break; default: // treat as int64 if (row.equals<8>(BIGINTNULL, fInputIndex)) diff --git a/mysql-test/columnstore/basic/r/mcs35_select_group_by.result b/mysql-test/columnstore/basic/r/mcs35_select_group_by.result index e94df43a9..7de2749c0 100644 --- a/mysql-test/columnstore/basic/r/mcs35_select_group_by.result +++ b/mysql-test/columnstore/basic/r/mcs35_select_group_by.result @@ -86,7 +86,11 @@ COUNT(*) 9 SELECT COUNT(*) FROM t1 GROUP BY t1_blob; COUNT(*) -14 +1 +1 +3 +3 +6 SELECT COUNT(*) FROM t1 GROUP BY t1_text; COUNT(*) 1 diff --git a/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result new file mode 100644 index 000000000..944999a55 --- /dev/null +++ b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.result @@ -0,0 +1,29 @@ +DROP DATABASE IF EXISTS MCOL5755; +CREATE DATABASE MCOL5755; +USE MCOL5755; +CREATE TABLE t1 (t TEXT,c CHAR(10),b BLOB) ENGINE = columnstore; +INSERT INTO t1 VALUES (NULL,NULL,NULL); +INSERT INTO t1 VALUES ("","",""); +INSERT INTO t1 VALUES ("hello","hello","hello"); +INSERT INTO t1 VALUES ("HELLO","HELLO","HELLO"); +INSERT INTO t1 VALUES ("HELLO MY","HELLO MY","HELLO MY"); +INSERT INTO t1 VALUES ("a","a","a"); +INSERT INTO t1 VALUES (1,1,1); +SELECT b FROM t1 GROUP BY b; +b + +1 +HELLO +HELLO MY +NULL +a +hello +SELECT t FROM t1 GROUP BY t; +t + +1 +HELLO MY +NULL +a +hello +DROP DATABASE MCOL5755; diff --git a/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test new file mode 100644 index 000000000..ff0160be6 --- /dev/null +++ b/mysql-test/columnstore/bugfixes/MCOL-5755-incorrect-group-by-handling-of-BLOB.test @@ -0,0 +1,18 @@ +--disable_warnings +DROP DATABASE IF EXISTS MCOL5755; +--enable_warnings +CREATE DATABASE MCOL5755; +USE MCOL5755; +CREATE TABLE t1 (t TEXT,c CHAR(10),b BLOB) ENGINE = columnstore; +INSERT INTO t1 VALUES (NULL,NULL,NULL); +INSERT INTO t1 VALUES ("","",""); +INSERT INTO t1 VALUES ("hello","hello","hello"); +INSERT INTO t1 VALUES ("HELLO","HELLO","HELLO"); +INSERT INTO t1 VALUES ("HELLO MY","HELLO MY","HELLO MY"); +INSERT INTO t1 VALUES ("a","a","a"); +INSERT INTO t1 VALUES (1,1,1); +--sorted_result +SELECT b FROM t1 GROUP BY b; +--sorted_result +SELECT t FROM t1 GROUP BY t; +DROP DATABASE MCOL5755; diff --git a/utils/common/conststring.h b/utils/common/conststring.h index 021529bf5..103b84a1c 100644 --- a/utils/common/conststring.h +++ b/utils/common/conststring.h @@ -81,7 +81,7 @@ class ConstString } bool eq(const ConstString& rhs) const { - if (!mStr) + if (!mStr || !rhs.mStr) { return mStr == rhs.mStr; } diff --git a/utils/rowgroup/rowaggregation.cpp b/utils/rowgroup/rowaggregation.cpp index a4d266da0..631c58284 100644 --- a/utils/rowgroup/rowaggregation.cpp +++ b/utils/rowgroup/rowaggregation.cpp @@ -955,6 +955,10 @@ void RowAggregation::initMapData(const Row& rowIn) break; } + case execplan::CalpontSystemCatalog::VARBINARY: + case execplan::CalpontSystemCatalog::BLOB: + fRow.setStringField(rowIn.getConstString(colIn), colOut); + break; case execplan::CalpontSystemCatalog::DOUBLE: case execplan::CalpontSystemCatalog::UDOUBLE: @@ -1306,6 +1310,8 @@ void RowAggregation::doSelectSome(const Row& rowIn, int64_t colIn, int64_t colOu case execplan::CalpontSystemCatalog::CHAR: case execplan::CalpontSystemCatalog::VARCHAR: case execplan::CalpontSystemCatalog::TEXT: + case execplan::CalpontSystemCatalog::CLOB: + case execplan::CalpontSystemCatalog::BLOB: { auto valIn = rowIn.getStringField(colIn); fRow.setStringField(valIn, colOut); @@ -1345,12 +1351,6 @@ void RowAggregation::doSelectSome(const Row& rowIn, int64_t colIn, int64_t colOu break; } - case execplan::CalpontSystemCatalog::CLOB: - case execplan::CalpontSystemCatalog::BLOB: - { - fRow.setVarBinaryField(rowIn.getVarBinaryField(colIn), rowIn.getVarBinaryLength(colIn), colOut); - break; - } default: { idbassert_s(0, "unknown data type in doSelectSome()"); diff --git a/utils/rowgroup/rowgroup.cpp b/utils/rowgroup/rowgroup.cpp index 0b620db45..cfc2aa770 100644 --- a/utils/rowgroup/rowgroup.cpp +++ b/utils/rowgroup/rowgroup.cpp @@ -1107,8 +1107,14 @@ bool Row::equals(const Row& r2, uint32_t lastCol) const cscDataType columnType = getColType(col); if (UNLIKELY(typeHasCollation(columnType))) { + auto c1 = getConstString(col); + auto c2 = r2.getConstString(col); + if (c1.isNull() != c2.isNull()) + { + return false; + } datatypes::Charset cs(getCharset(col)); - if (cs.strnncollsp(getConstString(col), r2.getConstString(col))) + if (cs.strnncollsp(c1, c2)) { return false; } diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h index e1b95b421..50baf681c 100644 --- a/utils/rowgroup/rowgroup.h +++ b/utils/rowgroup/rowgroup.h @@ -562,6 +562,9 @@ class Row inline void setBinaryField(const T* value, uint32_t colIndex); template inline void setBinaryField_offset(const T* value, uint32_t width, uint32_t colIndex); + // XXX: TODO: I'd deprecate these two functions in favor of get/setStringField. + // getSetStringField properly support binary data of up to 4G bytes + // and also provide perfomant interface through use of ConstString. // support VARBINARY // Add 2-byte length at the CHARSET_INFO*beginning of the field. nullptr and zero length field are // treated the same, could use one of the length bit to distinguish these two cases.