From bd4cbb542d1527320d2f7e60cbe0a17d388ceb15 Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Tue, 18 May 2021 11:29:21 +0400 Subject: [PATCH] MCOL-4721 CHAR(1) is not collation-aware for GROUP/DISTINCT --- datatypes/mcs_datatype.h | 9 +++ .../ctype_cmp_char1_latin1_swedish_ci.result | 22 +++++++ .../t/ctype_cmp_char1_latin1_swedish_ci.test | 17 +++++ utils/rowgroup/rowgroup.cpp | 62 +------------------ utils/rowgroup/rowgroup.h | 11 +++- 5 files changed, 61 insertions(+), 60 deletions(-) create mode 100644 mysql-test/columnstore/basic/r/ctype_cmp_char1_latin1_swedish_ci.result create mode 100644 mysql-test/columnstore/basic/t/ctype_cmp_char1_latin1_swedish_ci.test diff --git a/datatypes/mcs_datatype.h b/datatypes/mcs_datatype.h index 4202ddb8e..77f167e92 100644 --- a/datatypes/mcs_datatype.h +++ b/datatypes/mcs_datatype.h @@ -372,6 +372,15 @@ inline bool isCharType(const datatypes::SystemCatalog::ColDataType type) datatypes::SystemCatalog::TEXT == type); } + +inline bool typeHasCollation(const datatypes::SystemCatalog::ColDataType type) +{ + return datatypes::SystemCatalog::VARCHAR == type || + datatypes::SystemCatalog::CHAR == type || + datatypes::SystemCatalog::TEXT == type; +} + + /** convenience function to determine if column type is a * numeric type */ diff --git a/mysql-test/columnstore/basic/r/ctype_cmp_char1_latin1_swedish_ci.result b/mysql-test/columnstore/basic/r/ctype_cmp_char1_latin1_swedish_ci.result new file mode 100644 index 000000000..dca6ea26d --- /dev/null +++ b/mysql-test/columnstore/basic/r/ctype_cmp_char1_latin1_swedish_ci.result @@ -0,0 +1,22 @@ +SET NAMES utf8; +# +# MCOL-4721 CHAR(1) is not collation-aware for GROUP/DISTINCT +# +CREATE TABLE t1 (c1 CHAR(1) CHARACTER SET latin1 COLLATE latin1_swedish_ci); +INSERT INTO t1 VALUES ('a'),('A'); +SELECT c1, COUNT(*) FROM t1 GROUP BY c1; +c1 COUNT(*) +a 2 +SELECT DISTINCT c1 FROM t1; +c1 +a +INSERT INTO t1 VALUES ('ä'),('Ä'),('ã'),('Ã'); +SELECT c1, COUNT(*) FROM t1 GROUP BY c1 ORDER BY c1; +c1 COUNT(*) +a 4 +ä 2 +SELECT DISTINCT c1 FROM t1 ORDER BY c1; +c1 +a +ä +DROP TABLE t1; diff --git a/mysql-test/columnstore/basic/t/ctype_cmp_char1_latin1_swedish_ci.test b/mysql-test/columnstore/basic/t/ctype_cmp_char1_latin1_swedish_ci.test new file mode 100644 index 000000000..59f3eea84 --- /dev/null +++ b/mysql-test/columnstore/basic/t/ctype_cmp_char1_latin1_swedish_ci.test @@ -0,0 +1,17 @@ +--source ../include/have_columnstore.inc +--source ctype_cmp_combinations.inc + +SET NAMES utf8; + +--echo # +--echo # MCOL-4721 CHAR(1) is not collation-aware for GROUP/DISTINCT +--echo # + +CREATE TABLE t1 (c1 CHAR(1) CHARACTER SET latin1 COLLATE latin1_swedish_ci); +INSERT INTO t1 VALUES ('a'),('A'); +SELECT c1, COUNT(*) FROM t1 GROUP BY c1; +SELECT DISTINCT c1 FROM t1; +INSERT INTO t1 VALUES ('ä'),('Ä'),('ã'),('Ã'); +SELECT c1, COUNT(*) FROM t1 GROUP BY c1 ORDER BY c1; +SELECT DISTINCT c1 FROM t1 ORDER BY c1; +DROP TABLE t1; diff --git a/utils/rowgroup/rowgroup.cpp b/utils/rowgroup/rowgroup.cpp index b2dacc98d..ec9c70438 100644 --- a/utils/rowgroup/rowgroup.cpp +++ b/utils/rowgroup/rowgroup.cpp @@ -1130,54 +1130,6 @@ bool Row::equals(const std::string& val, uint32_t col) const return true; } -bool Row::equals(const Row& r2, const std::vector& keyCols) const -{ - for (uint32_t i = 0; i < keyCols.size(); i++) - { - const uint32_t& col = keyCols[i]; - cscDataType columnType = getColType(col); - - if (UNLIKELY(columnType == execplan::CalpontSystemCatalog::VARCHAR || - (columnType == execplan::CalpontSystemCatalog::CHAR && (colWidths[col] > 1)) || - columnType == execplan::CalpontSystemCatalog::TEXT)) - { - CHARSET_INFO* cs = getCharset(col); - if (cs->strnncollsp(getStringPointer(col), getStringLength(col), - r2.getStringPointer(col), r2.getStringLength(col))) - { - return false; - } - } - else if (UNLIKELY(columnType == execplan::CalpontSystemCatalog::BLOB)) - { - if (getStringLength(col) != r2.getStringLength(col)) - return false; - - if (memcmp(getStringPointer(col), r2.getStringPointer(col), getStringLength(col))) - return false; - } - else - { - if (UNLIKELY(columnType == execplan::CalpontSystemCatalog::LONGDOUBLE)) - { - if (getLongDoubleField(col) != r2.getLongDoubleField(col)) - return false; - } - else if (UNLIKELY(datatypes::isWideDecimalType(columnType, colWidths[col]))) - { - if (*getBinaryField(col) != *r2.getBinaryField(col)) - return false; - } - else if (getUintField(col) != r2.getUintField(col)) - { - return false; - } - } - } - - return true; -} - bool Row::equals(const Row& r2, uint32_t lastCol) const { // This check fires with empty r2 only. @@ -1196,9 +1148,7 @@ bool Row::equals(const Row& r2, uint32_t lastCol) const for (uint32_t col = 0; col <= lastCol; col++) { cscDataType columnType = getColType(col); - if (UNLIKELY(columnType == execplan::CalpontSystemCatalog::VARCHAR || - (columnType == execplan::CalpontSystemCatalog::CHAR && (colWidths[col] > 1)) || - columnType == execplan::CalpontSystemCatalog::TEXT)) + if (UNLIKELY(typeHasCollation(columnType))) { CHARSET_INFO* cs = getCharset(col); if (cs->strnncollsp(getStringPointer(col), getStringLength(col), @@ -1304,10 +1254,7 @@ RowGroup::RowGroup(uint32_t colCount, else stOffsets[i + 1] = stOffsets[i] + colWidths[i]; - execplan::CalpontSystemCatalog::ColDataType type = types[i]; - if ((type == execplan::CalpontSystemCatalog::CHAR && (colWidths[i] > 1)) || - type == execplan::CalpontSystemCatalog::VARCHAR || - type == execplan::CalpontSystemCatalog::TEXT) + if (colHasCollation(i)) { hasCollation = true; } @@ -1916,10 +1863,7 @@ RowGroup RowGroup::truncate(uint32_t cols) ret.hasLongStringField = true; } - execplan::CalpontSystemCatalog::ColDataType type = types[i]; - if ((type == execplan::CalpontSystemCatalog::CHAR && (colWidths[i] > 1)) || - type == execplan::CalpontSystemCatalog::VARCHAR || - type == execplan::CalpontSystemCatalog::TEXT) + if (colHasCollation(i)) { ret.hasCollation = true; } diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h index 1d54b5dd2..341d6feb7 100644 --- a/utils/rowgroup/rowgroup.h +++ b/utils/rowgroup/rowgroup.h @@ -372,6 +372,11 @@ public: inline bool isShortString(uint32_t colIndex) const; inline bool isLongString(uint32_t colIndex) const; + bool colHasCollation(uint32_t colIndex) const + { + return datatypes::typeHasCollation(getColType(colIndex)); + } + template inline uint64_t getUintField(uint32_t colIndex) const; inline uint64_t getUintField(uint32_t colIndex) const; template inline int64_t getIntField(uint32_t colIndex) const; @@ -555,7 +560,6 @@ public: inline uint64_t hash() const; // generates a hash for all cols inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const; - bool equals(const Row&, const std::vector& keyColumns) const; bool equals(const Row&, uint32_t lastCol) const; inline bool equals(const Row&) const; @@ -1508,6 +1512,11 @@ public: inline bool isShortString(uint32_t colIndex) const; inline bool isLongString(uint32_t colIndex) const; + bool colHasCollation(uint32_t colIndex) const + { + return datatypes::typeHasCollation(getColType(colIndex)); + } + inline const std::vector& getScale() const; inline const std::vector& getPrecision() const;