From 8c360a1a27ed6bc2efd5d23f8136302c8b14cdc1 Mon Sep 17 00:00:00 2001
From: Roman Nozdrin <rnozdrin@mariadb.com>
Date: Thu, 24 Jun 2021 14:38:01 +0000
Subject: [PATCH] MCOL-4759 Upmerge for MCOL-4564 code that implements hash
 merging family to reduce performance penalty using MDB hashing functions

---
 utils/common/hashfamily.h |  57 +++++++++++++++++++
 utils/rowgroup/rowgroup.h | 117 +++++++++++++++++++++-----------------
 2 files changed, 122 insertions(+), 52 deletions(-)
 create mode 100644 utils/common/hashfamily.h

diff --git a/utils/common/hashfamily.h b/utils/common/hashfamily.h
new file mode 100644
index 000000000..b324ea856
--- /dev/null
+++ b/utils/common/hashfamily.h
@@ -0,0 +1,57 @@
+/* Copyright (C) 2021 Mariadb Corporation.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#ifndef UTILS_HASHFAMILY_H
+#define UTILS_HASHFAMILY_H
+
+#include "hasher.h"
+#include "collation.h"
+
+namespace utils
+{
+
+class HashFamily
+{
+  public:
+    HashFamily(const utils::Hasher_r& h,
+               const uint64_t intermediateHash,
+               const uint64_t len,
+               const datatypes::MariaDBHasher& hM) : mHasher(h),
+                                                     mMariaDBHasher(hM),
+                                                     mHasher_rHash(intermediateHash),
+                                                     mHasher_rLen(len)
+    { }
+
+    // Algorithm, seed and factor are taken from this discussion
+    // https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations
+    inline uint64_t finalize() const
+    {
+      return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize();
+    }
+  private:
+    constexpr static uint64_t seed = 1009ULL;
+    constexpr static uint64_t factor = 9176ULL;
+
+    const utils::Hasher_r& mHasher;
+    const datatypes::MariaDBHasher& mMariaDBHasher;
+    const uint64_t mHasher_rHash;
+    const uint32_t mHasher_rLen;
+};
+
+}
+#endif
+// vim:ts=2 sw=2:
diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
index 5f2322d9b..79a0f4b29 100644
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@@ -60,7 +60,7 @@
 #include "../winport/winport.h"
 
 #include "collation.h"
-
+#include "common/hashfamily.h"
 
 // Workaround for my_global.h #define of isnan(X) causing a std::std namespace
 
@@ -70,57 +70,57 @@ namespace rowgroup
 const int16_t rgCommonSize = 8192;
 
 /*
-    The RowGroup family of classes encapsulate the data moved through the 
+    The RowGroup family of classes encapsulate the data moved through the
     system.
-    
+
      - RowGroup specifies the format of the data primarily (+ some other metadata),
      - RGData (aka RowGroup Data) encapsulates the data,
      - Row is used to extract fields from the data and iterate.
-    
+
     JobListFactory instantiates the RowGroups to be used by each stage of processing.
-    RGDatas are passed between stages, and their RowGroup instances are used 
+    RGDatas are passed between stages, and their RowGroup instances are used
     to interpret them.
-    
+
     Historically, row data was just a chunk of contiguous memory, a uint8_t *.
-    Every field had a fixed width, which allowed for quick offset 
+    Every field had a fixed width, which allowed for quick offset
     calculation when assigning or retrieving individual fields.  That worked
     well for a few years, but at some point it became common to declare
     all strings as max-length, and to manipulate them in queries.
-    
-    Having fixed-width fields, even for strings, required an unreasonable 
-    amount of memory.  RGData & StringStore were introduced to handle strings 
+
+    Having fixed-width fields, even for strings, required an unreasonable
+    amount of memory.  RGData & StringStore were introduced to handle strings
     more efficiently, at least with respect to memory.  The row data would
-    still be a uint8_t *, and columns would be fixed-width, but string fields 
-    above a certain width would contain a 'Pointer' that referenced a string in 
-    StringStore.  Strings are stored efficiently in StringStore, so there is 
+    still be a uint8_t *, and columns would be fixed-width, but string fields
+    above a certain width would contain a 'Pointer' that referenced a string in
+    StringStore.  Strings are stored efficiently in StringStore, so there is
     no longer wasted space.
-    
-    StringStore comes with a different inefficiency however.  When a value 
-    is overwritten, the original string cannot be freed independently of the 
-    others, so it continues to use space.  If values are only set once, as is 
-    the typical case, then StringStore is efficient.  When it is necessary 
-    to overwrite string fields, it is possible to configure these classes 
-    to use the original data format so that old string fields do not accumulate 
-    in memory.  Of course, be careful, because blobs and text fields in CS are 
+
+    StringStore comes with a different inefficiency however.  When a value
+    is overwritten, the original string cannot be freed independently of the
+    others, so it continues to use space.  If values are only set once, as is
+    the typical case, then StringStore is efficient.  When it is necessary
+    to overwrite string fields, it is possible to configure these classes
+    to use the original data format so that old string fields do not accumulate
+    in memory.  Of course, be careful, because blobs and text fields in CS are
     declared as 2GB strings!
-    
+
     A single RGData contains up to one 'logical block' worth of data,
     which is 8192 rows.  One RGData is usually treated as one unit of work by
-    PrimProc and the JobSteps, but the rows an RGData contains and how many are 
+    PrimProc and the JobSteps, but the rows an RGData contains and how many are
     treated as a work unit depend on the operation being done.
-    
-    For example, PrimProc works in units of 8192 contiguous rows 
-    that come from disk.  If half of the rows were filtered out, then the 
+
+    For example, PrimProc works in units of 8192 contiguous rows
+    that come from disk.  If half of the rows were filtered out, then the
     RGData it passes to the next stage would only contain 4096 rows.
 
-    Others build results incrementally before passing them along, such as 
-    group-by.  If one group contains 11111 values, then group-by will 
+    Others build results incrementally before passing them along, such as
+    group-by.  If one group contains 11111 values, then group-by will
     return 2 RGDatas for that group, one with 8192 rows, and one with 2919.
-    
+
     Note: There is no synchronization in any of these classes for obvious
-    performance reasons.  Likewise, although it's technically safe for many 
-    readers to access an RGData simultaneously, that would not be an 
-    efficient thing to do.  Try to stick to designs where a single RGData 
+    performance reasons.  Likewise, although it's technically safe for many
+    readers to access an RGData simultaneously, that would not be an
+    efficient thing to do.  Try to stick to designs where a single RGData
     is used by a single thread at a time.
 */
 
@@ -138,7 +138,7 @@ inline T derefFromTwoVectorPtrs(const std::vector<T>* outer,
                          const T innerIdx)
 {
     auto outerIdx = inner->operator[](innerIdx);
-    return outer->operator[](outerIdx); 
+    return outer->operator[](outerIdx);
 }
 
 class StringStore
@@ -375,7 +375,7 @@ public:
     inline execplan::CalpontSystemCatalog::ColDataType* getColTypes();
     inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const;
     inline uint32_t getCharsetNumber(uint32_t colIndex) const;
-    
+
     // this returns true if the type is not CHAR or VARCHAR
     inline bool isCharType(uint32_t colIndex) const;
     inline bool isUnsigned(uint32_t colIndex) const;
@@ -429,7 +429,7 @@ public:
     inline bool equals(long double val, uint32_t colIndex) const;
     bool equals(const std::string& val, uint32_t colIndex) const;
     inline bool equals(const int128_t& val, uint32_t colIndex) const;
-    
+
     inline double getDoubleField(uint32_t colIndex) const;
     inline float getFloatField(uint32_t colIndex) const;
     inline datatypes::Decimal getDecimalField(uint32_t colIndex) const
@@ -513,7 +513,7 @@ public:
     inline T* getBinaryField(T* argtype, uint32_t colIndex) const;
     template <typename T>
     inline T* getBinaryField_offset(uint32_t offset) const;
-    
+
     inline boost::shared_ptr<mcsv1sdk::UserData> getUserData(uint32_t colIndex) const;
     inline void setUserData(mcsv1sdk::mcsv1Context& context,
                             boost::shared_ptr<mcsv1sdk::UserData> userData,
@@ -569,18 +569,21 @@ public:
     // a fcn to check the type defs seperately doesn't exist yet.  No normalization.
     inline uint64_t hash(uint32_t lastCol) const;  // generates a hash for cols [0-lastCol]
     inline uint64_t hash() const;  // generates a hash for all cols
-    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const;
-    inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t keyColsIdx,
-                                               const std::vector<uint32_t>& keyCols,
-                                               const std::vector<uint32_t>* smallSideKeyColumnsIds,
-                                               const std::vector<uint32_t>* smallSideColumnsWidths) const;
+    inline void colUpdateHasher(datatypes::MariaDBHasher& hM,
+                                const utils::Hasher_r& h,
+                                const uint32_t col,
+                                uint32_t& intermediateHash) const;
+    inline void colUpdateHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t keyColsIdx,
+                                        const std::vector<uint32_t>& keyCols,
+                                        const std::vector<uint32_t>* smallSideKeyColumnsIds,
+                                        const std::vector<uint32_t>* smallSideColumnsWidths) const;
     inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols,
                                  const std::vector<uint32_t>* smallSideKeyColumnsIds,
                                  const std::vector<uint32_t>* smallSideColumnsWidths) const
     {
         datatypes::MariaDBHasher h;
         for (uint32_t i = 0; i < keyCols.size(); i++)
-            colUpdateMariaDBHasherTypeless(h, i, keyCols, smallSideKeyColumnsIds, smallSideColumnsWidths);
+            colUpdateHasherTypeless(h, i, keyCols, smallSideKeyColumnsIds, smallSideColumnsWidths);
         return h.finalize();
     }
 
@@ -591,7 +594,7 @@ public:
     {
         userDataStore = u;
     }
-    
+
     const CHARSET_INFO* getCharset(uint32_t col) const;
 
 private:
@@ -946,7 +949,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const
 }
 
 
-inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const
+inline void Row::colUpdateHasher(datatypes::MariaDBHasher& hM,
+                                 const utils::Hasher_r& h,
+                                 const uint32_t col,
+                                 uint32_t& intermediateHash) const
 {
     switch (getColType(col))
     {
@@ -956,17 +962,19 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
         case execplan::CalpontSystemCatalog::TEXT:
         {
             CHARSET_INFO *cs = getCharset(col);
-            h.add(cs, getConstString(col));
+            hM.add(cs, getConstString(col));
             break;
         }
         default:
-            h.add(&my_charset_bin, getShortConstString(col));
+        {
+            intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash);
             break;
+        }
     }
 }
 
 
-inline void Row::colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &h, uint32_t keyColsIdx,
+inline void Row::colUpdateHasherTypeless(datatypes::MariaDBHasher &h, uint32_t keyColsIdx,
                                                 const std::vector<uint32_t>& keyCols,
                                                 const std::vector<uint32_t>* smallSideKeyColumnsIds,
                                                 const std::vector<uint32_t>* smallSideColumnsWidths) const
@@ -1472,7 +1480,12 @@ inline uint64_t Row::hash() const
 
 inline uint64_t Row::hash(uint32_t lastCol) const
 {
-    datatypes::MariaDBHasher h;
+    // Use two hash classes. MariaDBHasher for text-based
+    // collation-aware data types and Hasher_r for all other data types.
+    // We deliver a hash that is a combination of both hashers' results.
+    utils::Hasher_r h;
+    datatypes::MariaDBHasher hM;
+    uint32_t intermediateHash = 0;
 
     // Sometimes we ask this to hash 0 bytes, and it comes through looking like
     // lastCol = -1.  Return 0.
@@ -1480,9 +1493,9 @@ inline uint64_t Row::hash(uint32_t lastCol) const
         return 0;
 
     for (uint32_t i = 0; i <= lastCol; i++)
-      colUpdateMariaDBHasher(h, i);
+        colUpdateHasher(hM, h, i, intermediateHash);
 
-    return h.finalize();
+    return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize();
 }
 
 inline bool Row::equals(const Row& r2) const
@@ -1661,7 +1674,7 @@ public:
                             uint16_t* blockNum);
 
     inline void setStringStore(boost::shared_ptr<StringStore>);
-    
+
     const CHARSET_INFO* getCharset(uint32_t col);
 
 private:
@@ -1682,7 +1695,7 @@ private:
     // For string collation
     std::vector<uint32_t> charsetNumbers;
     std::vector<CHARSET_INFO*> charsets;
-    
+
     // DECIMAL support.  For non-decimal fields, the values are 0.
     std::vector<uint32_t> scale;
     std::vector<uint32_t> precision;