MCOL-4173 This patch adds support for wide-DECIMAL INNER, OUTER, SEMI, functional JOINs

based on top of TypelessData
2025-07-29 08:21:15 +03:00 · 2021-02-16 10:23:49 +00:00
parent 4ecd561878
commit bed0b7c6bc
22 changed files with 2347 additions and 228 deletions
--- a/datatypes/mcs_datatype.h
+++ b/datatypes/mcs_datatype.h
@ -411,6 +411,33 @@ inline bool isNumeric(const datatypes::SystemCatalog::ColDataType type)
  }
 }
 inline bool isInteger(const datatypes::SystemCatalog::ColDataType type)
 {
  switch (type)
  {
    case datatypes::SystemCatalog::TINYINT:
    case datatypes::SystemCatalog::SMALLINT:
    case datatypes::SystemCatalog::MEDINT:
    case datatypes::SystemCatalog::INT:
    case datatypes::SystemCatalog::BIGINT:
    case datatypes::SystemCatalog::UTINYINT:
    case datatypes::SystemCatalog::USMALLINT:
    case datatypes::SystemCatalog::UMEDINT:
    case datatypes::SystemCatalog::UINT:
    case datatypes::SystemCatalog::UBIGINT:
        return true;
    default:
        return false;
  }
 }
 inline bool isLongDouble(const datatypes::SystemCatalog::ColDataType type)
 {
  return type == datatypes::SystemCatalog::LONGDOUBLE;
 }
 inline bool isDecimal(const datatypes::SystemCatalog::ColDataType type)
 {
  return (type == datatypes::SystemCatalog::DECIMAL ||
--- a/datatypes/mcs_decimal.h
+++ b/datatypes/mcs_decimal.h
@ -83,7 +83,7 @@ namespace datatypes
 constexpr uint32_t MAXDECIMALWIDTH = 16U;
 constexpr uint8_t INT64MAXPRECISION = 18U;
 constexpr uint8_t INT128MAXPRECISION = 38U;
-constexpr uint8_t MAXLEGACYWIDTH = 8U;
+constexpr uint32_t MAXLEGACYWIDTH = 8U;
 constexpr uint8_t MAXSCALEINC4AVG = 4U;
 constexpr int8_t IGNOREPRECISION = -1;
--- a/datatypes/mcs_int128.h
+++ b/datatypes/mcs_int128.h
@ -285,6 +285,21 @@ class TSInt128
      return TSInt128(s128Value + rhs.s128Value);
    }
    inline bool operator>(const TSInt128& rhs) const
    {
      return s128Value > rhs.s128Value;
    }
    inline bool operator<(const TSInt128& rhs) const
    {
      return s128Value < rhs.s128Value;
    }
    inline bool operator!=(const TSInt128& rhs) const
    {
      return s128Value != rhs.getValue();
    }
    inline TFloat128 toTFloat128() const
    {
      return TFloat128(s128Value);
--- a/datatypes/mcs_int64.h
+++ b/datatypes/mcs_int64.h
@ -51,6 +51,11 @@ public:
  {
    return mValue;
  }
  void store(uint8_t* dst) const
  {
    *(uint64_t*) dst = mValue;
  }
 };
--- a/dbcon/joblist/batchprimitiveprocessor-jl.cpp
+++ b/dbcon/joblist/batchprimitiveprocessor-jl.cpp
@ -1093,6 +1093,7 @@ void BatchPrimitiveProcessorJL::createBPP(ByteStream& bs) const
            cout << "PMJoinerCount = " << PMJoinerCount << endl;
 #endif
            bool smallSideRGSent = false;
            for (i = 0; i < PMJoinerCount; i++)
            {
                bs << (uint32_t) tJoiners[i]->size();
@ -1121,6 +1122,17 @@ void BatchPrimitiveProcessorJL::createBPP(ByteStream& bs) const
                {
                    serializeVector<uint32_t>(bs, tJoiners[i]->getLargeKeyColumns());
                    bs << (uint32_t) tJoiners[i]->getKeyLength();
                    // MCOL-4173 Notify PP if smallSide and largeSide have different column widths
                    // and send smallSide RG to PP.
                    bool joinHasSkewedKeyColumn = tJoiners[i]->joinHasSkewedKeyColumn();
                    bs << joinHasSkewedKeyColumn;
                    if (!smallSideRGSent && joinHasSkewedKeyColumn)
                    {
                        idbassert(!smallSideRGs.empty());
                        bs << smallSideRGs[0];
                        serializeVector<uint32_t>(bs, tJoiners[i]->getSmallKeyColumns());
                        smallSideRGSent = true;
                    }
                }
            }
@ -1606,17 +1618,6 @@ bool BatchPrimitiveProcessorJL::nextTupleJoinerMsg(ByteStream& bs)
        smallSide.setRowCount(toSend);
        tmpData.serialize(bs, smallSide.getDataSize());
        /*
        uint32_t lpos;
        uint8_t *buf;
        bs.needAtLeast(r.getSize() * toSend);
        buf = (uint8_t *) bs.getInputPtr();
        //for (i = pos, lpos = 0; i < pos + toSend; i++, lpos += r.getSize())
        //	memcpy(&buf[lpos], (*tSmallSide)[i], r.getSize());
        bs.advanceInputPtr(r.getSize() * toSend);
        */
    }
    pos += toSend;
--- a/dbcon/joblist/jlf_tuplejoblist.cpp
+++ b/dbcon/joblist/jlf_tuplejoblist.cpp
@ -1,5 +1,5 @@
 /* Copyright (C) 2014 InfiniDB, Inc.
-   Copyright (C) 2019 MariaDB Corporation
+   Copyright (C) 2019-2021 MariaDB Corporation
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
@ -1480,10 +1480,10 @@ bool addFunctionJoin(vector<uint32_t>& joinedTables, JobStepVector& joinSteps,
            TupleInfo ti1 = getTupleInfo(key1, jobInfo);
            TupleInfo ti2 = getTupleInfo(key2, jobInfo);
-            if (ti1.dtype == CalpontSystemCatalog::CHAR 
+            // Enable Typeless JOIN for char and wide decimal types.
-             || ti1.dtype == CalpontSystemCatalog::VARCHAR 
+            if (datatypes::isCharType(ti1.dtype) ||
-             || ti1.dtype == CalpontSystemCatalog::TEXT)
+                (datatypes::isWideDecimalType(ti1.dtype, ti1.width) ||
-//             || ti1.dtype == CalpontSystemCatalog::LONGDOUBLE)
+                 datatypes::isWideDecimalType(ti2.dtype, ti2.width)))
                m1->second.fTypeless = m2->second.fTypeless = true;  // ti2 is compatible
            else
                m1->second.fTypeless = m2->second.fTypeless = false;
--- a/dbcon/joblist/tuplehashjoin.cpp
+++ b/dbcon/joblist/tuplehashjoin.cpp
@ -1736,13 +1736,13 @@ void TupleHashJoinStep::joinOneRG(uint32_t threadID, vector<RGData>* out,
        {
            (*tjoiners)[j]->match(largeSideRow, k, threadID, &joinMatches[j]);
            /* Debugging code to print the matches
-            	Row r;
+               Row r;
-            	smallRGs[j].initRow(&r);
+               smallRGs[j].initRow(&r);
-            	cout << joinMatches[j].size() << " matches: \n";
+               cout << joinMatches[j].size() << " matches: \n";
-            	for (uint32_t z = 0; z < joinMatches[j].size(); z++) {
+               for (uint32_t z = 0; z < joinMatches[j].size(); z++) {
-            		r.setData(joinMatches[j][z]);
+                   r.setData(joinMatches[j][z]);
-            		cout << "  " << r.toString() << endl;
+                   cout << "  " << r.toString() << endl;
-            	}
+               }
            */
            matchCount = joinMatches[j].size();
--- a/mysql-test/columnstore/future/mcol641-joins.test
+++ b/mysql-test/columnstore/future/mcol641-joins.test
@ -7,6 +7,10 @@ DROP DATABASE IF EXISTS mcol641_joins_db;
 CREATE DATABASE mcol641_joins_db;
 USE mcol641_joins_db;
 --disable_query_log
 SET default_storage_engine=ColumnStore;
 --enable_query_log
 CREATE TABLE cs1 (d1 DECIMAL(38), d2 DECIMAL(38,10), d3 DECIMAL(38,38)) ENGINE=columnstore;
 CREATE TABLE cs2 (de1 DECIMAL(38,38), de2 DECIMAL(38,10)) ENGINE=columnstore;
--- a/mysql-test/columnstore/future/mcol641-skewed-joins.result
+++ b/mysql-test/columnstore/future/mcol641-skewed-joins.result
--- a/mysql-test/columnstore/future/mcol641-skewed-joins.test
+++ b/mysql-test/columnstore/future/mcol641-skewed-joins.test
@ -0,0 +1,328 @@
 -- source ../include/have_columnstore.inc
 -- source ../include/enable_ordered_only.inc
 --disable_warnings
 DROP DATABASE IF EXISTS mcol641_joins_db;
 --enable_warnings
 CREATE DATABASE mcol641_joins_db;
 USE mcol641_joins_db;
 --disable_query_log
 SET default_storage_engine=ColumnStore;
 --enable_query_log
 CREATE TABLE cs1 (d1 DECIMAL(38), d2 DECIMAL(37), id TINYINT);
 CREATE TABLE cs2 (i1 SMALLINT, i2 MEDIUMINT, i3 INT, i4 BIGINT);
 INSERT INTO cs1 VALUES
 (99,0,1),
 (255,254,2),
 (254,253,3),
 (252,253,4),
 (65535,2147483647,5),
 (65534,2147483646,6),
 (65533,65532,7),
 (2147483647,2147483636,8),
 (2147483646,2147483635,9),
 (2147483645,2147483634,10),
 (2147483645,9223372036854775804,11),
 (9223372036854775807,0,12),
 (9223372036854775807,2147483627,13),
 (9223372036854775806,2147483626,14),
 (9223372036854775805,9223372036854775704,15);
 INSERT INTO cs2 VALUES
 (255,254,NULL,NULL),
 (254,253,NULL,NULL),
 (251,251,NULL,NULL),
 (NULL,65535,NULL,NULL),
 (NULL,65535,2147483647,NULL),
 (NULL,65534,2147483646,NULL),
 (NULL,0,2147483641,NULL),
 (NULL,NULL,2147483647,NULL),
 (NULL,NULL,2147483647,2147483636),
 (NULL,NULL,2147483646,2147483635),
 (NULL,NULL,0,2147483641),
 (NULL,NULL,NULL,9223372036854775807),
 (NULL,NULL,2147483627,9223372036854775807),
 (NULL,NULL,2147483626,9223372036854775806),
 (NULL,NULL,0,1);
 # Distributed PrimProc-based JOINs
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i2 ORDER BY id,i3;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i2 ORDER BY id,i3;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i3 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i3 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i4 AND cs2.i3 IS NOT NULL ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i4 AND cs2.i3 IS NOT NULL ORDER BY id;
 # PrimProc-based composite key JOINs
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i1 AND cs1.d2 = cs2.i2 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i1 AND cs1.d2 = cs2.i2 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i2 AND cs1.d2 = cs2.i3 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i2 AND cs1.d2 = cs2.i3 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i3 AND cs1.d2 = cs2.i4 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i3 AND cs1.d2 = cs2.i4 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1 = cs2.i4 AND cs1.d2 = cs2.i3 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1 = cs2.i4 AND cs1.d2 = cs2.i3 ORDER BY id;
 # ExeMgr-based JOINs
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i1 ORDER BY id,i2;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i1 ORDER BY id,i2;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i2 ORDER BY id,i3;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i2 ORDER BY id,i3;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i3 ORDER BY id,i2,i4;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i3 ORDER BY id,i2,i4;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i4 AND s2.i3 IS NOT NULL ORDER BY id,i3;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i4 AND s2.i3 IS NOT NULL ORDER BY id,i3;
 # Functional JOIN
 # Distributed PrimProc-based functional JOINs
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1 = cs2.i1-1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1 = cs2.i1-1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1 = cs2.i2-1 ORDER BY id,i3;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1 = cs2.i2-1 ORDER BY id,i3;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1 = cs2.i3-1 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1 = cs2.i3-1 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1 = cs2.i4-1 AND cs2.i3 IS NOT NULL ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1 = cs2.i4-1 AND cs2.i3 IS NOT NULL ORDER BY id;
 # PrimProc-based composite key JOINs
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1= cs2.i1-1 AND cs1.d2-1= cs2.i2-1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1= cs2.i1-1 AND cs1.d2-1= cs2.i2-1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1= cs2.i2-1 AND cs1.d2-1= cs2.i3-1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1= cs2.i2-1 AND cs1.d2-1= cs2.i3-1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1= cs2.i3-1 AND cs1.d2-1= cs2.i4-1 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1= cs2.i3-1 AND cs1.d2-1= cs2.i4-1 ORDER BY id,i2,i4;
 SELECT cs1.*, cs2.* FROM cs1 INNER JOIN cs2 ON cs1.d1-1= cs2.i4-1 AND cs1.d2-1= cs2.i3-1 ORDER BY id;
 SELECT cs1.*, cs2.* FROM cs2 INNER JOIN cs1 ON cs1.d1-1= cs2.i4-1 AND cs1.d2-1= cs2.i3-1 ORDER BY id;
 # ExeMgr-based JOINs
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i1-1 ORDER BY id,i2;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i1-1 ORDER BY id,i2;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i2-1 ORDER BY id,i3;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i2-1 ORDER BY id,i3;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i3-1 ORDER BY id,i2,i4;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i3-1 ORDER BY id,i2,i4;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 INNER JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i4-1 AND s2.i3 IS NOT NULL ORDER BY id,i3;
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 INNER JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i4-1 AND s2.i3 IS NOT NULL ORDER BY id,i3;
 -- source ../include/disable_ordered_only.inc
 # Skewed OUTER JOIN
 TRUNCATE cs1;
 TRUNCATE cs2;
 INSERT INTO cs1 VALUES
 (99,0,1),
 (255,254,2),
 (254,253,3),
 (252,253,4),
 (-252,253,5),
 (65535,2147483647,5),
 (65534,2147483646,6),
 (65533,65532,7),
 (2147483647,2147483636,8),
 (2147483646,2147483635,9),
 (2147483645,2147483634,10),
 (2147483645,9223372036854775804,11),
 (9223372036854775807,0,12),
 (9223372036854775807,2147483627,13),
 (9223372036854775806,2147483626,14),
 (9223372036854775805,9223372036854775704,15);
 INSERT INTO cs2 VALUES
 (255,254,NULL,NULL),
 (254,253,NULL,NULL),
 (251,251,NULL,NULL),
 (-252,253,NULL,NULL),
 (-250,253,NULL,NULL),
 (NULL,65535,NULL,NULL),
 (NULL,65535,2147483647,NULL),
 (NULL,65534,2147483646,NULL),
 (NULL,0,2147483641,NULL),
 (NULL,NULL,2147483647,NULL),
 (NULL,NULL,2147483647,2147483636),
 (NULL,NULL,2147483646,2147483635),
 (NULL,NULL,0,2147483641),
 (NULL,NULL,NULL,9223372036854775807),
 (NULL,NULL,2147483627,9223372036854775807),
 (NULL,NULL,2147483626,9223372036854775806),
 (NULL,NULL,0,1);
 # Distributed PrimProc-based JOINs
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i2 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i2 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i3 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i3 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i4 AND cs2.i3 IS NOT NULL ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i4 AND cs2.i3 IS NOT NULL ;
 # PrimProc-based composite key JOINs
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i1 AND cs1.d2 = cs2.i2 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i1 AND cs1.d2 = cs2.i2 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i2 AND cs1.d2 = cs2.i3 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i2 AND cs1.d2 = cs2.i3 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i3 AND cs1.d2 = cs2.i4 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i3 AND cs1.d2 = cs2.i4 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1 = cs2.i4 AND cs1.d2 = cs2.i3 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1 = cs2.i4 AND cs1.d2 = cs2.i3 ;
 # ExeMgr-based JOINs
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i2 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i2 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i3 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i3 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1=s2.i4 AND s2.i3 IS NOT NULL ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1=s2.i4 AND s2.i3 IS NOT NULL ;
 # Functional JOIN
 # Distributed PrimProc-based functional JOINs
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1 = cs2.i1-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1 = cs2.i1-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1 = cs2.i2-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1 = cs2.i2-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1 = cs2.i3-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1 = cs2.i3-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1 = cs2.i4-1 AND cs2.i3 IS NOT NULL ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1 = cs2.i4-1 AND cs2.i3 IS NOT NULL ;
 # PrimProc-based composite key JOINs
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1= cs2.i1-1 AND cs1.d2-1= cs2.i2-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1= cs2.i1-1 AND cs1.d2-1= cs2.i2-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1= cs2.i2-1 AND cs1.d2-1= cs2.i3-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1= cs2.i2-1 AND cs1.d2-1= cs2.i3-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1= cs2.i3-1 AND cs1.d2-1= cs2.i4-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1= cs2.i3-1 AND cs1.d2-1= cs2.i4-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs1 LEFT JOIN cs2 ON cs1.d1-1= cs2.i4-1 AND cs1.d2-1= cs2.i3-1 ;
 --sorted_result
 SELECT cs1.*, cs2.* FROM cs2 LEFT JOIN cs1 ON cs1.d1-1= cs2.i4-1 AND cs1.d2-1= cs2.i3-1 ;
 # ExeMgr-based JOINs
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i1-1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i1-1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i2-1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i2-1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i3-1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i3-1 ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs1)s1 LEFT JOIN (SELECT * FROM cs2)s2 ON s1.d1-1=s2.i4-1 AND s2.i3 IS NOT NULL ;
 --sorted_result
 SELECT s1.*,s2.* FROM (SELECT * FROM cs2)s2 LEFT JOIN (SELECT * FROM cs1)s1 ON s1.d1-1=s2.i4-1 AND s2.i3 IS NOT NULL ;
 # Misc skewed JOIN
 CREATE TABLE t1 (a DECIMAL(10,1), b DECIMAL(20,1));
 INSERT INTO t1 VALUES (10.1,20.1);
 CREATE TABLE t2 (a DECIMAL(20,1), b DECIMAL(10,1));
 INSERT INTO t2 VALUES (10.1,20.1);
 SELECT * FROM t1,t2 WHERE t1.a=t2.a AND t1.b=t2.b;
 DROP TABLE t1,t2;
 CREATE TABLE t1 (a CHAR(10), b DECIMAL(10,1));
 INSERT INTO t1 VALUES (10.1,20.1);
 CREATE TABLE t2 (a CHAR(10), b DECIMAL(20,1));
 INSERT INTO t2 VALUES (10.1,20.1);
 SELECT * FROM t1,t2 WHERE t1.a=t2.a AND t1.b=t2.b;
 SELECT * FROM t2,t1 WHERE t1.a=t2.a AND t1.b=t2.b;
 DROP TABLE t1,t2;
 CREATE TABLE t1 (a DECIMAL(10,1), b CHAR(10));
 INSERT INTO t1 VALUES (10.1,20.1);
 CREATE TABLE t2 (a DECIMAL(20,1), b CHAR(10));
 INSERT INTO t2 VALUES (10.1,20.1);
 SELECT * FROM t1,t2 WHERE t1.a=t2.a AND t1.b=t2.b;
 SELECT * FROM t2,t1 WHERE t1.a=t2.a AND t1.b=t2.b;
 SELECT * FROM t2,t1 WHERE (t1.a,t1.b)=(t2.a,t2.b);
 SELECT * FROM t1,t2 WHERE (t1.a,t1.b)=(t2.a,t2.b);
 SELECT * FROM t1 JOIN t2 USING (a,b);
 SELECT * FROM t2 JOIN t1 USING (a,b);
 # Testing the max number of skewed columns in a join.
 DROP TABLE t1,t2;
 CREATE TABLE t1 (a DECIMAL(10,1), b DECIMAL(20,1),a1 DECIMAL(10,1), b1 DECIMAL(20,1),a2 DECIMAL(10,1), b2 DECIMAL(20,1),a3 DECIMAL(10,1), b3 DECIMAL(20,1),a4 DECIMAL(10,1), b4 DECIMAL(20,1),a5 DECIMAL(10,1));
 INSERT INTO t1 VALUES (10.1,20.1,10.1,20.1,10.1,20.1,10.1,20.1,10.1,20.1,10.1);
 CREATE TABLE t2 (a DECIMAL(20,1), b DECIMAL(10,1),a1 DECIMAL(20,1), b1 DECIMAL(10,1),a2 DECIMAL(20,1), b2 DECIMAL(10,1),a3 DECIMAL(20,1), b3 DECIMAL(10,1),a4 DECIMAL(20,1), b4 DECIMAL(10,1),a5 DECIMAL(20,1));
 INSERT INTO t2 VALUES (10.1,20.1,10.1,20.1,10.1,20.1,10.1,20.1,10.1,20.1,10.1);
 # These work b/c the max is 10 columns.
 SELECT * FROM t1 INNER JOIN t2 USING(a,b,a1,b1,a2,b2,a3,b3,a4);
 SELECT * FROM t2 INNER JOIN t1 USING(a,b,a1,b1,a2,b2,a3,b3,a4);
 # These do not.
 #SELECT * FROM t1 INNER JOIN t2 USING(a,b,a1,b1,a2,b2,a3,b3,a4,b4,a5);
 #SELECT * FROM t1 INNER JOIN t2 USING(a,b,a1,b1,a2,b2,a3,b3,a4,b4,a5);
 # Mixing skewed columns with non-skewed.
 DROP TABLE t1,t2;
 CREATE TABLE t1 (a DECIMAL(10,1), t text, b DECIMAL(20,1), i1 int, a1 DECIMAL(10,1), b1 DECIMAL(20,1),a2 DECIMAL(10,1), b2 DECIMAL(20,1),a3 DECIMAL(10,1), b3 DECIMAL(20,1),a4 DECIMAL(10,1), b4 DECIMAL(20,1),a5 DECIMAL(10,1));
 INSERT INTO t1 VALUES (10.1,'some',20.1,42,10.1,20.1,10.1,20.1,10.1,20.1,10.1,20.1,10.1);
 CREATE TABLE t2 (a DECIMAL(20,1), b DECIMAL(10,1), t text, a1 DECIMAL(20,1), i1 int, b1 DECIMAL(10,1),a2 DECIMAL(20,1), b2 DECIMAL(10,1),a3 DECIMAL(20,1), b3 DECIMAL(10,1),a4 DECIMAL(20,1), b4 DECIMAL(10,1),a5 DECIMAL(20,1));
 INSERT INTO t2 VALUES (10.1,20.1,'some',10.1,42,20.1,10.1,20.1,10.1,20.1,10.1,20.1,10.1);
 # These work b/c the max is 10 columns.
 SELECT * FROM t1 INNER JOIN t2 USING(a,b,a1,b1,a2,b2,a3,b3,a4,b4,t,i1);
 SELECT * FROM t2 INNER JOIN t1 USING(a,b,a1,b1,a2,b2,a3,b3,a4,b4,t,i1);
 # These do not.
 #SELECT * FROM t1 INNER JOIN t2 USING(a,b,a1,b1,a2,b2,a3,b3,a4,b4,a5,t,i1);
 #SELECT * FROM t2 INNER JOIN t1 USING(a,b,a1,b1,a2,b2,a3,b3,a4,b4,a5,t,i1);
 SELECT t1.a,t1.t,t1.i1 FROM t1 INNER JOIN (SELECT * from t2) s1 USING(a,b);
 SELECT t2.a,t2.t,s1.i1 FROM t2 INNER JOIN (SELECT * from t1) s1 USING(a,b);
 SELECT t1.a,t1.t,t1.i1 FROM t1 INNER JOIN (SELECT * from t2) s1 where t1.a+1=s1.a+1 and t1.b+1=s1.b+1;
 SELECT t2.a,t2.t,t2.i1 FROM t2 INNER JOIN (SELECT * from t1) s1 where t2.a+1=s1.a+1 and t2.b+1=s1.b+1;
 # Clean UP
 DROP DATABASE mcol641_joins_db;
--- a/mysql-test/columnstore/include/disable_ordered_only.inc
+++ b/mysql-test/columnstore/include/disable_ordered_only.inc
@ -0,0 +1,3 @@
 --disable_query_log
 set global columnstore_ordered_only=off;
 --enable_query_log
--- a/mysql-test/columnstore/include/enable_ordered_only.inc
+++ b/mysql-test/columnstore/include/enable_ordered_only.inc
@ -0,0 +1,3 @@
 --disable_query_log
 set global columnstore_ordered_only=on;
 --enable_query_log
--- a/primitives/primproc/batchprimitiveprocessor.cpp
+++ b/primitives/primproc/batchprimitiveprocessor.cpp
@ -129,6 +129,9 @@ BatchPrimitiveProcessor::BatchPrimitiveProcessor() :
    hasFilterStep(false),
    filtOnString(false),
    prefetchThreshold(0),
    mJOINHasSkewedKeyColumn(false),
    mSmallSideRGPtr(nullptr),
    mSmallSideKeyColumnsPtr(nullptr),
    hasDictStep(false),
    sockIndex(0),
    endOfJoinerRan(false),
@ -175,6 +178,9 @@ BatchPrimitiveProcessor::BatchPrimitiveProcessor(ByteStream& b, double prefetch,
    hasFilterStep(false),
    filtOnString(false),
    prefetchThreshold(prefetch),
    mJOINHasSkewedKeyColumn(false),
    mSmallSideRGPtr(nullptr),
    mSmallSideKeyColumnsPtr(nullptr),
    hasDictStep(false),
    sockIndex(0),
    endOfJoinerRan(false),
@ -297,7 +303,6 @@ void BatchPrimitiveProcessor::initBPP(ByteStream& bs)
            for (uint j = 0; j < joinerCount; ++j)
                tJoiners[j].reset(new boost::shared_ptr<TJoiner>[processorThreads]);
            //_pools.reset(new boost::shared_ptr<utils::SimplePool>[joinerCount]);
            tlJoiners.reset(new boost::shared_array<boost::shared_ptr<TLJoiner> >[joinerCount]);
            for (uint j = 0; j < joinerCount; ++j)
                tlJoiners[j].reset(new boost::shared_ptr<TLJoiner>[processorThreads]);
@ -310,8 +315,9 @@ void BatchPrimitiveProcessor::initBPP(ByteStream& bs)
            tJoinerSizes.reset(new std::atomic<uint32_t>[joinerCount]);
            largeSideKeyColumns.reset(new uint32_t[joinerCount]);
            tlLargeSideKeyColumns.reset(new vector<uint32_t>[joinerCount]);
            tlSmallSideKeyColumns.reset(new std::vector<uint32_t>);
            typelessJoin.reset(new bool[joinerCount]);
-            tlKeyLengths.reset(new uint32_t[joinerCount]);
+            tlSmallSideKeyLengths.reset(new uint32_t[joinerCount]);
            storedKeyAllocators.reset(new PoolAllocator[joinerCount]);
            for (uint j = 0; j < joinerCount; ++j)
@ -322,6 +328,7 @@ void BatchPrimitiveProcessor::initBPP(ByteStream& bs)
            joinFEFilters.reset(new scoped_ptr<FuncExpWrapper>[joinerCount]);
            hasJoinFEFilters = false;
            hasSmallOuterJoin = false;
            bool smallSideRGRecvd = false;
            for (i = 0; i < joinerCount; i++)
            {
@ -356,14 +363,31 @@ void BatchPrimitiveProcessor::initBPP(ByteStream& bs)
                else
                {
                    deserializeVector<uint32_t>(bs, tlLargeSideKeyColumns[i]);
-                    bs >> tlKeyLengths[i];
+                    bs >> tlSmallSideKeyLengths[i];
-                    //storedKeyAllocators[i] = PoolAllocator();
+                    bs >> mJOINHasSkewedKeyColumn;
                    // Deser smallSideRG if key data types are different, e.g. INT vs wide-DECIMAL.
                    if (mJOINHasSkewedKeyColumn && !smallSideRGRecvd)
                    {
                        smallSideRGs.emplace_back(rowgroup::RowGroup(bs));
                        // LargeSide key columns number equals to SmallSide key columns number.
                        deserializeVector<uint32_t>(bs, *tlSmallSideKeyColumns);
                        mSmallSideRGPtr = &smallSideRGs[0];
                        mSmallSideKeyColumnsPtr = &(*tlSmallSideKeyColumns);
                        smallSideRGRecvd = true;
                    }
                    for (uint j = 0; j < processorThreads; ++j)
-                        tlJoiners[i][j].reset(new TLJoiner(10,
+                    {
-                                                           TupleJoiner::TypelessDataHasher(&outputRG,
+                        auto tlHasher = TupleJoiner::TypelessDataHasher(&outputRG,
-                                                                               &tlLargeSideKeyColumns[i]),
+                                                                        &tlLargeSideKeyColumns[i],
-                                                           TupleJoiner::TypelessDataComparator(&outputRG,
+                                                                        mSmallSideKeyColumnsPtr,
-                                                                                   &tlLargeSideKeyColumns[i])));
+                                                                        mSmallSideRGPtr);
                        auto tlComparator = TupleJoiner::TypelessDataComparator(&outputRG,
                                                                                &tlLargeSideKeyColumns[i],
                                                                                mSmallSideKeyColumnsPtr,
                                                                                mSmallSideRGPtr);
                        tlJoiners[i][j].reset(new TLJoiner(10, tlHasher, tlComparator));
                    }
                }
            }
@ -610,7 +634,6 @@ void BatchPrimitiveProcessor::addToJoiner(ByteStream& bs)
        if (typelessJoin[joinerNum])
        {
            utils::VLArray<vector<pair<TypelessData, uint32_t> > > tmpBuckets(processorThreads);
            TypelessData tlLargeKey;
            uint8_t nullFlag;
            PoolAllocator &storedKeyAllocator = storedKeyAllocators[joinerNum];
            // this first loop hashes incoming values into vectors that parallel the hash tables.
@ -620,10 +643,20 @@ void BatchPrimitiveProcessor::addToJoiner(ByteStream& bs)
                bs >> nullFlag;
                if (nullFlag == 0)
                {
-                    tlLargeKey.deserialize(bs, storedKeyAllocator);
+                    TypelessData tlSmallSideKey(bs, storedKeyAllocator);
                    if (mJOINHasSkewedKeyColumn)
                        tlSmallSideKey.setSmallSideWithSkewedData();
                    else
                        tlSmallSideKey.setSmallSide();
                    bs >> tlIndex;
-                    bucket = tlLargeKey.hash(outputRG, tlLargeSideKeyColumns[joinerNum]) & ptMask;
+                    // The bucket number corresponds with the index used later inserting TL keys into permanent JOIN hash map.
-                    tmpBuckets[bucket].push_back(make_pair(tlLargeKey, tlIndex));
+                    auto ha = tlSmallSideKey.hash(outputRG,
                                                  tlLargeSideKeyColumns[joinerNum],
                                                  mSmallSideKeyColumnsPtr,
                                                  mSmallSideRGPtr);
                    bucket = ha & ptMask;
                    tmpBuckets[bucket].push_back(make_pair(tlSmallSideKey, tlIndex));
                }
                else
                    ++nullCount;
@ -914,11 +947,6 @@ void BatchPrimitiveProcessor::initProcessor()
        {
            outputRG.initRow(&oldRow);
            outputRG.initRow(&newRow);
            tmpKeyAllocators.reset(new FixedAllocator[joinerCount]);
            for (i = 0; i < joinerCount; i++)
                if (typelessJoin[i])
                    tmpKeyAllocators[i] = FixedAllocator(tlKeyLengths[i], true);
            tSmallSideMatches.reset(new MatchedData[joinerCount]);
            keyColumnProj.reset(new bool[projectCount]);
@ -1126,7 +1154,6 @@ void BatchPrimitiveProcessor::executeTupleJoin()
    uint32_t newRowCount = 0, i, j;
    vector<uint32_t> matches;
    uint64_t largeKey;
    TypelessData tlLargeKey;
    outputRG.getRow(0, &oldRow);
    outputRG.getRow(0, &newRow);
@ -1195,8 +1222,10 @@ void BatchPrimitiveProcessor::executeTupleJoin()
            {
                //cout << " typeless join\n";
                // the null values are not sent by UM in typeless case.  null -> !found
-                tlLargeKey = TypelessData(&oldRow);
+                TypelessData tlLargeKey(&oldRow);
-                uint bucket = oldRow.hashTypeless(tlLargeSideKeyColumns[j]) & ptMask;
+                uint bucket = oldRow.hashTypeless(tlLargeSideKeyColumns[j],
                                                  mSmallSideKeyColumnsPtr,
                                                  mSmallSideRGPtr ? &mSmallSideRGPtr->getColWidths() : nullptr) & ptMask;
                found = tlJoiners[j][bucket]->find(tlLargeKey) != tlJoiners[j][bucket]->end();
                if ((!found && !(joinTypes[j] & (LARGEOUTER | ANTI))) ||
@ -1335,21 +1364,23 @@ void BatchPrimitiveProcessor::executeTupleJoin()
            /* Finally, copy the row into the output */
            if (j == joinerCount)
            {
                // We need to update 8 and 16 bytes in values and wide128Values buffers
                // otherwise unrelated values will be observed in the JOIN-ed output RGData.
                if (i != newRowCount)
                {
                    values[newRowCount] = values[i];
                    if (mJOINHasSkewedKeyColumn)
                        wide128Values[newRowCount] = wide128Values[i];
                    relRids[newRowCount] = relRids[i];
                    copyRow(oldRow, &newRow);
-                    //cout << "joined row: " << newRow.toString() << endl;
+                    //cout << "joined row: " << newRow.toString() << endl; 
                    //memcpy(newRow.getData(), oldRow.getData(), oldRow.getSize());
                }
                newRowCount++;
                newRow.nextRow();
            }
            //else
-            //	cout << "j != joinerCount\n";
+            // cout << "j != joinerCount\n";
        }
    }
@ -2220,7 +2251,6 @@ int BatchPrimitiveProcessor::operator()()
        }
        catch (std::exception& e)
        {
            cerr << "BPP::sendResponse(): " << e.what() << endl;
            break;  // If we make this throw, be sure to do the cleanup at the end
        }
@ -2382,13 +2412,22 @@ SBPP BatchPrimitiveProcessor::duplicate()
        //bpp->_pools = _pools;
        bpp->typelessJoin = typelessJoin;
        bpp->tlLargeSideKeyColumns = tlLargeSideKeyColumns;
        bpp->tlSmallSideKeyColumns = tlSmallSideKeyColumns;
        bpp->tlJoiners = tlJoiners;
-        bpp->tlKeyLengths = tlKeyLengths;
+        bpp->tlSmallSideKeyLengths = tlSmallSideKeyLengths;
        bpp->storedKeyAllocators = storedKeyAllocators;
        bpp->joinNullValues = joinNullValues;
        bpp->doMatchNulls = doMatchNulls;
        bpp->hasJoinFEFilters = hasJoinFEFilters;
        bpp->hasSmallOuterJoin = hasSmallOuterJoin;
        bpp->mJOINHasSkewedKeyColumn = mJOINHasSkewedKeyColumn;
        bpp->mSmallSideRGPtr = mSmallSideRGPtr;
        bpp->mSmallSideKeyColumnsPtr = mSmallSideKeyColumnsPtr;
        if (!getTupleJoinRowGroupData && mJOINHasSkewedKeyColumn)
        {
            idbassert(!smallSideRGs.empty());
            bpp->smallSideRGs.push_back(smallSideRGs[0]);
        }
        if (hasJoinFEFilters)
        {
@ -2714,7 +2753,9 @@ inline void BatchPrimitiveProcessor::getJoinResults(const Row& r, uint32_t jInde
        }
        TypelessData largeKey(&r);
-        bucket = r.hashTypeless(tlLargeSideKeyColumns[jIndex]) & ptMask;
+        bucket = r.hashTypeless(tlLargeSideKeyColumns[jIndex],
                                mSmallSideKeyColumnsPtr,
                                mSmallSideRGPtr ? &mSmallSideRGPtr->getColWidths() : nullptr) & ptMask;
        pair<TLJoiner::iterator, TLJoiner::iterator> range =
            tlJoiners[jIndex][bucket]->equal_range(largeKey);
        for (; range.first != range.second; ++range.first)
--- a/primitives/primproc/batchprimitiveprocessor.h
+++ b/primitives/primproc/batchprimitiveprocessor.h
@ -87,7 +87,6 @@ public:
        std::runtime_error(s) { }
 };
 class BatchPrimitiveProcessor
 {
 public:
@ -184,7 +183,6 @@ private:
    void writeProjectionPreamble();
    void makeResponse();
    void sendResponse();
    /* Used by scan operations to increment the LBIDs in successive steps */
    void nextLBID();
@ -348,13 +346,17 @@ private:
    /* extra typeless join vars & fcns*/
    boost::shared_array<bool> typelessJoin;
    boost::shared_array<std::vector<uint32_t> > tlLargeSideKeyColumns;
    std::shared_ptr<std::vector<uint32_t>> tlSmallSideKeyColumns;
    boost::shared_array<boost::shared_array<boost::shared_ptr<TLJoiner> > > tlJoiners;
-    boost::shared_array<uint32_t> tlKeyLengths;
+    boost::shared_array<uint32_t> tlSmallSideKeyLengths;
    // True if smallSide and largeSide TypelessData key column differs,e.g BIGINT vs DECIMAL(38).
    bool mJOINHasSkewedKeyColumn;
    const rowgroup::RowGroup* mSmallSideRGPtr;
    const std::vector<uint32_t>* mSmallSideKeyColumnsPtr;
    inline void getJoinResults(const rowgroup::Row& r, uint32_t jIndex, std::vector<uint32_t>& v);
    // these allocators hold the memory for the keys stored in tlJoiners
    boost::shared_array<utils::PoolAllocator> storedKeyAllocators;
    // these allocators hold the memory for the large side keys which are short-lived
    boost::scoped_array<utils::FixedAllocator> tmpKeyAllocators;
    /* PM Aggregation */
    rowgroup::RowGroup joinedRG;  // if there's a join, the rows are formatted with this
--- a/utils/common/collation.h
+++ b/utils/common/collation.h
@ -126,7 +126,7 @@ public:
    }
    uint32_t finalize() const
    {
-        return (uint32_t) mPart1;
+        return (uint32_t)mPart1;
    }
 };
--- a/utils/joiner/tuplejoiner.cpp
+++ b/utils/joiner/tuplejoiner.cpp
@ -30,7 +30,6 @@
 #include "lbidlist.h"
 #include "spinlock.h"
 #include "vlarray.h"
 #include "mcs_string.h"
 using namespace std;
@ -42,6 +41,7 @@ using namespace joblist;
 namespace joiner
 {
 // Typed joiner ctor
 TupleJoiner::TupleJoiner(
    const rowgroup::RowGroup& smallInput,
    const rowgroup::RowGroup& largeInput,
@ -145,6 +145,7 @@ TupleJoiner::TupleJoiner(
    nullValueForJoinColumn = smallNullRow.getSignedNullValue(smallJoinColumn);
 }
 // Typeless joiner ctor
 TupleJoiner::TupleJoiner(
    const rowgroup::RowGroup& smallInput,
    const rowgroup::RowGroup& largeInput,
@ -182,67 +183,31 @@ TupleJoiner::TupleJoiner(
        smallNullRow.initToNull();
    }
-    for (i = keyLength = 0; i < smallKeyColumns.size(); i++)
+    keyLength = calculateKeyLength(smallKeyColumns, smallRG, &largeKeyColumns, &largeRG);
    {
        if (smallRG.getColTypes()[smallKeyColumns[i]] == CalpontSystemCatalog::CHAR ||
                smallRG.getColTypes()[smallKeyColumns[i]] == CalpontSystemCatalog::VARCHAR
                ||
                smallRG.getColTypes()[smallKeyColumns[i]] == CalpontSystemCatalog::TEXT)
        {
            keyLength += smallRG.getColumnWidth(smallKeyColumns[i]) + 2;  // +2 for length
            // MCOL-698: if we don't do this LONGTEXT allocates 32TB RAM
            if (keyLength > 65536)
                keyLength = 65536;
        }
        else if (smallRG.getColTypes()[smallKeyColumns[i]] == CalpontSystemCatalog::LONGDOUBLE)
        {
            keyLength += sizeof(long double);
        }
        else
        {
            keyLength += 8;
        }
        // Set bSignedUnsignedJoin if one or more join columns are signed to unsigned compares.
        if (smallRG.isUnsigned(smallKeyColumns[i]) != largeRG.isUnsigned(largeKeyColumns[i]))
        {
            bSignedUnsignedJoin = true;
        }
    }
    // note, 'numcores' is implied by tuplehashjoin on calls to insertRGData().
    // TODO: make it explicit to avoid future confusion.
    storedKeyAlloc.reset(new FixedAllocator[numCores]);
    for (i = 0; i < (uint) numCores; i++)
        storedKeyAlloc[i].setAllocSize(keyLength);
    discreteValues.reset(new bool[smallKeyColumns.size()]);
    cpValues.reset(new vector<int128_t>[smallKeyColumns.size()]);
-    for (i = 0; i < smallKeyColumns.size(); i++)
+    for (i = 0; i < smallKeyColumns.size(); ++i)
    {
-        discreteValues[i] = false;
+        uint32_t smallKeyColumnsIdx = smallKeyColumns[i]; 
-        if (isUnsigned(smallRG.getColTypes()[smallKeyColumns[i]]))
+        auto smallSideColType = smallRG.getColTypes()[smallKeyColumnsIdx];
        // Set bSignedUnsignedJoin if one or more join columns are signed to unsigned compares.
        if (smallRG.isUnsigned(smallKeyColumnsIdx) != largeRG.isUnsigned(largeKeyColumns[i]))
        {
-            if (datatypes::isWideDecimalType(
+            bSignedUnsignedJoin = true;
-                smallRG.getColType(smallKeyColumns[i]),
+        }
-                smallRG.getColumnWidth(smallKeyColumns[i])))
+
-            {
+        discreteValues[i] = false;
-                cpValues[i].push_back((int128_t) -1);
+        if (isUnsigned(smallSideColType))
-                cpValues[i].push_back(0);
+        {
-            }
+            cpValues[i].push_back((int128_t) numeric_limits<uint64_t>::max());
-            else
+            cpValues[i].push_back(0);
            {
                cpValues[i].push_back((int128_t) numeric_limits<uint64_t>::max());
                cpValues[i].push_back(0);
            }
        }
        else
        {
-            if (datatypes::isWideDecimalType(
+            if (datatypes::isWideDecimalType(smallSideColType,
-                smallRG.getColType(smallKeyColumns[i]),
+                                             smallRG.getColumnWidth(smallKeyColumnsIdx)))
                smallRG.getColumnWidth(smallKeyColumns[i])))
            {
                cpValues[i].push_back(utils::maxInt128);
                cpValues[i].push_back(utils::minInt128);
@ -254,6 +219,12 @@ TupleJoiner::TupleJoiner(
            }
        }
    }
    // note, 'numcores' is implied by tuplehashjoin on calls to insertRGData().
    // TODO: make it explicit to avoid future confusion.
    storedKeyAlloc.reset(new FixedAllocator[numCores]);
    for (i = 0; i < (uint) numCores; i++)
        storedKeyAlloc[i].setAllocSize(keyLength);
 }
 TupleJoiner::TupleJoiner() { }
@ -730,10 +701,12 @@ void TupleJoiner::doneInserting()
        typelesshash_t::iterator thit;
        uint32_t i, pmpos = 0, rowCount;
        Row smallRow;
        auto smallSideColIdx = smallKeyColumns[col];
        auto smallSideColType = smallRG.getColType(smallSideColIdx);
        smallRG.initRow(&smallRow);
-        if (smallRow.isCharType(smallKeyColumns[col]))
+        if (smallRow.isCharType(smallSideColIdx))
            continue;
        rowCount = size();
@ -743,7 +716,7 @@ void TupleJoiner::doneInserting()
            pmpos = 0;
        else if (typelessJoin)
            thit = ht[bucket]->begin();
-        else if (smallRG.getColType(smallKeyColumns[0]) == CalpontSystemCatalog::LONGDOUBLE)
+        else if (isLongDouble(smallRG.getColType(smallKeyColumns[0])))
            ldit = ld[bucket]->begin();
        else if (!smallRG.usesStringTable())
            hit = h[bucket]->begin();
@ -761,7 +734,7 @@ void TupleJoiner::doneInserting()
                smallRow.setPointer(thit->second);
                ++thit;
            }
-            else if (smallRG.getColType(smallKeyColumns[col]) == CalpontSystemCatalog::LONGDOUBLE)
+            else if (isLongDouble(smallSideColType))
            {
                while (ldit == ld[bucket]->end())
                    ldit = ld[++bucket]->begin();
@ -783,9 +756,9 @@ void TupleJoiner::doneInserting()
                ++sthit;
            }
-            if (smallRow.getColType(smallKeyColumns[col]) == CalpontSystemCatalog::LONGDOUBLE)
+            if (isLongDouble(smallSideColType))
            {
-                double dval = (double)roundl(smallRow.getLongDoubleField(smallKeyColumns[col]));
+                double dval = (double)roundl(smallRow.getLongDoubleField(smallSideColIdx));
                switch (largeRG.getColType(largeKeyColumns[col]))
                {
                    case CalpontSystemCatalog::DOUBLE:
@ -802,19 +775,18 @@ void TupleJoiner::doneInserting()
                    }
                }
            }
-            else if (datatypes::isWideDecimalType(
+            else if (datatypes::isWideDecimalType(smallSideColType,
-                     smallRow.getColType(smallKeyColumns[col]),
+                                                  smallRow.getColumnWidth(smallSideColIdx)))
                     smallRow.getColumnWidth(smallKeyColumns[col])))
            {
-                uniquer.insert(*((int128_t*)smallRow.getBinaryField<int128_t>(smallKeyColumns[col])));
+                uniquer.insert(smallRow.getTSInt128Field(smallSideColIdx).getValue());
            }
-            else if (smallRow.isUnsigned(smallKeyColumns[col]))
+            else if (smallRow.isUnsigned(smallSideColIdx))
            {
-                uniquer.insert((int64_t)smallRow.getUintField(smallKeyColumns[col]));
+                uniquer.insert((int64_t)smallRow.getUintField(smallSideColIdx));
            }
            else
            {
-                uniquer.insert(smallRow.getIntField(smallKeyColumns[col]));
+                uniquer.insert(smallRow.getIntField(smallSideColIdx));
            }
            CHECKSIZE;
@ -1170,7 +1142,8 @@ void TupleJoiner::updateCPData(const Row& r)
                     r.getColType(colIdx),
                     r.getColumnWidth(colIdx)))
            {
-                uval = *((int128_t*)r.getBinaryField<int128_t>(colIdx));
+    
                uval = r.getTSInt128Field(colIdx).getValue();
            }
            else
            {
@ -1210,7 +1183,7 @@ void TupleJoiner::updateCPData(const Row& r)
                     r.getColType(colIdx),
                     r.getColumnWidth(colIdx)))
            {
-                val = *((int128_t*)r.getBinaryField<int128_t>(colIdx));
+                val = r.getTSInt128Field(colIdx).getValue();
            }
            else
            {
@ -1283,66 +1256,134 @@ public:
    }
 };
-
+class WideDecimalKeyConverter
 class TypelessDataDecoder
 {
-    const uint8_t *mPtr;
+    const Row* mR;
-    const uint8_t *mEnd;
+    uint64_t convertedValue;
-    void checkAvailableData(uint32_t nbytes) const
+    const uint32_t mKeyColId;
-    {
+    uint16_t width;
-        if (mPtr + nbytes > mEnd)
+  public:
-            throw runtime_error("TypelessData is too short");
+    WideDecimalKeyConverter(const Row& r,
-    }
+                            const uint32_t keyColId): mR(&r),
-public:
+                                                      mKeyColId(keyColId),
-    TypelessDataDecoder(const uint8_t* ptr, size_t length)
+                                                      width(datatypes::MAXDECIMALWIDTH)
        :mPtr(ptr), mEnd(ptr + length)
    { }
-    TypelessDataDecoder(const TypelessData &data)
+    bool isConvertedToSmallSideType() const { return width == datatypes::MAXLEGACYWIDTH; }
-        :TypelessDataDecoder(data.data, data.len)
+    int64_t getConvertedTInt64() const { return (int64_t)convertedValue; }
-    { }
+    // Returns true if the value doesn't fit into allowed range for a type.
-    ConstString scanGeneric(uint32_t length)
+    template <typename T, typename AT>
    bool numericRangeCheckAndConvert(const AT& value)
    {
-        checkAvailableData(length);
+        if (value > AT(std::numeric_limits<T>::max()) ||
-        ConstString res((const char *) mPtr, length);
+            value < AT(std::numeric_limits<T>::min()))
-        mPtr += length;
+            return true;
-        return res;
+
        convertedValue = (uint64_t) static_cast<T>(value);
        return false;
    }
-    uint32_t scanStringLength()
+    // As of MCS 6.x there is an asumption MCS can't join having
    // INTEGER and non-INTEGER potentially fractional keys,
    // e.g. BIGINT to DECIMAL(38,1). It can only join BIGINT to DECIMAL(38).
    // convert() checks if wide-DECIMAL overflows INTEGER type range
    // and sets internal width to 0 if it is. If not width is set to 8
    // and convertedValue is casted to INTEGER type.
    // This convert() is called in EM to cast smallSide TypelessData
    // if the key columns has a skew, e.g. INT to DECIMAL(38).
    inline WideDecimalKeyConverter&
    convert(const bool otherSideIsIntOrNarrow,
            const execplan::CalpontSystemCatalog::ColDataType otherSideType)
    {
-        checkAvailableData(2);
+        if (otherSideIsIntOrNarrow)
-        uint32_t res = ((uint32_t) mPtr[0]) * 255 + mPtr[1];
+        {
-        mPtr += 2;
+            datatypes::TSInt128 integralPart = mR->getTSInt128Field(mKeyColId);
-        return res;
+
            bool isUnsigned = datatypes::isUnsigned(otherSideType);
            if (isUnsigned)
            {
                width = (numericRangeCheckAndConvert<uint64_t>(integralPart)) ? 0 : datatypes::MAXLEGACYWIDTH;
                return *this;
            }
            width = (numericRangeCheckAndConvert<int64_t>(integralPart)) ? 0 : datatypes::MAXLEGACYWIDTH;
        }
        return *this;
    }
-    ConstString scanString()
+    // Stores the value that might had been converted.
    inline bool store(TypelessData& typelessData,
                      uint32_t& off,
                      const uint32_t keylen) const
    {
-        return scanGeneric(scanStringLength());
+        // A note from convert() if there is otherSide column type range
        // overflow so store() returns TD with len=0. This tells EM to skip this
        // key b/c it won't match at PP. This happens it is possible to skip
        // smallSide TD but can't to do the same with largeSide b/c of OUTER joins.
        if (!width)
        {
            typelessData.len = 0;
            return true;
        }
        if (off + width > keylen)
            return true;
        switch (width)
        {
            case datatypes::MAXDECIMALWIDTH:
            {
                mR->storeInt128FieldIntoPtr(mKeyColId, &typelessData.data[off]);
                break;
            }
            default:
            {
                datatypes::TUInt64(convertedValue).store(&typelessData.data[off]);
            }
        }
        off += width;
        return false;
    }
 };
-
+// smallSideColWidths is non-nullptr valid pointer only
 // if there is a skew b/w small and large side columns widths.
 uint32 TypelessData::hash(const RowGroup& r,
-                          const std::vector<uint32_t>& keyCols) const
+                          const std::vector<uint32_t>& keyCols,
                          const std::vector<uint32_t>* smallSideKeyColumnsIds,
                          const rowgroup::RowGroup* smallSideRG) const
 {
-    if (mRowPtr)
+    // This part is for largeSide hashing using Row at PP.
-        return mRowPtr->hashTypeless(keyCols);
+    if (!isSmallSide())
    {
        return mRowPtr->hashTypeless(keyCols,
                                     smallSideKeyColumnsIds,
                                     (smallSideRG) ? &smallSideRG->getColWidths() : nullptr);
    }
    // This part is for smallSide hashing at PP.
    TypelessDataDecoder decoder(*this);
    datatypes::MariaDBHasher hasher;
-    for (uint32_t i = 0; i < keyCols.size(); i++)
+    for (auto keyColId: keyCols)
    {
-        switch (r.getColTypes()[keyCols[i]])
+        switch (r.getColTypes()[keyColId])
        {
            case CalpontSystemCatalog::VARCHAR:
            case CalpontSystemCatalog::CHAR:
            case CalpontSystemCatalog::TEXT:
            {
-                CHARSET_INFO *cs= const_cast<RowGroup&>(r).getCharset(keyCols[i]);
+                CHARSET_INFO *cs= const_cast<RowGroup&>(r).getCharset(keyColId);
                hasher.add(cs, decoder.scanString());
                break;
            }
            case CalpontSystemCatalog::DECIMAL:
            {
                const uint32_t width = std::max(r.getColWidths()[keyColId], datatypes::MAXLEGACYWIDTH);
                if (isSmallSideWithSkewedData() || width == datatypes::MAXLEGACYWIDTH)
                {
                    int64_t val = decoder.scanTInt64();
                    hasher.add(&my_charset_bin, reinterpret_cast<const char*>(&val), datatypes::MAXLEGACYWIDTH);
                }
                else
                    hasher.add(&my_charset_bin, decoder.scanGeneric(width));
                break;
            }
            default:
            {
-                hasher.add(&my_charset_bin, decoder.scanGeneric(8));
+                hasher.add(&my_charset_bin, decoder.scanGeneric(datatypes::MAXLEGACYWIDTH));
                break;
            }
        }
@ -1350,41 +1391,84 @@ uint32 TypelessData::hash(const RowGroup& r,
    return hasher.finalize();
 }
-
+// this is smallSide, Row represents largeSide record.
 int TypelessData::cmpToRow(const RowGroup& r,
                           const std::vector<uint32_t>& keyCols,
-                           const rowgroup::Row &row) const
+                           const rowgroup::Row &row,
                           const std::vector<uint32_t> *smallSideKeyColumnsIds,
                           const rowgroup::RowGroup *smallSideRG) const
 {
    TypelessDataDecoder a(*this);
    for (uint32_t i = 0; i < keyCols.size(); i++)
    {
-        switch (r.getColTypes()[keyCols[i]])
+        auto largeSideKeyColRowIdx = keyCols[i];
        switch (r.getColType(largeSideKeyColRowIdx))
        {
            case CalpontSystemCatalog::VARCHAR:
            case CalpontSystemCatalog::CHAR:
            case CalpontSystemCatalog::TEXT:
            {
-                datatypes::Charset cs(*const_cast<RowGroup&>(r).getCharset(keyCols[i]));
+                datatypes::Charset cs(*const_cast<RowGroup&>(r).getCharset(largeSideKeyColRowIdx));
                ConstString ta = a.scanString();
-                ConstString tb = row.getConstString(keyCols[i]);
+                ConstString tb = row.getConstString(largeSideKeyColRowIdx);
                if (int rc= cs.strnncollsp(ta, tb))
                    return rc;
                break;
            }
            case CalpontSystemCatalog::DECIMAL:
            {
                auto largeSideWidth = row.getColumnWidth(largeSideKeyColRowIdx);
                // First branch processes skewed JOIN, e.g. INT to DECIMAL(38)
                // else branch processes decimal with common width at both small- and largeSide.
                if (isSmallSideWithSkewedData() &&
                        largeSideWidth != smallSideRG->getColumnWidth(smallSideKeyColumnsIds->operator[](i)))
                {
                    if (largeSideWidth == datatypes::MAXLEGACYWIDTH)
                    {
                        if (int rc = a.scanTInt64() != row.getIntField(largeSideKeyColRowIdx))
                            return rc;
                    }
                    else
                    {
                        WideDecimalKeyConverter cv(row, largeSideKeyColRowIdx);
                        if (!cv.convert(true,
                                        smallSideRG->getColType(smallSideKeyColumnsIds->operator[](i)))
                               .isConvertedToSmallSideType())
                            return 1;
                        if (int rc = a.scanTInt64() != cv.getConvertedTInt64())
                            return rc;
                    }
                }
                else
                {
                    // There is an assumption that both sides here are equal and are either 8 or 16 bytes.
                    if (largeSideWidth == datatypes::MAXDECIMALWIDTH)
                    {
                        if (int rc = a.scanTInt128() != row.getTSInt128Field(largeSideKeyColRowIdx))
                            return rc;
                    }
                    else
                    {
                        if (int rc = a.scanTInt64() != row.getIntField(largeSideKeyColRowIdx))
                            return rc;
                    }
                }
                break;
            }
            default:
            {
                ConstString ta = a.scanGeneric(datatypes::MAXLEGACYWIDTH);
-                if (r.isUnsigned(keyCols[i]))
+                if (r.isUnsigned(largeSideKeyColRowIdx))
                {
-                    uint64_t tb = row.getUintField(keyCols[i]);
+                    uint64_t tb = row.getUintField(largeSideKeyColRowIdx);
-                    if (int rc= memcmp(ta.str(), &tb , datatypes::MAXLEGACYWIDTH))
+                    if (int rc = memcmp(ta.str(), &tb , datatypes::MAXLEGACYWIDTH))
                        return rc;
                }
                else
                {
-                    int64_t tb = row.getIntField(keyCols[i]);
+                    int64_t tb = row.getIntField(largeSideKeyColRowIdx);
-                    if (int rc= memcmp(ta.str(), &tb , datatypes::MAXLEGACYWIDTH))
+                    if (int rc = memcmp(ta.str(), &tb , datatypes::MAXLEGACYWIDTH))
                        return rc;
                }
                break;
@ -1394,39 +1478,60 @@ int TypelessData::cmpToRow(const RowGroup& r,
    return 0; // Equal
 }
 int TypelessData::cmp(const RowGroup& r, const std::vector<uint32_t>& keyCols,
-                      const TypelessData &da, const TypelessData &db)
+                      const TypelessData &da, const TypelessData &db,
                      const std::vector<uint32_t> *smallSideKeyColumnsIds,
                      const rowgroup::RowGroup *smallSideRG)
 {
-    idbassert((da.mRowPtr == nullptr) + (db.mRowPtr == nullptr) > 0);
+    idbassert(da.isSmallSide() || db.isSmallSide());
-    if (da.mRowPtr)
+    if (!da.isSmallSide() && db.isSmallSide())
-        return -db.cmpToRow(r, keyCols, da.mRowPtr[0]);
+        return -db.cmpToRow(r, keyCols, da.mRowPtr[0], smallSideKeyColumnsIds, smallSideRG);
-    if (db.mRowPtr)
+    if (da.isSmallSide() && !db.isSmallSide())
-        return da.cmpToRow(r, keyCols, db.mRowPtr[0]);
+        return da.cmpToRow(r, keyCols, db.mRowPtr[0], smallSideKeyColumnsIds, smallSideRG);
    // This case happens in BPP::addToJoiner when it populates the final
    // hashmap with multiple smallSide TDs from temp hashmaps.
    idbassert(da.isSmallSide() && db.isSmallSide());
    TypelessDataDecoder a(da);
    TypelessDataDecoder b(db);
-    for (uint32_t i = 0; i < keyCols.size(); i++)
+    for (uint32_t i = 0; i < keyCols.size(); ++i)
    {
-        switch (r.getColTypes()[keyCols[i]])
+        auto keyColIdx = keyCols[i];
        switch (r.getColTypes()[keyColIdx])
        {
            case CalpontSystemCatalog::VARCHAR:
            case CalpontSystemCatalog::CHAR:
            case CalpontSystemCatalog::TEXT:
            {
-                datatypes::Charset cs(*const_cast<RowGroup&>(r).getCharset(keyCols[i]));
+                datatypes::Charset cs(*const_cast<RowGroup&>(r).getCharset(keyColIdx));
                ConstString ta = a.scanString();
                ConstString tb = b.scanString();
                if (int rc= cs.strnncollsp(ta, tb))
                    return rc;
                break;
            }
            case CalpontSystemCatalog::DECIMAL:
            {
                auto largeSideWidth = r.getColumnWidth(keyColIdx);
                // First and second branches processes skewed JOIN, e.g. INT to DECIMAL(38)
                // Third processes decimal with common width at both small- and largeSide.
                auto width = (da.isSmallSideWithSkewedData() &&
                             largeSideWidth != smallSideRG->getColumnWidth(smallSideKeyColumnsIds->operator[](i))) ? datatypes::MAXLEGACYWIDTH : std::max(r.getColWidths()[keyColIdx], datatypes::MAXLEGACYWIDTH);
                ConstString ta = a.scanGeneric(width);
                ConstString tb = b.scanGeneric(width);
                if (int rc= memcmp(ta.str(), tb.str(), width))
                    return rc;
                break;
            }
            default:
            {
-                ConstString ta = a.scanGeneric(8);
+                ConstString ta = a.scanGeneric(datatypes::MAXLEGACYWIDTH);
-                ConstString tb = b.scanGeneric(8);
+                ConstString tb = b.scanGeneric(datatypes::MAXLEGACYWIDTH);
                idbassert(ta.length() == tb.length());
                // It is impossible to join signed to unsigned types now
                // but there is a potential error, e.g. uint64 vs negative int64.
                if (int rc= memcmp(ta.str(), tb.str() , ta.length()))
                    return rc;
                break;
@ -1438,23 +1543,24 @@ int TypelessData::cmp(const RowGroup& r, const std::vector<uint32_t>& keyCols,
 // Called in joblist code to produce SmallSide TypelessData to be sent to PP.
 TypelessData makeTypelessKey(const Row& r, const vector<uint32_t>& keyCols,
                             uint32_t keylen, FixedAllocator* fa,
-                             const rowgroup::RowGroup& otherSideRG, const std::vector<uint32_t>& otherKeyCols)
+                             const rowgroup::RowGroup& otherSideRG,
                             const std::vector<uint32_t>& otherKeyCols)
 {
    TypelessData ret;
    uint32_t off = 0, i;
    execplan::CalpontSystemCatalog::ColDataType type;
    ret.data = (uint8_t*) fa->allocate();
    idbassert(keyCols.size() == otherKeyCols.size());
    for (i = 0; i < keyCols.size(); i++)
    {
        type = r.getColTypes()[keyCols[i]];
-        if (type == CalpontSystemCatalog::VARCHAR ||
+        if (datatypes::isCharType(type))
                type == CalpontSystemCatalog::CHAR ||
                type == CalpontSystemCatalog::TEXT)
        {
            // this is a string, copy a normalized version
            const uint8_t* str = r.getStringPointer(keyCols[i]);
@ -1462,7 +1568,19 @@ TypelessData makeTypelessKey(const Row& r, const vector<uint32_t>& keyCols,
            if (TypelessDataStringEncoder(str, width).store(ret.data, off, keylen))
                goto toolong;
        }
-        else if (r.getColType(keyCols[i]) == CalpontSystemCatalog::LONGDOUBLE)
+        else if (datatypes::isWideDecimalType(type, r.getColumnWidth(keyCols[i])))
        {
            bool otherSideIsIntOrNarrow = otherSideRG.getColumnWidth(otherKeyCols[i]) <= datatypes::MAXLEGACYWIDTH;
            // useless if otherSideIsInt is false
            auto otherSideType = (otherSideIsIntOrNarrow) ? otherSideRG.getColType(otherKeyCols[i])
                                                          : datatypes::SystemCatalog::UNDEFINED;
            if (WideDecimalKeyConverter(r, keyCols[i]).convert(otherSideIsIntOrNarrow, otherSideType)
                                                      .store(ret, off, keylen))
            {
                goto toolong;
            }
        }
        else if (datatypes::isLongDouble(type))
        {
            if (off + sizeof(long double) > keylen)
                goto toolong;
@ -1546,7 +1664,7 @@ toolong:
    return ret;
 }
-
+// The method is used by disk-based JOIN and it is not collation or wide DECIMAL aware.
 uint64_t getHashOfTypelessKey(const Row& r, const vector<uint32_t>& keyCols, uint32_t seed)
 {
    Hasher_r hasher;
@ -1620,14 +1738,7 @@ void TypelessData::serialize(messageqcpp::ByteStream& b) const
 {
    b << len;
    b.append(data, len);
-}
+    // Flags are not send b/c they are locally significant now.
 void TypelessData::deserialize(messageqcpp::ByteStream& b, utils::FixedAllocator& fa)
 {
    b >> len;
    data = (uint8_t*) fa.allocate(len);
    memcpy(data, b.buf(), len);
    b.advance(len);
 }
 void TypelessData::deserialize(messageqcpp::ByteStream& b, utils::PoolAllocator& fa)
@ -1789,9 +1900,87 @@ boost::shared_ptr<TupleJoiner> TupleJoiner::copyForDiskJoin()
    return ret;
 }
 // Used for Typeless JOIN to detect if there is a JOIN when largeSide is wide-DECIMAL and
 // smallSide is a smaller data type, e.g. INT or narrow-DECIMAL.
 bool TupleJoiner::joinHasSkewedKeyColumn()
 {
    std::vector<uint32_t>::const_iterator largeSideKeyColumnsIter = getLargeKeyColumns().begin();
    std::vector<uint32_t>::const_iterator smallSideKeyColumnsIter = getSmallKeyColumns().begin();
    idbassert(getLargeKeyColumns().size() == getSmallKeyColumns().size());
    while (largeSideKeyColumnsIter != getLargeKeyColumns().end())
    {
        auto smallSideColumnWidth = smallRG.getColumnWidth(*smallSideKeyColumnsIter);
        auto largeSideColumnWidth = largeRG.getColumnWidth(*largeSideKeyColumnsIter);
        bool widthIsDifferent = smallSideColumnWidth != largeSideColumnWidth;
        if (widthIsDifferent && (datatypes::isWideDecimalType(smallRG.getColTypes()[*smallSideKeyColumnsIter], smallSideColumnWidth) ||
                                 datatypes::isWideDecimalType(largeRG.getColTypes()[*largeSideKeyColumnsIter], largeSideColumnWidth)))
        {
            return true;
        }
        ++largeSideKeyColumnsIter;
        ++smallSideKeyColumnsIter;
    }
    return false;
 }
 void TupleJoiner::setConvertToDiskJoin()
 {
    _convertToDiskJoin = true;
 }
 // The method is made to reuse the code from Typeless TupleJoiner ctor.
 // It is used in the mentioned ctor and in initBPP() to calculate
 // Typeless key length in case of a JOIN when large side column is INT
 // and small side column is wide-DECIMAL.
 // An important assumption is that if the type is DECIMAL than it must
 // be wide-DECIMAL b/c MCS calls the function running Typeless TupleJoiner
 // ctor.
 uint32_t calculateKeyLength(const std::vector<uint32_t>& aKeyColumnsIds,
                            const rowgroup::RowGroup& aSmallRowGroup,
                            const std::vector<uint32_t>* aLargeKeyColumnsIds,
                            const rowgroup::RowGroup* aLargeRowGroup)
 {
    uint32_t keyLength = 0;
    for (size_t keyColumnIdx = 0; keyColumnIdx < aKeyColumnsIds.size(); ++keyColumnIdx)
    {
        auto smallSideKeyColumnId = aKeyColumnsIds[keyColumnIdx];
        auto largeSideKeyColumnId = (aLargeRowGroup)
                                        ? aLargeKeyColumnsIds->operator[](keyColumnIdx)
                                        : std::numeric_limits<uint64_t>::max();
        const auto& smallKeyColumnType = aSmallRowGroup.getColTypes()[smallSideKeyColumnId];
        // Not used if aLargeRowGroup is 0 that happens in PrimProc.
        const auto& largeKeyColumntype = (aLargeRowGroup) ? aLargeRowGroup->getColTypes()[largeSideKeyColumnId]
                                                          : datatypes::SystemCatalog::UNDEFINED;
        if (datatypes::isCharType(smallKeyColumnType))
        {
            keyLength += aSmallRowGroup.getColumnWidth(smallSideKeyColumnId) + 2;  // +2 for encoded length
            // MCOL-698: if we don't do this LONGTEXT allocates 32TB RAM
            if (keyLength > 65536)
                return 65536;
        }
        else if (datatypes::isLongDouble(smallKeyColumnType))
        {
            keyLength += sizeof(long double);
        }
        else if (datatypes::isWideDecimalType(smallKeyColumnType,
                                              aSmallRowGroup.getColumnWidth(smallSideKeyColumnId)))
        {
            keyLength += (aLargeRowGroup &&
                          !datatypes::isWideDecimalType(largeKeyColumntype,
                                                        aLargeRowGroup->getColumnWidth(smallSideKeyColumnId)))
                       ? datatypes::MAXLEGACYWIDTH     // Small=Wide, Large=Narrow/xINT
                       : datatypes::MAXDECIMALWIDTH;   // Small=Wide, Large=Wide
        }
        else
        // The branch covers all datatypes left including skewed DECIMAL JOIN case
        // Small=Wide, Large=Narrow
        {
            keyLength += datatypes::MAXLEGACYWIDTH;
        }
    }
    return keyLength;
 }
 };
--- a/utils/joiner/tuplejoiner.h
+++ b/utils/joiner/tuplejoiner.h
@ -40,32 +40,77 @@
 #include "hasher.h"
 #include "threadpool.h"
 #include "columnwidth.h"
 #include "mcs_string.h"
 namespace joiner
 {
 uint32_t calculateKeyLength(const std::vector<uint32_t>& aKeyColumnsIds,
                            const rowgroup::RowGroup& aRowGroup,
                            const std::vector<uint32_t>* aLargeKeyColumnsIds = nullptr,
                            const rowgroup::RowGroup* aLargeRowGroup = nullptr);
 constexpr uint8_t IS_SMALLSIDE = 0x01; // SmallSide of a JOIN w/o a skew in key columns widths
 constexpr uint8_t IS_SMALLSIDE_SKEWED = 0x02; // SmallSide of a JOIN with a skew in key cols widths
 class TypelessDataDecoder;
 class TypelessData
 {
 public:
-    uint8_t* data;
+    union {
        uint8_t* data;
        const rowgroup::Row *mRowPtr;
    };
    uint32_t len;
-    const rowgroup::Row *mRowPtr;
+    // The flags are locally significant in PP now so serialize doesn't send it over the wire.
    uint32_t mFlags;
-    TypelessData() : data(NULL), len(0), mRowPtr(nullptr) { }
+    TypelessData() : data(nullptr), len(0), mFlags(0) { }
-    TypelessData(const rowgroup::Row *rowPtr) : data(NULL), len(0), mRowPtr(rowPtr) { }
+    TypelessData(const rowgroup::Row *rowPtr) : mRowPtr(rowPtr), len(0), mFlags(0) { }
    TypelessData(messageqcpp::ByteStream& bs, utils::PoolAllocator& memAllocator) : data(nullptr), len(0), mFlags(0)
    {
        deserialize(bs, memAllocator);
    }
    inline bool operator==(const TypelessData&) const;
    void serialize(messageqcpp::ByteStream&) const;
    void deserialize(messageqcpp::ByteStream&, utils::FixedAllocator&);
    void deserialize(messageqcpp::ByteStream&, utils::PoolAllocator&);
    std::string toString() const;
-    uint32_t hash(const rowgroup::RowGroup&, const std::vector<uint32_t>& keyCols) const;
+    uint32_t hash(const rowgroup::RowGroup&,
-    static int cmp(const rowgroup::RowGroup&, const std::vector<uint32_t>& keyCols,
+                  const std::vector<uint32_t>& keyCols,
                  const std::vector<uint32_t> *smallSideKeyColumnsIds,
                  const rowgroup::RowGroup *smallSideRG) const;
    static int cmp(const rowgroup::RowGroup&,
                   const std::vector<uint32_t>& keyCols,
                   const TypelessData &a,
-                   const TypelessData &b);
+                   const TypelessData &b,
-    int cmpToRow(const rowgroup::RowGroup& r, const std::vector<uint32_t>& keyCols,
+                   const std::vector<uint32_t> *smallSideKeyColumnsIds,
-                 const rowgroup::Row &db) const;
+                   const rowgroup::RowGroup *smallSideRG);
    int cmpToRow(const rowgroup::RowGroup& r,
                 const std::vector<uint32_t>& keyCols,
                 const rowgroup::Row &row,
                 const std::vector<uint32_t> *smallSideKeyColumnsIds,
                 const rowgroup::RowGroup *smallSideRG) const;
    inline void setSmallSide()
    {
        mFlags |= IS_SMALLSIDE;
    }
    inline void setSmallSideWithSkewedData()
    {
        mFlags |= IS_SMALLSIDE_SKEWED;
    }
    inline bool isSmallSide() const
    {
        return mFlags & (IS_SMALLSIDE_SKEWED | IS_SMALLSIDE);
    }
    inline bool isSmallSideWithSkewedData() const
    {
        return mFlags & IS_SMALLSIDE_SKEWED;
    }
 };
 // This operator is used in EM only so it doesn't support TD cmp operation
 // using Row pointers.
 inline bool TypelessData::operator==(const TypelessData& t) const
 {
    if (len != t.len)
@ -77,6 +122,57 @@ inline bool TypelessData::operator==(const TypelessData& t) const
    return (memcmp(data, t.data, len) == 0);
 }
 class TypelessDataDecoder
 {
    const uint8_t *mPtr;
    const uint8_t *mEnd;
    void checkAvailableData(uint32_t nbytes) const
    {
        if (mPtr + nbytes > mEnd)
            throw runtime_error("TypelessData is too short");
    }
 public:
    TypelessDataDecoder(const uint8_t* ptr, size_t length)
        :mPtr(ptr), mEnd(ptr + length)
    { }
    TypelessDataDecoder(const TypelessData &data)
        :TypelessDataDecoder(data.data, data.len)
    { }
    utils::ConstString scanGeneric(uint32_t length)
    {
        checkAvailableData(length);
        utils::ConstString res((const char *) mPtr, length);
        mPtr += length;
        return res;
    }
    uint32_t scanStringLength()
    {
        checkAvailableData(2);
        uint32_t res = ((uint32_t) mPtr[0]) * 255 + mPtr[1];
        mPtr += 2;
        return res;
    }
    utils::ConstString scanString()
    {
        return scanGeneric(scanStringLength());
    }
    int64_t scanTInt64()
    {
        checkAvailableData(sizeof(int64_t));
        int64_t res = *reinterpret_cast<const int64_t*>(mPtr);
        mPtr += sizeof(int64_t);
        return res;
    }
    datatypes::TSInt128 scanTInt128()
    {
        checkAvailableData(datatypes::MAXDECIMALWIDTH);
        datatypes::TSInt128 res(mPtr);
        mPtr += datatypes::MAXDECIMALWIDTH;
        return res;
    }
 };
 // Comparator for long double in the hash
 class LongDoubleEq
 {
@ -104,10 +200,16 @@ class TypelessDataStructure
 public:
   const rowgroup::RowGroup *mRowGroup;
   const std::vector<uint32_t> *mMap;
   const std::vector<uint32_t> *mSmallSideKeyColumnsIds;
   const rowgroup::RowGroup *mSmallSideRG;
   TypelessDataStructure(const rowgroup::RowGroup *rg,
-                         const std::vector<uint32_t> *map)
+                         const std::vector<uint32_t> *map,
                         const std::vector<uint32_t> *smallSideKeyColumnsIds,
                         const rowgroup::RowGroup *smallSideRG)
       :mRowGroup(rg),
-        mMap(map)
+        mMap(map),
        mSmallSideKeyColumnsIds(smallSideKeyColumnsIds),
        mSmallSideRG(smallSideRG)
   { }
 };
@ -150,12 +252,14 @@ public:
    struct TypelessDataHasher: public TypelessDataStructure
    {
        TypelessDataHasher(const rowgroup::RowGroup *rg,
-                           const std::vector<uint32_t> *map)
+                           const std::vector<uint32_t> *map,
-           :TypelessDataStructure(rg, map)
+                           const std::vector<uint32_t> *smallSideKeyColumnsIds,
                           const rowgroup::RowGroup *smallSideRG)
           :TypelessDataStructure(rg, map, smallSideKeyColumnsIds, smallSideRG)
        { }
        inline size_t operator()(const TypelessData& e) const
        {
-            return e.hash(*mRowGroup, *mMap);
+            return e.hash(*mRowGroup, *mMap, mSmallSideKeyColumnsIds, mSmallSideRG);
        }
    };
@ -163,12 +267,14 @@ public:
    {
    public:
        TypelessDataComparator(const rowgroup::RowGroup *rg,
-                               const std::vector<uint32_t> *map)
+                               const std::vector<uint32_t> *map,
-           :TypelessDataStructure(rg, map)
+                               const std::vector<uint32_t> *smallSideKeyColumnsIds,
                               const rowgroup::RowGroup *smallSideRG)
           :TypelessDataStructure(rg, map, smallSideKeyColumnsIds, smallSideRG)
        { }
        bool operator()(const TypelessData& a, const TypelessData& b) const
        {
-            return !TypelessData::cmp(*mRowGroup, *mMap, a, b);
+            return !TypelessData::cmp(*mRowGroup, *mMap, a, b, mSmallSideKeyColumnsIds, mSmallSideRG);
        }
    };
@ -365,6 +471,12 @@ public:
        return nullValueForJoinColumn;
    }
    // Wide-DECIMAL JOIN
    bool joinHasSkewedKeyColumn();
    inline const vector<uint32_t>& getSmallSideColumnsWidths() const
    {
        return smallRG.getColWidths();
    }
    // Disk-based join support
    void clearData();
    boost::shared_ptr<TupleJoiner> copyForDiskJoin();
--- a/utils/messageqcpp/bytestream.cpp
+++ b/utils/messageqcpp/bytestream.cpp
@ -170,6 +170,14 @@ ByteStream& ByteStream::operator<<(const uint8_t b)
    return *this;
 }
 ByteStream& ByteStream::operator<<(const bool b)
 {
    add(b);
    return *this;
 }
 ByteStream& ByteStream::operator<<(const int16_t d)
 {
    if (fBuf == 0 || (fCurInPtr - fBuf + 2U > fMaxLen + ISSOverhead))
@ -296,6 +304,14 @@ ByteStream& ByteStream::operator>>(uint8_t& b)
    return *this;
 }
 ByteStream& ByteStream::operator>>(bool& b)
 {
    peek(b);
    fCurOutPtr++;
    return *this;
 }
 ByteStream& ByteStream::operator>>(int16_t& d)
 {
    peek(d);
@ -382,6 +398,15 @@ void ByteStream::peek(uint8_t& b) const
    b = *((int8_t*)fCurOutPtr);
 }
 void ByteStream::peek(bool& b) const
 {
    if (length() < 1)
        throw underflow_error("ByteStream::peek(bool): not enough data in stream to fill datatype");
    b = *((bool*)fCurOutPtr);
 }
 void ByteStream::peek(int16_t& d) const
 {
    if (length() < 2)
--- a/utils/messageqcpp/bytestream.h
+++ b/utils/messageqcpp/bytestream.h
@ -113,6 +113,7 @@ public:
     *	push a uint8_t onto the end of the stream
     */
    EXPORT ByteStream& operator<<(const uint8_t b);
    EXPORT ByteStream& operator<<(const bool b);
    /**
     *	push a int16_t onto the end of the stream. The byte order is whatever the native byte order is.
     */
@ -195,6 +196,7 @@ public:
     *	extract a uint8_t from the front of the stream.
     */
    EXPORT ByteStream& operator>>(uint8_t& b);
    EXPORT ByteStream& operator>>(bool& b);
    /**
     *	extract a int16_t from the front of the stream. The byte order is whatever the native byte order is.
     */
@ -273,6 +275,7 @@ public:
     *	Peek at a uint8_t from the front of the stream.
     */
    EXPORT void peek(uint8_t& b) const;
    EXPORT void peek(bool& b) const;
    /**
     *	Peek at a int16_t from the front of the stream. The byte order is whatever the native byte order is.
     */
--- a/utils/rowgroup/rowaggregation.cpp
+++ b/utils/rowgroup/rowaggregation.cpp
@ -1068,7 +1068,7 @@ void RowAggregation::makeAggFieldsNull(Row& row)
            case execplan::CalpontSystemCatalog::DECIMAL:
            case execplan::CalpontSystemCatalog::UDECIMAL:
            {
-                int colWidth = fRowGroupOut->getColumnWidth(colOut);
+                uint32_t colWidth = fRowGroupOut->getColumnWidth(colOut);
                if (LIKELY(colWidth == datatypes::MAXDECIMALWIDTH))
                {
                    uint32_t offset = row.getOffset(colOut);
@ -1095,7 +1095,7 @@ void RowAggregation::makeAggFieldsNull(Row& row)
            case execplan::CalpontSystemCatalog::VARBINARY:
            case execplan::CalpontSystemCatalog::BLOB:
            {
-                int colWidth = fRowGroupOut->getColumnWidth(colOut);
+                uint32_t colWidth = fRowGroupOut->getColumnWidth(colOut);
                if (colWidth <= datatypes::MAXLEGACYWIDTH)
                {
--- a/utils/rowgroup/rowgroup.cpp
+++ b/utils/rowgroup/rowgroup.cpp
@ -1320,6 +1320,12 @@ RowGroup& RowGroup::operator=(const RowGroup& r)
    return *this;
 }
 RowGroup::RowGroup(ByteStream& bs): columnCount(0), data(nullptr), rgData(nullptr), strings(nullptr),
    useStringTable(true), hasCollation(false), hasLongStringField(false), sTableThreshold(20)
 {
    this->deserialize(bs);
 }
 RowGroup::~RowGroup()
 {
 }
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@ -131,6 +131,16 @@ const int16_t rgCommonSize = 8192;
 #pragma warning (disable : 4200)
 #endif
 // Helper to get a value from nested vector pointers.
 template<typename T>
 inline T derefFromTwoVectorPtrs(const std::vector<T>* outer,
                         const std::vector<T>* inner,
                         const T innerIdx)
 {
    auto outerIdx = inner->operator[](innerIdx);
    return outer->operator[](outerIdx); 
 }
 class StringStore
 {
 public:
@ -434,6 +444,7 @@ public:
                                  getPrecision(colIndex));
    }
    inline long double getLongDoubleField(uint32_t colIndex) const;
    inline void storeInt128FieldIntoPtr(uint32_t colIndex, uint8_t* x) const;
    inline void getInt128Field(uint32_t colIndex, int128_t& x) const;
    inline datatypes::TSInt128 getTSInt128Field(uint32_t colIndex) const;
@ -559,12 +570,17 @@ public:
    inline uint64_t hash(uint32_t lastCol) const;  // generates a hash for cols [0-lastCol]
    inline uint64_t hash() const;  // generates a hash for all cols
    inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const;
-    inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t col) const;
+    inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t keyColsIdx,
-    inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols) const
+                                               const std::vector<uint32_t>& keyCols,
                                               const std::vector<uint32_t>* smallSideKeyColumnsIds,
                                               const std::vector<uint32_t>* smallSideColumnsWidths) const;
    inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols,
                                 const std::vector<uint32_t>* smallSideKeyColumnsIds,
                                 const std::vector<uint32_t>* smallSideColumnsWidths) const
    {
        datatypes::MariaDBHasher h;
        for (uint32_t i = 0; i < keyCols.size(); i++)
-            colUpdateMariaDBHasherTypeless(h, keyCols[i]);
+            colUpdateMariaDBHasherTypeless(h, i, keyCols, smallSideKeyColumnsIds, smallSideColumnsWidths);
        return h.finalize();
    }
@ -950,30 +966,65 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
 }
-inline void Row::colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &h, uint32_t col) const
+inline void Row::colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &h, uint32_t keyColsIdx,
                                                const std::vector<uint32_t>& keyCols,
                                                const std::vector<uint32_t>* smallSideKeyColumnsIds,
                                                const std::vector<uint32_t>* smallSideColumnsWidths) const
 {
-    switch (getColType(col))
+    auto rowKeyColIdx = keyCols[keyColsIdx];
    auto largeSideColType = getColType(rowKeyColIdx);
    switch (largeSideColType)
    {
        case datatypes::SystemCatalog::CHAR:
        case datatypes::SystemCatalog::VARCHAR:
        case datatypes::SystemCatalog::BLOB:
        case datatypes::SystemCatalog::TEXT:
        {
-            CHARSET_INFO *cs = getCharset(col);
+            CHARSET_INFO *cs = getCharset(rowKeyColIdx);
-            h.add(cs, getConstString(col));
+            h.add(cs, getConstString(rowKeyColIdx));
            break;
        }
        case datatypes::SystemCatalog::DECIMAL:
        {
            auto width = getColumnWidth(rowKeyColIdx);
            if (datatypes::isWideDecimalType(largeSideColType,
                                             width))
            {
                bool joinHasSkewedKeyColumn = (smallSideColumnsWidths);
                datatypes::TSInt128 val = getTSInt128Field(rowKeyColIdx);
                if (joinHasSkewedKeyColumn &&
                    width != derefFromTwoVectorPtrs(smallSideColumnsWidths, smallSideKeyColumnsIds, keyColsIdx))
                {
                    if (val.getValue() >= std::numeric_limits<int64_t>::min() &&
                        val.getValue() <= std::numeric_limits<uint64_t>::max())
                    {
                        h.add(&my_charset_bin, (const char*)&val.getValue(), datatypes::MAXLEGACYWIDTH);
                    }
                    else
                        h.add(&my_charset_bin, (const char*)&val.getValue(), datatypes::MAXDECIMALWIDTH);
                }
                else
                    h.add(&my_charset_bin, (const char*)&val.getValue(), datatypes::MAXDECIMALWIDTH);
            }
            else
            {
                int64_t val = getIntField(rowKeyColIdx);
                h.add(&my_charset_bin, (const char*) &val, datatypes::MAXLEGACYWIDTH);
            }
            break;
        }
        default:
        {
-            if (isUnsigned(col))
+            if (isUnsigned(rowKeyColIdx))
            {
-                uint64_t tb = getUintField(col);
+                uint64_t val = getUintField(rowKeyColIdx);
-                h.add(&my_charset_bin, (const char*) &tb, 8);
+                h.add(&my_charset_bin, (const char*) &val, datatypes::MAXLEGACYWIDTH);
            }
            else
            {
-                int64_t val = getIntField(col);
+                int64_t val = getIntField(rowKeyColIdx);
-                h.add(&my_charset_bin, (const char*) &val, 8);
+                h.add(&my_charset_bin, (const char*) &val, datatypes::MAXLEGACYWIDTH);
            }
            break;
@ -981,7 +1032,6 @@ inline void Row::colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &h, uin
    }
 }
 inline void Row::setStringField(const uint8_t* strdata, uint32_t length, uint32_t colIndex)
 {
    uint64_t offset;
@ -1096,6 +1146,11 @@ inline long double Row::getLongDoubleField(uint32_t colIndex) const
    return *((long double*) &data[offsets[colIndex]]);
 }
 inline void Row::storeInt128FieldIntoPtr(uint32_t colIndex, uint8_t* x) const
 {
    datatypes::TSInt128::assignPtrPtr(x, &data[offsets[colIndex]]);
 }
 inline void Row::getInt128Field(uint32_t colIndex, int128_t& x) const
 {
    datatypes::TSInt128::assignPtrPtr(&x, &data[offsets[colIndex]]);
@ -1489,6 +1544,8 @@ public:
    /** @brief Assignment operator.  It copies metadata, not the row data */
    RowGroup& operator=(const RowGroup&);
    explicit RowGroup(messageqcpp::ByteStream& bs);
    ~RowGroup();
    inline void initRow(Row*, bool forceInlineData = false) const;